From e17d43b93e544f5016c0251d2074c15568d5d963 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 12 May 2020 15:19:08 +0300
Subject: perf: Add perf text poke event

Record (single instruction) changes to the kernel text (i.e.
self-modifying code) in order to support tracers like Intel PT and
ARM CoreSight.

A copy of the running kernel code is needed as a reference point (e.g.
from /proc/kcore). The text poke event records the old bytes and the
new bytes so that the event can be processed forwards or backwards.

The basic problem is recording the modified instruction in an
unambiguous manner given SMP instruction cache (in)coherence. That is,
when modifying an instruction concurrently any solution with one or
multiple timestamps is not sufficient:

	CPU0				CPU1
 0
 1	write insn A
 2					execute insn A
 3	sync-I$
 4

Due to I$, CPU1 might execute either the old or new A. No matter where
we record tracepoints on CPU0, one simply cannot tell what CPU1 will
have observed, except that at 0 it must be the old one and at 4 it
must be the new one.

To solve this, take inspiration from x86 text poking, which has to
solve this exact problem due to variable length instruction encoding
and I-fetch windows.

 1) overwrite the instruction with a breakpoint and sync I$

This guarantees that that code flow will never hit the target
instruction anymore, on any CPU (or rather, it will cause an
exception).

 2) issue the TEXT_POKE event

 3) overwrite the breakpoint with the new instruction and sync I$

Now we know that any execution after the TEXT_POKE event will either
observe the breakpoint (and hit the exception) or the new instruction.

So by guarding the TEXT_POKE event with an exception on either side;
we can now tell, without doubt, which instruction another CPU will
have observed.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200512121922.8997-2-adrian.hunter@intel.com
---
 kernel/events/core.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 856d98c36f56..9b8f92500833 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -394,6 +394,7 @@ static atomic_t nr_switch_events __read_mostly;
 static atomic_t nr_ksymbol_events __read_mostly;
 static atomic_t nr_bpf_events __read_mostly;
 static atomic_t nr_cgroup_events __read_mostly;
+static atomic_t nr_text_poke_events __read_mostly;
 
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -4575,7 +4576,7 @@ static bool is_sb_event(struct perf_event *event)
 	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
 	    attr->comm || attr->comm_exec ||
 	    attr->task || attr->ksymbol ||
-	    attr->context_switch ||
+	    attr->context_switch || attr->text_poke ||
 	    attr->bpf_event)
 		return true;
 	return false;
@@ -4651,6 +4652,8 @@ static void unaccount_event(struct perf_event *event)
 		atomic_dec(&nr_ksymbol_events);
 	if (event->attr.bpf_event)
 		atomic_dec(&nr_bpf_events);
+	if (event->attr.text_poke)
+		atomic_dec(&nr_text_poke_events);
 
 	if (dec) {
 		if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -8628,6 +8631,89 @@ void perf_event_bpf_event(struct bpf_prog *prog,
 	perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
 }
 
+struct perf_text_poke_event {
+	const void		*old_bytes;
+	const void		*new_bytes;
+	size_t			pad;
+	u16			old_len;
+	u16			new_len;
+
+	struct {
+		struct perf_event_header	header;
+
+		u64				addr;
+	} event_id;
+};
+
+static int perf_event_text_poke_match(struct perf_event *event)
+{
+	return event->attr.text_poke;
+}
+
+static void perf_event_text_poke_output(struct perf_event *event, void *data)
+{
+	struct perf_text_poke_event *text_poke_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	u64 padding = 0;
+	int ret;
+
+	if (!perf_event_text_poke_match(event))
+		return;
+
+	perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
+
+	ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, text_poke_event->event_id);
+	perf_output_put(&handle, text_poke_event->old_len);
+	perf_output_put(&handle, text_poke_event->new_len);
+
+	__output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
+	__output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
+
+	if (text_poke_event->pad)
+		__output_copy(&handle, &padding, text_poke_event->pad);
+
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
+void perf_event_text_poke(const void *addr, const void *old_bytes,
+			  size_t old_len, const void *new_bytes, size_t new_len)
+{
+	struct perf_text_poke_event text_poke_event;
+	size_t tot, pad;
+
+	if (!atomic_read(&nr_text_poke_events))
+		return;
+
+	tot  = sizeof(text_poke_event.old_len) + old_len;
+	tot += sizeof(text_poke_event.new_len) + new_len;
+	pad  = ALIGN(tot, sizeof(u64)) - tot;
+
+	text_poke_event = (struct perf_text_poke_event){
+		.old_bytes    = old_bytes,
+		.new_bytes    = new_bytes,
+		.pad          = pad,
+		.old_len      = old_len,
+		.new_len      = new_len,
+		.event_id  = {
+			.header = {
+				.type = PERF_RECORD_TEXT_POKE,
+				.misc = PERF_RECORD_MISC_KERNEL,
+				.size = sizeof(text_poke_event.event_id) + tot + pad,
+			},
+			.addr = (unsigned long)addr,
+		},
+	};
+
+	perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
+}
+
 void perf_event_itrace_started(struct perf_event *event)
 {
 	event->attach_state |= PERF_ATTACH_ITRACE;
@@ -10945,6 +11031,8 @@ static void account_event(struct perf_event *event)
 		atomic_inc(&nr_ksymbol_events);
 	if (event->attr.bpf_event)
 		atomic_inc(&nr_bpf_events);
+	if (event->attr.text_poke)
+		atomic_inc(&nr_text_poke_events);
 
 	if (inc) {
 		/*
-- 
cgit v1.2.3


From d002b8bc6dbc20e9043e279196cff8795dba05fe Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Thu, 28 May 2020 11:00:58 +0300
Subject: kprobes: Add symbols for kprobe insn pages

Symbols are needed for tools to describe instruction addresses. Pages
allocated for kprobe's purposes need symbols to be created for them.
Add such symbols to be visible via /proc/kallsyms.

Note: kprobe insn pages are not used if ftrace is configured. To see the
effect of this patch, the kernel must be configured with:

	# CONFIG_FUNCTION_TRACER is not set
	CONFIG_KPROBES=y

and for optimised kprobes:

	CONFIG_OPTPROBES=y

Example on x86:

	# perf probe __schedule
	Added new event:
	  probe:__schedule     (on __schedule)
	# cat /proc/kallsyms | grep '\[__builtin__kprobes\]'
	ffffffffc00d4000 t kprobe_insn_page     [__builtin__kprobes]
	ffffffffc00d6000 t kprobe_optinsn_page  [__builtin__kprobes]

Note: This patch adds "__builtin__kprobes" as a module name in
/proc/kallsyms for symbols for pages allocated for kprobes' purposes, even
though "__builtin__kprobes" is not a module.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Link: https://lkml.kernel.org/r/20200528080058.20230-1-adrian.hunter@intel.com
---
 include/linux/kprobes.h | 15 +++++++++++++++
 kernel/kallsyms.c       | 37 +++++++++++++++++++++++++++++++++----
 kernel/kprobes.c        | 45 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 594265bfd390..13fc58a74c04 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -242,6 +242,7 @@ struct kprobe_insn_cache {
 	struct mutex mutex;
 	void *(*alloc)(void);	/* allocate insn page */
 	void (*free)(void *);	/* free insn page */
+	const char *sym;	/* symbol for insn pages */
 	struct list_head pages; /* list of kprobe_insn_page */
 	size_t insn_size;	/* size of instruction slot */
 	int nr_garbage;
@@ -272,6 +273,10 @@ static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
 {									\
 	return __is_insn_slot_addr(&kprobe_##__name##_slots, addr);	\
 }
+#define KPROBE_INSN_PAGE_SYM		"kprobe_insn_page"
+#define KPROBE_OPTINSN_PAGE_SYM		"kprobe_optinsn_page"
+int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
+			     unsigned long *value, char *type, char *sym);
 #else /* __ARCH_WANT_KPROBES_INSN_SLOT */
 #define DEFINE_INSN_CACHE_OPS(__name)					\
 static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
@@ -373,6 +378,11 @@ void dump_kprobe(struct kprobe *kp);
 void *alloc_insn_page(void);
 void free_insn_page(void *page);
 
+int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		       char *sym);
+
+int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
+			    char *type, char *sym);
 #else /* !CONFIG_KPROBES: */
 
 static inline int kprobes_built_in(void)
@@ -435,6 +445,11 @@ static inline bool within_kprobe_blacklist(unsigned long addr)
 {
 	return true;
 }
+static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value,
+				     char *type, char *sym)
+{
+	return -ERANGE;
+}
 #endif /* CONFIG_KPROBES */
 static inline int disable_kretprobe(struct kretprobe *rp)
 {
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 16c8c605f4b0..c6cc293c0e67 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/filter.h>
 #include <linux/ftrace.h>
+#include <linux/kprobes.h>
 #include <linux/compiler.h>
 
 /*
@@ -437,6 +438,7 @@ struct kallsym_iter {
 	loff_t pos_arch_end;
 	loff_t pos_mod_end;
 	loff_t pos_ftrace_mod_end;
+	loff_t pos_bpf_end;
 	unsigned long value;
 	unsigned int nameoff; /* If iterating in core kernel symbols. */
 	char type;
@@ -496,11 +498,33 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
 
 static int get_ksymbol_bpf(struct kallsym_iter *iter)
 {
+	int ret;
+
 	strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN);
 	iter->exported = 0;
-	return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
-			       &iter->value, &iter->type,
-			       iter->name) < 0 ? 0 : 1;
+	ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
+			      &iter->value, &iter->type,
+			      iter->name);
+	if (ret < 0) {
+		iter->pos_bpf_end = iter->pos;
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * This uses "__builtin__kprobes" as a module name for symbols for pages
+ * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a
+ * module.
+ */
+static int get_ksymbol_kprobe(struct kallsym_iter *iter)
+{
+	strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN);
+	iter->exported = 0;
+	return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end,
+				  &iter->value, &iter->type,
+				  iter->name) < 0 ? 0 : 1;
 }
 
 /* Returns space to next name. */
@@ -527,6 +551,7 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
 		iter->pos_arch_end = 0;
 		iter->pos_mod_end = 0;
 		iter->pos_ftrace_mod_end = 0;
+		iter->pos_bpf_end = 0;
 	}
 }
 
@@ -551,7 +576,11 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
 	    get_ksymbol_ftrace_mod(iter))
 		return 1;
 
-	return get_ksymbol_bpf(iter);
+	if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) &&
+	    get_ksymbol_bpf(iter))
+		return 1;
+
+	return get_ksymbol_kprobe(iter);
 }
 
 /* Returns false if pos at or past end of file. */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 50cd84f53df0..058c0be3464b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -118,6 +118,7 @@ struct kprobe_insn_cache kprobe_insn_slots = {
 	.mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
 	.alloc = alloc_insn_page,
 	.free = free_insn_page,
+	.sym = KPROBE_INSN_PAGE_SYM,
 	.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
 	.insn_size = MAX_INSN_SIZE,
 	.nr_garbage = 0,
@@ -290,12 +291,34 @@ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
 	return ret;
 }
 
+int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
+			     unsigned long *value, char *type, char *sym)
+{
+	struct kprobe_insn_page *kip;
+	int ret = -ERANGE;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
+		if ((*symnum)--)
+			continue;
+		strlcpy(sym, c->sym, KSYM_NAME_LEN);
+		*type = 't';
+		*value = (unsigned long)kip->insns;
+		ret = 0;
+		break;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 #ifdef CONFIG_OPTPROBES
 /* For optimized_kprobe buffer */
 struct kprobe_insn_cache kprobe_optinsn_slots = {
 	.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
 	.alloc = alloc_insn_page,
 	.free = free_insn_page,
+	.sym = KPROBE_OPTINSN_PAGE_SYM,
 	.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
 	/* .insn_size is initialized later */
 	.nr_garbage = 0,
@@ -2197,6 +2220,28 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
 	kprobe_remove_area_blacklist(entry, entry + 1);
 }
 
+int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
+				   char *type, char *sym)
+{
+	return -ERANGE;
+}
+
+int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		       char *sym)
+{
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
+	if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym))
+		return 0;
+#ifdef CONFIG_OPTPROBES
+	if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym))
+		return 0;
+#endif
+#endif
+	if (!arch_kprobe_get_kallsym(&symnum, value, type, sym))
+		return 0;
+	return -ERANGE;
+}
+
 int __init __weak arch_populate_kprobe_blacklist(void)
 {
 	return 0;
-- 
cgit v1.2.3


From 69e49088692899d25dedfa22f00dfb9761e86ed7 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 12 May 2020 15:19:11 +0300
Subject: kprobes: Add perf ksymbol events for kprobe insn pages

Symbols are needed for tools to describe instruction addresses. Pages
allocated for kprobe's purposes need symbols to be created for them.
Add such symbols to be visible via perf ksymbol events.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Link: https://lkml.kernel.org/r/20200512121922.8997-5-adrian.hunter@intel.com
---
 include/uapi/linux/perf_event.h |  5 +++++
 kernel/kprobes.c                | 12 ++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e5bee6c17b86..e1a4179144a1 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1049,6 +1049,11 @@ enum perf_event_type {
 enum perf_record_ksymbol_type {
 	PERF_RECORD_KSYMBOL_TYPE_UNKNOWN	= 0,
 	PERF_RECORD_KSYMBOL_TYPE_BPF		= 1,
+	/*
+	 * Out of line code such as kprobe-replaced instructions or optimized
+	 * kprobes.
+	 */
+	PERF_RECORD_KSYMBOL_TYPE_OOL		= 2,
 	PERF_RECORD_KSYMBOL_TYPE_MAX		/* non-ABI */
 };
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 058c0be3464b..2b58740ca0f3 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,6 +35,7 @@
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
 #include <linux/jump_label.h>
+#include <linux/perf_event.h>
 
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
@@ -184,6 +185,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 	kip->cache = c;
 	list_add_rcu(&kip->list, &c->pages);
 	slot = kip->insns;
+
+	/* Record the perf ksymbol register event after adding the page */
+	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
+			   PAGE_SIZE, false, c->sym);
 out:
 	mutex_unlock(&c->mutex);
 	return slot;
@@ -202,6 +207,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
 		 * next time somebody inserts a probe.
 		 */
 		if (!list_is_singular(&kip->list)) {
+			/*
+			 * Record perf ksymbol unregister event before removing
+			 * the page.
+			 */
+			perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+					   (unsigned long)kip->insns, PAGE_SIZE, true,
+					   kip->cache->sym);
 			list_del_rcu(&kip->list);
 			synchronize_rcu();
 			kip->cache->free(kip->insns);
-- 
cgit v1.2.3


From fc0ea795f53c8d7040fa42471f74fe51d78d0834 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 12 May 2020 15:19:13 +0300
Subject: ftrace: Add symbols for ftrace trampolines

Symbols are needed for tools to describe instruction addresses. Pages
allocated for ftrace's purposes need symbols to be created for them.
Add such symbols to be visible via /proc/kallsyms.

Example on x86 with CONFIG_DYNAMIC_FTRACE=y

	# echo function > /sys/kernel/debug/tracing/current_tracer
	# cat /proc/kallsyms | grep '\[__builtin__ftrace\]'
	ffffffffc0238000 t ftrace_trampoline    [__builtin__ftrace]

Note: This patch adds "__builtin__ftrace" as a module name in /proc/kallsyms for
symbols for pages allocated for ftrace's purposes, even though "__builtin__ftrace"
is not a module.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200512121922.8997-7-adrian.hunter@intel.com
---
 include/linux/ftrace.h | 12 +++++---
 kernel/kallsyms.c      |  5 ++++
 kernel/trace/ftrace.c  | 77 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 88 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e339dac91ee6..ce2c06f72e86 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -58,9 +58,6 @@ struct ftrace_direct_func;
 const char *
 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 		   unsigned long *off, char **modname, char *sym);
-int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
-			   char *type, char *name,
-			   char *module_name, int *exported);
 #else
 static inline const char *
 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
@@ -68,6 +65,13 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 {
 	return NULL;
 }
+#endif
+
+#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+			   char *type, char *name,
+			   char *module_name, int *exported);
+#else
 static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
 					 char *type, char *name,
 					 char *module_name, int *exported)
@@ -76,7 +80,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val
 }
 #endif
 
-
 #ifdef CONFIG_FUNCTION_TRACER
 
 extern int ftrace_enabled;
@@ -207,6 +210,7 @@ struct ftrace_ops {
 	struct ftrace_ops_hash		old_hash;
 	unsigned long			trampoline;
 	unsigned long			trampoline_size;
+	struct list_head		list;
 #endif
 };
 
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index c6cc293c0e67..834bfdc43235 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -482,6 +482,11 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
 	return 1;
 }
 
+/*
+ * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace
+ * purposes. In that case "__builtin__ftrace" is used as a module name, even
+ * though "__builtin__ftrace" is not a module.
+ */
 static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
 {
 	int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c163c3531faf..31675b209db2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2764,6 +2764,38 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
 {
 }
 
+/* List of trace_ops that have allocated trampolines */
+static LIST_HEAD(ftrace_ops_trampoline_list);
+
+static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops)
+{
+	lockdep_assert_held(&ftrace_lock);
+	list_add_rcu(&ops->list, &ftrace_ops_trampoline_list);
+}
+
+static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops)
+{
+	lockdep_assert_held(&ftrace_lock);
+	list_del_rcu(&ops->list);
+}
+
+/*
+ * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols
+ * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is
+ * not a module.
+ */
+#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace"
+#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline"
+
+static void ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+	if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) &&
+	    ops->trampoline)
+		ftrace_remove_trampoline_from_kallsyms(ops);
+
+	arch_ftrace_trampoline_free(ops);
+}
+
 static void ftrace_startup_enable(int command)
 {
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -2934,7 +2966,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
 			synchronize_rcu_tasks();
 
  free_ops:
-		arch_ftrace_trampoline_free(ops);
+		ftrace_trampoline_free(ops);
 	}
 
 	return 0;
@@ -6178,6 +6210,27 @@ struct ftrace_mod_map {
 	unsigned int		num_funcs;
 };
 
+static int ftrace_get_trampoline_kallsym(unsigned int symnum,
+					 unsigned long *value, char *type,
+					 char *name, char *module_name,
+					 int *exported)
+{
+	struct ftrace_ops *op;
+
+	list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) {
+		if (!op->trampoline || symnum--)
+			continue;
+		*value = op->trampoline;
+		*type = 't';
+		strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN);
+		strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN);
+		*exported = 0;
+		return 0;
+	}
+
+	return -ERANGE;
+}
+
 #ifdef CONFIG_MODULES
 
 #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@ -6514,6 +6567,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
 {
 	struct ftrace_mod_map *mod_map;
 	struct ftrace_mod_func *mod_func;
+	int ret;
 
 	preempt_disable();
 	list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
@@ -6540,8 +6594,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
 		WARN_ON(1);
 		break;
 	}
+	ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+					    module_name, exported);
 	preempt_enable();
-	return -ERANGE;
+	return ret;
 }
 
 #else
@@ -6553,6 +6609,18 @@ allocate_ftrace_mod_map(struct module *mod,
 {
 	return NULL;
 }
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+			   char *type, char *name, char *module_name,
+			   int *exported)
+{
+	int ret;
+
+	preempt_disable();
+	ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+					    module_name, exported);
+	preempt_enable();
+	return ret;
+}
 #endif /* CONFIG_MODULES */
 
 struct ftrace_init_func {
@@ -6733,7 +6801,12 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
 
 static void ftrace_update_trampoline(struct ftrace_ops *ops)
 {
+	unsigned long trampoline = ops->trampoline;
+
 	arch_ftrace_update_trampoline(ops);
+	if (ops->trampoline && ops->trampoline != trampoline &&
+	    (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+		ftrace_add_trampoline_to_kallsyms(ops);
 }
 
 void ftrace_init_trace_array(struct trace_array *tr)
-- 
cgit v1.2.3


From dd9ddf466ad7a5d2e247925d81ebb0b878bf3b76 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 12 May 2020 15:19:14 +0300
Subject: ftrace: Add perf ksymbol events for ftrace trampolines

Symbols are needed for tools to describe instruction addresses. Pages
allocated for ftrace's purposes need symbols to be created for them.
Add such symbols to be visible via perf ksymbol events.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200512121922.8997-8-adrian.hunter@intel.com
---
 include/uapi/linux/perf_event.h |  2 +-
 kernel/trace/ftrace.c           | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e1a4179144a1..52ca2093831c 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1051,7 +1051,7 @@ enum perf_record_ksymbol_type {
 	PERF_RECORD_KSYMBOL_TYPE_BPF		= 1,
 	/*
 	 * Out of line code such as kprobe-replaced instructions or optimized
-	 * kprobes.
+	 * kprobes or ftrace trampolines.
 	 */
 	PERF_RECORD_KSYMBOL_TYPE_OOL		= 2,
 	PERF_RECORD_KSYMBOL_TYPE_MAX		/* non-ABI */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 31675b209db2..2baaf7716537 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2790,8 +2790,13 @@ static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops)
 static void ftrace_trampoline_free(struct ftrace_ops *ops)
 {
 	if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) &&
-	    ops->trampoline)
+	    ops->trampoline) {
+		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+				   ops->trampoline, ops->trampoline_size,
+				   true, FTRACE_TRAMPOLINE_SYM);
+		/* Remove from kallsyms after the perf events */
 		ftrace_remove_trampoline_from_kallsyms(ops);
+	}
 
 	arch_ftrace_trampoline_free(ops);
 }
@@ -6805,8 +6810,13 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
 
 	arch_ftrace_update_trampoline(ops);
 	if (ops->trampoline && ops->trampoline != trampoline &&
-	    (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+	    (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) {
+		/* Add to kallsyms before the perf events */
 		ftrace_add_trampoline_to_kallsyms(ops);
+		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+				   ops->trampoline, ops->trampoline_size, false,
+				   FTRACE_TRAMPOLINE_SYM);
+	}
 }
 
 void ftrace_init_trace_array(struct trace_array *tr)
-- 
cgit v1.2.3


From 548e1f6c76e1eb80ba29edd4286b9b9f2c37f5bf Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 12 May 2020 15:19:15 +0300
Subject: ftrace: Add perf text poke events for ftrace trampolines

Add perf text poke events for ftrace trampolines when created and when
freed.

There can be 3 text_poke events for ftrace trampolines:

1. NULL -> trampoline
   By ftrace_update_trampoline() when !ops->trampoline
   Trampoline created

2. [e.g. on x86] CALL rel32 -> CALL rel32
   By arch_ftrace_update_trampoline() when ops->trampoline and
                        ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP
   [e.g. on x86] via text_poke_bp() which generates text poke events
   Trampoline-called function target updated

3. trampoline -> NULL
   By ftrace_trampoline_free() when ops->trampoline and
                 ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP
   Trampoline freed

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200512121922.8997-9-adrian.hunter@intel.com
---
 kernel/trace/ftrace.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2baaf7716537..d6bba734ab72 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2791,6 +2791,13 @@ static void ftrace_trampoline_free(struct ftrace_ops *ops)
 {
 	if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) &&
 	    ops->trampoline) {
+		/*
+		 * Record the text poke event before the ksymbol unregister
+		 * event.
+		 */
+		perf_event_text_poke((void *)ops->trampoline,
+				     (void *)ops->trampoline,
+				     ops->trampoline_size, NULL, 0);
 		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
 				   ops->trampoline, ops->trampoline_size,
 				   true, FTRACE_TRAMPOLINE_SYM);
@@ -6816,6 +6823,13 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
 		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
 				   ops->trampoline, ops->trampoline_size, false,
 				   FTRACE_TRAMPOLINE_SYM);
+		/*
+		 * Record the perf text poke event after the ksymbol register
+		 * event.
+		 */
+		perf_event_text_poke((void *)ops->trampoline, NULL, 0,
+				     (void *)ops->trampoline,
+				     ops->trampoline_size);
 	}
 }
 
-- 
cgit v1.2.3


From 3dc167ba5729ddd2d8e3fa1841653792c295d3f1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 19 May 2020 19:25:06 +0200
Subject: sched/cputime: Improve cputime_adjust()

People report that utime and stime from /proc/<pid>/stat become very
wrong when the numbers are big enough, especially if you watch these
counters incrementally.

Specifically, the current implementation of: stime*rtime/total,
results in a saw-tooth function on top of the desired line, where the
teeth grow in size the larger the values become. IOW, it has a
relative error.

The result is that, when watching incrementally as time progresses
(for large values), we'll see periods of pure stime or utime increase,
irrespective of the actual ratio we're striving for.

Replace scale_stime() with a math64.h helper: mul_u64_u64_div_u64()
that is far more accurate. This also allows architectures to override
the implementation -- for instance they can opt for the old algorithm
if this new one turns out to be too expensive for them.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200519172506.GA317395@hirez.programming.kicks-ass.net
---
 arch/x86/include/asm/div64.h | 14 ++++++++++++--
 include/linux/math64.h       |  2 ++
 kernel/sched/cputime.c       | 46 +-------------------------------------------
 lib/math/div64.c             | 41 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9b8cb50768c2..b8f1dc0761e4 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -74,16 +74,26 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
 #else
 # include <asm-generic/div64.h>
 
-static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
+/*
+ * Will generate an #DE when the result doesn't fit u64, could fix with an
+ * __ex_table[] entry when it becomes an issue.
+ */
+static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
 {
 	u64 q;
 
 	asm ("mulq %2; divq %3" : "=a" (q)
-				: "a" (a), "rm" ((u64)mul), "rm" ((u64)div)
+				: "a" (a), "rm" (mul), "rm" (div)
 				: "rdx");
 
 	return q;
 }
+#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
+
+static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
+{
+	return mul_u64_u64_div_u64(a, mul, div);
+}
 #define mul_u64_u32_div	mul_u64_u32_div
 
 #endif /* CONFIG_X86_32 */
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 11a267413e8e..d097119419e6 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -263,6 +263,8 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
 }
 #endif /* mul_u64_u32_div */
 
+u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
+
 #define DIV64_U64_ROUND_UP(ll, d)	\
 	({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); })
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ff9435dee1df..5a55d2300452 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -519,50 +519,6 @@ void account_idle_ticks(unsigned long ticks)
 	account_idle_time(cputime);
 }
 
-/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static u64 scale_stime(u64 stime, u64 rtime, u64 total)
-{
-	u64 scaled;
-
-	for (;;) {
-		/* Make sure "rtime" is the bigger of stime/rtime */
-		if (stime > rtime)
-			swap(rtime, stime);
-
-		/* Make sure 'total' fits in 32 bits */
-		if (total >> 32)
-			goto drop_precision;
-
-		/* Does rtime (and thus stime) fit in 32 bits? */
-		if (!(rtime >> 32))
-			break;
-
-		/* Can we just balance rtime/stime rather than dropping bits? */
-		if (stime >> 31)
-			goto drop_precision;
-
-		/* We can grow stime and shrink rtime and try to make them both fit */
-		stime <<= 1;
-		rtime >>= 1;
-		continue;
-
-drop_precision:
-		/* We drop from rtime, it has more bits than stime */
-		rtime >>= 1;
-		total >>= 1;
-	}
-
-	/*
-	 * Make sure gcc understands that this is a 32x32->64 multiply,
-	 * followed by a 64/32->64 divide.
-	 */
-	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-	return scaled;
-}
-
 /*
  * Adjust tick based cputime random precision against scheduler runtime
  * accounting.
@@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
 		goto update;
 	}
 
-	stime = scale_stime(stime, rtime, stime + utime);
+	stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
 
 update:
 	/*
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 368ca7fd0d82..3952a07130d8 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -190,3 +190,44 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
 	return __iter_div_u64_rem(dividend, divisor, remainder);
 }
 EXPORT_SYMBOL(iter_div_u64_rem);
+
+#ifndef mul_u64_u64_div_u64
+u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
+{
+	u64 res = 0, div, rem;
+	int shift;
+
+	/* can a * b overflow ? */
+	if (ilog2(a) + ilog2(b) > 62) {
+		/*
+		 * (b * a) / c is equal to
+		 *
+		 *      (b / c) * a +
+		 *      (b % c) * a / c
+		 *
+		 * if nothing overflows. Can the 1st multiplication
+		 * overflow? Yes, but we do not care: this can only
+		 * happen if the end result can't fit in u64 anyway.
+		 *
+		 * So the code below does
+		 *
+		 *      res = (b / c) * a;
+		 *      b = b % c;
+		 */
+		div = div64_u64_rem(b, c, &rem);
+		res = div * a;
+		b = rem;
+
+		shift = ilog2(a) + ilog2(b) - 62;
+		if (shift > 0) {
+			/* drop precision */
+			b >>= shift;
+			c >>= shift;
+			if (!c)
+				return res;
+		}
+	}
+
+	return res + div64_u64(a * b, c);
+}
+#endif
-- 
cgit v1.2.3


From 844eb6458facb09d4871a480d8bda06550927a80 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 3 Jun 2020 10:03:01 +0200
Subject: sched/pelt: Remove redundant cap_scale() definition

Besides in PELT cap_scale() is used in the Deadline scheduler class for
scale-invariant bandwidth enforcement.
Remove the cap_scale() definition in kernel/sched/pelt.c and keep the
one in kernel/sched/sched.h.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20200603080304.16548-2-dietmar.eggemann@arm.com
---
 kernel/sched/pelt.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index b4b1ff96642f..dea5567e4f72 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -83,8 +83,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
 	return c1 + c2 + c3;
 }
 
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
-
 /*
  * Accumulate the three separate parts of the sum; d1 the remainder
  * of the last (incomplete) period, d2 the span of full periods and d3
-- 
cgit v1.2.3


From 0900acf2d8273f79432a4ded122ad5a265e85783 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 3 Jun 2020 10:03:02 +0200
Subject: sched/core: Remove redundant 'preempt' param from
 sched_class->yield_to_task()

Commit 6d1cafd8b56e ("sched: Resched proper CPU on yield_to()") moved
the code to resched the CPU from yield_to_task_fair() to yield_to()
making the preempt parameter in sched_class->yield_to_task()
unnecessary. Remove it. No other sched_class implements yield_to_task().

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200603080304.16548-3-dietmar.eggemann@arm.com
---
 kernel/sched/core.c  | 2 +-
 kernel/sched/fair.c  | 2 +-
 kernel/sched/sched.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8f360326861e..9c89b0eaf796 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5810,7 +5810,7 @@ again:
 	if (task_running(p_rq, p) || p->state)
 		goto out_unlock;
 
-	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+	yielded = curr->sched_class->yield_to_task(rq, p);
 	if (yielded) {
 		schedstat_inc(rq->yld_count);
 		/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cbcb2f71599b..6a4dab2f7c07 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7157,7 +7157,7 @@ static void yield_task_fair(struct rq *rq)
 	set_skip_buddy(se);
 }
 
-static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1d4e94c1e5fe..8d5d06881294 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1748,7 +1748,7 @@ struct sched_class {
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*yield_task)   (struct rq *rq);
-	bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
+	bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
 
 	void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
 
-- 
cgit v1.2.3


From e3e76a6a04114ec95b0969cd026e8904c67b431b Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 3 Jun 2020 10:03:03 +0200
Subject: sched/idle,stop: Remove .get_rr_interval from sched_class

The idle task and stop task sched_classes return 0 in this function.

The single call site in sched_rr_get_interval() calls
p->sched_class->get_rr_interval() only conditional in case it is
defined. Otherwise time_slice=0 will be used.

The deadline sched class does not define it. Commit a57beec5d427
("sched: Make sched_class::get_rr_interval() optional") introduced
the default time-slice=0 for sched classes which do not provide this
function.

So .get_rr_interval for idle and stop sched_class can be removed to
shrink the code a little.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200603080304.16548-4-dietmar.eggemann@arm.com
---
 kernel/sched/idle.c      | 7 -------
 kernel/sched/stop_task.c | 8 --------
 2 files changed, 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 05deb81bb3e3..8d75ca201484 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -446,11 +446,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
 	BUG();
 }
 
-static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-{
-	return 0;
-}
-
 static void update_curr_idle(struct rq *rq)
 {
 }
@@ -479,8 +474,6 @@ const struct sched_class idle_sched_class = {
 
 	.task_tick		= task_tick_idle,
 
-	.get_rr_interval	= get_rr_interval_idle,
-
 	.prio_changed		= prio_changed_idle,
 	.switched_to		= switched_to_idle,
 	.update_curr		= update_curr_idle,
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4c9e9975684f..3e50a6a8f1e5 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -102,12 +102,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
 	BUG(); /* how!?, what priority? */
 }
 
-static unsigned int
-get_rr_interval_stop(struct rq *rq, struct task_struct *task)
-{
-	return 0;
-}
-
 static void update_curr_stop(struct rq *rq)
 {
 }
@@ -136,8 +130,6 @@ const struct sched_class stop_sched_class = {
 
 	.task_tick		= task_tick_stop,
 
-	.get_rr_interval	= get_rr_interval_stop,
-
 	.prio_changed		= prio_changed_stop,
 	.switched_to		= switched_to_stop,
 	.update_curr		= update_curr_stop,
-- 
cgit v1.2.3


From 1ca2034ed798aea72a68d3904bd39a6cbfbdf405 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 3 Jun 2020 10:03:04 +0200
Subject: sched/fair: Remove unused 'sd' parameter from scale_rt_capacity()

Since commit 8ec59c0f5f49 ("sched/topology: Remove unused 'sd'
parameter from arch_scale_cpu_capacity()") it is no longer needed.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20200603080304.16548-5-dietmar.eggemann@arm.com
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a4dab2f7c07..69da576f7f48 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8038,7 +8038,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 	};
 }
 
-static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long scale_rt_capacity(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long max = arch_scale_cpu_capacity(cpu);
@@ -8070,7 +8070,7 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
 
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-	unsigned long capacity = scale_rt_capacity(sd, cpu);
+	unsigned long capacity = scale_rt_capacity(cpu);
 	struct sched_group *sdg = sd->groups;
 
 	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
-- 
cgit v1.2.3


From 4581bea8b4ec4de353369775dfef921191e393b3 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vincent.donnefort@arm.com>
Date: Wed, 27 May 2020 17:39:14 +0100
Subject: sched/debug: Add new tracepoints to track util_est

The util_est signals are key elements for EAS task placement and
frequency selection. Having tracepoints to track these signals enables
load-tracking and schedutil testing and/or debugging by a toolkit.

Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Link: https://lkml.kernel.org/r/1590597554-370150-1-git-send-email-vincent.donnefort@arm.com
---
 include/trace/events/sched.h | 8 ++++++++
 kernel/sched/core.c          | 2 ++
 kernel/sched/fair.c          | 6 ++++++
 3 files changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ed168b0e2c53..04f9a4c7b0d9 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -634,6 +634,14 @@ DECLARE_TRACE(sched_overutilized_tp,
 	TP_PROTO(struct root_domain *rd, bool overutilized),
 	TP_ARGS(rd, overutilized));
 
+DECLARE_TRACE(sched_util_est_cfs_tp,
+	TP_PROTO(struct cfs_rq *cfs_rq),
+	TP_ARGS(cfs_rq));
+
+DECLARE_TRACE(sched_util_est_se_tp,
+	TP_PROTO(struct sched_entity *se),
+	TP_ARGS(se));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9c89b0eaf796..0208b71bef80 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -36,6 +36,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69da576f7f48..a785a9b262dd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3922,6 +3922,8 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
 	enqueued  = cfs_rq->avg.util_est.enqueued;
 	enqueued += _task_util_est(p);
 	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+	trace_sched_util_est_cfs_tp(cfs_rq);
 }
 
 /*
@@ -3952,6 +3954,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 	ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
 	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
 
+	trace_sched_util_est_cfs_tp(cfs_rq);
+
 	/*
 	 * Skip update of task's estimated utilization when the task has not
 	 * yet completed an activation, e.g. being migrated.
@@ -4017,6 +4021,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
 done:
 	WRITE_ONCE(p->se.avg.util_est, ue);
+
+	trace_sched_util_est_se_tp(&p->se);
 }
 
 static inline int task_fits_capacity(struct task_struct *p, long capacity)
-- 
cgit v1.2.3


From 461daba06bdcb9c7a3f92b9bbd110e1f7d093ffc Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 28 May 2020 12:54:42 -0700
Subject: psi: eliminate kthread_worker from psi trigger scheduling mechanism

Each psi group requires a dedicated kthread_delayed_work and
kthread_worker. Since no other work can be performed using psi_group's
kthread_worker, the same result can be obtained using a task_struct and
a timer directly. This makes psi triggering simpler by removing lists
and locks involved with kthread_worker usage and eliminates the need for
poll_scheduled atomic use in the hot path.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200528195442.190116-1-surenb@google.com
---
 include/linux/psi_types.h |   7 +--
 kernel/sched/psi.c        | 113 ++++++++++++++++++++++++++--------------------
 2 files changed, 68 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 4b7258495a04..b95f3211566a 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -153,9 +153,10 @@ struct psi_group {
 	unsigned long avg[NR_PSI_STATES - 1][3];
 
 	/* Monitor work control */
-	atomic_t poll_scheduled;
-	struct kthread_worker __rcu *poll_kworker;
-	struct kthread_delayed_work poll_work;
+	struct task_struct __rcu *poll_task;
+	struct timer_list poll_timer;
+	wait_queue_head_t poll_wait;
+	atomic_t poll_wakeup;
 
 	/* Protects data used by the monitor */
 	struct mutex trigger_lock;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8f45cdb6463b..e53b711bd643 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -190,7 +190,6 @@ static void group_init(struct psi_group *group)
 	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
 	mutex_init(&group->avgs_lock);
 	/* Init trigger-related members */
-	atomic_set(&group->poll_scheduled, 0);
 	mutex_init(&group->trigger_lock);
 	INIT_LIST_HEAD(&group->triggers);
 	memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
@@ -199,7 +198,7 @@ static void group_init(struct psi_group *group)
 	memset(group->polling_total, 0, sizeof(group->polling_total));
 	group->polling_next_update = ULLONG_MAX;
 	group->polling_until = 0;
-	rcu_assign_pointer(group->poll_kworker, NULL);
+	rcu_assign_pointer(group->poll_task, NULL);
 }
 
 void __init psi_init(void)
@@ -547,47 +546,38 @@ static u64 update_triggers(struct psi_group *group, u64 now)
 	return now + group->poll_min_period;
 }
 
-/*
- * Schedule polling if it's not already scheduled. It's safe to call even from
- * hotpath because even though kthread_queue_delayed_work takes worker->lock
- * spinlock that spinlock is never contended due to poll_scheduled atomic
- * preventing such competition.
- */
+/* Schedule polling if it's not already scheduled. */
 static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
 {
-	struct kthread_worker *kworker;
+	struct task_struct *task;
 
-	/* Do not reschedule if already scheduled */
-	if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
+	/*
+	 * Do not reschedule if already scheduled.
+	 * Possible race with a timer scheduled after this check but before
+	 * mod_timer below can be tolerated because group->polling_next_update
+	 * will keep updates on schedule.
+	 */
+	if (timer_pending(&group->poll_timer))
 		return;
 
 	rcu_read_lock();
 
-	kworker = rcu_dereference(group->poll_kworker);
+	task = rcu_dereference(group->poll_task);
 	/*
 	 * kworker might be NULL in case psi_trigger_destroy races with
 	 * psi_task_change (hotpath) which can't use locks
 	 */
-	if (likely(kworker))
-		kthread_queue_delayed_work(kworker, &group->poll_work, delay);
-	else
-		atomic_set(&group->poll_scheduled, 0);
+	if (likely(task))
+		mod_timer(&group->poll_timer, jiffies + delay);
 
 	rcu_read_unlock();
 }
 
-static void psi_poll_work(struct kthread_work *work)
+static void psi_poll_work(struct psi_group *group)
 {
-	struct kthread_delayed_work *dwork;
-	struct psi_group *group;
 	u32 changed_states;
 	u64 now;
 
-	dwork = container_of(work, struct kthread_delayed_work, work);
-	group = container_of(dwork, struct psi_group, poll_work);
-
-	atomic_set(&group->poll_scheduled, 0);
-
 	mutex_lock(&group->trigger_lock);
 
 	now = sched_clock();
@@ -623,6 +613,35 @@ out:
 	mutex_unlock(&group->trigger_lock);
 }
 
+static int psi_poll_worker(void *data)
+{
+	struct psi_group *group = (struct psi_group *)data;
+	struct sched_param param = {
+		.sched_priority = 1,
+	};
+
+	sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
+
+	while (true) {
+		wait_event_interruptible(group->poll_wait,
+				atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
+				kthread_should_stop());
+		if (kthread_should_stop())
+			break;
+
+		psi_poll_work(group);
+	}
+	return 0;
+}
+
+static void poll_timer_fn(struct timer_list *t)
+{
+	struct psi_group *group = from_timer(group, t, poll_timer);
+
+	atomic_set(&group->poll_wakeup, 1);
+	wake_up_interruptible(&group->poll_wait);
+}
+
 static void record_times(struct psi_group_cpu *groupc, int cpu,
 			 bool memstall_tick)
 {
@@ -1099,22 +1118,20 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
 
 	mutex_lock(&group->trigger_lock);
 
-	if (!rcu_access_pointer(group->poll_kworker)) {
-		struct sched_param param = {
-			.sched_priority = 1,
-		};
-		struct kthread_worker *kworker;
+	if (!rcu_access_pointer(group->poll_task)) {
+		struct task_struct *task;
 
-		kworker = kthread_create_worker(0, "psimon");
-		if (IS_ERR(kworker)) {
+		task = kthread_create(psi_poll_worker, group, "psimon");
+		if (IS_ERR(task)) {
 			kfree(t);
 			mutex_unlock(&group->trigger_lock);
-			return ERR_CAST(kworker);
+			return ERR_CAST(task);
 		}
-		sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
-		kthread_init_delayed_work(&group->poll_work,
-				psi_poll_work);
-		rcu_assign_pointer(group->poll_kworker, kworker);
+		atomic_set(&group->poll_wakeup, 0);
+		init_waitqueue_head(&group->poll_wait);
+		wake_up_process(task);
+		timer_setup(&group->poll_timer, poll_timer_fn, 0);
+		rcu_assign_pointer(group->poll_task, task);
 	}
 
 	list_add(&t->node, &group->triggers);
@@ -1132,7 +1149,7 @@ static void psi_trigger_destroy(struct kref *ref)
 {
 	struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
 	struct psi_group *group = t->group;
-	struct kthread_worker *kworker_to_destroy = NULL;
+	struct task_struct *task_to_destroy = NULL;
 
 	if (static_branch_likely(&psi_disabled))
 		return;
@@ -1158,13 +1175,13 @@ static void psi_trigger_destroy(struct kref *ref)
 			period = min(period, div_u64(tmp->win.size,
 					UPDATES_PER_WINDOW));
 		group->poll_min_period = period;
-		/* Destroy poll_kworker when the last trigger is destroyed */
+		/* Destroy poll_task when the last trigger is destroyed */
 		if (group->poll_states == 0) {
 			group->polling_until = 0;
-			kworker_to_destroy = rcu_dereference_protected(
-					group->poll_kworker,
+			task_to_destroy = rcu_dereference_protected(
+					group->poll_task,
 					lockdep_is_held(&group->trigger_lock));
-			rcu_assign_pointer(group->poll_kworker, NULL);
+			rcu_assign_pointer(group->poll_task, NULL);
 		}
 	}
 
@@ -1172,25 +1189,23 @@ static void psi_trigger_destroy(struct kref *ref)
 
 	/*
 	 * Wait for both *trigger_ptr from psi_trigger_replace and
-	 * poll_kworker RCUs to complete their read-side critical sections
-	 * before destroying the trigger and optionally the poll_kworker
+	 * poll_task RCUs to complete their read-side critical sections
+	 * before destroying the trigger and optionally the poll_task
 	 */
 	synchronize_rcu();
 	/*
 	 * Destroy the kworker after releasing trigger_lock to prevent a
 	 * deadlock while waiting for psi_poll_work to acquire trigger_lock
 	 */
-	if (kworker_to_destroy) {
+	if (task_to_destroy) {
 		/*
 		 * After the RCU grace period has expired, the worker
-		 * can no longer be found through group->poll_kworker.
+		 * can no longer be found through group->poll_task.
 		 * But it might have been already scheduled before
 		 * that - deschedule it cleanly before destroying it.
 		 */
-		kthread_cancel_delayed_work_sync(&group->poll_work);
-		atomic_set(&group->poll_scheduled, 0);
-
-		kthread_destroy_worker(kworker_to_destroy);
+		del_timer_sync(&group->poll_timer);
+		kthread_stop(task_to_destroy);
 	}
 	kfree(t);
 }
-- 
cgit v1.2.3


From 043eb8e1051143a24811e6f35c276e35ae8247b6 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 27 May 2020 16:29:08 +0200
Subject: kthread: Switch to cpu_possible_mask

Next patch will switch unbound kernel threads mask to
housekeeping_cpumask(), a subset of cpu_possible_mask. So in order to
ease bisection, lets first switch kthreads default affinity from
cpu_all_mask to cpu_possible_mask.

It looks safe to do so as cpu_possible_mask seem to be initialized
at setup_arch() time, way before kthreadd is created.

Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200527142909.23372-2-frederic@kernel.org
---
 kernel/kthread.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8e3d2d7fdf5e..b86d37cda109 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -383,7 +383,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 		 * The kernel thread should not inherit these properties.
 		 */
 		sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
-		set_cpus_allowed_ptr(task, cpu_all_mask);
+		set_cpus_allowed_ptr(task, cpu_possible_mask);
 	}
 	kfree(create);
 	return task;
@@ -608,7 +608,7 @@ int kthreadd(void *unused)
 	/* Setup a clean context for our children to inherit. */
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
-	set_cpus_allowed_ptr(tsk, cpu_all_mask);
+	set_cpus_allowed_ptr(tsk, cpu_possible_mask);
 	set_mems_allowed(node_states[N_MEMORY]);
 
 	current->flags |= PF_NOFREEZE;
-- 
cgit v1.2.3


From 9cc5b8656892a72438ee7deb5e80f5be47643b8b Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 27 May 2020 16:29:09 +0200
Subject: isolcpus: Affine unbound kernel threads to housekeeping cpus

This is a kernel enhancement that configures the cpu affinity of kernel
threads via kernel boot option nohz_full=.

When this option is specified, the cpumask is immediately applied upon
kthread launch. This does not affect kernel threads that specify cpu
and node.

This allows CPU isolation (that is not allowing certain threads
to execute on certain CPUs) without using the isolcpus=domain parameter,
making it possible to enable load balancing on such CPUs
during runtime (see kernel-parameters.txt).

Note-1: this is based off on Wind River's patch at
https://github.com/starlingx-staging/stx-integ/blob/master/kernel/kernel-std/centos/patches/affine-compute-kernel-threads.patch

Difference being that this patch is limited to modifying kernel thread
cpumask. Behaviour of other threads can be controlled via cgroups or
sched_setaffinity.

Note-2: Wind River's patch was based off Christoph Lameter's patch at
https://lwn.net/Articles/565932/ with the only difference being
the kernel parameter changed from kthread to kthread_cpus.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200527142909.23372-3-frederic@kernel.org
---
 include/linux/sched/isolation.h | 1 +
 kernel/kthread.c                | 6 ++++--
 kernel/sched/isolation.c        | 3 ++-
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 0fbcbacd1b29..cc9f393e2a70 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -14,6 +14,7 @@ enum hk_flags {
 	HK_FLAG_DOMAIN		= (1 << 5),
 	HK_FLAG_WQ		= (1 << 6),
 	HK_FLAG_MANAGED_IRQ	= (1 << 7),
+	HK_FLAG_KTHREAD		= (1 << 8),
 };
 
 #ifdef CONFIG_CPU_ISOLATION
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b86d37cda109..032b610912b0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
 #include <linux/numa.h>
+#include <linux/sched/isolation.h>
 #include <trace/events/sched.h>
 
 
@@ -383,7 +384,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 		 * The kernel thread should not inherit these properties.
 		 */
 		sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
-		set_cpus_allowed_ptr(task, cpu_possible_mask);
+		set_cpus_allowed_ptr(task,
+				     housekeeping_cpumask(HK_FLAG_KTHREAD));
 	}
 	kfree(create);
 	return task;
@@ -608,7 +610,7 @@ int kthreadd(void *unused)
 	/* Setup a clean context for our children to inherit. */
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
-	set_cpus_allowed_ptr(tsk, cpu_possible_mask);
+	set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
 	set_mems_allowed(node_states[N_MEMORY]);
 
 	current->flags |= PF_NOFREEZE;
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 808244f3ddd9..5a6ea03f9882 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -140,7 +140,8 @@ static int __init housekeeping_nohz_full_setup(char *str)
 {
 	unsigned int flags;
 
-	flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+	flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
+		HK_FLAG_MISC | HK_FLAG_KTHREAD;
 
 	return housekeeping_setup(str, flags);
 }
-- 
cgit v1.2.3


From b4098bfc5efb1fd7ecf40165132a1283aeea3500 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 26 Jul 2019 16:54:10 +0200
Subject: sched/deadline: Impose global limits on sched_attr::sched_period

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726161357.397880775@infradead.org
---
 include/linux/sched/sysctl.h |  3 +++
 kernel/sched/deadline.c      | 23 +++++++++++++++++++++--
 kernel/sysctl.c              | 14 ++++++++++++++
 3 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 660ac49f2b53..24be30a40814 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -61,6 +61,9 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 
+extern unsigned int sysctl_sched_dl_period_max;
+extern unsigned int sysctl_sched_dl_period_min;
+
 #ifdef CONFIG_UCLAMP_TASK
 extern unsigned int sysctl_sched_uclamp_util_min;
 extern unsigned int sysctl_sched_uclamp_util_max;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 504d2f51b0d6..f31964ad9c2e 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2634,6 +2634,14 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
 	attr->sched_flags = dl_se->flags;
 }
 
+/*
+ * Default limits for DL period; on the top end we guard against small util
+ * tasks still getting rediculous long effective runtimes, on the bottom end we
+ * guard against timer DoS.
+ */
+unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
+unsigned int sysctl_sched_dl_period_min = 100;     /* 100 us */
+
 /*
  * This function validates the new parameters of a -deadline task.
  * We ask for the deadline not being zero, and greater or equal
@@ -2646,6 +2654,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
  */
 bool __checkparam_dl(const struct sched_attr *attr)
 {
+	u64 period, max, min;
+
 	/* special dl tasks don't actually use any parameter */
 	if (attr->sched_flags & SCHED_FLAG_SUGOV)
 		return true;
@@ -2669,12 +2679,21 @@ bool __checkparam_dl(const struct sched_attr *attr)
 	    attr->sched_period & (1ULL << 63))
 		return false;
 
+	period = attr->sched_period;
+	if (!period)
+		period = attr->sched_deadline;
+
 	/* runtime <= deadline <= period (if period != 0) */
-	if ((attr->sched_period != 0 &&
-	     attr->sched_period < attr->sched_deadline) ||
+	if (period < attr->sched_deadline ||
 	    attr->sched_deadline < attr->sched_runtime)
 		return false;
 
+	max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC;
+	min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC;
+
+	if (period < min || period > max)
+		return false;
+
 	return true;
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index db1ce7af2563..4aea67d3d552 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1779,6 +1779,20 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_rt_handler,
 	},
+	{
+		.procname	= "sched_deadline_period_max_us",
+		.data		= &sysctl_sched_dl_period_max,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_deadline_period_min_us",
+		.data		= &sysctl_sched_dl_period_min,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "sched_rr_timeslice_ms",
 		.data		= &sysctl_sched_rr_timeslice,
-- 
cgit v1.2.3


From 3ea2f097b17e13a8280f1f9386c331b326a3dbef Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 9 Jun 2020 14:37:48 +0200
Subject: sched/fair: Fix NOHZ next idle balance

With commit:
  'b7031a02ec75 ("sched/fair: Add NOHZ_STATS_KICK")'
rebalance_domains of the local cfs_rq happens before others idle cpus have
updated nohz.next_balance and its value is overwritten.

Move the update of nohz.next_balance for other idles cpus before balancing
and updating the next_balance of local cfs_rq.

Also, the nohz.next_balance is now updated only if all idle cpus got a
chance to rebalance their domains and the idle balance has not been aborted
because of new activities on the CPU. In case of need_resched, the idle
load balance will be kick the next jiffie in order to address remaining
ilb.

Fixes: b7031a02ec75 ("sched/fair: Add NOHZ_STATS_KICK")
Reported-by: Peng Liu <iwtbavbm@gmail.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lkml.kernel.org/r/20200609123748.18636-1-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a785a9b262dd..295c9ffa850b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10022,7 +10022,12 @@ static void kick_ilb(unsigned int flags)
 {
 	int ilb_cpu;
 
-	nohz.next_balance++;
+	/*
+	 * Increase nohz.next_balance only when if full ilb is triggered but
+	 * not if we only update stats.
+	 */
+	if (flags & NOHZ_BALANCE_KICK)
+		nohz.next_balance = jiffies+1;
 
 	ilb_cpu = find_new_ilb();
 
@@ -10343,6 +10348,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 		}
 	}
 
+	/*
+	 * next_balance will be updated only when there is a need.
+	 * When the CPU is attached to null domain for ex, it will not be
+	 * updated.
+	 */
+	if (likely(update_next_balance))
+		nohz.next_balance = next_balance;
+
 	/* Newly idle CPU doesn't need an update */
 	if (idle != CPU_NEWLY_IDLE) {
 		update_blocked_averages(this_cpu);
@@ -10363,14 +10376,6 @@ abort:
 	if (has_blocked_load)
 		WRITE_ONCE(nohz.has_blocked, 1);
 
-	/*
-	 * next_balance will be updated only when there is a need.
-	 * When the CPU is attached to null domain for ex, it will not be
-	 * updated.
-	 */
-	if (likely(update_next_balance))
-		nohz.next_balance = next_balance;
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From 9b1b234bb86bcdcdb142e900d39b599185465dbb Mon Sep 17 00:00:00 2001
From: Peng Liu <iwtbavbm@gmail.com>
Date: Tue, 9 Jun 2020 23:09:36 +0800
Subject: sched: correct SD_flags returned by tl->sd_flags()

During sched domain init, we check whether non-topological SD_flags are
returned by tl->sd_flags(), if found, fire a waning and correct the
violation, but the code failed to correct the violation. Correct this.

Fixes: 143e1e28cb40 ("sched: Rework sched_domain topology definition")
Signed-off-by: Peng Liu <iwtbavbm@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Link: https://lkml.kernel.org/r/20200609150936.GA13060@iZj6chx1xj0e0buvshuecpZ
---
 kernel/sched/topology.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ba81187bb7af..9079d865a935 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1328,7 +1328,7 @@ sd_init(struct sched_domain_topology_level *tl,
 		sd_flags = (*tl->sd_flags)();
 	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
 			"wrong sd_flags in topology description\n"))
-		sd_flags &= ~TOPOLOGY_SD_FLAGS;
+		sd_flags &= TOPOLOGY_SD_FLAGS;
 
 	/* Apply detected topology flags */
 	sd_flags |= dflags;
-- 
cgit v1.2.3


From c81b89329933c6c0be809d4c0d2cb57c49153ee3 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 20 May 2020 15:42:39 +0200
Subject: sched/deadline: Optimize dl_bw_cpus()

Return the weight of the root domain (rd) span in case it is a subset
of the cpu_active_mask.

Continue to compute the number of CPUs over rd span and cpu_active_mask
when in hotplug.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200520134243.19352-2-dietmar.eggemann@arm.com
---
 kernel/sched/deadline.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f31964ad9c2e..ec90265e9d8e 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -54,10 +54,16 @@ static inline struct dl_bw *dl_bw_of(int i)
 static inline int dl_bw_cpus(int i)
 {
 	struct root_domain *rd = cpu_rq(i)->rd;
-	int cpus = 0;
+	int cpus;
 
 	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
 			 "sched RCU must be held");
+
+	if (cpumask_subset(rd->span, cpu_active_mask))
+		return cpumask_weight(rd->span);
+
+	cpus = 0;
+
 	for_each_cpu_and(i, rd->span, cpu_active_mask)
 		cpus++;
 
-- 
cgit v1.2.3


From fc9dc698472aa460a8b3b036d9b1d0b751f12f58 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 20 May 2020 15:42:40 +0200
Subject: sched/deadline: Add dl_bw_capacity()

Capacity-aware SCHED_DEADLINE Admission Control (AC) needs root domain
(rd) CPU capacity sum.

Introduce dl_bw_capacity() which for a symmetric rd w/ a CPU capacity
of SCHED_CAPACITY_SCALE simply relies on dl_bw_cpus() to return #CPUs
multiplied by SCHED_CAPACITY_SCALE.

For an asymmetric rd or a CPU capacity < SCHED_CAPACITY_SCALE it
computes the CPU capacity sum over rd span and cpu_active_mask.

A 'XXX Fix:' comment was added to highlight that if 'rq->rd ==
def_root_domain' AC should be performed against the capacity of the
CPU the task is running on rather the rd CPU capacity sum. This
issue already exists w/o capacity awareness.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200520134243.19352-3-dietmar.eggemann@arm.com
---
 kernel/sched/deadline.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ec90265e9d8e..01f474a5bd14 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -69,6 +69,34 @@ static inline int dl_bw_cpus(int i)
 
 	return cpus;
 }
+
+static inline unsigned long __dl_bw_capacity(int i)
+{
+	struct root_domain *rd = cpu_rq(i)->rd;
+	unsigned long cap = 0;
+
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+			 "sched RCU must be held");
+
+	for_each_cpu_and(i, rd->span, cpu_active_mask)
+		cap += capacity_orig_of(i);
+
+	return cap;
+}
+
+/*
+ * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity
+ * of the CPU the task is running on rather rd's \Sum CPU capacity.
+ */
+static inline unsigned long dl_bw_capacity(int i)
+{
+	if (!static_branch_unlikely(&sched_asym_cpucapacity) &&
+	    capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
+		return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
+	} else {
+		return __dl_bw_capacity(i);
+	}
+}
 #else
 static inline struct dl_bw *dl_bw_of(int i)
 {
@@ -79,6 +107,11 @@ static inline int dl_bw_cpus(int i)
 {
 	return 1;
 }
+
+static inline unsigned long dl_bw_capacity(int i)
+{
+	return SCHED_CAPACITY_SCALE;
+}
 #endif
 
 static inline
-- 
cgit v1.2.3


From 60ffd5edc5e4fa69622c125c54ef8e7d5d894af8 Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Wed, 20 May 2020 15:42:41 +0200
Subject: sched/deadline: Improve admission control for asymmetric CPU
 capacities

The current SCHED_DEADLINE (DL) admission control ensures that

    sum of reserved CPU bandwidth < x * M

where

    x = /proc/sys/kernel/sched_rt_{runtime,period}_us
    M = # CPUs in root domain.

DL admission control works well for homogeneous systems where the
capacity of all CPUs are equal (1024). I.e. bounded tardiness for DL
and non-starvation of non-DL tasks is guaranteed.

But on heterogeneous systems where capacity of CPUs are different it
could fail by over-allocating CPU time on smaller capacity CPUs.

On an Arm big.LITTLE/DynamIQ system DL tasks can easily starve other
tasks making it unusable.

Fix this by explicitly considering the CPU capacity in the DL admission
test by replacing M with the root domain CPU capacity sum.

Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200520134243.19352-4-dietmar.eggemann@arm.com
---
 kernel/sched/deadline.c | 30 +++++++++++++++++-------------
 kernel/sched/sched.h    |  6 +++---
 2 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 01f474a5bd14..9ebd0a9241ed 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2590,11 +2590,12 @@ void sched_dl_do_global(void)
 int sched_dl_overflow(struct task_struct *p, int policy,
 		      const struct sched_attr *attr)
 {
-	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 	u64 period = attr->sched_period ?: attr->sched_deadline;
 	u64 runtime = attr->sched_runtime;
 	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
-	int cpus, err = -1;
+	int cpus, err = -1, cpu = task_cpu(p);
+	struct dl_bw *dl_b = dl_bw_of(cpu);
+	unsigned long cap;
 
 	if (attr->sched_flags & SCHED_FLAG_SUGOV)
 		return 0;
@@ -2609,15 +2610,17 @@ int sched_dl_overflow(struct task_struct *p, int policy,
 	 * allocated bandwidth of the container.
 	 */
 	raw_spin_lock(&dl_b->lock);
-	cpus = dl_bw_cpus(task_cpu(p));
+	cpus = dl_bw_cpus(cpu);
+	cap = dl_bw_capacity(cpu);
+
 	if (dl_policy(policy) && !task_has_dl_policy(p) &&
-	    !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+	    !__dl_overflow(dl_b, cap, 0, new_bw)) {
 		if (hrtimer_active(&p->dl.inactive_timer))
 			__dl_sub(dl_b, p->dl.dl_bw, cpus);
 		__dl_add(dl_b, new_bw, cpus);
 		err = 0;
 	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
-		   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+		   !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) {
 		/*
 		 * XXX this is slightly incorrect: when the task
 		 * utilization decreases, we should delay the total
@@ -2772,19 +2775,19 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
 #ifdef CONFIG_SMP
 int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
 {
+	unsigned long flags, cap;
 	unsigned int dest_cpu;
 	struct dl_bw *dl_b;
 	bool overflow;
-	int cpus, ret;
-	unsigned long flags;
+	int ret;
 
 	dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
 
 	rcu_read_lock_sched();
 	dl_b = dl_bw_of(dest_cpu);
 	raw_spin_lock_irqsave(&dl_b->lock, flags);
-	cpus = dl_bw_cpus(dest_cpu);
-	overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+	cap = dl_bw_capacity(dest_cpu);
+	overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw);
 	if (overflow) {
 		ret = -EBUSY;
 	} else {
@@ -2794,6 +2797,8 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
 		 * We will free resources in the source root_domain
 		 * later on (see set_cpus_allowed_dl()).
 		 */
+		int cpus = dl_bw_cpus(dest_cpu);
+
 		__dl_add(dl_b, p->dl.dl_bw, cpus);
 		ret = 0;
 	}
@@ -2826,16 +2831,15 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
 
 bool dl_cpu_busy(unsigned int cpu)
 {
-	unsigned long flags;
+	unsigned long flags, cap;
 	struct dl_bw *dl_b;
 	bool overflow;
-	int cpus;
 
 	rcu_read_lock_sched();
 	dl_b = dl_bw_of(cpu);
 	raw_spin_lock_irqsave(&dl_b->lock, flags);
-	cpus = dl_bw_cpus(cpu);
-	overflow = __dl_overflow(dl_b, cpus, 0, 0);
+	cap = dl_bw_capacity(cpu);
+	overflow = __dl_overflow(dl_b, cap, 0, 0);
 	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 	rcu_read_unlock_sched();
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8d5d06881294..91b250f265c0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -310,11 +310,11 @@ void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
 	__dl_update(dl_b, -((s32)tsk_bw / cpus));
 }
 
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap,
+				 u64 old_bw, u64 new_bw)
 {
 	return dl_b->bw != -1 &&
-	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+	       cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
 }
 
 extern void init_dl_bw(struct dl_bw *dl_b);
-- 
cgit v1.2.3


From b4118988fdcb4554ea6687dd8ff68bcab690b8ea Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Wed, 20 May 2020 15:42:42 +0200
Subject: sched/deadline: Make DL capacity-aware

The current SCHED_DEADLINE (DL) scheduler uses a global EDF scheduling
algorithm w/o considering CPU capacity or task utilization.
This works well on homogeneous systems where DL tasks are guaranteed
to have a bounded tardiness but presents issues on heterogeneous
systems.

A DL task can migrate to a CPU which does not have enough CPU capacity
to correctly serve the task (e.g. a task w/ 70ms runtime and 100ms
period on a CPU w/ 512 capacity).

Add the DL fitness function dl_task_fits_capacity() for DL admission
control on heterogeneous systems. A task fits onto a CPU if:

    CPU original capacity / 1024 >= task runtime / task deadline

Use this function on heterogeneous systems to try to find a CPU which
meets this criterion during task wakeup, push and offline migration.

On homogeneous systems the original behavior of the DL admission
control should be retained.

Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200520134243.19352-5-dietmar.eggemann@arm.com
---
 kernel/sched/cpudeadline.c | 14 +++++++++++++-
 kernel/sched/deadline.c    | 18 ++++++++++++++----
 kernel/sched/sched.h       | 15 +++++++++++++++
 3 files changed, 42 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5cc4012572ec..8630f2a40a3f 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -121,7 +121,19 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 
 	if (later_mask &&
 	    cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
-		return 1;
+		int cpu;
+
+		if (!static_branch_unlikely(&sched_asym_cpucapacity))
+			return 1;
+
+		/* Ensure the capacity of the CPUs fits the task. */
+		for_each_cpu(cpu, later_mask) {
+			if (!dl_task_fits_capacity(p, cpu))
+				cpumask_clear_cpu(cpu, later_mask);
+		}
+
+		if (!cpumask_empty(later_mask))
+			return 1;
 	} else {
 		int best_cpu = cpudl_maximum(cp);
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9ebd0a9241ed..84e84ba0b00a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1643,6 +1643,7 @@ static int
 select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
 	struct task_struct *curr;
+	bool select_rq;
 	struct rq *rq;
 
 	if (sd_flag != SD_BALANCE_WAKE)
@@ -1662,10 +1663,19 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 	 * other hand, if it has a shorter deadline, we
 	 * try to make it stay here, it might be important.
 	 */
-	if (unlikely(dl_task(curr)) &&
-	    (curr->nr_cpus_allowed < 2 ||
-	     !dl_entity_preempt(&p->dl, &curr->dl)) &&
-	    (p->nr_cpus_allowed > 1)) {
+	select_rq = unlikely(dl_task(curr)) &&
+		    (curr->nr_cpus_allowed < 2 ||
+		     !dl_entity_preempt(&p->dl, &curr->dl)) &&
+		    p->nr_cpus_allowed > 1;
+
+	/*
+	 * Take the capacity of the CPU into account to
+	 * ensure it fits the requirement of the task.
+	 */
+	if (static_branch_unlikely(&sched_asym_cpucapacity))
+		select_rq |= !dl_task_fits_capacity(p, cpu);
+
+	if (select_rq) {
 		int target = find_later_rq(p);
 
 		if (target != -1 &&
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 91b250f265c0..336887607b3d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -317,6 +317,21 @@ static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap,
 	       cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
 }
 
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the
+ * CPU original capacity and the runtime/deadline ratio of the task.
+ *
+ * The function will return true if the CPU original capacity of the
+ * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the
+ * task and false otherwise.
+ */
+static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu)
+{
+	unsigned long cap = arch_scale_cpu_capacity(cpu);
+
+	return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime;
+}
+
 extern void init_dl_bw(struct dl_bw *dl_b);
 extern int  sched_dl_global_validate(void);
 extern void sched_dl_do_global(void);
-- 
cgit v1.2.3


From 23e71d8ba42933bff12e453858fd68c073bc5258 Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Wed, 20 May 2020 15:42:43 +0200
Subject: sched/deadline: Implement fallback mechanism for !fit case

When a task has a runtime that cannot be served within the scheduling
deadline by any of the idle CPU (later_mask) the task is doomed to miss
its deadline.

This can happen since the SCHED_DEADLINE admission control guarantees
only bounded tardiness and not the hard respect of all deadlines.
In this case try to select the idle CPU with the largest CPU capacity
to minimize tardiness.

Favor task_cpu(p) if it has max capacity of !fitting CPUs so that
find_later_rq() can potentially still return it (most likely cache-hot)
early.

Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200520134243.19352-6-dietmar.eggemann@arm.com
---
 kernel/sched/cpudeadline.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 8630f2a40a3f..8cb06c8c7eb1 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -121,19 +121,31 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 
 	if (later_mask &&
 	    cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
-		int cpu;
+		unsigned long cap, max_cap = 0;
+		int cpu, max_cpu = -1;
 
 		if (!static_branch_unlikely(&sched_asym_cpucapacity))
 			return 1;
 
 		/* Ensure the capacity of the CPUs fits the task. */
 		for_each_cpu(cpu, later_mask) {
-			if (!dl_task_fits_capacity(p, cpu))
+			if (!dl_task_fits_capacity(p, cpu)) {
 				cpumask_clear_cpu(cpu, later_mask);
+
+				cap = capacity_orig_of(cpu);
+
+				if (cap > max_cap ||
+				    (cpu == task_cpu(p) && cap == max_cap)) {
+					max_cap = cap;
+					max_cpu = cpu;
+				}
+			}
 		}
 
-		if (!cpumask_empty(later_mask))
-			return 1;
+		if (cpumask_empty(later_mask))
+			cpumask_set_cpu(max_cpu, later_mask);
+
+		return 1;
 	} else {
 		int best_cpu = cpudl_maximum(cp);
 
-- 
cgit v1.2.3


From c49694173da004b1b16082f82f28bd625415fbb2 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Tue, 2 Jun 2020 21:50:02 +0200
Subject: sched/deadline: Fix a typo in a comment

s/deadine/deadline/

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200602195002.677448-1-christophe.jaillet@wanadoo.fr
---
 kernel/sched/deadline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 84e84ba0b00a..d4708e29008f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1137,7 +1137,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
  * cannot use the runtime, and so it replenishes the task. This rule
  * works fine for implicit deadline tasks (deadline == period), and the
  * CBS was designed for implicit deadline tasks. However, a task with
- * constrained deadline (deadine < period) might be awakened after the
+ * constrained deadline (deadline < period) might be awakened after the
  * deadline, but before the next period. In this case, replenishing the
  * task would allow it to run for runtime / deadline. As in this case
  * deadline < period, CBS enables a task to run for more than the
-- 
cgit v1.2.3


From 87e867b4269f29dac8190bca13912d08163a277f Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 12 Jun 2020 17:47:03 +0200
Subject: sched/pelt: Cleanup PELT divider

Factorize in a single place the calculation of the divider to be used to
to compute *_avg from *_sum value

Suggested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200612154703.23555-1-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 32 ++++++++++++++++++--------------
 kernel/sched/pelt.c |  2 +-
 kernel/sched/pelt.h |  5 +++++
 3 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 295c9ffa850b..0424a0af5f87 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3094,7 +3094,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 
 #ifdef CONFIG_SMP
 	do {
-		u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+		u32 divider = get_pelt_divider(&se->avg);
 
 		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
 	} while (0);
@@ -3440,16 +3440,18 @@ static inline void
 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
 	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
-	/*
-	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
-	 * See ___update_load_avg() for details.
-	 */
-	u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+	u32 divider;
 
 	/* Nothing to update */
 	if (!delta)
 		return;
 
+	/*
+	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+	 * See ___update_load_avg() for details.
+	 */
+	divider = get_pelt_divider(&cfs_rq->avg);
+
 	/* Set new sched_entity's utilization */
 	se->avg.util_avg = gcfs_rq->avg.util_avg;
 	se->avg.util_sum = se->avg.util_avg * divider;
@@ -3463,16 +3465,18 @@ static inline void
 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
 	long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
-	/*
-	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
-	 * See ___update_load_avg() for details.
-	 */
-	u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+	u32 divider;
 
 	/* Nothing to update */
 	if (!delta)
 		return;
 
+	/*
+	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+	 * See ___update_load_avg() for details.
+	 */
+	divider = get_pelt_divider(&cfs_rq->avg);
+
 	/* Set new sched_entity's runnable */
 	se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
 	se->avg.runnable_sum = se->avg.runnable_avg * divider;
@@ -3500,7 +3504,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
 	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
 	 * See ___update_load_avg() for details.
 	 */
-	divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+	divider = get_pelt_divider(&cfs_rq->avg);
 
 	if (runnable_sum >= 0) {
 		/*
@@ -3646,7 +3650,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 
 	if (cfs_rq->removed.nr) {
 		unsigned long r;
-		u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+		u32 divider = get_pelt_divider(&cfs_rq->avg);
 
 		raw_spin_lock(&cfs_rq->removed.lock);
 		swap(cfs_rq->removed.util_avg, removed_util);
@@ -3701,7 +3705,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
 	 * See ___update_load_avg() for details.
 	 */
-	u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+	u32 divider = get_pelt_divider(&cfs_rq->avg);
 
 	/*
 	 * When we attach the @se to the @cfs_rq, we must align the decay
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index dea5567e4f72..11bea3b08115 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -262,7 +262,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
 static __always_inline void
 ___update_load_avg(struct sched_avg *sa, unsigned long load)
 {
-	u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+	u32 divider = get_pelt_divider(sa);
 
 	/*
 	 * Step 2: update *_avg.
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index eb034d9f024d..795e43e02afc 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -37,6 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running)
 }
 #endif
 
+static inline u32 get_pelt_divider(struct sched_avg *avg)
+{
+	return LOAD_AVG_MAX - 1024 + avg->period_contrib;
+}
+
 /*
  * When a task is dequeued, its estimated utilization should not be update if
  * its util_avg has not been updated at least once.
-- 
cgit v1.2.3


From 7318d4cc14c8c8a5dde2b0b72ea50fd2545f0b7a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Apr 2020 12:09:13 +0200
Subject: sched: Provide sched_set_fifo()

SCHED_FIFO (or any static priority scheduler) is a broken scheduler
model; it is fundamentally incapable of resource management, the one
thing an OS is actually supposed to do.

It is impossible to compose static priority workloads. One cannot take
two well designed and functional static priority workloads and mash
them together and still expect them to work.

Therefore it doesn't make sense to expose the priority field; the
kernel is fundamentally incapable of setting a sensible value, it
needs systems knowledge that it doesn't have.

Take away sched_setschedule() / sched_setattr() from modules and
replace them with:

  - sched_set_fifo(p); create a FIFO task (at prio 50)
  - sched_set_fifo_low(p); create a task higher than NORMAL,
	which ends up being a FIFO task at prio 1.
  - sched_set_normal(p, nice); (re)set the task to normal

This stops the proliferation of randomly chosen, and irrelevant, FIFO
priorities that dont't really mean anything anyway.

The system administrator/integrator, whoever has insight into the
actual system design and requirements (userspace) can set-up
appropriate priorities if and when needed.

Cc: airlied@redhat.com
Cc: alexander.deucher@amd.com
Cc: awalls@md.metrocast.net
Cc: axboe@kernel.dk
Cc: broonie@kernel.org
Cc: daniel.lezcano@linaro.org
Cc: gregkh@linuxfoundation.org
Cc: hannes@cmpxchg.org
Cc: herbert@gondor.apana.org.au
Cc: hverkuil@xs4all.nl
Cc: john.stultz@linaro.org
Cc: nico@fluxnic.net
Cc: paulmck@kernel.org
Cc: rafael.j.wysocki@intel.com
Cc: rmk+kernel@arm.linux.org.uk
Cc: sudeep.holla@arm.com
Cc: tglx@linutronix.de
Cc: ulf.hansson@linaro.org
Cc: wim@linux-watchdog.org
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/sched.h |  3 +++
 kernel/sched/core.c   | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b62e6aaf28f0..b792b8f0f4cf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1653,6 +1653,9 @@ extern int idle_cpu(int cpu);
 extern int available_idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
 extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
+extern int sched_set_fifo(struct task_struct *p);
+extern int sched_set_fifo_low(struct task_struct *p);
+extern int sched_set_normal(struct task_struct *p, int nice);
 extern int sched_setattr(struct task_struct *, const struct sched_attr *);
 extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
 extern struct task_struct *idle_task(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0208b71bef80..40d3939b0520 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5124,6 +5124,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
+ * Use sched_set_fifo(), read its comment.
+ *
  * Return: 0 on success. An error code otherwise.
  *
  * NOTE that the task may be already dead.
@@ -5166,6 +5168,51 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
 
+/*
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
+ * incapable of resource management, which is the one thing an OS really should
+ * be doing.
+ *
+ * This is of course the reason it is limited to privileged users only.
+ *
+ * Worse still; it is fundamentally impossible to compose static priority
+ * workloads. You cannot take two correctly working static prio workloads
+ * and smash them together and still expect them to work.
+ *
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
+ *
+ *   MAX_RT_PRIO / 2
+ *
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
+ * know enough information to make a sensible choice.
+ */
+int sched_set_fifo(struct task_struct *p)
+{
+	struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
+	return sched_setscheduler_nocheck(p, SCHED_FIFO, &sp);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo);
+
+/*
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
+ */
+int sched_set_fifo_low(struct task_struct *p)
+{
+	struct sched_param sp = { .sched_priority = 1 };
+	return sched_setscheduler_nocheck(p, SCHED_FIFO, &sp);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
+
+int sched_set_normal(struct task_struct *p, int nice)
+{
+	struct sched_attr attr = {
+		.sched_policy = SCHED_NORMAL,
+		.sched_nice = nice,
+	};
+	return sched_setattr_nocheck(p, &attr);
+}
+EXPORT_SYMBOL_GPL(sched_set_normal);
+
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
-- 
cgit v1.2.3


From 7a40798c714ff462863352d490b382515daba49e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Apr 2020 12:09:13 +0200
Subject: sched,irq: Convert to sched_set_fifo()

Because SCHED_FIFO is a broken scheduler model (see previous patches)
take away the priority field, the kernel can't possibly make an
informed decision.

Effectively no change.

Cc: tglx@linutronix.de
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/irq/manage.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 761911168438..06b62746e1df 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1271,9 +1271,6 @@ static int
 setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
 {
 	struct task_struct *t;
-	struct sched_param param = {
-		.sched_priority = MAX_USER_RT_PRIO/2,
-	};
 
 	if (!secondary) {
 		t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -1281,13 +1278,12 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
 	} else {
 		t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq,
 				   new->name);
-		param.sched_priority -= 1;
 	}
 
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 
-	sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
+	sched_set_fifo(t);
 
 	/*
 	 * We keep the reference to the task struct even if
-- 
cgit v1.2.3


From 93db9129fa4beb78e2227554d237e8db04ca514c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Apr 2020 12:09:13 +0200
Subject: sched,locktorture: Convert to sched_set_fifo()

Because SCHED_FIFO is a broken scheduler model (see previous patches)
take away the priority field, the kernel can't possibly make an
informed decision.

Effectively changes prio from 99 to 50.

Cc: paulmck@kernel.org
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/locking/locktorture.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 5efbfc68ce99..fca41a7ded0c 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -436,8 +436,6 @@ static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
 
 static void torture_rtmutex_boost(struct torture_random_state *trsp)
 {
-	int policy;
-	struct sched_param param;
 	const unsigned int factor = 50000; /* yes, quite arbitrary */
 
 	if (!rt_task(current)) {
@@ -448,8 +446,7 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp)
 		 */
 		if (trsp && !(torture_random(trsp) %
 			      (cxt.nrealwriters_stress * factor))) {
-			policy = SCHED_FIFO;
-			param.sched_priority = MAX_RT_PRIO - 1;
+			sched_set_fifo(current);
 		} else /* common case, do nothing */
 			return;
 	} else {
@@ -462,13 +459,10 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp)
 		 */
 		if (!trsp || !(torture_random(trsp) %
 			       (cxt.nrealwriters_stress * factor * 2))) {
-			policy = SCHED_NORMAL;
-			param.sched_priority = 0;
+			sched_set_normal(current, 0);
 		} else /* common case, do nothing */
 			return;
 	}
-
-	sched_setscheduler_nocheck(current, policy, &param);
 }
 
 static void torture_rtmutex_delay(struct torture_random_state *trsp)
-- 
cgit v1.2.3


From b1433395c4cc078b4382429e6d7dd1721714094f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Apr 2020 12:09:13 +0200
Subject: sched,rcuperf: Convert to sched_set_fifo_low()

Because SCHED_FIFO is a broken scheduler model (see previous patches)
take away the priority field, the kernel can't possibly make an
informed decision.

Effectively no change.

Cc: paulmck@kernel.org
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcuperf.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 16dd1e6b7c09..64249a045232 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -354,7 +354,6 @@ rcu_perf_writer(void *arg)
 	int i_max;
 	long me = (long)arg;
 	struct rcu_head *rhp = NULL;
-	struct sched_param sp;
 	bool started = false, done = false, alldone = false;
 	u64 t;
 	u64 *wdp;
@@ -363,8 +362,7 @@ rcu_perf_writer(void *arg)
 	VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
 	WARN_ON(!wdpp);
 	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
-	sp.sched_priority = 1;
-	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+	sched_set_fifo_low(current);
 
 	if (holdoff)
 		schedule_timeout_uninterruptible(holdoff * HZ);
@@ -420,9 +418,7 @@ retry:
 			started = true;
 		if (!done && i >= MIN_MEAS) {
 			done = true;
-			sp.sched_priority = 0;
-			sched_setscheduler_nocheck(current,
-						   SCHED_NORMAL, &sp);
+			sched_set_normal(current, 0);
 			pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n",
 				 perf_type, PERF_FLAG, me, MIN_MEAS);
 			if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
-- 
cgit v1.2.3


From ce1c8afd3f3119ddaddcdff27579f5723f55c75e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Apr 2020 12:09:13 +0200
Subject: sched,rcutorture: Convert to sched_set_fifo_low()

Because SCHED_FIFO is a broken scheduler model (see previous patches)
take away the priority field, the kernel can't possibly make an
informed decision.

Effectively no change.

Cc: paulmck@kernel.org
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index efb792e13fca..bbc3c8a9e7b9 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -889,13 +889,11 @@ static int rcu_torture_boost(void *arg)
 	unsigned long endtime;
 	unsigned long oldstarttime;
 	struct rcu_boost_inflight rbi = { .inflight = 0 };
-	struct sched_param sp;
 
 	VERBOSE_TOROUT_STRING("rcu_torture_boost started");
 
 	/* Set real-time priority. */
-	sp.sched_priority = 1;
-	if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+	if (sched_set_fifo_low(current) < 0) {
 		VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
 		n_rcu_torture_boost_rterror++;
 	}
-- 
cgit v1.2.3


From 2cca5426b9c108998bc03230cd6ae440f3e205ed Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Apr 2020 12:09:13 +0200
Subject: sched,psi: Convert to sched_set_fifo_low()

Because SCHED_FIFO is a broken scheduler model (see previous patches)
take away the priority field, the kernel can't possibly make an
informed decision.

Effectively no change.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
---
 kernel/sched/psi.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index e53b711bd643..967732c0766c 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -616,11 +616,8 @@ out:
 static int psi_poll_worker(void *data)
 {
 	struct psi_group *group = (struct psi_group *)data;
-	struct sched_param param = {
-		.sched_priority = 1,
-	};
 
-	sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
+	sched_set_fifo_low(current);
 
 	while (true) {
 		wait_event_interruptible(group->poll_wait,
-- 
cgit v1.2.3


From 616d91b68cd56bcb1954b6a5af7d542401fde772 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Apr 2020 12:09:13 +0200
Subject: sched: Remove sched_setscheduler*() EXPORTs

Now that nothing (modular) still uses sched_setscheduler(), remove the
exports.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 40d3939b0520..f882d3d74dad 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5135,13 +5135,11 @@ int sched_setscheduler(struct task_struct *p, int policy,
 {
 	return _sched_setscheduler(p, policy, param, true);
 }
-EXPORT_SYMBOL_GPL(sched_setscheduler);
 
 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
 	return __sched_setscheduler(p, attr, true, true);
 }
-EXPORT_SYMBOL_GPL(sched_setattr);
 
 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
 {
@@ -5166,7 +5164,6 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
 {
 	return _sched_setscheduler(p, policy, param, false);
 }
-EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
 
 /*
  * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
-- 
cgit v1.2.3


From 8b700983de82f79e05b2c1136d6513ea4c9b22c4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Apr 2020 13:10:04 +0200
Subject: sched: Remove sched_set_*() return value

Ingo suggested that since the new sched_set_*() functions are
implemented using the 'nocheck' variants, they really shouldn't ever
fail, so remove the return value.

Cc: axboe@kernel.dk
Cc: daniel.lezcano@linaro.org
Cc: sudeep.holla@arm.com
Cc: airlied@redhat.com
Cc: broonie@kernel.org
Cc: paulmck@kernel.org
Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/block/drbd/drbd_receiver.c    |  4 +---
 drivers/firmware/psci/psci_checker.c  |  3 +--
 drivers/gpu/drm/msm/msm_drv.c         |  5 +----
 drivers/platform/chrome/cros_ec_spi.c |  7 +++----
 include/linux/sched.h                 |  6 +++---
 kernel/rcu/rcutorture.c               |  5 +----
 kernel/sched/core.c                   | 12 ++++++------
 7 files changed, 16 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 140fd98274b1..280615efef74 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -6020,9 +6020,7 @@ int drbd_ack_receiver(struct drbd_thread *thi)
 	int expect   = header_size;
 	bool ping_timeout_active = false;
 
-	rv = sched_set_fifo_low(current);
-	if (rv < 0)
-		drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv);
+	sched_set_fifo_low(current);
 
 	while (get_t_state(thi) == RUNNING) {
 		drbd_thread_current_set_cpu(thi);
diff --git a/drivers/firmware/psci/psci_checker.c b/drivers/firmware/psci/psci_checker.c
index a5279a430274..6fff482847e7 100644
--- a/drivers/firmware/psci/psci_checker.c
+++ b/drivers/firmware/psci/psci_checker.c
@@ -281,8 +281,7 @@ static int suspend_test_thread(void *arg)
 	wait_for_completion(&suspend_threads_started);
 
 	/* Set maximum priority to preempt all other threads on this CPU. */
-	if (sched_set_fifo(current))
-		pr_warn("Failed to set suspend thread scheduler on CPU %d\n", cpu);
+	sched_set_fifo(current);
 
 	dev = this_cpu_read(cpuidle_devices);
 	drv = cpuidle_get_cpu_driver(dev);
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 89a8b9c7e044..556cca38487c 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -509,10 +509,7 @@ static int msm_drm_init(struct device *dev, struct drm_driver *drv)
 			goto err_msm_uninit;
 		}
 
-		ret = sched_set_fifo(priv->event_thread[i].thread);
-		if (ret)
-			dev_warn(dev, "event_thread set priority failed:%d\n",
-				 ret);
+		sched_set_fifo(priv->event_thread[i].thread);
 	}
 
 	ret = drm_vblank_init(ddev, priv->num_crtcs);
diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c
index c20a43a97040..d09260382550 100644
--- a/drivers/platform/chrome/cros_ec_spi.c
+++ b/drivers/platform/chrome/cros_ec_spi.c
@@ -725,10 +725,9 @@ static int cros_ec_spi_devm_high_pri_alloc(struct device *dev,
 	if (err)
 		return err;
 
-	err = sched_set_fifo(ec_spi->high_pri_worker->task);
-	if (err)
-		dev_err(dev, "Can't set cros_ec high pri priority: %d\n", err);
-	return err;
+	sched_set_fifo(ec_spi->high_pri_worker->task);
+
+	return 0;
 }
 
 static int cros_ec_spi_probe(struct spi_device *spi)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b792b8f0f4cf..ae7664492af2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1653,9 +1653,9 @@ extern int idle_cpu(int cpu);
 extern int available_idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
 extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
-extern int sched_set_fifo(struct task_struct *p);
-extern int sched_set_fifo_low(struct task_struct *p);
-extern int sched_set_normal(struct task_struct *p, int nice);
+extern void sched_set_fifo(struct task_struct *p);
+extern void sched_set_fifo_low(struct task_struct *p);
+extern void sched_set_normal(struct task_struct *p, int nice);
 extern int sched_setattr(struct task_struct *, const struct sched_attr *);
 extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
 extern struct task_struct *idle_task(int cpu);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index bbc3c8a9e7b9..b4c1146de414 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -893,10 +893,7 @@ static int rcu_torture_boost(void *arg)
 	VERBOSE_TOROUT_STRING("rcu_torture_boost started");
 
 	/* Set real-time priority. */
-	if (sched_set_fifo_low(current) < 0) {
-		VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
-		n_rcu_torture_boost_rterror++;
-	}
+	sched_set_fifo_low(current);
 
 	init_rcu_head_on_stack(&rbi.rcu);
 	/* Each pass through the following loop does one boost-test cycle. */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f882d3d74dad..41d3778ea80e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5183,30 +5183,30 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
  * The administrator _MUST_ configure the system, the kernel simply doesn't
  * know enough information to make a sensible choice.
  */
-int sched_set_fifo(struct task_struct *p)
+void sched_set_fifo(struct task_struct *p)
 {
 	struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
-	return sched_setscheduler_nocheck(p, SCHED_FIFO, &sp);
+	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
 }
 EXPORT_SYMBOL_GPL(sched_set_fifo);
 
 /*
  * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
  */
-int sched_set_fifo_low(struct task_struct *p)
+void sched_set_fifo_low(struct task_struct *p)
 {
 	struct sched_param sp = { .sched_priority = 1 };
-	return sched_setscheduler_nocheck(p, SCHED_FIFO, &sp);
+	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
 }
 EXPORT_SYMBOL_GPL(sched_set_fifo_low);
 
-int sched_set_normal(struct task_struct *p, int nice)
+void sched_set_normal(struct task_struct *p, int nice)
 {
 	struct sched_attr attr = {
 		.sched_policy = SCHED_NORMAL,
 		.sched_nice = nice,
 	};
-	return sched_setattr_nocheck(p, &attr);
+	WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
 }
 EXPORT_SYMBOL_GPL(sched_set_normal);
 
-- 
cgit v1.2.3


From 60997c3d45d9a67daf01c56d805ae4fec37e0bd8 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Wed, 3 Jun 2020 21:48:55 +0200
Subject: close_range: add CLOSE_RANGE_UNSHARE

One of the use-cases of close_range() is to drop file descriptors just before
execve(). This would usually be expressed in the sequence:

unshare(CLONE_FILES);
close_range(3, ~0U);

as pointed out by Linus it might be desirable to have this be a part of
close_range() itself under a new flag CLOSE_RANGE_UNSHARE.

This expands {dup,unshare)_fd() to take a max_fds argument that indicates the
maximum number of file descriptors to copy from the old struct files. When the
user requests that all file descriptors are supposed to be closed via
close_range(min, max) then we can cap via unshare_fd(min) and hence don't need
to do any of the heavy fput() work for everything above min.

The patch makes it so that if CLOSE_RANGE_UNSHARE is requested and we do in
fact currently share our file descriptor table we create a new private copy.
We then close all fds in the requested range and finally after we're done we
install the new fd table.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/file.c                        | 65 +++++++++++++++++++++++++++++++++++-----
 fs/open.c                        |  5 +---
 include/linux/fdtable.h          |  8 +++--
 include/uapi/linux/close_range.h |  9 ++++++
 kernel/fork.c                    | 11 +++----
 5 files changed, 79 insertions(+), 19 deletions(-)
 create mode 100644 include/uapi/linux/close_range.h

(limited to 'kernel')

diff --git a/fs/file.c b/fs/file.c
index 1b8ff05e8311..340bc9569f9d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -19,6 +19,7 @@
 #include <linux/bitops.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/close_range.h>
 
 unsigned int sysctl_nr_open __read_mostly = 1024*1024;
 unsigned int sysctl_nr_open_min = BITS_PER_LONG;
@@ -265,12 +266,22 @@ static unsigned int count_open_files(struct fdtable *fdt)
 	return i;
 }
 
+static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
+{
+	unsigned int count;
+
+	count = count_open_files(fdt);
+	if (max_fds < NR_OPEN_DEFAULT)
+		max_fds = NR_OPEN_DEFAULT;
+	return min(count, max_fds);
+}
+
 /*
  * Allocate a new files structure and copy contents from the
  * passed in files structure.
  * errorp will be valid only when the returned files_struct is NULL.
  */
-struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
+struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
 {
 	struct files_struct *newf;
 	struct file **old_fds, **new_fds;
@@ -297,7 +308,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 
 	spin_lock(&oldf->file_lock);
 	old_fdt = files_fdtable(oldf);
-	open_files = count_open_files(old_fdt);
+	open_files = sane_fdtable_size(old_fdt, max_fds);
 
 	/*
 	 * Check whether we need to allocate a larger fd array and fd set.
@@ -328,7 +339,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 		 */
 		spin_lock(&oldf->file_lock);
 		old_fdt = files_fdtable(oldf);
-		open_files = count_open_files(old_fdt);
+		open_files = sane_fdtable_size(old_fdt, max_fds);
 	}
 
 	copy_fd_bitmaps(new_fdt, old_fdt, open_files);
@@ -665,32 +676,72 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
  * This closes a range of file descriptors. All file descriptors
  * from @fd up to and including @max_fd are closed.
  */
-int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd)
+int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 {
 	unsigned int cur_max;
+	struct task_struct *me = current;
+	struct files_struct *cur_fds = me->files, *fds = NULL;
+
+	if (flags & ~CLOSE_RANGE_UNSHARE)
+		return -EINVAL;
 
 	if (fd > max_fd)
 		return -EINVAL;
 
 	rcu_read_lock();
-	cur_max = files_fdtable(files)->max_fds;
+	cur_max = files_fdtable(cur_fds)->max_fds;
 	rcu_read_unlock();
 
 	/* cap to last valid index into fdtable */
 	cur_max--;
 
+	if (flags & CLOSE_RANGE_UNSHARE) {
+		int ret;
+		unsigned int max_unshare_fds = NR_OPEN_MAX;
+
+		/*
+		 * If the requested range is greater than the current maximum,
+		 * we're closing everything so only copy all file descriptors
+		 * beneath the lowest file descriptor.
+		 */
+		if (max_fd >= cur_max)
+			max_unshare_fds = fd;
+
+		ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
+		if (ret)
+			return ret;
+
+		/*
+		 * We used to share our file descriptor table, and have now
+		 * created a private one, make sure we're using it below.
+		 */
+		if (fds)
+			swap(cur_fds, fds);
+	}
+
 	max_fd = min(max_fd, cur_max);
 	while (fd <= max_fd) {
 		struct file *file;
 
-		file = pick_file(files, fd++);
+		file = pick_file(cur_fds, fd++);
 		if (!file)
 			continue;
 
-		filp_close(file, files);
+		filp_close(file, cur_fds);
 		cond_resched();
 	}
 
+	if (fds) {
+		/*
+		 * We're done closing the files we were supposed to. Time to install
+		 * the new file descriptor table and drop the old one.
+		 */
+		task_lock(me);
+		me->files = cur_fds;
+		task_unlock(me);
+		put_files_struct(fds);
+	}
+
 	return 0;
 }
 
diff --git a/fs/open.c b/fs/open.c
index 073ea3c45347..5e62f18adc5b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1324,10 +1324,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
 SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
 		unsigned int, flags)
 {
-	if (flags)
-		return -EINVAL;
-
-	return __close_range(current->files, fd, max_fd);
+	return __close_range(fd, max_fd, flags);
 }
 
 /*
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index fcd07181a365..a32bf47c593e 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -22,6 +22,7 @@
  * as this is the granularity returned by copy_fdset().
  */
 #define NR_OPEN_DEFAULT BITS_PER_LONG
+#define NR_OPEN_MAX ~0U
 
 struct fdtable {
 	unsigned int max_fds;
@@ -109,7 +110,7 @@ struct files_struct *get_files_struct(struct task_struct *);
 void put_files_struct(struct files_struct *fs);
 void reset_files_struct(struct files_struct *);
 int unshare_files(struct files_struct **);
-struct files_struct *dup_fd(struct files_struct *, int *) __latent_entropy;
+struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
 void do_close_on_exec(struct files_struct *);
 int iterate_fd(struct files_struct *, unsigned,
 		int (*)(const void *, struct file *, unsigned),
@@ -121,9 +122,10 @@ extern void __fd_install(struct files_struct *files,
 		      unsigned int fd, struct file *file);
 extern int __close_fd(struct files_struct *files,
 		      unsigned int fd);
-extern int __close_range(struct files_struct *files, unsigned int fd,
-			 unsigned int max_fd);
+extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
 extern int __close_fd_get_file(unsigned int fd, struct file **res);
+extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
+		      struct files_struct **new_fdp);
 
 extern struct kmem_cache *files_cachep;
 
diff --git a/include/uapi/linux/close_range.h b/include/uapi/linux/close_range.h
new file mode 100644
index 000000000000..6928a9fdee3c
--- /dev/null
+++ b/include/uapi/linux/close_range.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_CLOSE_RANGE_H
+#define _UAPI_LINUX_CLOSE_RANGE_H
+
+/* Unshare the file descriptor table before closing file descriptors. */
+#define CLOSE_RANGE_UNSHARE	(1U << 1)
+
+#endif /* _UAPI_LINUX_CLOSE_RANGE_H */
+
diff --git a/kernel/fork.c b/kernel/fork.c
index 142b23645d82..8948121c8454 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1474,7 +1474,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
 		goto out;
 	}
 
-	newf = dup_fd(oldf, &error);
+	newf = dup_fd(oldf, NR_OPEN_MAX, &error);
 	if (!newf)
 		goto out;
 
@@ -2907,14 +2907,15 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 /*
  * Unshare file descriptor table if it is being shared
  */
-static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
+	       struct files_struct **new_fdp)
 {
 	struct files_struct *fd = current->files;
 	int error = 0;
 
 	if ((unshare_flags & CLONE_FILES) &&
 	    (fd && atomic_read(&fd->count) > 1)) {
-		*new_fdp = dup_fd(fd, &error);
+		*new_fdp = dup_fd(fd, max_fds, &error);
 		if (!*new_fdp)
 			return error;
 	}
@@ -2974,7 +2975,7 @@ int ksys_unshare(unsigned long unshare_flags)
 	err = unshare_fs(unshare_flags, &new_fs);
 	if (err)
 		goto bad_unshare_out;
-	err = unshare_fd(unshare_flags, &new_fd);
+	err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
 	if (err)
 		goto bad_unshare_cleanup_fs;
 	err = unshare_userns(unshare_flags, &new_cred);
@@ -3063,7 +3064,7 @@ int unshare_files(struct files_struct **displaced)
 	struct files_struct *copy = NULL;
 	int error;
 
-	error = unshare_fd(CLONE_FILES, &copy);
+	error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
 	if (error || !copy) {
 		*displaced = NULL;
 		return error;
-- 
cgit v1.2.3


From bbccc11bc8848926065915e6193fd4c6e33c85ef Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Sun, 24 May 2020 15:52:38 -0500
Subject: audit: Use struct_size() helper in alloc_chunk

One of the more common cases of allocation size calculations is finding
the size of a structure that has a zero-sized array at the end, along
with memory for some number of elements for that array. For example:

struct audit_chunk {
	...
        struct node {
                struct list_head list;
                struct audit_tree *owner;
                unsigned index;         /* index; upper bit indicates 'will prune' */
        } owners[];
};

Make use of the struct_size() helper instead of an open-coded version
in order to avoid any potential type mistakes.

So, replace the following form:

offsetof(struct audit_chunk, owners) + count * sizeof(struct node);

with:

struct_size(chunk, owners, count)

This code was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e49c912f862d..1b7a2f041793 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -188,11 +188,9 @@ static struct fsnotify_mark *alloc_mark(void)
 static struct audit_chunk *alloc_chunk(int count)
 {
 	struct audit_chunk *chunk;
-	size_t size;
 	int i;
 
-	size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
-	chunk = kzalloc(size, GFP_KERNEL);
+	chunk = kzalloc(struct_size(chunk, owners, count), GFP_KERNEL);
 	if (!chunk)
 		return NULL;
 
-- 
cgit v1.2.3


From 6c6935419e2fd8ef1fcde71c4e50b9095e520900 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 18 Jun 2020 16:46:31 -0700
Subject: bpf: Avoid verifier failure for 32bit pointer arithmetic

When do experiments with llvm (disabling instcombine and
simplifyCFG), I hit the following error with test_seg6_loop.o.

  ; R1=pkt(id=0,off=0,r=48,imm=0), R7=pkt(id=0,off=40,r=48,imm=0)
  w2 = w7
  ; R2_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
  w2 -= w1
  R2 32-bit pointer arithmetic prohibited

The corresponding source code is:
  uint32_t srh_off
  // srh and skb->data are all packet pointers
  srh_off = (char *)srh - (char *)(long)skb->data;

The verifier does not support 32-bit pointer/scalar arithmetic.

Without my llvm change, the code looks like

  ; R3=pkt(id=0,off=40,r=48,imm=0), R8=pkt(id=0,off=0,r=48,imm=0)
  w3 -= w8
  ; R3_w=inv(id=0)

This is explicitly allowed in verifier if both registers are
pointers and the opcode is BPF_SUB.

To fix this problem, I changed the verifier to allow
32-bit pointer/scaler BPF_SUB operations.

At the source level, the issue could be workarounded with
inline asm or changing "uint32_t srh_off" to "uint64_t srh_off".
But I feel that verifier change might be the right thing to do.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200618234631.3321118-1-yhs@fb.com
---
 kernel/bpf/verifier.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 34cde841ab68..a1857c4ffaaf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5031,6 +5031,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 
 	if (BPF_CLASS(insn->code) != BPF_ALU64) {
 		/* 32-bit ALU ops on pointers produce (meaningless) scalars */
+		if (opcode == BPF_SUB && env->allow_ptr_leaks) {
+			__mark_reg_unknown(env, dst_reg);
+			return 0;
+		}
+
 		verbose(env,
 			"R%d 32-bit pointer arithmetic prohibited\n",
 			dst);
-- 
cgit v1.2.3


From 3af8588c77186bf08e55e7281da83d88373481d7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Mon, 8 Jun 2020 17:28:50 +0200
Subject: fork: fold legacy_clone_args_valid() into _do_fork()

This separate helper only existed to guarantee the mutual exclusivity of
CLONE_PIDFD and CLONE_PARENT_SETTID for legacy clone since CLONE_PIDFD
abuses the parent_tid field to return the pidfd. But we can actually handle
this uniformely thus removing the helper. For legacy clone we can detect
that CLONE_PIDFD is specified in conjunction with CLONE_PARENT_SETTID
because they will share the same memory which is invalid and for clone3()
setting the separate pidfd and parent_tid fields to the same memory is
bogus as well. So fold that helper directly into _do_fork() by detecting
this case.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: linux-m68k@lists.linux-m68k.org
Cc: x86@kernel.org
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 arch/m68k/kernel/process.c |  3 ---
 arch/x86/kernel/sys_ia32.c |  3 ---
 include/linux/sched/task.h |  1 -
 kernel/fork.c              | 30 ++++++++++++++----------------
 4 files changed, 14 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 90ae376b7ab1..0608439ba452 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -125,9 +125,6 @@ asmlinkage int m68k_clone(struct pt_regs *regs)
 		.tls		= regs->d5,
 	};
 
-	if (!legacy_clone_args_valid(&args))
-		return -EINVAL;
-
 	return _do_fork(&args);
 }
 
diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c
index f8d65c99feb8..720cde885042 100644
--- a/arch/x86/kernel/sys_ia32.c
+++ b/arch/x86/kernel/sys_ia32.c
@@ -251,9 +251,6 @@ COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags,
 		.tls		= tls_val,
 	};
 
-	if (!legacy_clone_args_valid(&args))
-		return -EINVAL;
-
 	return _do_fork(&args);
 }
 #endif /* CONFIG_IA32_EMULATION */
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 38359071236a..ddce0ea515d1 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -96,7 +96,6 @@ extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern long _do_fork(struct kernel_clone_args *kargs);
-extern bool legacy_clone_args_valid(const struct kernel_clone_args *kargs);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 struct mm_struct *copy_init_mm(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index 142b23645d82..9875aeb2ba41 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2422,6 +2422,20 @@ long _do_fork(struct kernel_clone_args *args)
 	int trace = 0;
 	long nr;
 
+	/*
+	 * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
+	 * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
+	 * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
+	 * field in struct clone_args and it still doesn't make sense to have
+	 * them both point at the same memory location. Performing this check
+	 * here has the advantage that we don't need to have a separate helper
+	 * to check for legacy clone().
+	 */
+	if ((args->flags & CLONE_PIDFD) &&
+	    (args->flags & CLONE_PARENT_SETTID) &&
+	    (args->pidfd == args->parent_tid))
+		return -EINVAL;
+
 	/*
 	 * Determine whether and which event to report to ptracer.  When
 	 * called from kernel_thread or CLONE_UNTRACED is explicitly
@@ -2479,16 +2493,6 @@ long _do_fork(struct kernel_clone_args *args)
 	return nr;
 }
 
-bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
-{
-	/* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
-	if ((kargs->flags & CLONE_PIDFD) &&
-	    (kargs->flags & CLONE_PARENT_SETTID))
-		return false;
-
-	return true;
-}
-
 #ifndef CONFIG_HAVE_COPY_THREAD_TLS
 /* For compatibility with architectures that call do_fork directly rather than
  * using the syscall entry points below. */
@@ -2508,9 +2512,6 @@ long do_fork(unsigned long clone_flags,
 		.stack_size	= stack_size,
 	};
 
-	if (!legacy_clone_args_valid(&args))
-		return -EINVAL;
-
 	return _do_fork(&args);
 }
 #endif
@@ -2593,9 +2594,6 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		.tls		= tls,
 	};
 
-	if (!legacy_clone_args_valid(&args))
-		return -EINVAL;
-
 	return _do_fork(&args);
 }
 #endif
-- 
cgit v1.2.3


From a2d0d62f4d9e3546134ba08e102ca7decd4ed836 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 19 Jun 2020 14:11:41 -0700
Subject: bpf: Switch btf_parse_vmlinux to btf_find_by_name_kind

btf_parse_vmlinux() implements manual search for struct bpf_ctx_convert
since at the time of implementing btf_find_by_name_kind() was not
available.

Later btf_find_by_name_kind() was introduced in 27ae7997a661 ("bpf:
Introduce BPF_PROG_TYPE_STRUCT_OPS"). It provides similar search
functionality and can be leveraged in btf_parse_vmlinux(). Do it.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/6e12d5c3e8a3d552925913ef73a695dd1bb27800.1592600985.git.rdna@fb.com
---
 kernel/bpf/btf.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 58c9af1d4808..3eb804618a53 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3591,7 +3591,7 @@ struct btf *btf_parse_vmlinux(void)
 	struct btf_verifier_env *env = NULL;
 	struct bpf_verifier_log *log;
 	struct btf *btf = NULL;
-	int err, i;
+	int err, btf_id;
 
 	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
 	if (!env)
@@ -3625,24 +3625,13 @@ struct btf *btf_parse_vmlinux(void)
 		goto errout;
 
 	/* find struct bpf_ctx_convert for type checking later */
-	for (i = 1; i <= btf->nr_types; i++) {
-		const struct btf_type *t;
-		const char *tname;
-
-		t = btf_type_by_id(btf, i);
-		if (!__btf_type_is_struct(t))
-			continue;
-		tname = __btf_name_by_offset(btf, t->name_off);
-		if (!strcmp(tname, "bpf_ctx_convert")) {
-			/* btf_parse_vmlinux() runs under bpf_verifier_lock */
-			bpf_ctx_convert.t = t;
-			break;
-		}
-	}
-	if (i > btf->nr_types) {
-		err = -ENOENT;
+	btf_id = btf_find_by_name_kind(btf, "bpf_ctx_convert", BTF_KIND_STRUCT);
+	if (btf_id < 0) {
+		err = btf_id;
 		goto errout;
 	}
+	/* btf_parse_vmlinux() runs under bpf_verifier_lock */
+	bpf_ctx_convert.t = btf_type_by_id(btf, btf_id);
 
 	bpf_struct_ops_init(btf, log);
 
-- 
cgit v1.2.3


From 41c48f3a98231738c5ce79f6f2aa6e40ba924d18 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 19 Jun 2020 14:11:43 -0700
Subject: bpf: Support access to bpf map fields

There are multiple use-cases when it's convenient to have access to bpf
map fields, both `struct bpf_map` and map type specific struct-s such as
`struct bpf_array`, `struct bpf_htab`, etc.

For example while working with sock arrays it can be necessary to
calculate the key based on map->max_entries (some_hash % max_entries).
Currently this is solved by communicating max_entries via "out-of-band"
channel, e.g. via additional map with known key to get info about target
map. That works, but is not very convenient and error-prone while
working with many maps.

In other cases necessary data is dynamic (i.e. unknown at loading time)
and it's impossible to get it at all. For example while working with a
hash table it can be convenient to know how much capacity is already
used (bpf_htab.count.counter for BPF_F_NO_PREALLOC case).

At the same time kernel knows this info and can provide it to bpf
program.

Fill this gap by adding support to access bpf map fields from bpf
program for both `struct bpf_map` and map type specific fields.

Support is implemented via btf_struct_access() so that a user can define
their own `struct bpf_map` or map type specific struct in their program
with only necessary fields and preserve_access_index attribute, cast a
map to this struct and use a field.

For example:

	struct bpf_map {
		__u32 max_entries;
	} __attribute__((preserve_access_index));

	struct bpf_array {
		struct bpf_map map;
		__u32 elem_size;
	} __attribute__((preserve_access_index));

	struct {
		__uint(type, BPF_MAP_TYPE_ARRAY);
		__uint(max_entries, 4);
		__type(key, __u32);
		__type(value, __u32);
	} m_array SEC(".maps");

	SEC("cgroup_skb/egress")
	int cg_skb(void *ctx)
	{
		struct bpf_array *array = (struct bpf_array *)&m_array;
		struct bpf_map *map = (struct bpf_map *)&m_array;

		/* .. use map->max_entries or array->map.max_entries .. */
	}

Similarly to other btf_struct_access() use-cases (e.g. struct tcp_sock
in net/ipv4/bpf_tcp_ca.c) the patch allows access to any fields of
corresponding struct. Only reading from map fields is supported.

For btf_struct_access() to work there should be a way to know btf id of
a struct that corresponds to a map type. To get btf id there should be a
way to get a stringified name of map-specific struct, such as
"bpf_array", "bpf_htab", etc for a map type. Two new fields are added to
`struct bpf_map_ops` to handle it:
* .map_btf_name keeps a btf name of a struct returned by map_alloc();
* .map_btf_id is used to cache btf id of that struct.

To make btf ids calculation cheaper they're calculated once while
preparing btf_vmlinux and cached same way as it's done for btf_id field
of `struct bpf_func_proto`

While calculating btf ids, struct names are NOT checked for collision.
Collisions will be checked as a part of the work to prepare btf ids used
in verifier in compile time that should land soon. The only known
collision for `struct bpf_htab` (kernel/bpf/hashtab.c vs
net/core/sock_map.c) was fixed earlier.

Both new fields .map_btf_name and .map_btf_id must be set for a map type
for the feature to work. If neither is set for a map type, verifier will
return ENOTSUPP on a try to access map_ptr of corresponding type. If
just one of them set, it's verifier misconfiguration.

Only `struct bpf_array` for BPF_MAP_TYPE_ARRAY and `struct bpf_htab` for
BPF_MAP_TYPE_HASH are supported by this patch. Other map types will be
supported separately.

The feature is available only for CONFIG_DEBUG_INFO_BTF=y and gated by
perfmon_capable() so that unpriv programs won't have access to bpf map
fields.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/6479686a0cd1e9067993df57b4c3eef0e276fec9.1592600985.git.rdna@fb.com
---
 include/linux/bpf.h                                |  9 +++
 include/linux/bpf_verifier.h                       |  1 +
 kernel/bpf/arraymap.c                              |  3 +
 kernel/bpf/btf.c                                   | 40 +++++++++++
 kernel/bpf/hashtab.c                               |  3 +
 kernel/bpf/verifier.c                              | 82 +++++++++++++++++++---
 .../selftests/bpf/verifier/map_ptr_mixing.c        |  2 +-
 7 files changed, 131 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 07052d44bca1..1e1501ee53ce 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -92,6 +92,10 @@ struct bpf_map_ops {
 	int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
 	__poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
 			     struct poll_table_struct *pts);
+
+	/* BTF name and id of struct allocated by map_alloc */
+	const char * const map_btf_name;
+	int *map_btf_id;
 };
 
 struct bpf_map_memory {
@@ -1109,6 +1113,11 @@ static inline bool bpf_allow_ptr_leaks(void)
 	return perfmon_capable();
 }
 
+static inline bool bpf_allow_ptr_to_map_access(void)
+{
+	return perfmon_capable();
+}
+
 static inline bool bpf_bypass_spec_v1(void)
 {
 	return perfmon_capable();
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index ca08db4ffb5f..53c7bd568c5d 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -379,6 +379,7 @@ struct bpf_verifier_env {
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	bool allow_ptr_leaks;
+	bool allow_ptr_to_map_access;
 	bool bpf_capable;
 	bool bypass_spec_v1;
 	bool bypass_spec_v4;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 11584618e861..e7caa48812fb 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -494,6 +494,7 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
 				   vma->vm_pgoff + pgoff);
 }
 
+static int array_map_btf_id;
 const struct bpf_map_ops array_map_ops = {
 	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
@@ -510,6 +511,8 @@ const struct bpf_map_ops array_map_ops = {
 	.map_check_btf = array_map_check_btf,
 	.map_lookup_batch = generic_map_lookup_batch,
 	.map_update_batch = generic_map_update_batch,
+	.map_btf_name = "bpf_array",
+	.map_btf_id = &array_map_btf_id,
 };
 
 const struct bpf_map_ops percpu_array_map_ops = {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 3eb804618a53..e377d1981730 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3571,6 +3571,41 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
 	return ctx_type;
 }
 
+static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
+#define BPF_LINK_TYPE(_id, _name)
+#define BPF_MAP_TYPE(_id, _ops) \
+	[_id] = &_ops,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_LINK_TYPE
+#undef BPF_MAP_TYPE
+};
+
+static int btf_vmlinux_map_ids_init(const struct btf *btf,
+				    struct bpf_verifier_log *log)
+{
+	const struct bpf_map_ops *ops;
+	int i, btf_id;
+
+	for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) {
+		ops = btf_vmlinux_map_ops[i];
+		if (!ops || (!ops->map_btf_name && !ops->map_btf_id))
+			continue;
+		if (!ops->map_btf_name || !ops->map_btf_id) {
+			bpf_log(log, "map type %d is misconfigured\n", i);
+			return -EINVAL;
+		}
+		btf_id = btf_find_by_name_kind(btf, ops->map_btf_name,
+					       BTF_KIND_STRUCT);
+		if (btf_id < 0)
+			return btf_id;
+		*ops->map_btf_id = btf_id;
+	}
+
+	return 0;
+}
+
 static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
 				     struct btf *btf,
 				     const struct btf_type *t,
@@ -3633,6 +3668,11 @@ struct btf *btf_parse_vmlinux(void)
 	/* btf_parse_vmlinux() runs under bpf_verifier_lock */
 	bpf_ctx_convert.t = btf_type_by_id(btf, btf_id);
 
+	/* find bpf map structs for map_ptr access checking */
+	err = btf_vmlinux_map_ids_init(btf, log);
+	if (err < 0)
+		goto errout;
+
 	bpf_struct_ops_init(btf, log);
 
 	btf_verifier_env_free(env);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b4b288a3c3c9..2c5999e02060 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1614,6 +1614,7 @@ htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
 						  true, false);
 }
 
+static int htab_map_btf_id;
 const struct bpf_map_ops htab_map_ops = {
 	.map_alloc_check = htab_map_alloc_check,
 	.map_alloc = htab_map_alloc,
@@ -1625,6 +1626,8 @@ const struct bpf_map_ops htab_map_ops = {
 	.map_gen_lookup = htab_map_gen_lookup,
 	.map_seq_show_elem = htab_map_seq_show_elem,
 	BATCH_OPS(htab),
+	.map_btf_name = "bpf_htab",
+	.map_btf_id = &htab_map_btf_id,
 };
 
 const struct bpf_map_ops htab_lru_map_ops = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a1857c4ffaaf..7460f967cb75 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1351,6 +1351,19 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, regs + regno);
 }
 
+static void mark_btf_ld_reg(struct bpf_verifier_env *env,
+			    struct bpf_reg_state *regs, u32 regno,
+			    enum bpf_reg_type reg_type, u32 btf_id)
+{
+	if (reg_type == SCALAR_VALUE) {
+		mark_reg_unknown(env, regs, regno);
+		return;
+	}
+	mark_reg_known_zero(env, regs, regno);
+	regs[regno].type = PTR_TO_BTF_ID;
+	regs[regno].btf_id = btf_id;
+}
+
 #define DEF_NOT_SUBREG	(0)
 static void init_reg_state(struct bpf_verifier_env *env,
 			   struct bpf_func_state *state)
@@ -3182,19 +3195,68 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	if (ret < 0)
 		return ret;
 
-	if (atype == BPF_READ && value_regno >= 0) {
-		if (ret == SCALAR_VALUE) {
-			mark_reg_unknown(env, regs, value_regno);
-			return 0;
-		}
-		mark_reg_known_zero(env, regs, value_regno);
-		regs[value_regno].type = PTR_TO_BTF_ID;
-		regs[value_regno].btf_id = btf_id;
+	if (atype == BPF_READ && value_regno >= 0)
+		mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
+
+	return 0;
+}
+
+static int check_ptr_to_map_access(struct bpf_verifier_env *env,
+				   struct bpf_reg_state *regs,
+				   int regno, int off, int size,
+				   enum bpf_access_type atype,
+				   int value_regno)
+{
+	struct bpf_reg_state *reg = regs + regno;
+	struct bpf_map *map = reg->map_ptr;
+	const struct btf_type *t;
+	const char *tname;
+	u32 btf_id;
+	int ret;
+
+	if (!btf_vmlinux) {
+		verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n");
+		return -ENOTSUPP;
+	}
+
+	if (!map->ops->map_btf_id || !*map->ops->map_btf_id) {
+		verbose(env, "map_ptr access not supported for map type %d\n",
+			map->map_type);
+		return -ENOTSUPP;
+	}
+
+	t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
+	tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+
+	if (!env->allow_ptr_to_map_access) {
+		verbose(env,
+			"%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+			tname);
+		return -EPERM;
 	}
 
+	if (off < 0) {
+		verbose(env, "R%d is %s invalid negative access: off=%d\n",
+			regno, tname, off);
+		return -EACCES;
+	}
+
+	if (atype != BPF_READ) {
+		verbose(env, "only read from %s is supported\n", tname);
+		return -EACCES;
+	}
+
+	ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
+	if (ret < 0)
+		return ret;
+
+	if (value_regno >= 0)
+		mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
+
 	return 0;
 }
 
+
 /* check whether memory at (regno + off) is accessible for t = (read | write)
  * if t==write, value_regno is a register which value is stored into memory
  * if t==read, value_regno is a register which will receive the value from memory
@@ -3363,6 +3425,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	} else if (reg->type == PTR_TO_BTF_ID) {
 		err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
 					      value_regno);
+	} else if (reg->type == CONST_PTR_TO_MAP) {
+		err = check_ptr_to_map_access(env, regs, regno, off, size, t,
+					      value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
@@ -10951,6 +11016,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 		env->strict_alignment = false;
 
 	env->allow_ptr_leaks = bpf_allow_ptr_leaks();
+	env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
 	env->bypass_spec_v1 = bpf_bypass_spec_v1();
 	env->bypass_spec_v4 = bpf_bypass_spec_v4();
 	env->bpf_capable = bpf_capable();
diff --git a/tools/testing/selftests/bpf/verifier/map_ptr_mixing.c b/tools/testing/selftests/bpf/verifier/map_ptr_mixing.c
index cd26ee6b7b1d..1f2b8c4cb26d 100644
--- a/tools/testing/selftests/bpf/verifier/map_ptr_mixing.c
+++ b/tools/testing/selftests/bpf/verifier/map_ptr_mixing.c
@@ -56,7 +56,7 @@
 	.fixup_map_in_map = { 16 },
 	.fixup_map_array_48b = { 13 },
 	.result = REJECT,
-	.errstr = "R0 invalid mem access 'map_ptr'",
+	.errstr = "only read from bpf_array is supported",
 },
 {
 	"cond: two branches returning different map pointers for lookup (tail, tail)",
-- 
cgit v1.2.3


From 2872e9ac33a4440173418147351ed4f93177e763 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 19 Jun 2020 14:11:44 -0700
Subject: bpf: Set map_btf_{name, id} for all map types

Set map_btf_name and map_btf_id for all map types so that map fields can
be accessed by bpf programs.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/a825f808f22af52b018dbe82f1c7d29dab5fc978.1592600985.git.rdna@fb.com
---
 kernel/bpf/arraymap.c         | 15 +++++++++++++++
 kernel/bpf/bpf_struct_ops.c   |  3 +++
 kernel/bpf/cpumap.c           |  3 +++
 kernel/bpf/devmap.c           |  6 ++++++
 kernel/bpf/hashtab.c          | 12 ++++++++++++
 kernel/bpf/local_storage.c    |  3 +++
 kernel/bpf/lpm_trie.c         |  3 +++
 kernel/bpf/queue_stack_maps.c |  6 ++++++
 kernel/bpf/reuseport_array.c  |  3 +++
 kernel/bpf/ringbuf.c          |  3 +++
 kernel/bpf/stackmap.c         |  3 +++
 net/core/bpf_sk_storage.c     |  3 +++
 net/core/sock_map.c           |  6 ++++++
 net/xdp/xskmap.c              |  3 +++
 14 files changed, 72 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index e7caa48812fb..ec5cd11032aa 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -515,6 +515,7 @@ const struct bpf_map_ops array_map_ops = {
 	.map_btf_id = &array_map_btf_id,
 };
 
+static int percpu_array_map_btf_id;
 const struct bpf_map_ops percpu_array_map_ops = {
 	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
@@ -525,6 +526,8 @@ const struct bpf_map_ops percpu_array_map_ops = {
 	.map_delete_elem = array_map_delete_elem,
 	.map_seq_show_elem = percpu_array_map_seq_show_elem,
 	.map_check_btf = array_map_check_btf,
+	.map_btf_name = "bpf_array",
+	.map_btf_id = &percpu_array_map_btf_id,
 };
 
 static int fd_array_map_alloc_check(union bpf_attr *attr)
@@ -871,6 +874,7 @@ static void prog_array_map_free(struct bpf_map *map)
 	fd_array_map_free(map);
 }
 
+static int prog_array_map_btf_id;
 const struct bpf_map_ops prog_array_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = prog_array_map_alloc,
@@ -886,6 +890,8 @@ const struct bpf_map_ops prog_array_map_ops = {
 	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
 	.map_release_uref = prog_array_map_clear,
 	.map_seq_show_elem = prog_array_map_seq_show_elem,
+	.map_btf_name = "bpf_array",
+	.map_btf_id = &prog_array_map_btf_id,
 };
 
 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -964,6 +970,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
 	rcu_read_unlock();
 }
 
+static int perf_event_array_map_btf_id;
 const struct bpf_map_ops perf_event_array_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = array_map_alloc,
@@ -975,6 +982,8 @@ const struct bpf_map_ops perf_event_array_map_ops = {
 	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
 	.map_release = perf_event_fd_array_release,
 	.map_check_btf = map_check_no_btf,
+	.map_btf_name = "bpf_array",
+	.map_btf_id = &perf_event_array_map_btf_id,
 };
 
 #ifdef CONFIG_CGROUPS
@@ -997,6 +1006,7 @@ static void cgroup_fd_array_free(struct bpf_map *map)
 	fd_array_map_free(map);
 }
 
+static int cgroup_array_map_btf_id;
 const struct bpf_map_ops cgroup_array_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = array_map_alloc,
@@ -1007,6 +1017,8 @@ const struct bpf_map_ops cgroup_array_map_ops = {
 	.map_fd_get_ptr = cgroup_fd_array_get_ptr,
 	.map_fd_put_ptr = cgroup_fd_array_put_ptr,
 	.map_check_btf = map_check_no_btf,
+	.map_btf_name = "bpf_array",
+	.map_btf_id = &cgroup_array_map_btf_id,
 };
 #endif
 
@@ -1080,6 +1092,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
 	return insn - insn_buf;
 }
 
+static int array_of_maps_map_btf_id;
 const struct bpf_map_ops array_of_maps_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = array_of_map_alloc,
@@ -1092,4 +1105,6 @@ const struct bpf_map_ops array_of_maps_map_ops = {
 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 	.map_gen_lookup = array_of_map_gen_lookup,
 	.map_check_btf = map_check_no_btf,
+	.map_btf_name = "bpf_array",
+	.map_btf_id = &array_of_maps_map_btf_id,
 };
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index c6b0decaa46a..969c5d47f81f 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -611,6 +611,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	return map;
 }
 
+static int bpf_struct_ops_map_btf_id;
 const struct bpf_map_ops bpf_struct_ops_map_ops = {
 	.map_alloc_check = bpf_struct_ops_map_alloc_check,
 	.map_alloc = bpf_struct_ops_map_alloc,
@@ -620,6 +621,8 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
 	.map_delete_elem = bpf_struct_ops_map_delete_elem,
 	.map_update_elem = bpf_struct_ops_map_update_elem,
 	.map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
+	.map_btf_name = "bpf_struct_ops_map",
+	.map_btf_id = &bpf_struct_ops_map_btf_id,
 };
 
 /* "const void *" because some subsystem is
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 27595fc6da56..bd8658055c16 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -543,6 +543,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
+static int cpu_map_btf_id;
 const struct bpf_map_ops cpu_map_ops = {
 	.map_alloc		= cpu_map_alloc,
 	.map_free		= cpu_map_free,
@@ -551,6 +552,8 @@ const struct bpf_map_ops cpu_map_ops = {
 	.map_lookup_elem	= cpu_map_lookup_elem,
 	.map_get_next_key	= cpu_map_get_next_key,
 	.map_check_btf		= map_check_no_btf,
+	.map_btf_name		= "bpf_cpu_map",
+	.map_btf_id		= &cpu_map_btf_id,
 };
 
 static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 0cbb72cdaf63..58acc46861ef 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -747,6 +747,7 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
 					 map, key, value, map_flags);
 }
 
+static int dev_map_btf_id;
 const struct bpf_map_ops dev_map_ops = {
 	.map_alloc = dev_map_alloc,
 	.map_free = dev_map_free,
@@ -755,8 +756,11 @@ const struct bpf_map_ops dev_map_ops = {
 	.map_update_elem = dev_map_update_elem,
 	.map_delete_elem = dev_map_delete_elem,
 	.map_check_btf = map_check_no_btf,
+	.map_btf_name = "bpf_dtab",
+	.map_btf_id = &dev_map_btf_id,
 };
 
+static int dev_map_hash_map_btf_id;
 const struct bpf_map_ops dev_map_hash_ops = {
 	.map_alloc = dev_map_alloc,
 	.map_free = dev_map_free,
@@ -765,6 +769,8 @@ const struct bpf_map_ops dev_map_hash_ops = {
 	.map_update_elem = dev_map_hash_update_elem,
 	.map_delete_elem = dev_map_hash_delete_elem,
 	.map_check_btf = map_check_no_btf,
+	.map_btf_name = "bpf_dtab",
+	.map_btf_id = &dev_map_hash_map_btf_id,
 };
 
 static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 2c5999e02060..acd06081d81d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1630,6 +1630,7 @@ const struct bpf_map_ops htab_map_ops = {
 	.map_btf_id = &htab_map_btf_id,
 };
 
+static int htab_lru_map_btf_id;
 const struct bpf_map_ops htab_lru_map_ops = {
 	.map_alloc_check = htab_map_alloc_check,
 	.map_alloc = htab_map_alloc,
@@ -1642,6 +1643,8 @@ const struct bpf_map_ops htab_lru_map_ops = {
 	.map_gen_lookup = htab_lru_map_gen_lookup,
 	.map_seq_show_elem = htab_map_seq_show_elem,
 	BATCH_OPS(htab_lru),
+	.map_btf_name = "bpf_htab",
+	.map_btf_id = &htab_lru_map_btf_id,
 };
 
 /* Called from eBPF program */
@@ -1746,6 +1749,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
+static int htab_percpu_map_btf_id;
 const struct bpf_map_ops htab_percpu_map_ops = {
 	.map_alloc_check = htab_map_alloc_check,
 	.map_alloc = htab_map_alloc,
@@ -1756,8 +1760,11 @@ const struct bpf_map_ops htab_percpu_map_ops = {
 	.map_delete_elem = htab_map_delete_elem,
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
 	BATCH_OPS(htab_percpu),
+	.map_btf_name = "bpf_htab",
+	.map_btf_id = &htab_percpu_map_btf_id,
 };
 
+static int htab_lru_percpu_map_btf_id;
 const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	.map_alloc_check = htab_map_alloc_check,
 	.map_alloc = htab_map_alloc,
@@ -1768,6 +1775,8 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	.map_delete_elem = htab_lru_map_delete_elem,
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
 	BATCH_OPS(htab_lru_percpu),
+	.map_btf_name = "bpf_htab",
+	.map_btf_id = &htab_lru_percpu_map_btf_id,
 };
 
 static int fd_htab_map_alloc_check(union bpf_attr *attr)
@@ -1890,6 +1899,7 @@ static void htab_of_map_free(struct bpf_map *map)
 	fd_htab_map_free(map);
 }
 
+static int htab_of_maps_map_btf_id;
 const struct bpf_map_ops htab_of_maps_map_ops = {
 	.map_alloc_check = fd_htab_map_alloc_check,
 	.map_alloc = htab_of_map_alloc,
@@ -1902,4 +1912,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 	.map_gen_lookup = htab_of_map_gen_lookup,
 	.map_check_btf = map_check_no_btf,
+	.map_btf_name = "bpf_htab",
+	.map_btf_id = &htab_of_maps_map_btf_id,
 };
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 33d01866bcc2..51bd5a8cb01b 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -409,6 +409,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key,
 	rcu_read_unlock();
 }
 
+static int cgroup_storage_map_btf_id;
 const struct bpf_map_ops cgroup_storage_map_ops = {
 	.map_alloc = cgroup_storage_map_alloc,
 	.map_free = cgroup_storage_map_free,
@@ -418,6 +419,8 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
 	.map_delete_elem = cgroup_storage_delete_elem,
 	.map_check_btf = cgroup_storage_check_btf,
 	.map_seq_show_elem = cgroup_storage_seq_show_elem,
+	.map_btf_name = "bpf_cgroup_storage_map",
+	.map_btf_id = &cgroup_storage_map_btf_id,
 };
 
 int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index c8cc4e4cf98d..1abd4e3f906d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -735,6 +735,7 @@ static int trie_check_btf(const struct bpf_map *map,
 	       -EINVAL : 0;
 }
 
+static int trie_map_btf_id;
 const struct bpf_map_ops trie_map_ops = {
 	.map_alloc = trie_alloc,
 	.map_free = trie_free,
@@ -743,4 +744,6 @@ const struct bpf_map_ops trie_map_ops = {
 	.map_update_elem = trie_update_elem,
 	.map_delete_elem = trie_delete_elem,
 	.map_check_btf = trie_check_btf,
+	.map_btf_name = "lpm_trie",
+	.map_btf_id = &trie_map_btf_id,
 };
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 05c8e043b9d2..80c66a6d7c54 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -262,6 +262,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,
 	return -EINVAL;
 }
 
+static int queue_map_btf_id;
 const struct bpf_map_ops queue_map_ops = {
 	.map_alloc_check = queue_stack_map_alloc_check,
 	.map_alloc = queue_stack_map_alloc,
@@ -273,8 +274,11 @@ const struct bpf_map_ops queue_map_ops = {
 	.map_pop_elem = queue_map_pop_elem,
 	.map_peek_elem = queue_map_peek_elem,
 	.map_get_next_key = queue_stack_map_get_next_key,
+	.map_btf_name = "bpf_queue_stack",
+	.map_btf_id = &queue_map_btf_id,
 };
 
+static int stack_map_btf_id;
 const struct bpf_map_ops stack_map_ops = {
 	.map_alloc_check = queue_stack_map_alloc_check,
 	.map_alloc = queue_stack_map_alloc,
@@ -286,4 +290,6 @@ const struct bpf_map_ops stack_map_ops = {
 	.map_pop_elem = stack_map_pop_elem,
 	.map_peek_elem = stack_map_peek_elem,
 	.map_get_next_key = queue_stack_map_get_next_key,
+	.map_btf_name = "bpf_queue_stack",
+	.map_btf_id = &stack_map_btf_id,
 };
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 21cde24386db..a09922f656e4 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -345,6 +345,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
 	return 0;
 }
 
+static int reuseport_array_map_btf_id;
 const struct bpf_map_ops reuseport_array_ops = {
 	.map_alloc_check = reuseport_array_alloc_check,
 	.map_alloc = reuseport_array_alloc,
@@ -352,4 +353,6 @@ const struct bpf_map_ops reuseport_array_ops = {
 	.map_lookup_elem = reuseport_array_lookup_elem,
 	.map_get_next_key = reuseport_array_get_next_key,
 	.map_delete_elem = reuseport_array_delete_elem,
+	.map_btf_name = "reuseport_array",
+	.map_btf_id = &reuseport_array_map_btf_id,
 };
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 180414bb0d3e..dbf37aff4827 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -294,6 +294,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
 	return 0;
 }
 
+static int ringbuf_map_btf_id;
 const struct bpf_map_ops ringbuf_map_ops = {
 	.map_alloc = ringbuf_map_alloc,
 	.map_free = ringbuf_map_free,
@@ -303,6 +304,8 @@ const struct bpf_map_ops ringbuf_map_ops = {
 	.map_update_elem = ringbuf_map_update_elem,
 	.map_delete_elem = ringbuf_map_delete_elem,
 	.map_get_next_key = ringbuf_map_get_next_key,
+	.map_btf_name = "bpf_ringbuf_map",
+	.map_btf_id = &ringbuf_map_btf_id,
 };
 
 /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 599488f25e40..27dc9b1b08a5 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -613,6 +613,7 @@ static void stack_map_free(struct bpf_map *map)
 	put_callchain_buffers();
 }
 
+static int stack_trace_map_btf_id;
 const struct bpf_map_ops stack_trace_map_ops = {
 	.map_alloc = stack_map_alloc,
 	.map_free = stack_map_free,
@@ -621,6 +622,8 @@ const struct bpf_map_ops stack_trace_map_ops = {
 	.map_update_elem = stack_map_update_elem,
 	.map_delete_elem = stack_map_delete_elem,
 	.map_check_btf = map_check_no_btf,
+	.map_btf_name = "bpf_stack_map",
+	.map_btf_id = &stack_trace_map_btf_id,
 };
 
 static int __init stack_map_init(void)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 1dae4b543243..6f921c4ddc2c 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -919,6 +919,7 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
 	return -ENOENT;
 }
 
+static int sk_storage_map_btf_id;
 const struct bpf_map_ops sk_storage_map_ops = {
 	.map_alloc_check = bpf_sk_storage_map_alloc_check,
 	.map_alloc = bpf_sk_storage_map_alloc,
@@ -928,6 +929,8 @@ const struct bpf_map_ops sk_storage_map_ops = {
 	.map_update_elem = bpf_fd_sk_storage_update_elem,
 	.map_delete_elem = bpf_fd_sk_storage_delete_elem,
 	.map_check_btf = bpf_sk_storage_map_check_btf,
+	.map_btf_name = "bpf_sk_storage_map",
+	.map_btf_id = &sk_storage_map_btf_id,
 };
 
 const struct bpf_func_proto bpf_sk_storage_get_proto = {
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 2b884f2d562a..4c1123c749bb 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -643,6 +643,7 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = {
 	.arg4_type      = ARG_ANYTHING,
 };
 
+static int sock_map_btf_id;
 const struct bpf_map_ops sock_map_ops = {
 	.map_alloc		= sock_map_alloc,
 	.map_free		= sock_map_free,
@@ -653,6 +654,8 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_lookup_elem	= sock_map_lookup,
 	.map_release_uref	= sock_map_release_progs,
 	.map_check_btf		= map_check_no_btf,
+	.map_btf_name		= "bpf_stab",
+	.map_btf_id		= &sock_map_btf_id,
 };
 
 struct bpf_shtab_elem {
@@ -1176,6 +1179,7 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
 	.arg4_type      = ARG_ANYTHING,
 };
 
+static int sock_hash_map_btf_id;
 const struct bpf_map_ops sock_hash_ops = {
 	.map_alloc		= sock_hash_alloc,
 	.map_free		= sock_hash_free,
@@ -1186,6 +1190,8 @@ const struct bpf_map_ops sock_hash_ops = {
 	.map_lookup_elem_sys_only = sock_hash_lookup_sys,
 	.map_release_uref	= sock_hash_release_progs,
 	.map_check_btf		= map_check_no_btf,
+	.map_btf_name		= "bpf_shtab",
+	.map_btf_id		= &sock_hash_map_btf_id,
 };
 
 static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 1dc7208c71ba..8367adbbe9df 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -254,6 +254,7 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
 	spin_unlock_bh(&map->lock);
 }
 
+static int xsk_map_btf_id;
 const struct bpf_map_ops xsk_map_ops = {
 	.map_alloc = xsk_map_alloc,
 	.map_free = xsk_map_free,
@@ -264,4 +265,6 @@ const struct bpf_map_ops xsk_map_ops = {
 	.map_update_elem = xsk_map_update_elem,
 	.map_delete_elem = xsk_map_delete_elem,
 	.map_check_btf = map_check_no_btf,
+	.map_btf_name = "xsk_map",
+	.map_btf_id = &xsk_map_btf_id,
 };
-- 
cgit v1.2.3


From 8e6cf365e1d5c70e275a77a3c5ad7e3dc685474c Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Thu, 4 Jun 2020 09:20:49 -0400
Subject: audit: log nftables configuration change events

iptables, ip6tables, arptables and ebtables table registration,
replacement and unregistration configuration events are logged for the
native (legacy) iptables setsockopt api, but not for the
nftables netlink api which is used by the nft-variant of iptables in
addition to nftables itself.

Add calls to log the configuration actions in the nftables netlink api.

This uses the same NETFILTER_CFG record format but overloads the table
field.

  type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.878:162) : table=?:0;?:0 family=unspecified entries=2 op=nft_register_gen pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld
  ...
  type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.878:162) : table=firewalld:1;?:0 family=inet entries=0 op=nft_register_table pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld
  ...
  type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.911:163) : table=firewalld:1;filter_FORWARD:85 family=inet entries=8 op=nft_register_chain pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld
  ...
  type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.911:163) : table=firewalld:1;filter_FORWARD:85 family=inet entries=101 op=nft_register_rule pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld
  ...
  type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.911:163) : table=firewalld:1;__set0:87 family=inet entries=87 op=nft_register_setelem pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld
  ...
  type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.911:163) : table=firewalld:1;__set0:87 family=inet entries=0 op=nft_register_set pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld

For further information please see issue
https://github.com/linux-audit/audit-kernel/issues/124

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h         |  18 ++++++++
 kernel/auditsc.c              |  24 ++++++++--
 net/netfilter/nf_tables_api.c | 103 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 142 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 3fcd9ee49734..604ede630580 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 #include <linux/ptrace.h>
 #include <uapi/linux/audit.h>
+#include <uapi/linux/netfilter/nf_tables.h>
 
 #define AUDIT_INO_UNSET ((unsigned long)-1)
 #define AUDIT_DEV_UNSET ((dev_t)-1)
@@ -98,6 +99,23 @@ enum audit_nfcfgop {
 	AUDIT_XT_OP_REGISTER,
 	AUDIT_XT_OP_REPLACE,
 	AUDIT_XT_OP_UNREGISTER,
+	AUDIT_NFT_OP_TABLE_REGISTER,
+	AUDIT_NFT_OP_TABLE_UNREGISTER,
+	AUDIT_NFT_OP_CHAIN_REGISTER,
+	AUDIT_NFT_OP_CHAIN_UNREGISTER,
+	AUDIT_NFT_OP_RULE_REGISTER,
+	AUDIT_NFT_OP_RULE_UNREGISTER,
+	AUDIT_NFT_OP_SET_REGISTER,
+	AUDIT_NFT_OP_SET_UNREGISTER,
+	AUDIT_NFT_OP_SETELEM_REGISTER,
+	AUDIT_NFT_OP_SETELEM_UNREGISTER,
+	AUDIT_NFT_OP_GEN_REGISTER,
+	AUDIT_NFT_OP_OBJ_REGISTER,
+	AUDIT_NFT_OP_OBJ_UNREGISTER,
+	AUDIT_NFT_OP_OBJ_RESET,
+	AUDIT_NFT_OP_FLOWTABLE_REGISTER,
+	AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
+	AUDIT_NFT_OP_INVALID,
 };
 
 extern int is_audit_feature_set(int which);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 468a23390457..3a9100e95fda 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -75,6 +75,7 @@
 #include <linux/uaccess.h>
 #include <linux/fsnotify_backend.h>
 #include <uapi/linux/limits.h>
+#include <uapi/linux/netfilter/nf_tables.h>
 
 #include "audit.h"
 
@@ -136,9 +137,26 @@ struct audit_nfcfgop_tab {
 };
 
 static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
-	{ AUDIT_XT_OP_REGISTER,		"register"	},
-	{ AUDIT_XT_OP_REPLACE,		"replace"	},
-	{ AUDIT_XT_OP_UNREGISTER,	"unregister"	},
+	{ AUDIT_XT_OP_REGISTER,			"xt_register"		   },
+	{ AUDIT_XT_OP_REPLACE,			"xt_replace"		   },
+	{ AUDIT_XT_OP_UNREGISTER,		"xt_unregister"		   },
+	{ AUDIT_NFT_OP_TABLE_REGISTER,		"nft_register_table"	   },
+	{ AUDIT_NFT_OP_TABLE_UNREGISTER,	"nft_unregister_table"	   },
+	{ AUDIT_NFT_OP_CHAIN_REGISTER,		"nft_register_chain"	   },
+	{ AUDIT_NFT_OP_CHAIN_UNREGISTER,	"nft_unregister_chain"	   },
+	{ AUDIT_NFT_OP_RULE_REGISTER,		"nft_register_rule"	   },
+	{ AUDIT_NFT_OP_RULE_UNREGISTER,		"nft_unregister_rule"	   },
+	{ AUDIT_NFT_OP_SET_REGISTER,		"nft_register_set"	   },
+	{ AUDIT_NFT_OP_SET_UNREGISTER,		"nft_unregister_set"	   },
+	{ AUDIT_NFT_OP_SETELEM_REGISTER,	"nft_register_setelem"	   },
+	{ AUDIT_NFT_OP_SETELEM_UNREGISTER,	"nft_unregister_setelem"   },
+	{ AUDIT_NFT_OP_GEN_REGISTER,		"nft_register_gen"	   },
+	{ AUDIT_NFT_OP_OBJ_REGISTER,		"nft_register_obj"	   },
+	{ AUDIT_NFT_OP_OBJ_UNREGISTER,		"nft_unregister_obj"	   },
+	{ AUDIT_NFT_OP_OBJ_RESET,		"nft_reset_obj"		   },
+	{ AUDIT_NFT_OP_FLOWTABLE_REGISTER,	"nft_register_flowtable"   },
+	{ AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,	"nft_unregister_flowtable" },
+	{ AUDIT_NFT_OP_INVALID,			"nft_invalid"		   },
 };
 
 static int audit_match_perm(struct audit_context *ctx, int mask)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 073aa1051d43..164700273947 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -12,6 +12,7 @@
 #include <linux/netlink.h>
 #include <linux/vmalloc.h>
 #include <linux/rhashtable.h>
+#include <linux/audit.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
@@ -693,6 +694,16 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
 {
 	struct sk_buff *skb;
 	int err;
+	char *buf = kasprintf(GFP_KERNEL, "%s:%llu;?:0",
+			      ctx->table->name, ctx->table->handle);
+
+	audit_log_nfcfg(buf,
+			ctx->family,
+			ctx->table->use,
+			event == NFT_MSG_NEWTABLE ?
+				AUDIT_NFT_OP_TABLE_REGISTER :
+				AUDIT_NFT_OP_TABLE_UNREGISTER);
+	kfree(buf);
 
 	if (!ctx->report &&
 	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
@@ -1428,6 +1439,17 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
 {
 	struct sk_buff *skb;
 	int err;
+	char *buf = kasprintf(GFP_KERNEL, "%s:%llu;%s:%llu",
+			      ctx->table->name, ctx->table->handle,
+			      ctx->chain->name, ctx->chain->handle);
+
+	audit_log_nfcfg(buf,
+			ctx->family,
+			ctx->chain->use,
+			event == NFT_MSG_NEWCHAIN ?
+				AUDIT_NFT_OP_CHAIN_REGISTER :
+				AUDIT_NFT_OP_CHAIN_UNREGISTER);
+	kfree(buf);
 
 	if (!ctx->report &&
 	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
@@ -2693,6 +2715,17 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx,
 {
 	struct sk_buff *skb;
 	int err;
+	char *buf = kasprintf(GFP_KERNEL, "%s:%llu;%s:%llu",
+			      ctx->table->name, ctx->table->handle,
+			      ctx->chain->name, ctx->chain->handle);
+
+	audit_log_nfcfg(buf,
+			ctx->family,
+			rule->handle,
+			event == NFT_MSG_NEWRULE ?
+				AUDIT_NFT_OP_RULE_REGISTER :
+				AUDIT_NFT_OP_RULE_UNREGISTER);
+	kfree(buf);
 
 	if (!ctx->report &&
 	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
@@ -3695,6 +3728,17 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx,
 	struct sk_buff *skb;
 	u32 portid = ctx->portid;
 	int err;
+	char *buf = kasprintf(gfp_flags, "%s:%llu;%s:%llu",
+			      ctx->table->name, ctx->table->handle,
+			      set->name, set->handle);
+
+	audit_log_nfcfg(buf,
+			ctx->family,
+			set->field_count,
+			event == NFT_MSG_NEWSET ?
+				AUDIT_NFT_OP_SET_REGISTER :
+				AUDIT_NFT_OP_SET_UNREGISTER);
+	kfree(buf);
 
 	if (!ctx->report &&
 	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
@@ -4811,6 +4855,17 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
 	u32 portid = ctx->portid;
 	struct sk_buff *skb;
 	int err;
+	char *buf = kasprintf(GFP_KERNEL, "%s:%llu;%s:%llu",
+			      ctx->table->name, ctx->table->handle,
+			      set->name, set->handle);
+
+	audit_log_nfcfg(buf,
+			ctx->family,
+			set->handle,
+			event == NFT_MSG_NEWSETELEM ?
+				AUDIT_NFT_OP_SETELEM_REGISTER :
+				AUDIT_NFT_OP_SETELEM_UNREGISTER);
+	kfree(buf);
 
 	if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
 		return;
@@ -5892,6 +5947,19 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 			    obj->ops->type->type != filter->type)
 				goto cont;
 
+			if (reset) {
+				char *buf = kasprintf(GFP_KERNEL,
+						      "%s:%llu;?:0",
+						      table->name,
+						      table->handle);
+
+				audit_log_nfcfg(buf,
+						family,
+						obj->handle,
+						AUDIT_NFT_OP_OBJ_RESET);
+				kfree(buf);
+			}
+
 			if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid,
 						    cb->nlh->nlmsg_seq,
 						    NFT_MSG_NEWOBJ,
@@ -6002,6 +6070,17 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 	if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
 		reset = true;
 
+	if (reset) {
+		char *buf = kasprintf(GFP_KERNEL, "%s:%llu;?:0",
+				      table->name, table->handle);
+
+		audit_log_nfcfg(buf,
+				family,
+				obj->handle,
+				AUDIT_NFT_OP_OBJ_RESET);
+		kfree(buf);
+	}
+
 	err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
 				      nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
 				      family, table, obj, reset);
@@ -6077,6 +6156,16 @@ void nft_obj_notify(struct net *net, const struct nft_table *table,
 {
 	struct sk_buff *skb;
 	int err;
+	char *buf = kasprintf(GFP_KERNEL, "%s:%llu;?:0",
+			      table->name, table->handle);
+
+	audit_log_nfcfg(buf,
+			family,
+			obj->handle,
+			event == NFT_MSG_NEWOBJ ?
+				AUDIT_NFT_OP_OBJ_REGISTER :
+				AUDIT_NFT_OP_OBJ_UNREGISTER);
+	kfree(buf);
 
 	if (!report &&
 	    !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
@@ -6856,6 +6945,17 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
 {
 	struct sk_buff *skb;
 	int err;
+	char *buf = kasprintf(GFP_KERNEL, "%s:%llu;%s:%llu",
+			      flowtable->table->name, flowtable->table->handle,
+			      flowtable->name, flowtable->handle);
+
+	audit_log_nfcfg(buf,
+			ctx->family,
+			flowtable->hooknum,
+			event == NFT_MSG_NEWFLOWTABLE ?
+				AUDIT_NFT_OP_FLOWTABLE_REGISTER :
+				AUDIT_NFT_OP_FLOWTABLE_UNREGISTER);
+	kfree(buf);
 
 	if (ctx->report &&
 	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
@@ -6977,6 +7077,9 @@ static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb,
 	struct sk_buff *skb2;
 	int err;
 
+	audit_log_nfcfg("?:0;?:0", 0, net->nft.base_seq,
+			AUDIT_NFT_OP_GEN_REGISTER);
+
 	if (nlmsg_report(nlh) &&
 	    !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
 		return;
-- 
cgit v1.2.3


From 005e696df65d0ff90468ecf38a50aa584dc82421 Mon Sep 17 00:00:00 2001
From: Alexander Popov <alex.popov@linux.com>
Date: Wed, 24 Jun 2020 15:33:26 +0300
Subject: gcc-plugins/stackleak: Don't instrument itself

There is no need to try instrumenting functions in kernel/stackleak.c.
Otherwise that can cause issues if the cleanup pass of stackleak gcc plugin
is disabled.

Signed-off-by: Alexander Popov <alex.popov@linux.com>
Link: https://lore.kernel.org/r/20200624123330.83226-2-alex.popov@linux.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index f3218bc5ec69..155b5380500a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -125,6 +125,7 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
 
 obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
 
+CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN)
 obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
 KASAN_SANITIZE_stackleak.o := n
 KCSAN_SANITIZE_stackleak.o := n
-- 
cgit v1.2.3


From feee1b8c490821f29aae416a5422795f5a29263d Mon Sep 17 00:00:00 2001
From: Alexander Popov <alex.popov@linux.com>
Date: Wed, 24 Jun 2020 15:33:29 +0300
Subject: gcc-plugins/stackleak: Use asm instrumentation to avoid useless
 register saving

The kernel code instrumentation in stackleak gcc plugin works in two stages.
At first, stack tracking is added to GIMPLE representation of every function
(except some special cases). And later, when stack frame size info is
available, stack tracking is removed from the RTL representation of the
functions with small stack frame. There is an unwanted side-effect for these
functions: some of them do useless work with caller-saved registers.

As an example of such case, proc_sys_write without() instrumentation:
    55                      push   %rbp
    41 b8 01 00 00 00       mov    $0x1,%r8d
    48 89 e5                mov    %rsp,%rbp
    e8 11 ff ff ff          callq  ffffffff81284610 <proc_sys_call_handler>
    5d                      pop    %rbp
    c3                      retq
    0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
    66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
    00 00 00

proc_sys_write() with instrumentation:
    55                      push   %rbp
    48 89 e5                mov    %rsp,%rbp
    41 56                   push   %r14
    41 55                   push   %r13
    41 54                   push   %r12
    53                      push   %rbx
    49 89 f4                mov    %rsi,%r12
    48 89 fb                mov    %rdi,%rbx
    49 89 d5                mov    %rdx,%r13
    49 89 ce                mov    %rcx,%r14
    4c 89 f1                mov    %r14,%rcx
    4c 89 ea                mov    %r13,%rdx
    4c 89 e6                mov    %r12,%rsi
    48 89 df                mov    %rbx,%rdi
    41 b8 01 00 00 00       mov    $0x1,%r8d
    e8 f2 fe ff ff          callq  ffffffff81298e80 <proc_sys_call_handler>
    5b                      pop    %rbx
    41 5c                   pop    %r12
    41 5d                   pop    %r13
    41 5e                   pop    %r14
    5d                      pop    %rbp
    c3                      retq
    66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
    00 00

Let's improve the instrumentation to avoid this:

1. Make stackleak_track_stack() save all register that it works with.
Use no_caller_saved_registers attribute for that function. This attribute
is available for x86_64 and i386 starting from gcc-7.

2. Insert calling stackleak_track_stack() in asm:
  asm volatile("call stackleak_track_stack" :: "r" (current_stack_pointer))
Here we use ASM_CALL_CONSTRAINT trick from arch/x86/include/asm/asm.h.
The input constraint is taken into account during gcc shrink-wrapping
optimization. It is needed to be sure that stackleak_track_stack() call is
inserted after the prologue of the containing function, when the stack
frame is prepared.

This work is a deep reengineering of the idea described on grsecurity blog
  https://grsecurity.net/resolving_an_unfortunate_stackleak_interaction

Signed-off-by: Alexander Popov <alex.popov@linux.com>
Acked-by: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Link: https://lore.kernel.org/r/20200624123330.83226-5-alex.popov@linux.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/compiler_attributes.h    |  13 +++
 kernel/stackleak.c                     |  16 +--
 scripts/Makefile.gcc-plugins           |   2 +
 scripts/gcc-plugins/stackleak_plugin.c | 205 ++++++++++++++++++++++++++++-----
 4 files changed, 196 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index cdf016596659..551ea8cb70b1 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -37,6 +37,7 @@
 # define __GCC4_has_attribute___copy__                0
 # define __GCC4_has_attribute___designated_init__     0
 # define __GCC4_has_attribute___externally_visible__  1
+# define __GCC4_has_attribute___no_caller_saved_registers__ 0
 # define __GCC4_has_attribute___noclone__             1
 # define __GCC4_has_attribute___nonstring__           0
 # define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8)
@@ -175,6 +176,18 @@
  */
 #define __mode(x)                       __attribute__((__mode__(x)))
 
+/*
+ * Optional: only supported since gcc >= 7
+ *
+ *   gcc: https://gcc.gnu.org/onlinedocs/gcc/x86-Function-Attributes.html#index-no_005fcaller_005fsaved_005fregisters-function-attribute_002c-x86
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#no-caller-saved-registers
+ */
+#if __has_attribute(__no_caller_saved_registers__)
+# define __no_caller_saved_registers	__attribute__((__no_caller_saved_registers__))
+#else
+# define __no_caller_saved_registers
+#endif
+
 /*
  * Optional: not supported by clang
  *
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index b193a59fc05b..a8fc9ae1d03d 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -104,19 +104,9 @@ asmlinkage void notrace stackleak_erase(void)
 }
 NOKPROBE_SYMBOL(stackleak_erase);
 
-void __used notrace stackleak_track_stack(void)
+void __used __no_caller_saved_registers notrace stackleak_track_stack(void)
 {
-	/*
-	 * N.B. stackleak_erase() fills the kernel stack with the poison value,
-	 * which has the register width. That code assumes that the value
-	 * of 'lowest_stack' is aligned on the register width boundary.
-	 *
-	 * That is true for x86 and x86_64 because of the kernel stack
-	 * alignment on these platforms (for details, see 'cc_stack_align' in
-	 * arch/x86/Makefile). Take care of that when you port STACKLEAK to
-	 * new platforms.
-	 */
-	unsigned long sp = (unsigned long)&sp;
+	unsigned long sp = current_stack_pointer;
 
 	/*
 	 * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than
@@ -125,6 +115,8 @@ void __used notrace stackleak_track_stack(void)
 	 */
 	BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH);
 
+	/* 'lowest_stack' should be aligned on the register width boundary */
+	sp = ALIGN(sp, sizeof(unsigned long));
 	if (sp < current->lowest_stack &&
 	    sp >= (unsigned long)task_stack_page(current) +
 						sizeof(unsigned long)) {
diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
index 5f7df50cfe7a..952e46876329 100644
--- a/scripts/Makefile.gcc-plugins
+++ b/scripts/Makefile.gcc-plugins
@@ -33,6 +33,8 @@ gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK)		\
 		+= -DSTACKLEAK_PLUGIN
 gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK)		\
 		+= -fplugin-arg-stackleak_plugin-track-min-size=$(CONFIG_STACKLEAK_TRACK_MIN_SIZE)
+gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK)		\
+		+= -fplugin-arg-stackleak_plugin-arch=$(SRCARCH)
 ifdef CONFIG_GCC_PLUGIN_STACKLEAK
     DISABLE_STACKLEAK_PLUGIN += -fplugin-arg-stackleak_plugin-disable
 endif
diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c
index cc75eeba0be1..a18b0d4af456 100644
--- a/scripts/gcc-plugins/stackleak_plugin.c
+++ b/scripts/gcc-plugins/stackleak_plugin.c
@@ -20,7 +20,7 @@
  *
  * Debugging:
  *  - use fprintf() to stderr, debug_generic_expr(), debug_gimple_stmt(),
- *     print_rtl() and print_simple_rtl();
+ *     print_rtl_single() and debug_rtx();
  *  - add "-fdump-tree-all -fdump-rtl-all" to the plugin CFLAGS in
  *     Makefile.gcc-plugins to see the verbose dumps of the gcc passes;
  *  - use gcc -E to understand the preprocessing shenanigans;
@@ -32,6 +32,7 @@
 __visible int plugin_is_GPL_compatible;
 
 static int track_frame_size = -1;
+static bool build_for_x86 = false;
 static const char track_function[] = "stackleak_track_stack";
 
 /*
@@ -43,32 +44,31 @@ static GTY(()) tree track_function_decl;
 static struct plugin_info stackleak_plugin_info = {
 	.version = "201707101337",
 	.help = "track-min-size=nn\ttrack stack for functions with a stack frame size >= nn bytes\n"
+		"arch=target_arch\tspecify target build arch\n"
 		"disable\t\tdo not activate the plugin\n"
 };
 
-static void stackleak_add_track_stack(gimple_stmt_iterator *gsi, bool after)
+static void add_stack_tracking_gcall(gimple_stmt_iterator *gsi, bool after)
 {
 	gimple stmt;
-	gcall *stackleak_track_stack;
+	gcall *gimple_call;
 	cgraph_node_ptr node;
 	basic_block bb;
 
-	/* Insert call to void stackleak_track_stack(void) */
+	/* Insert calling stackleak_track_stack() */
 	stmt = gimple_build_call(track_function_decl, 0);
-	stackleak_track_stack = as_a_gcall(stmt);
-	if (after) {
-		gsi_insert_after(gsi, stackleak_track_stack,
-						GSI_CONTINUE_LINKING);
-	} else {
-		gsi_insert_before(gsi, stackleak_track_stack, GSI_SAME_STMT);
-	}
+	gimple_call = as_a_gcall(stmt);
+	if (after)
+		gsi_insert_after(gsi, gimple_call, GSI_CONTINUE_LINKING);
+	else
+		gsi_insert_before(gsi, gimple_call, GSI_SAME_STMT);
 
 	/* Update the cgraph */
-	bb = gimple_bb(stackleak_track_stack);
+	bb = gimple_bb(gimple_call);
 	node = cgraph_get_create_node(track_function_decl);
 	gcc_assert(node);
 	cgraph_create_edge(cgraph_get_node(current_function_decl), node,
-			stackleak_track_stack, bb->count,
+			gimple_call, bb->count,
 			compute_call_stmt_bb_frequency(current_function_decl, bb));
 }
 
@@ -85,6 +85,79 @@ static bool is_alloca(gimple stmt)
 	return false;
 }
 
+static tree get_current_stack_pointer_decl(void)
+{
+	varpool_node_ptr node;
+
+	FOR_EACH_VARIABLE(node) {
+		tree var = NODE_DECL(node);
+		tree name = DECL_NAME(var);
+
+		if (DECL_NAME_LENGTH(var) != sizeof("current_stack_pointer") - 1)
+			continue;
+
+		if (strcmp(IDENTIFIER_POINTER(name), "current_stack_pointer"))
+			continue;
+
+		return var;
+	}
+
+	return NULL_TREE;
+}
+
+static void add_stack_tracking_gasm(gimple_stmt_iterator *gsi, bool after)
+{
+	gasm *asm_call = NULL;
+	tree sp_decl, input;
+	vec<tree, va_gc> *inputs = NULL;
+
+	/* 'no_caller_saved_registers' is currently supported only for x86 */
+	gcc_assert(build_for_x86);
+
+	/*
+	 * Insert calling stackleak_track_stack() in asm:
+	 *   asm volatile("call stackleak_track_stack"
+	 *		  :: "r" (current_stack_pointer))
+	 * Use ASM_CALL_CONSTRAINT trick from arch/x86/include/asm/asm.h.
+	 * This constraint is taken into account during gcc shrink-wrapping
+	 * optimization. It is needed to be sure that stackleak_track_stack()
+	 * call is inserted after the prologue of the containing function,
+	 * when the stack frame is prepared.
+	 */
+	sp_decl = get_current_stack_pointer_decl();
+	if (sp_decl == NULL_TREE) {
+		add_stack_tracking_gcall(gsi, after);
+		return;
+	}
+	input = build_tree_list(NULL_TREE, build_const_char_string(2, "r"));
+	input = chainon(NULL_TREE, build_tree_list(input, sp_decl));
+	vec_safe_push(inputs, input);
+	asm_call = gimple_build_asm_vec("call stackleak_track_stack",
+					inputs, NULL, NULL, NULL);
+	gimple_asm_set_volatile(asm_call, true);
+	if (after)
+		gsi_insert_after(gsi, asm_call, GSI_CONTINUE_LINKING);
+	else
+		gsi_insert_before(gsi, asm_call, GSI_SAME_STMT);
+	update_stmt(asm_call);
+}
+
+static void add_stack_tracking(gimple_stmt_iterator *gsi, bool after)
+{
+	/*
+	 * The 'no_caller_saved_registers' attribute is used for
+	 * stackleak_track_stack(). If the compiler supports this attribute for
+	 * the target arch, we can add calling stackleak_track_stack() in asm.
+	 * That improves performance: we avoid useless operations with the
+	 * caller-saved registers in the functions from which we will remove
+	 * stackleak_track_stack() call during the stackleak_cleanup pass.
+	 */
+	if (lookup_attribute_spec(get_identifier("no_caller_saved_registers")))
+		add_stack_tracking_gasm(gsi, after);
+	else
+		add_stack_tracking_gcall(gsi, after);
+}
+
 /*
  * Work with the GIMPLE representation of the code. Insert the
  * stackleak_track_stack() call after alloca() and into the beginning
@@ -94,7 +167,7 @@ static unsigned int stackleak_instrument_execute(void)
 {
 	basic_block bb, entry_bb;
 	bool prologue_instrumented = false, is_leaf = true;
-	gimple_stmt_iterator gsi;
+	gimple_stmt_iterator gsi = { 0 };
 
 	/*
 	 * ENTRY_BLOCK_PTR is a basic block which represents possible entry
@@ -123,7 +196,7 @@ static unsigned int stackleak_instrument_execute(void)
 				continue;
 
 			/* Insert stackleak_track_stack() call after alloca() */
-			stackleak_add_track_stack(&gsi, true);
+			add_stack_tracking(&gsi, true);
 			if (bb == entry_bb)
 				prologue_instrumented = true;
 		}
@@ -168,7 +241,7 @@ static unsigned int stackleak_instrument_execute(void)
 		bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun));
 	}
 	gsi = gsi_after_labels(bb);
-	stackleak_add_track_stack(&gsi, false);
+	add_stack_tracking(&gsi, false);
 
 	return 0;
 }
@@ -182,21 +255,10 @@ static bool large_stack_frame(void)
 #endif
 }
 
-/*
- * Work with the RTL representation of the code.
- * Remove the unneeded stackleak_track_stack() calls from the functions
- * which don't call alloca() and don't have a large enough stack frame size.
- */
-static unsigned int stackleak_cleanup_execute(void)
+static void remove_stack_tracking_gcall(void)
 {
 	rtx_insn *insn, *next;
 
-	if (cfun->calls_alloca)
-		return 0;
-
-	if (large_stack_frame())
-		return 0;
-
 	/*
 	 * Find stackleak_track_stack() calls. Loop through the chain of insns,
 	 * which is an RTL representation of the code for a function.
@@ -257,6 +319,84 @@ static unsigned int stackleak_cleanup_execute(void)
 		}
 #endif
 	}
+}
+
+static bool remove_stack_tracking_gasm(void)
+{
+	bool removed = false;
+	rtx_insn *insn, *next;
+
+	/* 'no_caller_saved_registers' is currently supported only for x86 */
+	gcc_assert(build_for_x86);
+
+	/*
+	 * Find stackleak_track_stack() asm calls. Loop through the chain of
+	 * insns, which is an RTL representation of the code for a function.
+	 *
+	 * The example of a matching insn:
+	 *  (insn 11 5 12 2 (parallel [ (asm_operands/v
+	 *  ("call stackleak_track_stack") ("") 0
+	 *  [ (reg/v:DI 7 sp [ current_stack_pointer ]) ]
+	 *  [ (asm_input:DI ("r")) ] [])
+	 *  (clobber (reg:CC 17 flags)) ]) -1 (nil))
+	 */
+	for (insn = get_insns(); insn; insn = next) {
+		rtx body;
+
+		next = NEXT_INSN(insn);
+
+		/* Check the expression code of the insn */
+		if (!NONJUMP_INSN_P(insn))
+			continue;
+
+		/*
+		 * Check the expression code of the insn body, which is an RTL
+		 * Expression (RTX) describing the side effect performed by
+		 * that insn.
+		 */
+		body = PATTERN(insn);
+
+		if (GET_CODE(body) != PARALLEL)
+			continue;
+
+		body = XVECEXP(body, 0, 0);
+
+		if (GET_CODE(body) != ASM_OPERANDS)
+			continue;
+
+		if (strcmp(ASM_OPERANDS_TEMPLATE(body),
+						"call stackleak_track_stack")) {
+			continue;
+		}
+
+		delete_insn_and_edges(insn);
+		gcc_assert(!removed);
+		removed = true;
+	}
+
+	return removed;
+}
+
+/*
+ * Work with the RTL representation of the code.
+ * Remove the unneeded stackleak_track_stack() calls from the functions
+ * which don't call alloca() and don't have a large enough stack frame size.
+ */
+static unsigned int stackleak_cleanup_execute(void)
+{
+	bool removed = false;
+
+	if (cfun->calls_alloca)
+		return 0;
+
+	if (large_stack_frame())
+		return 0;
+
+	if (lookup_attribute_spec(get_identifier("no_caller_saved_registers")))
+		removed = remove_stack_tracking_gasm();
+
+	if (!removed)
+		remove_stack_tracking_gcall();
 
 	return 0;
 }
@@ -392,6 +532,15 @@ __visible int plugin_init(struct plugin_name_args *plugin_info,
 					plugin_name, argv[i].key, argv[i].value);
 				return 1;
 			}
+		} else if (!strcmp(argv[i].key, "arch")) {
+			if (!argv[i].value) {
+				error(G_("no value supplied for option '-fplugin-arg-%s-%s'"),
+					plugin_name, argv[i].key);
+				return 1;
+			}
+
+			if (!strcmp(argv[i].value, "x86"))
+				build_for_x86 = true;
 		} else {
 			error(G_("unknown option '-fplugin-arg-%s-%s'"),
 					plugin_name, argv[i].key);
-- 
cgit v1.2.3


From 521b512b157a1315ff2bf11c11ab184c79515aea Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 27 May 2020 10:58:47 +0100
Subject: PM / EM: change naming convention from 'capacity' to 'performance'

The Energy Model uses concept of performance domain and capacity states in
order to calculate power used by CPUs. Change naming convention from
capacity to performance state would enable wider usage in future, e.g.
upcoming support for other devices other than CPUs.

Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Quentin Perret <qperret@google.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/cpufreq_cooling.c | 12 +++---
 include/linux/energy_model.h      | 86 +++++++++++++++++++++------------------
 kernel/power/energy_model.c       | 44 ++++++++++----------
 kernel/sched/topology.c           | 20 ++++-----
 4 files changed, 84 insertions(+), 78 deletions(-)

(limited to 'kernel')

diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index 9e124020519f..641995ebc107 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -333,18 +333,18 @@ static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev,
 		return false;
 
 	policy = cpufreq_cdev->policy;
-	if (!cpumask_equal(policy->related_cpus, to_cpumask(em->cpus))) {
+	if (!cpumask_equal(policy->related_cpus, em_span_cpus(em))) {
 		pr_err("The span of pd %*pbl is misaligned with cpufreq policy %*pbl\n",
-			cpumask_pr_args(to_cpumask(em->cpus)),
+			cpumask_pr_args(em_span_cpus(em)),
 			cpumask_pr_args(policy->related_cpus));
 		return false;
 	}
 
 	nr_levels = cpufreq_cdev->max_level + 1;
-	if (em->nr_cap_states != nr_levels) {
-		pr_err("The number of cap states in pd %*pbl (%u) doesn't match the number of cooling levels (%u)\n",
-			cpumask_pr_args(to_cpumask(em->cpus)),
-			em->nr_cap_states, nr_levels);
+	if (em_pd_nr_perf_states(em) != nr_levels) {
+		pr_err("The number of performance states in pd %*pbl (%u) doesn't match the number of cooling levels (%u)\n",
+			cpumask_pr_args(em_span_cpus(em)),
+			em_pd_nr_perf_states(em), nr_levels);
 		return false;
 	}
 
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index ade6486a3382..fe336a9eb5d4 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -10,13 +10,13 @@
 #include <linux/types.h>
 
 /**
- * em_cap_state - Capacity state of a performance domain
+ * em_perf_state - Performance state of a performance domain
  * @frequency:	The CPU frequency in KHz, for consistency with CPUFreq
  * @power:	The power consumed by 1 CPU at this level, in milli-watts
  * @cost:	The cost coefficient associated with this level, used during
  *		energy calculation. Equal to: power * max_frequency / frequency
  */
-struct em_cap_state {
+struct em_perf_state {
 	unsigned long frequency;
 	unsigned long power;
 	unsigned long cost;
@@ -24,8 +24,8 @@ struct em_cap_state {
 
 /**
  * em_perf_domain - Performance domain
- * @table:		List of capacity states, in ascending order
- * @nr_cap_states:	Number of capacity states
+ * @table:		List of performance states, in ascending order
+ * @nr_perf_states:	Number of performance states
  * @cpus:		Cpumask covering the CPUs of the domain
  *
  * A "performance domain" represents a group of CPUs whose performance is
@@ -34,22 +34,27 @@ struct em_cap_state {
  * CPUFreq policies.
  */
 struct em_perf_domain {
-	struct em_cap_state *table;
-	int nr_cap_states;
+	struct em_perf_state *table;
+	int nr_perf_states;
 	unsigned long cpus[];
 };
 
+#define em_span_cpus(em) (to_cpumask((em)->cpus))
+
 #ifdef CONFIG_ENERGY_MODEL
 #define EM_CPU_MAX_POWER 0xFFFF
 
 struct em_data_callback {
 	/**
-	 * active_power() - Provide power at the next capacity state of a CPU
-	 * @power	: Active power at the capacity state in mW (modified)
-	 * @freq	: Frequency at the capacity state in kHz (modified)
+	 * active_power() - Provide power at the next performance state of
+	 *		a CPU
+	 * @power	: Active power at the performance state in mW
+	 *		(modified)
+	 * @freq	: Frequency at the performance state in kHz
+	 *		(modified)
 	 * @cpu		: CPU for which we do this operation
 	 *
-	 * active_power() must find the lowest capacity state of 'cpu' above
+	 * active_power() must find the lowest performance state of 'cpu' above
 	 * 'freq' and update 'power' and 'freq' to the matching active power
 	 * and frequency.
 	 *
@@ -80,46 +85,46 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
 				unsigned long max_util, unsigned long sum_util)
 {
 	unsigned long freq, scale_cpu;
-	struct em_cap_state *cs;
+	struct em_perf_state *ps;
 	int i, cpu;
 
 	/*
-	 * In order to predict the capacity state, map the utilization of the
-	 * most utilized CPU of the performance domain to a requested frequency,
-	 * like schedutil.
+	 * In order to predict the performance state, map the utilization of
+	 * the most utilized CPU of the performance domain to a requested
+	 * frequency, like schedutil.
 	 */
 	cpu = cpumask_first(to_cpumask(pd->cpus));
 	scale_cpu = arch_scale_cpu_capacity(cpu);
-	cs = &pd->table[pd->nr_cap_states - 1];
-	freq = map_util_freq(max_util, cs->frequency, scale_cpu);
+	ps = &pd->table[pd->nr_perf_states - 1];
+	freq = map_util_freq(max_util, ps->frequency, scale_cpu);
 
 	/*
-	 * Find the lowest capacity state of the Energy Model above the
+	 * Find the lowest performance state of the Energy Model above the
 	 * requested frequency.
 	 */
-	for (i = 0; i < pd->nr_cap_states; i++) {
-		cs = &pd->table[i];
-		if (cs->frequency >= freq)
+	for (i = 0; i < pd->nr_perf_states; i++) {
+		ps = &pd->table[i];
+		if (ps->frequency >= freq)
 			break;
 	}
 
 	/*
-	 * The capacity of a CPU in the domain at that capacity state (cs)
+	 * The capacity of a CPU in the domain at the performance state (ps)
 	 * can be computed as:
 	 *
-	 *             cs->freq * scale_cpu
-	 *   cs->cap = --------------------                          (1)
+	 *             ps->freq * scale_cpu
+	 *   ps->cap = --------------------                          (1)
 	 *                 cpu_max_freq
 	 *
 	 * So, ignoring the costs of idle states (which are not available in
-	 * the EM), the energy consumed by this CPU at that capacity state is
-	 * estimated as:
+	 * the EM), the energy consumed by this CPU at that performance state
+	 * is estimated as:
 	 *
-	 *             cs->power * cpu_util
+	 *             ps->power * cpu_util
 	 *   cpu_nrg = --------------------                          (2)
-	 *                   cs->cap
+	 *                   ps->cap
 	 *
-	 * since 'cpu_util / cs->cap' represents its percentage of busy time.
+	 * since 'cpu_util / ps->cap' represents its percentage of busy time.
 	 *
 	 *   NOTE: Although the result of this computation actually is in
 	 *         units of power, it can be manipulated as an energy value
@@ -129,34 +134,35 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
 	 * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
 	 * of two terms:
 	 *
-	 *             cs->power * cpu_max_freq   cpu_util
+	 *             ps->power * cpu_max_freq   cpu_util
 	 *   cpu_nrg = ------------------------ * ---------          (3)
-	 *                    cs->freq            scale_cpu
+	 *                    ps->freq            scale_cpu
 	 *
-	 * The first term is static, and is stored in the em_cap_state struct
-	 * as 'cs->cost'.
+	 * The first term is static, and is stored in the em_perf_state struct
+	 * as 'ps->cost'.
 	 *
 	 * Since all CPUs of the domain have the same micro-architecture, they
-	 * share the same 'cs->cost', and the same CPU capacity. Hence, the
+	 * share the same 'ps->cost', and the same CPU capacity. Hence, the
 	 * total energy of the domain (which is the simple sum of the energy of
 	 * all of its CPUs) can be factorized as:
 	 *
-	 *            cs->cost * \Sum cpu_util
+	 *            ps->cost * \Sum cpu_util
 	 *   pd_nrg = ------------------------                       (4)
 	 *                  scale_cpu
 	 */
-	return cs->cost * sum_util / scale_cpu;
+	return ps->cost * sum_util / scale_cpu;
 }
 
 /**
- * em_pd_nr_cap_states() - Get the number of capacity states of a perf. domain
+ * em_pd_nr_perf_states() - Get the number of performance states of a perf.
+ *				domain
  * @pd		: performance domain for which this must be done
  *
- * Return: the number of capacity states in the performance domain table
+ * Return: the number of performance states in the performance domain table
  */
-static inline int em_pd_nr_cap_states(struct em_perf_domain *pd)
+static inline int em_pd_nr_perf_states(struct em_perf_domain *pd)
 {
-	return pd->nr_cap_states;
+	return pd->nr_perf_states;
 }
 
 #else
@@ -177,7 +183,7 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
 {
 	return 0;
 }
-static inline int em_pd_nr_cap_states(struct em_perf_domain *pd)
+static inline int em_pd_nr_perf_states(struct em_perf_domain *pd)
 {
 	return 0;
 }
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 0a9326f5f421..9892d548a0fa 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -27,18 +27,18 @@ static DEFINE_MUTEX(em_pd_mutex);
 #ifdef CONFIG_DEBUG_FS
 static struct dentry *rootdir;
 
-static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd)
+static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd)
 {
 	struct dentry *d;
 	char name[24];
 
-	snprintf(name, sizeof(name), "cs:%lu", cs->frequency);
+	snprintf(name, sizeof(name), "ps:%lu", ps->frequency);
 
-	/* Create per-cs directory */
+	/* Create per-ps directory */
 	d = debugfs_create_dir(name, pd);
-	debugfs_create_ulong("frequency", 0444, d, &cs->frequency);
-	debugfs_create_ulong("power", 0444, d, &cs->power);
-	debugfs_create_ulong("cost", 0444, d, &cs->cost);
+	debugfs_create_ulong("frequency", 0444, d, &ps->frequency);
+	debugfs_create_ulong("power", 0444, d, &ps->power);
+	debugfs_create_ulong("cost", 0444, d, &ps->cost);
 }
 
 static int em_debug_cpus_show(struct seq_file *s, void *unused)
@@ -62,9 +62,9 @@ static void em_debug_create_pd(struct em_perf_domain *pd, int cpu)
 
 	debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops);
 
-	/* Create a sub-directory for each capacity state */
-	for (i = 0; i < pd->nr_cap_states; i++)
-		em_debug_create_cs(&pd->table[i], d);
+	/* Create a sub-directory for each performance state */
+	for (i = 0; i < pd->nr_perf_states; i++)
+		em_debug_create_ps(&pd->table[i], d);
 }
 
 static int __init em_debug_init(void)
@@ -84,7 +84,7 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
 	unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
 	unsigned long power, freq, prev_freq = 0;
 	int i, ret, cpu = cpumask_first(span);
-	struct em_cap_state *table;
+	struct em_perf_state *table;
 	struct em_perf_domain *pd;
 	u64 fmax;
 
@@ -99,26 +99,26 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
 	if (!table)
 		goto free_pd;
 
-	/* Build the list of capacity states for this performance domain */
+	/* Build the list of performance states for this performance domain */
 	for (i = 0, freq = 0; i < nr_states; i++, freq++) {
 		/*
 		 * active_power() is a driver callback which ceils 'freq' to
-		 * lowest capacity state of 'cpu' above 'freq' and updates
+		 * lowest performance state of 'cpu' above 'freq' and updates
 		 * 'power' and 'freq' accordingly.
 		 */
 		ret = cb->active_power(&power, &freq, cpu);
 		if (ret) {
-			pr_err("pd%d: invalid cap. state: %d\n", cpu, ret);
-			goto free_cs_table;
+			pr_err("pd%d: invalid perf. state: %d\n", cpu, ret);
+			goto free_ps_table;
 		}
 
 		/*
 		 * We expect the driver callback to increase the frequency for
-		 * higher capacity states.
+		 * higher performance states.
 		 */
 		if (freq <= prev_freq) {
 			pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq);
-			goto free_cs_table;
+			goto free_ps_table;
 		}
 
 		/*
@@ -127,7 +127,7 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
 		 */
 		if (!power || power > EM_CPU_MAX_POWER) {
 			pr_err("pd%d: invalid power: %lu\n", cpu, power);
-			goto free_cs_table;
+			goto free_ps_table;
 		}
 
 		table[i].power = power;
@@ -141,12 +141,12 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
 		 */
 		opp_eff = freq / power;
 		if (opp_eff >= prev_opp_eff)
-			pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n",
+			pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n",
 					cpu, i, i - 1);
 		prev_opp_eff = opp_eff;
 	}
 
-	/* Compute the cost of each capacity_state. */
+	/* Compute the cost of each performance state. */
 	fmax = (u64) table[nr_states - 1].frequency;
 	for (i = 0; i < nr_states; i++) {
 		table[i].cost = div64_u64(fmax * table[i].power,
@@ -154,14 +154,14 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
 	}
 
 	pd->table = table;
-	pd->nr_cap_states = nr_states;
+	pd->nr_perf_states = nr_states;
 	cpumask_copy(to_cpumask(pd->cpus), span);
 
 	em_debug_create_pd(pd, cpu);
 
 	return pd;
 
-free_cs_table:
+free_ps_table:
 	kfree(table);
 free_pd:
 	kfree(pd);
@@ -185,7 +185,7 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
 /**
  * em_register_perf_domain() - Register the Energy Model of a performance domain
  * @span	: Mask of CPUs in the performance domain
- * @nr_states	: Number of capacity states to register
+ * @nr_states	: Number of performance states to register
  * @cb		: Callback functions providing the data of the Energy Model
  *
  * Create Energy Model tables for a performance domain using the callbacks
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ba81187bb7af..2f91d3126365 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -272,10 +272,10 @@ static void perf_domain_debug(const struct cpumask *cpu_map,
 	printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
 
 	while (pd) {
-		printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
+		printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
 				cpumask_first(perf_domain_span(pd)),
 				cpumask_pr_args(perf_domain_span(pd)),
-				em_pd_nr_cap_states(pd->em_pd));
+				em_pd_nr_perf_states(pd->em_pd));
 		pd = pd->next;
 	}
 
@@ -313,26 +313,26 @@ static void sched_energy_set(bool has_eas)
  *
  * The complexity of the Energy Model is defined as:
  *
- *              C = nr_pd * (nr_cpus + nr_cs)
+ *              C = nr_pd * (nr_cpus + nr_ps)
  *
  * with parameters defined as:
  *  - nr_pd:    the number of performance domains
  *  - nr_cpus:  the number of CPUs
- *  - nr_cs:    the sum of the number of capacity states of all performance
+ *  - nr_ps:    the sum of the number of performance states of all performance
  *              domains (for example, on a system with 2 performance domains,
- *              with 10 capacity states each, nr_cs = 2 * 10 = 20).
+ *              with 10 performance states each, nr_ps = 2 * 10 = 20).
  *
  * It is generally not a good idea to use such a model in the wake-up path on
  * very complex platforms because of the associated scheduling overheads. The
  * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
- * with per-CPU DVFS and less than 8 capacity states each, for example.
+ * with per-CPU DVFS and less than 8 performance states each, for example.
  */
 #define EM_MAX_COMPLEXITY 2048
 
 extern struct cpufreq_governor schedutil_gov;
 static bool build_perf_domains(const struct cpumask *cpu_map)
 {
-	int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
+	int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
 	struct perf_domain *pd = NULL, *tmp;
 	int cpu = cpumask_first(cpu_map);
 	struct root_domain *rd = cpu_rq(cpu)->rd;
@@ -384,15 +384,15 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
 		pd = tmp;
 
 		/*
-		 * Count performance domains and capacity states for the
+		 * Count performance domains and performance states for the
 		 * complexity check.
 		 */
 		nr_pd++;
-		nr_cs += em_pd_nr_cap_states(pd->em_pd);
+		nr_ps += em_pd_nr_perf_states(pd->em_pd);
 	}
 
 	/* Bail out if the Energy Model complexity is too high. */
-	if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
+	if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
 		WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
 						cpumask_pr_args(cpu_map));
 		goto free;
-- 
cgit v1.2.3


From 7d9895c7fbfc9c70afce7029b7de0f3f974adb88 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 27 May 2020 10:58:48 +0100
Subject: PM / EM: introduce em_dev_register_perf_domain function

Add now function in the Energy Model framework which is going to support
new devices. This function will help in transition and make it smoother.
For now it still checks if the cpumask is a valid pointer, which will be
removed later when the new structures and infrastructure will be ready.

Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Quentin Perret <qperret@google.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/energy_model.h | 13 +++++++++++--
 kernel/power/energy_model.c  | 40 ++++++++++++++++++++++++++++++++++------
 2 files changed, 45 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index fe336a9eb5d4..7c048df98447 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_ENERGY_MODEL_H
 #define _LINUX_ENERGY_MODEL_H
 #include <linux/cpumask.h>
+#include <linux/device.h>
 #include <linux/jump_label.h>
 #include <linux/kobject.h>
 #include <linux/rcupdate.h>
@@ -42,7 +43,7 @@ struct em_perf_domain {
 #define em_span_cpus(em) (to_cpumask((em)->cpus))
 
 #ifdef CONFIG_ENERGY_MODEL
-#define EM_CPU_MAX_POWER 0xFFFF
+#define EM_MAX_POWER 0xFFFF
 
 struct em_data_callback {
 	/**
@@ -59,7 +60,7 @@ struct em_data_callback {
 	 * and frequency.
 	 *
 	 * The power is the one of a single CPU in the domain, expressed in
-	 * milli-watts. It is expected to fit in the [0, EM_CPU_MAX_POWER]
+	 * milli-watts. It is expected to fit in the [0, EM_MAX_POWER]
 	 * range.
 	 *
 	 * Return 0 on success.
@@ -71,6 +72,8 @@ struct em_data_callback {
 struct em_perf_domain *em_cpu_get(int cpu);
 int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
 						struct em_data_callback *cb);
+int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
+				struct em_data_callback *cb, cpumask_t *span);
 
 /**
  * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain
@@ -174,6 +177,12 @@ static inline int em_register_perf_domain(cpumask_t *span,
 {
 	return -EINVAL;
 }
+static inline
+int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
+				struct em_data_callback *cb, cpumask_t *span)
+{
+	return -EINVAL;
+}
 static inline struct em_perf_domain *em_cpu_get(int cpu)
 {
 	return NULL;
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 9892d548a0fa..875b163e54ab 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -125,7 +125,7 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
 		 * The power returned by active_state() is expected to be
 		 * positive, in milli-watts and to fit into 16 bits.
 		 */
-		if (!power || power > EM_CPU_MAX_POWER) {
+		if (!power || power > EM_MAX_POWER) {
 			pr_err("pd%d: invalid power: %lu\n", cpu, power);
 			goto free_ps_table;
 		}
@@ -183,10 +183,13 @@ struct em_perf_domain *em_cpu_get(int cpu)
 EXPORT_SYMBOL_GPL(em_cpu_get);
 
 /**
- * em_register_perf_domain() - Register the Energy Model of a performance domain
- * @span	: Mask of CPUs in the performance domain
+ * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device
+ * @dev		: Device for which the EM is to register
  * @nr_states	: Number of performance states to register
  * @cb		: Callback functions providing the data of the Energy Model
+ * @span	: Pointer to cpumask_t, which in case of a CPU device is
+ *		obligatory. It can be taken from i.e. 'policy->cpus'. For other
+ *		type of devices this should be set to NULL.
  *
  * Create Energy Model tables for a performance domain using the callbacks
  * defined in cb.
@@ -196,14 +199,14 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
  *
  * Return 0 on success
  */
-int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
-						struct em_data_callback *cb)
+int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
+				struct em_data_callback *cb, cpumask_t *span)
 {
 	unsigned long cap, prev_cap = 0;
 	struct em_perf_domain *pd;
 	int cpu, ret = 0;
 
-	if (!span || !nr_states || !cb)
+	if (!dev || !span || !nr_states || !cb)
 		return -EINVAL;
 
 	/*
@@ -255,4 +258,29 @@ unlock:
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
+
+/**
+ * em_register_perf_domain() - Register the Energy Model of a performance domain
+ * @span	: Mask of CPUs in the performance domain
+ * @nr_states	: Number of capacity states to register
+ * @cb		: Callback functions providing the data of the Energy Model
+ *
+ * Create Energy Model tables for a performance domain using the callbacks
+ * defined in cb.
+ *
+ * If multiple clients register the same performance domain, all but the first
+ * registration will be ignored.
+ *
+ * Return 0 on success
+ */
+int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
+						struct em_data_callback *cb)
+{
+	struct device *cpu_dev;
+
+	cpu_dev = get_cpu_device(cpumask_first(span));
+
+	return em_dev_register_perf_domain(cpu_dev, nr_states, cb, span);
+}
 EXPORT_SYMBOL_GPL(em_register_perf_domain);
-- 
cgit v1.2.3


From d0351cc3b0f57214d157e4d589564730af2aedae Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 27 May 2020 10:58:49 +0100
Subject: PM / EM: update callback structure and add device pointer

The Energy Model framework is going to support devices other that CPUs. In
order to make this happen change the callback function and add pointer to
a device as an argument.

Update the related users to use new function and new callback from the
Energy Model.

Acked-by: Quentin Perret <qperret@google.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/scmi-cpufreq.c | 11 +++--------
 drivers/opp/of.c               |  9 ++-------
 include/linux/energy_model.h   | 15 ++++++++-------
 kernel/power/energy_model.c    |  9 +++++----
 4 files changed, 18 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index 61623e2ff149..11ee24e06d12 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -103,17 +103,12 @@ scmi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask)
 }
 
 static int __maybe_unused
-scmi_get_cpu_power(unsigned long *power, unsigned long *KHz, int cpu)
+scmi_get_cpu_power(unsigned long *power, unsigned long *KHz,
+		   struct device *cpu_dev)
 {
-	struct device *cpu_dev = get_cpu_device(cpu);
 	unsigned long Hz;
 	int ret, domain;
 
-	if (!cpu_dev) {
-		pr_err("failed to get cpu%d device\n", cpu);
-		return -ENODEV;
-	}
-
 	domain = handle->perf_ops->device_domain_id(cpu_dev);
 	if (domain < 0)
 		return domain;
@@ -200,7 +195,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 
 	policy->fast_switch_possible = true;
 
-	em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
+	em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus);
 
 	return 0;
 
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 9a5873591a40..e273f419a4bf 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -1216,9 +1216,8 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node);
  * calculation failed because of missing parameters, 0 otherwise.
  */
 static int __maybe_unused _get_cpu_power(unsigned long *mW, unsigned long *kHz,
-					 int cpu)
+					 struct device *cpu_dev)
 {
-	struct device *cpu_dev;
 	struct dev_pm_opp *opp;
 	struct device_node *np;
 	unsigned long mV, Hz;
@@ -1226,10 +1225,6 @@ static int __maybe_unused _get_cpu_power(unsigned long *mW, unsigned long *kHz,
 	u64 tmp;
 	int ret;
 
-	cpu_dev = get_cpu_device(cpu);
-	if (!cpu_dev)
-		return -ENODEV;
-
 	np = of_node_get(cpu_dev->of_node);
 	if (!np)
 		return -EINVAL;
@@ -1297,6 +1292,6 @@ void dev_pm_opp_of_register_em(struct cpumask *cpus)
 	if (ret || !cap)
 		return;
 
-	em_register_perf_domain(cpus, nr_opp, &em_cb);
+	em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, cpus);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_register_em);
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 7c048df98447..7076cb22b247 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -48,24 +48,25 @@ struct em_perf_domain {
 struct em_data_callback {
 	/**
 	 * active_power() - Provide power at the next performance state of
-	 *		a CPU
+	 *		a device
 	 * @power	: Active power at the performance state in mW
 	 *		(modified)
 	 * @freq	: Frequency at the performance state in kHz
 	 *		(modified)
-	 * @cpu		: CPU for which we do this operation
+	 * @dev		: Device for which we do this operation (can be a CPU)
 	 *
-	 * active_power() must find the lowest performance state of 'cpu' above
+	 * active_power() must find the lowest performance state of 'dev' above
 	 * 'freq' and update 'power' and 'freq' to the matching active power
 	 * and frequency.
 	 *
-	 * The power is the one of a single CPU in the domain, expressed in
-	 * milli-watts. It is expected to fit in the [0, EM_MAX_POWER]
-	 * range.
+	 * In case of CPUs, the power is the one of a single CPU in the domain,
+	 * expressed in milli-watts. It is expected to fit in the
+	 * [0, EM_MAX_POWER] range.
 	 *
 	 * Return 0 on success.
 	 */
-	int (*active_power)(unsigned long *power, unsigned long *freq, int cpu);
+	int (*active_power)(unsigned long *power, unsigned long *freq,
+			    struct device *dev);
 };
 #define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb }
 
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 875b163e54ab..5b8a1566526a 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -78,8 +78,9 @@ core_initcall(em_debug_init);
 #else /* CONFIG_DEBUG_FS */
 static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {}
 #endif
-static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
-						struct em_data_callback *cb)
+static struct em_perf_domain *
+em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb,
+	     cpumask_t *span)
 {
 	unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
 	unsigned long power, freq, prev_freq = 0;
@@ -106,7 +107,7 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
 		 * lowest performance state of 'cpu' above 'freq' and updates
 		 * 'power' and 'freq' accordingly.
 		 */
-		ret = cb->active_power(&power, &freq, cpu);
+		ret = cb->active_power(&power, &freq, dev);
 		if (ret) {
 			pr_err("pd%d: invalid perf. state: %d\n", cpu, ret);
 			goto free_ps_table;
@@ -237,7 +238,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 	}
 
 	/* Create the performance domain and add it to the Energy Model. */
-	pd = em_create_pd(span, nr_states, cb);
+	pd = em_create_pd(dev, nr_states, cb, span);
 	if (!pd) {
 		ret = -EINVAL;
 		goto unlock;
-- 
cgit v1.2.3


From a67549c8e568627290234e9fbe833cb9dfd36b55 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 19 Jun 2020 20:47:26 +0000
Subject: blktrace: annotate required lock on do_blk_trace_setup()

Ensure it is clear which lock is required on do_blk_trace_setup().

Suggested-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 5ef0484513ec..5a88a6b55933 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -483,6 +483,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	struct dentry *dir = NULL;
 	int ret;
 
+	lockdep_assert_held(&q->blk_trace_mutex);
+
 	if (!buts->buf_size || !buts->buf_nr)
 		return -EINVAL;
 
-- 
cgit v1.2.3


From bad8e64fb19d3a0de5e564d9a7271c31bd684369 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 19 Jun 2020 20:47:28 +0000
Subject: blktrace: fix debugfs use after free

On commit 6ac93117ab00 ("blktrace: use existing disk debugfs directory")
merged on v4.12 Omar fixed the original blktrace code for request-based
drivers (multiqueue). This however left in place a possible crash, if you
happen to abuse blktrace while racing to remove / add a device.

We used to use asynchronous removal of the request_queue, and with that
the issue was easier to reproduce. Now that we have reverted to
synchronous removal of the request_queue, the issue is still possible to
reproduce, its however just a bit more difficult.

We essentially run two instances of break-blktrace which add/remove
a loop device, and setup a blktrace and just never tear the blktrace
down. We do this twice in parallel. This is easily reproduced with the
script run_0004.sh from break-blktrace [0].

We can end up with two types of panics each reflecting where we
race, one a failed blktrace setup:

[  252.426751] debugfs: Directory 'loop0' with parent 'block' already present!
[  252.432265] BUG: kernel NULL pointer dereference, address: 00000000000000a0
[  252.436592] #PF: supervisor write access in kernel mode
[  252.439822] #PF: error_code(0x0002) - not-present page
[  252.442967] PGD 0 P4D 0
[  252.444656] Oops: 0002 [#1] SMP NOPTI
[  252.446972] CPU: 10 PID: 1153 Comm: break-blktrace Tainted: G            E     5.7.0-rc2-next-20200420+ #164
[  252.452673] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
[  252.456343] RIP: 0010:down_write+0x15/0x40
[  252.458146] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc
               cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00
               00 00 <f0> 48 0f b1 55 00 75 0f 48 8b 04 25 c0 8b 01 00 48 89
               45 08 5d
[  252.463638] RSP: 0018:ffffa626415abcc8 EFLAGS: 00010246
[  252.464950] RAX: 0000000000000000 RBX: ffff958c25f0f5c0 RCX: ffffff8100000000
[  252.466727] RDX: 0000000000000001 RSI: ffffff8100000000 RDI: 00000000000000a0
[  252.468482] RBP: 00000000000000a0 R08: 0000000000000000 R09: 0000000000000001
[  252.470014] R10: 0000000000000000 R11: ffff958d1f9227ff R12: 0000000000000000
[  252.471473] R13: ffff958c25ea5380 R14: ffffffff8cce15f1 R15: 00000000000000a0
[  252.473346] FS:  00007f2e69dee540(0000) GS:ffff958c2fc80000(0000) knlGS:0000000000000000
[  252.475225] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  252.476267] CR2: 00000000000000a0 CR3: 0000000427d10004 CR4: 0000000000360ee0
[  252.477526] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  252.478776] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  252.479866] Call Trace:
[  252.480322]  simple_recursive_removal+0x4e/0x2e0
[  252.481078]  ? debugfs_remove+0x60/0x60
[  252.481725]  ? relay_destroy_buf+0x77/0xb0
[  252.482662]  debugfs_remove+0x40/0x60
[  252.483518]  blk_remove_buf_file_callback+0x5/0x10
[  252.484328]  relay_close_buf+0x2e/0x60
[  252.484930]  relay_open+0x1ce/0x2c0
[  252.485520]  do_blk_trace_setup+0x14f/0x2b0
[  252.486187]  __blk_trace_setup+0x54/0xb0
[  252.486803]  blk_trace_ioctl+0x90/0x140
[  252.487423]  ? do_sys_openat2+0x1ab/0x2d0
[  252.488053]  blkdev_ioctl+0x4d/0x260
[  252.488636]  block_ioctl+0x39/0x40
[  252.489139]  ksys_ioctl+0x87/0xc0
[  252.489675]  __x64_sys_ioctl+0x16/0x20
[  252.490380]  do_syscall_64+0x52/0x180
[  252.491032]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

And the other on the device removal:

[  128.528940] debugfs: Directory 'loop0' with parent 'block' already present!
[  128.615325] BUG: kernel NULL pointer dereference, address: 00000000000000a0
[  128.619537] #PF: supervisor write access in kernel mode
[  128.622700] #PF: error_code(0x0002) - not-present page
[  128.625842] PGD 0 P4D 0
[  128.627585] Oops: 0002 [#1] SMP NOPTI
[  128.629871] CPU: 12 PID: 544 Comm: break-blktrace Tainted: G            E     5.7.0-rc2-next-20200420+ #164
[  128.635595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014
[  128.640471] RIP: 0010:down_write+0x15/0x40
[  128.643041] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc
               cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00
               00 00 <f0> 48 0f b1 55 00 75 0f 65 48 8b 04 25 c0 8b 01 00 48 89
               45 08 5d
[  128.650180] RSP: 0018:ffffa9c3c05ebd78 EFLAGS: 00010246
[  128.651820] RAX: 0000000000000000 RBX: ffff8ae9a6370240 RCX: ffffff8100000000
[  128.653942] RDX: 0000000000000001 RSI: ffffff8100000000 RDI: 00000000000000a0
[  128.655720] RBP: 00000000000000a0 R08: 0000000000000002 R09: ffff8ae9afd2d3d0
[  128.657400] R10: 0000000000000056 R11: 0000000000000000 R12: 0000000000000000
[  128.659099] R13: 0000000000000000 R14: 0000000000000003 R15: 00000000000000a0
[  128.660500] FS:  00007febfd995540(0000) GS:ffff8ae9afd00000(0000) knlGS:0000000000000000
[  128.662204] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  128.663426] CR2: 00000000000000a0 CR3: 0000000420042003 CR4: 0000000000360ee0
[  128.664776] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  128.666022] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  128.667282] Call Trace:
[  128.667801]  simple_recursive_removal+0x4e/0x2e0
[  128.668663]  ? debugfs_remove+0x60/0x60
[  128.669368]  debugfs_remove+0x40/0x60
[  128.669985]  blk_trace_free+0xd/0x50
[  128.670593]  __blk_trace_remove+0x27/0x40
[  128.671274]  blk_trace_shutdown+0x30/0x40
[  128.671935]  blk_release_queue+0x95/0xf0
[  128.672589]  kobject_put+0xa5/0x1b0
[  128.673188]  disk_release+0xa2/0xc0
[  128.673786]  device_release+0x28/0x80
[  128.674376]  kobject_put+0xa5/0x1b0
[  128.674915]  loop_remove+0x39/0x50 [loop]
[  128.675511]  loop_control_ioctl+0x113/0x130 [loop]
[  128.676199]  ksys_ioctl+0x87/0xc0
[  128.676708]  __x64_sys_ioctl+0x16/0x20
[  128.677274]  do_syscall_64+0x52/0x180
[  128.677823]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

The common theme here is:

debugfs: Directory 'loop0' with parent 'block' already present

This crash happens because of how blktrace uses the debugfs directory
where it places its files. Upon init we always create the same directory
which would be needed by blktrace but we only do this for make_request
drivers (multiqueue) block drivers. When you race a removal of these
devices with a blktrace setup you end up in a situation where the
make_request recursive debugfs removal will sweep away the blktrace
files and then later blktrace will also try to remove individual
dentries which are already NULL. The inverse is also possible and hence
the two types of use after frees.

We don't create the block debugfs directory on init for these types of
block devices:

  * request-based block driver block devices
  * every possible partition
  * scsi-generic

And so, this race should in theory only be possible with make_request
drivers.

We can fix the UAF by simply re-using the debugfs directory for
make_request drivers (multiqueue) and only creating the ephemeral
directory for the other type of block devices. The new clarifications
on relying on the q->blk_trace_mutex *and* also checking for q->blk_trace
*prior* to processing a blktrace ensures the debugfs directories are
only created if no possible directory name clashes are possible.

This goes tested with:

  o nvme partitions
  o ISCSI with tgt, and blktracing against scsi-generic with:
    o block
    o tape
    o cdrom
    o media changer
  o blktests

This patch is part of the work which disputes the severity of
CVE-2019-19770 which shows this issue is not a core debugfs issue, but
a misuse of debugfs within blktace.

Fixes: 6ac93117ab00 ("blktrace: use existing disk debugfs directory")
Reported-by: syzbot+603294af2d01acfdd6da@syzkaller.appspotmail.com
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: "James E.J. Bottomley" <jejb@linux.ibm.com>
Cc: yu kuai <yukuai3@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 5a88a6b55933..e27dee345d81 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -524,10 +524,18 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->msg_data)
 		goto err;
 
-	ret = -ENOENT;
-
-	dir = debugfs_lookup(buts->name, blk_debugfs_root);
-	if (!dir)
+#ifdef CONFIG_BLK_DEBUG_FS
+	/*
+	 * When tracing whole make_request drivers (multiqueue) block devices,
+	 * reuse the existing debugfs directory created by the block layer on
+	 * init. For request-based block devices, all partitions block devices,
+	 * and scsi-generic block devices we create a temporary new debugfs
+	 * directory that will be removed once the trace ends.
+	 */
+	if (queue_is_mq(q) && bdev && bdev == bdev->bd_contains)
+		dir = q->debugfs_dir;
+	else
+#endif
 		bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
 
 	bt->dev = dev;
@@ -565,8 +573,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 
 	ret = 0;
 err:
-	if (dir && !bt->dir)
-		dput(dir);
 	if (ret)
 		blk_trace_free(bt);
 	return ret;
-- 
cgit v1.2.3


From b431ef837e3374da0db8ff6683170359aaa0859c Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 19 Jun 2020 20:47:29 +0000
Subject: blktrace: ensure our debugfs dir exists

We make an assumption that a debugfs directory exists, but since
this can fail ensure it exists before allowing blktrace setup to
complete. Otherwise we end up stuffing blktrace files on the debugfs
root directory. In the worst case scenario this *in theory* can create
an eventual panic *iff* in the future a similarly named file is created
prior on the debugfs root directory. This theoretical crash can happen
due to a recursive removal followed by a specific dentry removal.

This doesn't fix any known crash, however I have seen the files
go into the main debugfs root directory in cases where the debugfs
directory was not created due to other internal bugs with blktrace
now fixed.

blktrace is also completely useless without this directory, so
this ensures to userspace we only setup blktrace if the kernel
can stuff files where they are supposed to go into.

debugfs directory creations typically aren't checked for, and we have
maintainers doing sweep removals of these checks, but since we need this
check to ensure proper userspace blktrace functionality we make sure
to annotate the justification for the check.

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e27dee345d81..46c273f4dec5 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -538,6 +538,18 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 #endif
 		bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
 
+	/*
+	 * As blktrace relies on debugfs for its interface the debugfs directory
+	 * is required, contrary to the usual mantra of not checking for debugfs
+	 * files or directories.
+	 */
+	if (IS_ERR_OR_NULL(dir)) {
+		pr_warn("debugfs_dir not present for %s so skipping\n",
+			buts->name);
+		ret = -ENOENT;
+		goto err;
+	}
+
 	bt->dev = dev;
 	atomic_set(&bt->dropped, 0);
 	INIT_LIST_HEAD(&bt->running_list);
-- 
cgit v1.2.3


From 85e0cbbb8a79537dbc465e9deb449a08b2b092a6 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 19 Jun 2020 20:47:30 +0000
Subject: block: create the request_queue debugfs_dir on registration

We were only creating the request_queue debugfs_dir only
for make_request block drivers (multiqueue), but never for
request-based block drivers. We did this as we were only
creating non-blktrace additional debugfs files on that directory
for make_request drivers. However, since blktrace *always* creates
that directory anyway, we special-case the use of that directory
on blktrace. Other than this being an eye-sore, this exposes
request-based block drivers to the same debugfs fragile
race that used to exist with make_request block drivers
where if we start adding files onto that directory we can later
run a race with a double removal of dentries on the directory
if we don't deal with this carefully on blktrace.

Instead, just simplify things by always creating the request_queue
debugfs_dir on request_queue registration. Rename the mutex also to
reflect the fact that this is used outside of the blktrace context.

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c        |  8 +------
 block/blk-mq-debugfs.c  |  5 -----
 block/blk-sysfs.c       |  9 ++++++++
 block/blk.h             |  2 --
 include/linux/blkdev.h  |  5 +++--
 kernel/trace/blktrace.c | 58 ++++++++++++++++++++++---------------------------
 6 files changed, 39 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-core.c b/block/blk-core.c
index a99b22fac38a..a9769c1a2875 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -51,9 +51,7 @@
 #include "blk-pm.h"
 #include "blk-rq-qos.h"
 
-#ifdef CONFIG_DEBUG_FS
 struct dentry *blk_debugfs_root;
-#endif
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -555,9 +553,7 @@ struct request_queue *__blk_alloc_queue(int node_id)
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
 
-#ifdef CONFIG_BLK_DEV_IO_TRACE
-	mutex_init(&q->blk_trace_mutex);
-#endif
+	mutex_init(&q->debugfs_mutex);
 	mutex_init(&q->sysfs_lock);
 	mutex_init(&q->sysfs_dir_lock);
 	spin_lock_init(&q->queue_lock);
@@ -1931,9 +1927,7 @@ int __init blk_dev_init(void)
 	blk_requestq_cachep = kmem_cache_create("request_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 
-#ifdef CONFIG_DEBUG_FS
 	blk_debugfs_root = debugfs_create_dir("block", NULL);
-#endif
 
 	return 0;
 }
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 15df3a36e9fa..a2800bc56fb4 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -824,9 +824,6 @@ void blk_mq_debugfs_register(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
-	q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
-					    blk_debugfs_root);
-
 	debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
 
 	/*
@@ -857,9 +854,7 @@ void blk_mq_debugfs_register(struct request_queue *q)
 
 void blk_mq_debugfs_unregister(struct request_queue *q)
 {
-	debugfs_remove_recursive(q->debugfs_dir);
 	q->sched_debugfs_dir = NULL;
-	q->debugfs_dir = NULL;
 }
 
 static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 561624d4cc4e..be67952e7be2 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -11,6 +11,7 @@
 #include <linux/blktrace_api.h>
 #include <linux/blk-mq.h>
 #include <linux/blk-cgroup.h>
+#include <linux/debugfs.h>
 
 #include "blk.h"
 #include "blk-mq.h"
@@ -917,6 +918,9 @@ static void blk_release_queue(struct kobject *kobj)
 		blk_mq_release(q);
 
 	blk_trace_shutdown(q);
+	mutex_lock(&q->debugfs_mutex);
+	debugfs_remove_recursive(q->debugfs_dir);
+	mutex_unlock(&q->debugfs_mutex);
 
 	if (queue_is_mq(q))
 		blk_mq_debugfs_unregister(q);
@@ -989,6 +993,11 @@ int blk_register_queue(struct gendisk *disk)
 		goto unlock;
 	}
 
+	mutex_lock(&q->debugfs_mutex);
+	q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
+					    blk_debugfs_root);
+	mutex_unlock(&q->debugfs_mutex);
+
 	if (queue_is_mq(q)) {
 		__blk_mq_register_dev(dev, q);
 		blk_mq_debugfs_register(q);
diff --git a/block/blk.h b/block/blk.h
index 8ba4a5e4fe07..3a120a070dac 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -14,9 +14,7 @@
 /* Max future timer expiry for timeouts */
 #define BLK_MAX_TIMEOUT		(5 * HZ)
 
-#ifdef CONFIG_DEBUG_FS
 extern struct dentry *blk_debugfs_root;
-#endif
 
 struct blk_flush_queue {
 	unsigned int		flush_pending_idx:1;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e214e0e9f868..c0701237116d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -528,9 +528,9 @@ struct request_queue {
 	unsigned int		sg_timeout;
 	unsigned int		sg_reserved_size;
 	int			node;
+	struct mutex		debugfs_mutex;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	struct blk_trace __rcu	*blk_trace;
-	struct mutex		blk_trace_mutex;
 #endif
 	/*
 	 * for flush operations
@@ -574,8 +574,9 @@ struct request_queue {
 	struct list_head	tag_set_list;
 	struct bio_set		bio_split;
 
-#ifdef CONFIG_BLK_DEBUG_FS
 	struct dentry		*debugfs_dir;
+
+#ifdef CONFIG_BLK_DEBUG_FS
 	struct dentry		*sched_debugfs_dir;
 	struct dentry		*rqos_debugfs_dir;
 #endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 46c273f4dec5..c086c38f4954 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -348,7 +348,7 @@ static int __blk_trace_remove(struct request_queue *q)
 	struct blk_trace *bt;
 
 	bt = rcu_replace_pointer(q->blk_trace, NULL,
-				 lockdep_is_held(&q->blk_trace_mutex));
+				 lockdep_is_held(&q->debugfs_mutex));
 	if (!bt)
 		return -EINVAL;
 
@@ -362,9 +362,9 @@ int blk_trace_remove(struct request_queue *q)
 {
 	int ret;
 
-	mutex_lock(&q->blk_trace_mutex);
+	mutex_lock(&q->debugfs_mutex);
 	ret = __blk_trace_remove(q);
-	mutex_unlock(&q->blk_trace_mutex);
+	mutex_unlock(&q->debugfs_mutex);
 
 	return ret;
 }
@@ -483,14 +483,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	struct dentry *dir = NULL;
 	int ret;
 
-	lockdep_assert_held(&q->blk_trace_mutex);
+	lockdep_assert_held(&q->debugfs_mutex);
 
 	if (!buts->buf_size || !buts->buf_nr)
 		return -EINVAL;
 
-	if (!blk_debugfs_root)
-		return -ENOENT;
-
 	strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
 	buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
 
@@ -505,7 +502,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	 * we can be.
 	 */
 	if (rcu_dereference_protected(q->blk_trace,
-				      lockdep_is_held(&q->blk_trace_mutex))) {
+				      lockdep_is_held(&q->debugfs_mutex))) {
 		pr_warn("Concurrent blktraces are not allowed on %s\n",
 			buts->name);
 		return -EBUSY;
@@ -524,18 +521,15 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->msg_data)
 		goto err;
 
-#ifdef CONFIG_BLK_DEBUG_FS
 	/*
-	 * When tracing whole make_request drivers (multiqueue) block devices,
-	 * reuse the existing debugfs directory created by the block layer on
-	 * init. For request-based block devices, all partitions block devices,
+	 * When tracing the whole disk reuse the existing debugfs directory
+	 * created by the block layer on init. For partitions block devices,
 	 * and scsi-generic block devices we create a temporary new debugfs
 	 * directory that will be removed once the trace ends.
 	 */
-	if (queue_is_mq(q) && bdev && bdev == bdev->bd_contains)
+	if (bdev && bdev == bdev->bd_contains)
 		dir = q->debugfs_dir;
 	else
-#endif
 		bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
 
 	/*
@@ -617,9 +611,9 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 {
 	int ret;
 
-	mutex_lock(&q->blk_trace_mutex);
+	mutex_lock(&q->debugfs_mutex);
 	ret = __blk_trace_setup(q, name, dev, bdev, arg);
-	mutex_unlock(&q->blk_trace_mutex);
+	mutex_unlock(&q->debugfs_mutex);
 
 	return ret;
 }
@@ -665,7 +659,7 @@ static int __blk_trace_startstop(struct request_queue *q, int start)
 	struct blk_trace *bt;
 
 	bt = rcu_dereference_protected(q->blk_trace,
-				       lockdep_is_held(&q->blk_trace_mutex));
+				       lockdep_is_held(&q->debugfs_mutex));
 	if (bt == NULL)
 		return -EINVAL;
 
@@ -705,9 +699,9 @@ int blk_trace_startstop(struct request_queue *q, int start)
 {
 	int ret;
 
-	mutex_lock(&q->blk_trace_mutex);
+	mutex_lock(&q->debugfs_mutex);
 	ret = __blk_trace_startstop(q, start);
-	mutex_unlock(&q->blk_trace_mutex);
+	mutex_unlock(&q->debugfs_mutex);
 
 	return ret;
 }
@@ -736,7 +730,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	if (!q)
 		return -ENXIO;
 
-	mutex_lock(&q->blk_trace_mutex);
+	mutex_lock(&q->debugfs_mutex);
 
 	switch (cmd) {
 	case BLKTRACESETUP:
@@ -763,7 +757,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 		break;
 	}
 
-	mutex_unlock(&q->blk_trace_mutex);
+	mutex_unlock(&q->debugfs_mutex);
 	return ret;
 }
 
@@ -774,14 +768,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
  **/
 void blk_trace_shutdown(struct request_queue *q)
 {
-	mutex_lock(&q->blk_trace_mutex);
+	mutex_lock(&q->debugfs_mutex);
 	if (rcu_dereference_protected(q->blk_trace,
-				      lockdep_is_held(&q->blk_trace_mutex))) {
+				      lockdep_is_held(&q->debugfs_mutex))) {
 		__blk_trace_startstop(q, 0);
 		__blk_trace_remove(q);
 	}
 
-	mutex_unlock(&q->blk_trace_mutex);
+	mutex_unlock(&q->debugfs_mutex);
 }
 
 #ifdef CONFIG_BLK_CGROUP
@@ -1662,7 +1656,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
 	struct blk_trace *bt;
 
 	bt = rcu_replace_pointer(q->blk_trace, NULL,
-				 lockdep_is_held(&q->blk_trace_mutex));
+				 lockdep_is_held(&q->debugfs_mutex));
 	if (bt == NULL)
 		return -EINVAL;
 
@@ -1837,10 +1831,10 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	if (q == NULL)
 		goto out_bdput;
 
-	mutex_lock(&q->blk_trace_mutex);
+	mutex_lock(&q->debugfs_mutex);
 
 	bt = rcu_dereference_protected(q->blk_trace,
-				       lockdep_is_held(&q->blk_trace_mutex));
+				       lockdep_is_held(&q->debugfs_mutex));
 	if (attr == &dev_attr_enable) {
 		ret = sprintf(buf, "%u\n", !!bt);
 		goto out_unlock_bdev;
@@ -1858,7 +1852,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 		ret = sprintf(buf, "%llu\n", bt->end_lba);
 
 out_unlock_bdev:
-	mutex_unlock(&q->blk_trace_mutex);
+	mutex_unlock(&q->debugfs_mutex);
 out_bdput:
 	bdput(bdev);
 out:
@@ -1901,10 +1895,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	if (q == NULL)
 		goto out_bdput;
 
-	mutex_lock(&q->blk_trace_mutex);
+	mutex_lock(&q->debugfs_mutex);
 
 	bt = rcu_dereference_protected(q->blk_trace,
-				       lockdep_is_held(&q->blk_trace_mutex));
+				       lockdep_is_held(&q->debugfs_mutex));
 	if (attr == &dev_attr_enable) {
 		if (!!value == !!bt) {
 			ret = 0;
@@ -1921,7 +1915,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	if (bt == NULL) {
 		ret = blk_trace_setup_queue(q, bdev);
 		bt = rcu_dereference_protected(q->blk_trace,
-				lockdep_is_held(&q->blk_trace_mutex));
+				lockdep_is_held(&q->debugfs_mutex));
 	}
 
 	if (ret == 0) {
@@ -1936,7 +1930,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	}
 
 out_unlock_bdev:
-	mutex_unlock(&q->blk_trace_mutex);
+	mutex_unlock(&q->debugfs_mutex);
 out_bdput:
 	bdput(bdev);
 out:
-- 
cgit v1.2.3


From 1bc138c622959979eb547be2d3bbc6442a5c80b0 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 10 Jun 2020 11:12:23 +0100
Subject: PM / EM: add support for other devices than CPUs in Energy Model

Add support for other devices than CPUs. The registration function
does not require a valid cpumask pointer and is ready to handle new
devices. Some of the internal structures has been reorganized in order to
keep consistent view (like removing per_cpu pd pointers).

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/device.h       |   5 +
 include/linux/energy_model.h |  29 +++--
 kernel/power/energy_model.c  | 244 +++++++++++++++++++++++++++++--------------
 3 files changed, 194 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/device.h b/include/linux/device.h
index 15460a5ac024..b72e6f9ad845 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -13,6 +13,7 @@
 #define _DEVICE_H_
 
 #include <linux/dev_printk.h>
+#include <linux/energy_model.h>
 #include <linux/ioport.h>
 #include <linux/kobject.h>
 #include <linux/klist.h>
@@ -559,6 +560,10 @@ struct device {
 	struct dev_pm_info	power;
 	struct dev_pm_domain	*pm_domain;
 
+#ifdef CONFIG_ENERGY_MODEL
+	struct em_perf_domain	*em_pd;
+#endif
+
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
 	struct irq_domain	*msi_domain;
 #endif
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 7076cb22b247..2d4689964029 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -12,8 +12,10 @@
 
 /**
  * em_perf_state - Performance state of a performance domain
- * @frequency:	The CPU frequency in KHz, for consistency with CPUFreq
- * @power:	The power consumed by 1 CPU at this level, in milli-watts
+ * @frequency:	The frequency in KHz, for consistency with CPUFreq
+ * @power:	The power consumed at this level, in milli-watts (by 1 CPU or
+		by a registered device). It can be a total power: static and
+		dynamic.
  * @cost:	The cost coefficient associated with this level, used during
  *		energy calculation. Equal to: power * max_frequency / frequency
  */
@@ -27,12 +29,16 @@ struct em_perf_state {
  * em_perf_domain - Performance domain
  * @table:		List of performance states, in ascending order
  * @nr_perf_states:	Number of performance states
- * @cpus:		Cpumask covering the CPUs of the domain
+ * @cpus:		Cpumask covering the CPUs of the domain. It's here
+ *			for performance reasons to avoid potential cache
+ *			misses during energy calculations in the scheduler
+ *			and simplifies allocating/freeing that memory region.
  *
- * A "performance domain" represents a group of CPUs whose performance is
- * scaled together. All CPUs of a performance domain must have the same
- * micro-architecture. Performance domains often have a 1-to-1 mapping with
- * CPUFreq policies.
+ * In case of CPU device, a "performance domain" represents a group of CPUs
+ * whose performance is scaled together. All CPUs of a performance domain
+ * must have the same micro-architecture. Performance domains often have
+ * a 1-to-1 mapping with CPUFreq policies. In case of other devices the @cpus
+ * field is unused.
  */
 struct em_perf_domain {
 	struct em_perf_state *table;
@@ -71,10 +77,12 @@ struct em_data_callback {
 #define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb }
 
 struct em_perf_domain *em_cpu_get(int cpu);
+struct em_perf_domain *em_pd_get(struct device *dev);
 int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
 						struct em_data_callback *cb);
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 				struct em_data_callback *cb, cpumask_t *span);
+void em_dev_unregister_perf_domain(struct device *dev);
 
 /**
  * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain
@@ -184,10 +192,17 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 {
 	return -EINVAL;
 }
+static inline void em_dev_unregister_perf_domain(struct device *dev)
+{
+}
 static inline struct em_perf_domain *em_cpu_get(int cpu)
 {
 	return NULL;
 }
+static inline struct em_perf_domain *em_pd_get(struct device *dev)
+{
+	return NULL;
+}
 static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
 			unsigned long max_util, unsigned long sum_util)
 {
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 5b8a1566526a..32d76e78f992 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Energy Model of CPUs
+ * Energy Model of devices
  *
- * Copyright (c) 2018, Arm ltd.
+ * Copyright (c) 2018-2020, Arm ltd.
  * Written by: Quentin Perret, Arm ltd.
+ * Improvements provided by: Lukasz Luba, Arm ltd.
  */
 
 #define pr_fmt(fmt) "energy_model: " fmt
@@ -15,15 +16,17 @@
 #include <linux/sched/topology.h>
 #include <linux/slab.h>
 
-/* Mapping of each CPU to the performance domain to which it belongs. */
-static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
-
 /*
  * Mutex serializing the registrations of performance domains and letting
  * callbacks defined by drivers sleep.
  */
 static DEFINE_MUTEX(em_pd_mutex);
 
+static bool _is_cpu_device(struct device *dev)
+{
+	return (dev->bus == &cpu_subsys);
+}
+
 #ifdef CONFIG_DEBUG_FS
 static struct dentry *rootdir;
 
@@ -49,22 +52,30 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
 }
 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
 
-static void em_debug_create_pd(struct em_perf_domain *pd, int cpu)
+static void em_debug_create_pd(struct device *dev)
 {
 	struct dentry *d;
-	char name[8];
 	int i;
 
-	snprintf(name, sizeof(name), "pd%d", cpu);
-
 	/* Create the directory of the performance domain */
-	d = debugfs_create_dir(name, rootdir);
+	d = debugfs_create_dir(dev_name(dev), rootdir);
 
-	debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops);
+	if (_is_cpu_device(dev))
+		debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
+				    &em_debug_cpus_fops);
 
 	/* Create a sub-directory for each performance state */
-	for (i = 0; i < pd->nr_perf_states; i++)
-		em_debug_create_ps(&pd->table[i], d);
+	for (i = 0; i < dev->em_pd->nr_perf_states; i++)
+		em_debug_create_ps(&dev->em_pd->table[i], d);
+
+}
+
+static void em_debug_remove_pd(struct device *dev)
+{
+	struct dentry *debug_dir;
+
+	debug_dir = debugfs_lookup(dev_name(dev), rootdir);
+	debugfs_remove_recursive(debug_dir);
 }
 
 static int __init em_debug_init(void)
@@ -76,40 +87,34 @@ static int __init em_debug_init(void)
 }
 core_initcall(em_debug_init);
 #else /* CONFIG_DEBUG_FS */
-static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {}
+static void em_debug_create_pd(struct device *dev) {}
+static void em_debug_remove_pd(struct device *dev) {}
 #endif
-static struct em_perf_domain *
-em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb,
-	     cpumask_t *span)
+
+static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
+				int nr_states, struct em_data_callback *cb)
 {
 	unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
 	unsigned long power, freq, prev_freq = 0;
-	int i, ret, cpu = cpumask_first(span);
 	struct em_perf_state *table;
-	struct em_perf_domain *pd;
+	int i, ret;
 	u64 fmax;
 
-	if (!cb->active_power)
-		return NULL;
-
-	pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
-	if (!pd)
-		return NULL;
-
 	table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
 	if (!table)
-		goto free_pd;
+		return -ENOMEM;
 
 	/* Build the list of performance states for this performance domain */
 	for (i = 0, freq = 0; i < nr_states; i++, freq++) {
 		/*
 		 * active_power() is a driver callback which ceils 'freq' to
-		 * lowest performance state of 'cpu' above 'freq' and updates
+		 * lowest performance state of 'dev' above 'freq' and updates
 		 * 'power' and 'freq' accordingly.
 		 */
 		ret = cb->active_power(&power, &freq, dev);
 		if (ret) {
-			pr_err("pd%d: invalid perf. state: %d\n", cpu, ret);
+			dev_err(dev, "EM: invalid perf. state: %d\n",
+				ret);
 			goto free_ps_table;
 		}
 
@@ -118,7 +123,8 @@ em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb,
 		 * higher performance states.
 		 */
 		if (freq <= prev_freq) {
-			pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq);
+			dev_err(dev, "EM: non-increasing freq: %lu\n",
+				freq);
 			goto free_ps_table;
 		}
 
@@ -127,7 +133,8 @@ em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb,
 		 * positive, in milli-watts and to fit into 16 bits.
 		 */
 		if (!power || power > EM_MAX_POWER) {
-			pr_err("pd%d: invalid power: %lu\n", cpu, power);
+			dev_err(dev, "EM: invalid power: %lu\n",
+				power);
 			goto free_ps_table;
 		}
 
@@ -142,8 +149,8 @@ em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb,
 		 */
 		opp_eff = freq / power;
 		if (opp_eff >= prev_opp_eff)
-			pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n",
-					cpu, i, i - 1);
+			dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n",
+					i, i - 1);
 		prev_opp_eff = opp_eff;
 	}
 
@@ -156,30 +163,82 @@ em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb,
 
 	pd->table = table;
 	pd->nr_perf_states = nr_states;
-	cpumask_copy(to_cpumask(pd->cpus), span);
 
-	em_debug_create_pd(pd, cpu);
-
-	return pd;
+	return 0;
 
 free_ps_table:
 	kfree(table);
-free_pd:
-	kfree(pd);
+	return -EINVAL;
+}
+
+static int em_create_pd(struct device *dev, int nr_states,
+			struct em_data_callback *cb, cpumask_t *cpus)
+{
+	struct em_perf_domain *pd;
+	struct device *cpu_dev;
+	int cpu, ret;
+
+	if (_is_cpu_device(dev)) {
+		pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
+		if (!pd)
+			return -ENOMEM;
+
+		cpumask_copy(em_span_cpus(pd), cpus);
+	} else {
+		pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+		if (!pd)
+			return -ENOMEM;
+	}
+
+	ret = em_create_perf_table(dev, pd, nr_states, cb);
+	if (ret) {
+		kfree(pd);
+		return ret;
+	}
+
+	if (_is_cpu_device(dev))
+		for_each_cpu(cpu, cpus) {
+			cpu_dev = get_cpu_device(cpu);
+			cpu_dev->em_pd = pd;
+		}
+
+	dev->em_pd = pd;
+
+	return 0;
+}
+
+/**
+ * em_pd_get() - Return the performance domain for a device
+ * @dev : Device to find the performance domain for
+ *
+ * Returns the performance domain to which @dev belongs, or NULL if it doesn't
+ * exist.
+ */
+struct em_perf_domain *em_pd_get(struct device *dev)
+{
+	if (IS_ERR_OR_NULL(dev))
+		return NULL;
 
-	return NULL;
+	return dev->em_pd;
 }
+EXPORT_SYMBOL_GPL(em_pd_get);
 
 /**
  * em_cpu_get() - Return the performance domain for a CPU
  * @cpu : CPU to find the performance domain for
  *
- * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't
+ * Returns the performance domain to which @cpu belongs, or NULL if it doesn't
  * exist.
  */
 struct em_perf_domain *em_cpu_get(int cpu)
 {
-	return READ_ONCE(per_cpu(em_data, cpu));
+	struct device *cpu_dev;
+
+	cpu_dev = get_cpu_device(cpu);
+	if (!cpu_dev)
+		return NULL;
+
+	return em_pd_get(cpu_dev);
 }
 EXPORT_SYMBOL_GPL(em_cpu_get);
 
@@ -188,7 +247,7 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
  * @dev		: Device for which the EM is to register
  * @nr_states	: Number of performance states to register
  * @cb		: Callback functions providing the data of the Energy Model
- * @span	: Pointer to cpumask_t, which in case of a CPU device is
+ * @cpus	: Pointer to cpumask_t, which in case of a CPU device is
  *		obligatory. It can be taken from i.e. 'policy->cpus'. For other
  *		type of devices this should be set to NULL.
  *
@@ -201,13 +260,12 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
  * Return 0 on success
  */
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
-				struct em_data_callback *cb, cpumask_t *span)
+				struct em_data_callback *cb, cpumask_t *cpus)
 {
 	unsigned long cap, prev_cap = 0;
-	struct em_perf_domain *pd;
-	int cpu, ret = 0;
+	int cpu, ret;
 
-	if (!dev || !span || !nr_states || !cb)
+	if (!dev || !nr_states || !cb)
 		return -EINVAL;
 
 	/*
@@ -216,47 +274,50 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 	 */
 	mutex_lock(&em_pd_mutex);
 
-	for_each_cpu(cpu, span) {
-		/* Make sure we don't register again an existing domain. */
-		if (READ_ONCE(per_cpu(em_data, cpu))) {
-			ret = -EEXIST;
-			goto unlock;
-		}
+	if (dev->em_pd) {
+		ret = -EEXIST;
+		goto unlock;
+	}
 
-		/*
-		 * All CPUs of a domain must have the same micro-architecture
-		 * since they all share the same table.
-		 */
-		cap = arch_scale_cpu_capacity(cpu);
-		if (prev_cap && prev_cap != cap) {
-			pr_err("CPUs of %*pbl must have the same capacity\n",
-							cpumask_pr_args(span));
+	if (_is_cpu_device(dev)) {
+		if (!cpus) {
+			dev_err(dev, "EM: invalid CPU mask\n");
 			ret = -EINVAL;
 			goto unlock;
 		}
-		prev_cap = cap;
+
+		for_each_cpu(cpu, cpus) {
+			if (em_cpu_get(cpu)) {
+				dev_err(dev, "EM: exists for CPU%d\n", cpu);
+				ret = -EEXIST;
+				goto unlock;
+			}
+			/*
+			 * All CPUs of a domain must have the same
+			 * micro-architecture since they all share the same
+			 * table.
+			 */
+			cap = arch_scale_cpu_capacity(cpu);
+			if (prev_cap && prev_cap != cap) {
+				dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n",
+					cpumask_pr_args(cpus));
+
+				ret = -EINVAL;
+				goto unlock;
+			}
+			prev_cap = cap;
+		}
 	}
 
-	/* Create the performance domain and add it to the Energy Model. */
-	pd = em_create_pd(dev, nr_states, cb, span);
-	if (!pd) {
-		ret = -EINVAL;
+	ret = em_create_pd(dev, nr_states, cb, cpus);
+	if (ret)
 		goto unlock;
-	}
 
-	for_each_cpu(cpu, span) {
-		/*
-		 * The per-cpu array can be read concurrently from em_cpu_get().
-		 * The barrier enforces the ordering needed to make sure readers
-		 * can only access well formed em_perf_domain structs.
-		 */
-		smp_store_release(per_cpu_ptr(&em_data, cpu), pd);
-	}
+	em_debug_create_pd(dev);
+	dev_info(dev, "EM: created perf domain\n");
 
-	pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span));
 unlock:
 	mutex_unlock(&em_pd_mutex);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
@@ -285,3 +346,32 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
 	return em_dev_register_perf_domain(cpu_dev, nr_states, cb, span);
 }
 EXPORT_SYMBOL_GPL(em_register_perf_domain);
+
+/**
+ * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device
+ * @dev		: Device for which the EM is registered
+ *
+ * Unregister the EM for the specified @dev (but not a CPU device).
+ */
+void em_dev_unregister_perf_domain(struct device *dev)
+{
+	if (IS_ERR_OR_NULL(dev) || !dev->em_pd)
+		return;
+
+	if (_is_cpu_device(dev))
+		return;
+
+	/*
+	 * The mutex separates all register/unregister requests and protects
+	 * from potential clean-up/setup issues in the debugfs directories.
+	 * The debugfs directory name is the same as device's name.
+	 */
+	mutex_lock(&em_pd_mutex);
+	em_debug_remove_pd(dev);
+
+	kfree(dev->em_pd->table);
+	kfree(dev->em_pd);
+	dev->em_pd = NULL;
+	mutex_unlock(&em_pd_mutex);
+}
+EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
-- 
cgit v1.2.3


From 07891f15d91317b2220a0b610a2d7e324a88105d Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 27 May 2020 10:58:51 +0100
Subject: PM / EM: remove em_register_perf_domain

Remove old function em_register_perf_domain which is no longer needed.
There is em_dev_register_perf_domain that covers old use cases and new as
well.

Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Quentin Perret <qperret@google.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/energy_model.h |  7 -------
 kernel/power/energy_model.c  | 25 -------------------------
 2 files changed, 32 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 2d4689964029..0f94e871a202 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -78,8 +78,6 @@ struct em_data_callback {
 
 struct em_perf_domain *em_cpu_get(int cpu);
 struct em_perf_domain *em_pd_get(struct device *dev);
-int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
-						struct em_data_callback *cb);
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 				struct em_data_callback *cb, cpumask_t *span);
 void em_dev_unregister_perf_domain(struct device *dev);
@@ -181,11 +179,6 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd)
 struct em_data_callback {};
 #define EM_DATA_CB(_active_power_cb) { }
 
-static inline int em_register_perf_domain(cpumask_t *span,
-			unsigned int nr_states, struct em_data_callback *cb)
-{
-	return -EINVAL;
-}
 static inline
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 				struct em_data_callback *cb, cpumask_t *span)
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 32d76e78f992..c1ff7fa030ab 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -322,31 +322,6 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
 
-/**
- * em_register_perf_domain() - Register the Energy Model of a performance domain
- * @span	: Mask of CPUs in the performance domain
- * @nr_states	: Number of capacity states to register
- * @cb		: Callback functions providing the data of the Energy Model
- *
- * Create Energy Model tables for a performance domain using the callbacks
- * defined in cb.
- *
- * If multiple clients register the same performance domain, all but the first
- * registration will be ignored.
- *
- * Return 0 on success
- */
-int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
-						struct em_data_callback *cb)
-{
-	struct device *cpu_dev;
-
-	cpu_dev = get_cpu_device(cpumask_first(span));
-
-	return em_dev_register_perf_domain(cpu_dev, nr_states, cb, span);
-}
-EXPORT_SYMBOL_GPL(em_register_perf_domain);
-
 /**
  * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device
  * @dev		: Device for which the EM is registered
-- 
cgit v1.2.3


From f0b5694791ce70dba16758c3b838d5ddc7731b02 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 27 May 2020 10:58:52 +0100
Subject: PM / EM: change name of em_pd_energy to em_cpu_energy

Energy Model framework now supports other devices than CPUs. Refactor some
of the functions in order to prevent wrong usage. The old function
em_pd_energy has to generic name. It must not be used without proper
cpumask pointer, which is possible only for CPU devices. Thus, rename it
and add proper description to warn of potential wrong usage for other
devices.

Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Quentin Perret <qperret@google.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/energy_model.h | 11 ++++++++---
 kernel/sched/fair.c          |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 0f94e871a202..b67a51c574b9 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -83,15 +83,20 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 void em_dev_unregister_perf_domain(struct device *dev);
 
 /**
- * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain
+ * em_cpu_energy() - Estimates the energy consumed by the CPUs of a
+		performance domain
  * @pd		: performance domain for which energy has to be estimated
  * @max_util	: highest utilization among CPUs of the domain
  * @sum_util	: sum of the utilization of all CPUs in the domain
  *
+ * This function must be used only for CPU devices. There is no validation,
+ * i.e. if the EM is a CPU type and has cpumask allocated. It is called from
+ * the scheduler code quite frequently and that is why there is not checks.
+ *
  * Return: the sum of the energy consumed by the CPUs of the domain assuming
  * a capacity state satisfying the max utilization of the domain.
  */
-static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
+static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 				unsigned long max_util, unsigned long sum_util)
 {
 	unsigned long freq, scale_cpu;
@@ -196,7 +201,7 @@ static inline struct em_perf_domain *em_pd_get(struct device *dev)
 {
 	return NULL;
 }
-static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
+static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 			unsigned long max_util, unsigned long sum_util)
 {
 	return 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cbcb2f71599b..6da601c8d383 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6497,7 +6497,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 		max_util = max(max_util, cpu_util);
 	}
 
-	return em_pd_energy(pd->em_pd, max_util, sum_util);
+	return em_cpu_energy(pd->em_pd, max_util, sum_util);
 }
 
 /*
-- 
cgit v1.2.3


From c06b0229579806f5b5e14af64cf9c5dc771445b3 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 23 Jun 2020 16:08:07 -0700
Subject: bpf: Support 'X' in bpf_seq_printf() helper

'X' tells kernel to print hex with upper case letters.
/proc/net/tcp{4,6} seq_file show() used this, and
supports it in bpf_seq_printf() helper too.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200623230807.3988014-1-yhs@fb.com
---
 kernel/trace/bpf_trace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a3ac7de98baa..b44505f56b6d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -681,7 +681,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
 		}
 
 		if (fmt[i] != 'i' && fmt[i] != 'd' &&
-		    fmt[i] != 'u' && fmt[i] != 'x') {
+		    fmt[i] != 'u' && fmt[i] != 'x' &&
+		    fmt[i] != 'X') {
 			err = -EINVAL;
 			goto out;
 		}
-- 
cgit v1.2.3


From 72e2b2b66f9c1225e51fc4a1c1e8512959195b76 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 23 Jun 2020 16:08:08 -0700
Subject: bpf: Allow tracing programs to use bpf_jiffies64() helper

/proc/net/tcp{4,6} uses jiffies for various computations.
Let us add bpf_jiffies64() helper to tracing program
so bpf_iter and other programs can use it.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200623230808.3988073-1-yhs@fb.com
---
 kernel/trace/bpf_trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b44505f56b6d..0159f12d2af5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1135,6 +1135,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_ringbuf_discard_proto;
 	case BPF_FUNC_ringbuf_query:
 		return &bpf_ringbuf_query_proto;
+	case BPF_FUNC_jiffies64:
+		return &bpf_jiffies64_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From af7ec13833619e17f03aa73a785a2f871da6d66b Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 23 Jun 2020 16:08:09 -0700
Subject: bpf: Add bpf_skc_to_tcp6_sock() helper

The helper is used in tracing programs to cast a socket
pointer to a tcp6_sock pointer.
The return value could be NULL if the casting is illegal.

A new helper return type RET_PTR_TO_BTF_ID_OR_NULL is added
so the verifier is able to deduce proper return types for the helper.

Different from the previous BTF_ID based helpers,
the bpf_skc_to_tcp6_sock() argument can be several possible
btf_ids. More specifically, all possible socket data structures
with sock_common appearing in the first in the memory layout.
This patch only added socket types related to tcp and udp.

All possible argument btf_id and return value btf_id
for helper bpf_skc_to_tcp6_sock() are pre-calculcated and
cached. In the future, it is even possible to precompute
these btf_id's at kernel build time.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200623230809.3988195-1-yhs@fb.com
---
 include/linux/bpf.h            | 12 +++++++
 include/uapi/linux/bpf.h       |  9 ++++-
 kernel/bpf/btf.c               |  1 +
 kernel/bpf/verifier.c          | 43 ++++++++++++++++------
 kernel/trace/bpf_trace.c       |  2 ++
 net/core/filter.c              | 82 ++++++++++++++++++++++++++++++++++++++++++
 scripts/bpf_helpers_doc.py     |  2 ++
 tools/include/uapi/linux/bpf.h |  9 ++++-
 8 files changed, 148 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1e1501ee53ce..c0e38ad07848 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -265,6 +265,7 @@ enum bpf_return_type {
 	RET_PTR_TO_TCP_SOCK_OR_NULL,	/* returns a pointer to a tcp_sock or NULL */
 	RET_PTR_TO_SOCK_COMMON_OR_NULL,	/* returns a pointer to a sock_common or NULL */
 	RET_PTR_TO_ALLOC_MEM_OR_NULL,	/* returns a pointer to dynamically allocated memory or NULL */
+	RET_PTR_TO_BTF_ID_OR_NULL,	/* returns a pointer to a btf_id or NULL */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
@@ -287,6 +288,12 @@ struct bpf_func_proto {
 		enum bpf_arg_type arg_type[5];
 	};
 	int *btf_id; /* BTF ids of arguments */
+	bool (*check_btf_id)(u32 btf_id, u32 arg); /* if the argument btf_id is
+						    * valid. Often used if more
+						    * than one btf id is permitted
+						    * for this argument.
+						    */
+	int *ret_btf_id; /* return value btf_id */
 };
 
 /* bpf_context is intentionally undefined structure. Pointer to bpf_context is
@@ -1524,6 +1531,7 @@ static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
 
 struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
 void bpf_map_offload_map_free(struct bpf_map *map);
+void init_btf_sock_ids(struct btf *btf);
 #else
 static inline int bpf_prog_offload_init(struct bpf_prog *prog,
 					union bpf_attr *attr)
@@ -1549,6 +1557,9 @@ static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
 static inline void bpf_map_offload_map_free(struct bpf_map *map)
 {
 }
+static inline void init_btf_sock_ids(struct btf *btf)
+{
+}
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
 #if defined(CONFIG_BPF_STREAM_PARSER)
@@ -1638,6 +1649,7 @@ extern const struct bpf_func_proto bpf_ringbuf_reserve_proto;
 extern const struct bpf_func_proto bpf_ringbuf_submit_proto;
 extern const struct bpf_func_proto bpf_ringbuf_discard_proto;
 extern const struct bpf_func_proto bpf_ringbuf_query_proto;
+extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d9737d51dd19..e90ad07b291a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3255,6 +3255,12 @@ union bpf_attr {
  * 		case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
  * 		is returned or the error code -EACCES in case the skb is not
  * 		subject to CHECKSUM_UNNECESSARY.
+ *
+ * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk)
+ *	Description
+ *		Dynamically cast a *sk* pointer to a *tcp6_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3392,7 +3398,8 @@ union bpf_attr {
 	FN(ringbuf_submit),		\
 	FN(ringbuf_discard),		\
 	FN(ringbuf_query),		\
-	FN(csum_level),
+	FN(csum_level),			\
+	FN(skc_to_tcp6_sock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e377d1981730..4c3007f428b1 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3674,6 +3674,7 @@ struct btf *btf_parse_vmlinux(void)
 		goto errout;
 
 	bpf_struct_ops_init(btf, log);
+	init_btf_sock_ids(btf);
 
 	btf_verifier_env_free(env);
 	refcount_set(&btf->refcnt, 1);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7460f967cb75..7de98906ddf4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3800,12 +3800,14 @@ static int int_ptr_type_to_size(enum bpf_arg_type type)
 	return -EINVAL;
 }
 
-static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
-			  enum bpf_arg_type arg_type,
-			  struct bpf_call_arg_meta *meta)
+static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
+			  struct bpf_call_arg_meta *meta,
+			  const struct bpf_func_proto *fn)
 {
+	u32 regno = BPF_REG_1 + arg;
 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 	enum bpf_reg_type expected_type, type = reg->type;
+	enum bpf_arg_type arg_type = fn->arg_type[arg];
 	int err = 0;
 
 	if (arg_type == ARG_DONTCARE)
@@ -3885,9 +3887,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_BTF_ID;
 		if (type != expected_type)
 			goto err_type;
-		if (reg->btf_id != meta->btf_id) {
-			verbose(env, "Helper has type %s got %s in R%d\n",
-				kernel_type_name(meta->btf_id),
+		if (!fn->check_btf_id) {
+			if (reg->btf_id != meta->btf_id) {
+				verbose(env, "Helper has type %s got %s in R%d\n",
+					kernel_type_name(meta->btf_id),
+					kernel_type_name(reg->btf_id), regno);
+
+				return -EACCES;
+			}
+		} else if (!fn->check_btf_id(reg->btf_id, arg)) {
+			verbose(env, "Helper does not support %s in R%d\n",
 				kernel_type_name(reg->btf_id), regno);
 
 			return -EACCES;
@@ -4709,10 +4718,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	meta.func_id = func_id;
 	/* check args */
 	for (i = 0; i < 5; i++) {
-		err = btf_resolve_helper_id(&env->log, fn, i);
-		if (err > 0)
-			meta.btf_id = err;
-		err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta);
+		if (!fn->check_btf_id) {
+			err = btf_resolve_helper_id(&env->log, fn, i);
+			if (err > 0)
+				meta.btf_id = err;
+		}
+		err = check_func_arg(env, i, &meta, fn);
 		if (err)
 			return err;
 	}
@@ -4815,6 +4826,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
 		regs[BPF_REG_0].id = ++env->id_gen;
 		regs[BPF_REG_0].mem_size = meta.mem_size;
+	} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) {
+		int ret_btf_id;
+
+		mark_reg_known_zero(env, regs, BPF_REG_0);
+		regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL;
+		ret_btf_id = *fn->ret_btf_id;
+		if (ret_btf_id == 0) {
+			verbose(env, "invalid return type %d of func %s#%d\n",
+				fn->ret_type, func_id_name(func_id), func_id);
+			return -EINVAL;
+		}
+		regs[BPF_REG_0].btf_id = ret_btf_id;
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0159f12d2af5..2a97a268f533 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1515,6 +1515,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skb_output_proto;
 	case BPF_FUNC_xdp_output:
 		return &bpf_xdp_output_proto;
+	case BPF_FUNC_skc_to_tcp6_sock:
+		return &bpf_skc_to_tcp6_sock_proto;
 #endif
 	case BPF_FUNC_seq_printf:
 		return prog->expected_attach_type == BPF_TRACE_ITER ?
diff --git a/net/core/filter.c b/net/core/filter.c
index c713b6b8938f..176e27d75c51 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -47,6 +47,7 @@
 #include <linux/seccomp.h>
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
+#include <linux/btf.h>
 #include <net/sch_generic.h>
 #include <net/cls_cgroup.h>
 #include <net/dst_metadata.h>
@@ -9225,3 +9226,84 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
 {
 	bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
 }
+
+/* Define a list of socket types which can be the argument for
+ * skc_to_*_sock() helpers. All these sockets should have
+ * sock_common as the first argument in its memory layout.
+ */
+#define BTF_SOCK_TYPE_xxx \
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, "inet_sock")			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, "inet_connection_sock")	\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, "inet_request_sock")	\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, "inet_timewait_sock")	\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, "request_sock")		\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, "sock")			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, "sock_common")		\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, "tcp_sock")			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, "tcp_request_sock")	\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, "tcp_timewait_sock")	\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, "tcp6_sock")			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, "udp_sock")			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, "udp6_sock")
+
+enum {
+#define BTF_SOCK_TYPE(name, str) name,
+BTF_SOCK_TYPE_xxx
+#undef BTF_SOCK_TYPE
+MAX_BTF_SOCK_TYPE,
+};
+
+static int btf_sock_ids[MAX_BTF_SOCK_TYPE];
+
+#ifdef CONFIG_BPF_SYSCALL
+static const char *bpf_sock_types[] = {
+#define BTF_SOCK_TYPE(name, str) str,
+BTF_SOCK_TYPE_xxx
+#undef BTF_SOCK_TYPE
+};
+
+void init_btf_sock_ids(struct btf *btf)
+{
+	int i, btf_id;
+
+	for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) {
+		btf_id = btf_find_by_name_kind(btf, bpf_sock_types[i],
+					       BTF_KIND_STRUCT);
+		if (btf_id > 0)
+			btf_sock_ids[i] = btf_id;
+	}
+}
+#endif
+
+static bool check_arg_btf_id(u32 btf_id, u32 arg)
+{
+	int i;
+
+	/* only one argument, no need to check arg */
+	for (i = 0; i < MAX_BTF_SOCK_TYPE; i++)
+		if (btf_sock_ids[i] == btf_id)
+			return true;
+	return false;
+}
+
+BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
+{
+	/* tcp6_sock type is not generated in dwarf and hence btf,
+	 * trigger an explicit type generation here.
+	 */
+	BTF_TYPE_EMIT(struct tcp6_sock);
+	if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
+	    sk->sk_family == AF_INET6)
+		return (unsigned long)sk;
+
+	return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
+	.func			= bpf_skc_to_tcp6_sock,
+	.gpl_only		= false,
+	.ret_type		= RET_PTR_TO_BTF_ID_OR_NULL,
+	.arg1_type		= ARG_PTR_TO_BTF_ID,
+	.check_btf_id		= check_arg_btf_id,
+	.ret_btf_id		= &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
+};
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 91fa668fa860..6c2f64118651 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -421,6 +421,7 @@ class PrinterHelpers(Printer):
             'struct sockaddr',
             'struct tcphdr',
             'struct seq_file',
+            'struct tcp6_sock',
 
             'struct __sk_buff',
             'struct sk_msg_md',
@@ -458,6 +459,7 @@ class PrinterHelpers(Printer):
             'struct sockaddr',
             'struct tcphdr',
             'struct seq_file',
+            'struct tcp6_sock',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d9737d51dd19..e90ad07b291a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3255,6 +3255,12 @@ union bpf_attr {
  * 		case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
  * 		is returned or the error code -EACCES in case the skb is not
  * 		subject to CHECKSUM_UNNECESSARY.
+ *
+ * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk)
+ *	Description
+ *		Dynamically cast a *sk* pointer to a *tcp6_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3392,7 +3398,8 @@ union bpf_attr {
 	FN(ringbuf_submit),		\
 	FN(ringbuf_discard),		\
 	FN(ringbuf_query),		\
-	FN(csum_level),
+	FN(csum_level),			\
+	FN(skc_to_tcp6_sock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From 478cfbdf5f13dfe09cfd0b1cbac821f5e27f6108 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 23 Jun 2020 16:08:11 -0700
Subject: bpf: Add bpf_skc_to_{tcp, tcp_timewait, tcp_request}_sock() helpers

Three more helpers are added to cast a sock_common pointer to
an tcp_sock, tcp_timewait_sock or a tcp_request_sock for
tracing programs.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200623230811.3988277-1-yhs@fb.com
---
 include/linux/bpf.h            |  3 ++
 include/uapi/linux/bpf.h       | 23 +++++++++++++++-
 kernel/trace/bpf_trace.c       |  6 ++++
 net/core/filter.c              | 62 ++++++++++++++++++++++++++++++++++++++++++
 scripts/bpf_helpers_doc.py     |  6 ++++
 tools/include/uapi/linux/bpf.h | 23 +++++++++++++++-
 6 files changed, 121 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c0e38ad07848..c23998cf6699 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1650,6 +1650,9 @@ extern const struct bpf_func_proto bpf_ringbuf_submit_proto;
 extern const struct bpf_func_proto bpf_ringbuf_discard_proto;
 extern const struct bpf_func_proto bpf_ringbuf_query_proto;
 extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto;
+extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto;
+extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto;
+extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e90ad07b291a..b9412ab275f3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3261,6 +3261,24 @@ union bpf_attr {
  *		Dynamically cast a *sk* pointer to a *tcp6_sock* pointer.
  *	Return
  *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk)
+ *	Description
+ *		Dynamically cast a *sk* pointer to a *tcp_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk)
+ * 	Description
+ *		Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk)
+ * 	Description
+ *		Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3399,7 +3417,10 @@ union bpf_attr {
 	FN(ringbuf_discard),		\
 	FN(ringbuf_query),		\
 	FN(csum_level),			\
-	FN(skc_to_tcp6_sock),
+	FN(skc_to_tcp6_sock),		\
+	FN(skc_to_tcp_sock),		\
+	FN(skc_to_tcp_timewait_sock),	\
+	FN(skc_to_tcp_request_sock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2a97a268f533..48d935b0d87c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1517,6 +1517,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_output_proto;
 	case BPF_FUNC_skc_to_tcp6_sock:
 		return &bpf_skc_to_tcp6_sock_proto;
+	case BPF_FUNC_skc_to_tcp_sock:
+		return &bpf_skc_to_tcp_sock_proto;
+	case BPF_FUNC_skc_to_tcp_timewait_sock:
+		return &bpf_skc_to_tcp_timewait_sock_proto;
+	case BPF_FUNC_skc_to_tcp_request_sock:
+		return &bpf_skc_to_tcp_request_sock_proto;
 #endif
 	case BPF_FUNC_seq_printf:
 		return prog->expected_attach_type == BPF_TRACE_ITER ?
diff --git a/net/core/filter.c b/net/core/filter.c
index 176e27d75c51..0b4e5aed7e20 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -74,6 +74,7 @@
 #include <net/lwtunnel.h>
 #include <net/ipv6_stubs.h>
 #include <net/bpf_sk_storage.h>
+#include <net/transp_v6.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -9307,3 +9308,64 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
 	.check_btf_id		= check_arg_btf_id,
 	.ret_btf_id		= &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
 };
+
+BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
+{
+	if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+		return (unsigned long)sk;
+
+	return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = {
+	.func			= bpf_skc_to_tcp_sock,
+	.gpl_only		= false,
+	.ret_type		= RET_PTR_TO_BTF_ID_OR_NULL,
+	.arg1_type		= ARG_PTR_TO_BTF_ID,
+	.check_btf_id		= check_arg_btf_id,
+	.ret_btf_id		= &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+};
+
+BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk)
+{
+	if (sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT)
+		return (unsigned long)sk;
+
+#if IS_BUILTIN(CONFIG_IPV6)
+	if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT)
+		return (unsigned long)sk;
+#endif
+
+	return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = {
+	.func			= bpf_skc_to_tcp_timewait_sock,
+	.gpl_only		= false,
+	.ret_type		= RET_PTR_TO_BTF_ID_OR_NULL,
+	.arg1_type		= ARG_PTR_TO_BTF_ID,
+	.check_btf_id		= check_arg_btf_id,
+	.ret_btf_id		= &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW],
+};
+
+BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk)
+{
+	if (sk->sk_prot == &tcp_prot  && sk->sk_state == TCP_NEW_SYN_RECV)
+		return (unsigned long)sk;
+
+#if IS_BUILTIN(CONFIG_IPV6)
+	if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV)
+		return (unsigned long)sk;
+#endif
+
+	return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
+	.func			= bpf_skc_to_tcp_request_sock,
+	.gpl_only		= false,
+	.ret_type		= RET_PTR_TO_BTF_ID_OR_NULL,
+	.arg1_type		= ARG_PTR_TO_BTF_ID,
+	.check_btf_id		= check_arg_btf_id,
+	.ret_btf_id		= &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
+};
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 6c2f64118651..d886657c6aaa 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -422,6 +422,9 @@ class PrinterHelpers(Printer):
             'struct tcphdr',
             'struct seq_file',
             'struct tcp6_sock',
+            'struct tcp_sock',
+            'struct tcp_timewait_sock',
+            'struct tcp_request_sock',
 
             'struct __sk_buff',
             'struct sk_msg_md',
@@ -460,6 +463,9 @@ class PrinterHelpers(Printer):
             'struct tcphdr',
             'struct seq_file',
             'struct tcp6_sock',
+            'struct tcp_sock',
+            'struct tcp_timewait_sock',
+            'struct tcp_request_sock',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e90ad07b291a..b9412ab275f3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3261,6 +3261,24 @@ union bpf_attr {
  *		Dynamically cast a *sk* pointer to a *tcp6_sock* pointer.
  *	Return
  *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk)
+ *	Description
+ *		Dynamically cast a *sk* pointer to a *tcp_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk)
+ * 	Description
+ *		Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk)
+ * 	Description
+ *		Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3399,7 +3417,10 @@ union bpf_attr {
 	FN(ringbuf_discard),		\
 	FN(ringbuf_query),		\
 	FN(csum_level),			\
-	FN(skc_to_tcp6_sock),
+	FN(skc_to_tcp6_sock),		\
+	FN(skc_to_tcp_sock),		\
+	FN(skc_to_tcp_timewait_sock),	\
+	FN(skc_to_tcp_request_sock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From 0d4fad3e57df2bf61e8ffc8d12a34b1caf9b8835 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 23 Jun 2020 16:08:15 -0700
Subject: bpf: Add bpf_skc_to_udp6_sock() helper

The helper is used in tracing programs to cast a socket
pointer to a udp6_sock pointer.
The return value could be NULL if the casting is illegal.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/bpf/20200623230815.3988481-1-yhs@fb.com
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       |  9 ++++++++-
 kernel/trace/bpf_trace.c       |  2 ++
 net/core/filter.c              | 22 ++++++++++++++++++++++
 scripts/bpf_helpers_doc.py     |  2 ++
 tools/include/uapi/linux/bpf.h |  9 ++++++++-
 6 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c23998cf6699..3d2ade703a35 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1653,6 +1653,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
+extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b9412ab275f3..0cb8ec948816 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3279,6 +3279,12 @@ union bpf_attr {
  *		Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer.
  *	Return
  *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk)
+ * 	Description
+ *		Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3420,7 +3426,8 @@ union bpf_attr {
 	FN(skc_to_tcp6_sock),		\
 	FN(skc_to_tcp_sock),		\
 	FN(skc_to_tcp_timewait_sock),	\
-	FN(skc_to_tcp_request_sock),
+	FN(skc_to_tcp_request_sock),	\
+	FN(skc_to_udp6_sock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 48d935b0d87c..5d59dda5f661 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1523,6 +1523,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skc_to_tcp_timewait_sock_proto;
 	case BPF_FUNC_skc_to_tcp_request_sock:
 		return &bpf_skc_to_tcp_request_sock_proto;
+	case BPF_FUNC_skc_to_udp6_sock:
+		return &bpf_skc_to_udp6_sock_proto;
 #endif
 	case BPF_FUNC_seq_printf:
 		return prog->expected_attach_type == BPF_TRACE_ITER ?
diff --git a/net/core/filter.c b/net/core/filter.c
index 0b4e5aed7e20..c796e141ea8e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9369,3 +9369,25 @@ const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
 	.check_btf_id		= check_arg_btf_id,
 	.ret_btf_id		= &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
 };
+
+BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk)
+{
+	/* udp6_sock type is not generated in dwarf and hence btf,
+	 * trigger an explicit type generation here.
+	 */
+	BTF_TYPE_EMIT(struct udp6_sock);
+	if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP &&
+	    sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6)
+		return (unsigned long)sk;
+
+	return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
+	.func			= bpf_skc_to_udp6_sock,
+	.gpl_only		= false,
+	.ret_type		= RET_PTR_TO_BTF_ID_OR_NULL,
+	.arg1_type		= ARG_PTR_TO_BTF_ID,
+	.check_btf_id		= check_arg_btf_id,
+	.ret_btf_id		= &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
+};
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index d886657c6aaa..6bab40ff442e 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -425,6 +425,7 @@ class PrinterHelpers(Printer):
             'struct tcp_sock',
             'struct tcp_timewait_sock',
             'struct tcp_request_sock',
+            'struct udp6_sock',
 
             'struct __sk_buff',
             'struct sk_msg_md',
@@ -466,6 +467,7 @@ class PrinterHelpers(Printer):
             'struct tcp_sock',
             'struct tcp_timewait_sock',
             'struct tcp_request_sock',
+            'struct udp6_sock',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b9412ab275f3..0cb8ec948816 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3279,6 +3279,12 @@ union bpf_attr {
  *		Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer.
  *	Return
  *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk)
+ * 	Description
+ *		Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
+ *	Return
+ *		*sk* if casting is valid, or NULL otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3420,7 +3426,8 @@ union bpf_attr {
 	FN(skc_to_tcp6_sock),		\
 	FN(skc_to_tcp_sock),		\
 	FN(skc_to_tcp_timewait_sock),	\
-	FN(skc_to_tcp_request_sock),
+	FN(skc_to_tcp_request_sock),	\
+	FN(skc_to_udp6_sock),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From 590d69796346353878b275c5512c664e3f875f24 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 19 Dec 2019 16:44:52 -0500
Subject: sched: Force the address order of each sched class descriptor

In order to make a micro optimization in pick_next_task(), the order of the
sched class descriptor address must be in the same order as their priority
to each other. That is:

 &idle_sched_class < &fair_sched_class < &rt_sched_class <
 &dl_sched_class < &stop_sched_class

In order to guarantee this order of the sched class descriptors, add each
one into their own data section and force the order in the linker script.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/157675913272.349305.8936736338884044103.stgit@localhost.localdomain
---
 include/asm-generic/vmlinux.lds.h | 13 +++++++++++++
 kernel/sched/deadline.c           |  3 ++-
 kernel/sched/fair.c               |  3 ++-
 kernel/sched/idle.c               |  3 ++-
 kernel/sched/rt.c                 |  3 ++-
 kernel/sched/stop_task.c          |  3 ++-
 6 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index db600ef218d7..2186d7b01af6 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -108,6 +108,18 @@
 #define SBSS_MAIN .sbss
 #endif
 
+/*
+ * The order of the sched class addresses are important, as they are
+ * used to determine the order of the priority of each sched class in
+ * relation to each other.
+ */
+#define SCHED_DATA				\
+	*(__idle_sched_class)			\
+	*(__fair_sched_class)			\
+	*(__rt_sched_class)			\
+	*(__dl_sched_class)			\
+	*(__stop_sched_class)
+
 /*
  * Align to a 32 byte boundary equal to the
  * alignment gcc 4.5 uses for a struct
@@ -388,6 +400,7 @@
 	.rodata           : AT(ADDR(.rodata) - LOAD_OFFSET) {		\
 		__start_rodata = .;					\
 		*(.rodata) *(.rodata.*)					\
+		SCHED_DATA						\
 		RO_AFTER_INIT_DATA	/* Read only after init */	\
 		. = ALIGN(8);						\
 		__start___tracepoints_ptrs = .;				\
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d4708e29008f..d9e79462993b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2479,7 +2479,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 	}
 }
 
-const struct sched_class dl_sched_class = {
+const struct sched_class dl_sched_class
+	__attribute__((section("__dl_sched_class"))) = {
 	.next			= &rt_sched_class,
 	.enqueue_task		= enqueue_task_dl,
 	.dequeue_task		= dequeue_task_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0424a0af5f87..3365f6b07c36 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11122,7 +11122,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
 /*
  * All the scheduling class methods:
  */
-const struct sched_class fair_sched_class = {
+const struct sched_class fair_sched_class
+	__attribute__((section("__fair_sched_class"))) = {
 	.next			= &idle_sched_class,
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8d75ca201484..f5806295356b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -453,7 +453,8 @@ static void update_curr_idle(struct rq *rq)
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
-const struct sched_class idle_sched_class = {
+const struct sched_class idle_sched_class
+	__attribute__((section("__idle_sched_class"))) = {
 	/* .next is NULL */
 	/* no enqueue/yield_task for idle tasks */
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f395ddb75f38..6543d4430331 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2429,7 +2429,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 		return 0;
 }
 
-const struct sched_class rt_sched_class = {
+const struct sched_class rt_sched_class
+	__attribute__((section("__rt_sched_class"))) = {
 	.next			= &fair_sched_class,
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 3e50a6a8f1e5..f4bbd54caae0 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -109,7 +109,8 @@ static void update_curr_stop(struct rq *rq)
 /*
  * Simple, special scheduling class for the per-CPU stop tasks:
  */
-const struct sched_class stop_sched_class = {
+const struct sched_class stop_sched_class
+	__attribute__((section("__stop_sched_class"))) = {
 	.next			= &dl_sched_class,
 
 	.enqueue_task		= enqueue_task_stop,
-- 
cgit v1.2.3


From c3a340f7e7eadac7662ab104ceb16432e5a4c6b2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 19 Dec 2019 16:44:53 -0500
Subject: sched: Have sched_class_highest define by vmlinux.lds.h

Now that the sched_class descriptors are defined by the linker script, and
this needs to be aware of the existance of stop_sched_class when SMP is
enabled or not, as it is used as the "highest" priority when defined. Move
the declaration of sched_class_highest to the same location in the linker
script that inserts stop_sched_class, and this will also make it easier to
see what should be defined as the highest class, as this linker script
location defines the priorities as well.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20191219214558.682913590@goodmis.org
---
 include/asm-generic/vmlinux.lds.h |  5 ++++-
 kernel/sched/core.c               |  8 ++++++++
 kernel/sched/sched.h              | 17 +++++++++--------
 3 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 2186d7b01af6..66fb84c3dc7e 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -114,11 +114,14 @@
  * relation to each other.
  */
 #define SCHED_DATA				\
+	STRUCT_ALIGN();				\
+	__begin_sched_classes = .;		\
 	*(__idle_sched_class)			\
 	*(__fair_sched_class)			\
 	*(__rt_sched_class)			\
 	*(__dl_sched_class)			\
-	*(__stop_sched_class)
+	*(__stop_sched_class)			\
+	__end_sched_classes = .;
 
 /*
  * Align to a 32 byte boundary equal to the
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0208b71bef80..81640fe0eae8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6646,6 +6646,14 @@ void __init sched_init(void)
 	unsigned long ptr = 0;
 	int i;
 
+	/* Make sure the linker didn't screw up */
+	BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
+	       &fair_sched_class + 1 != &rt_sched_class ||
+	       &rt_sched_class + 1   != &dl_sched_class);
+#ifdef CONFIG_SMP
+	BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
+#endif
+
 	wait_bit_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 336887607b3d..4165c06d1d7b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1811,7 +1811,7 @@ struct sched_class {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*task_change_group)(struct task_struct *p, int type);
 #endif
-};
+} __aligned(32); /* STRUCT_ALIGN(), vmlinux.lds.h */
 
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
@@ -1825,17 +1825,18 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
 	next->sched_class->set_next_task(rq, next, false);
 }
 
-#ifdef CONFIG_SMP
-#define sched_class_highest (&stop_sched_class)
-#else
-#define sched_class_highest (&dl_sched_class)
-#endif
+/* Defined in include/asm-generic/vmlinux.lds.h */
+extern struct sched_class __begin_sched_classes[];
+extern struct sched_class __end_sched_classes[];
+
+#define sched_class_highest (__end_sched_classes - 1)
+#define sched_class_lowest  (__begin_sched_classes - 1)
 
 #define for_class_range(class, _from, _to) \
-	for (class = (_from); class != (_to); class = class->next)
+	for (class = (_from); class != (_to); class--)
 
 #define for_each_class(class) \
-	for_class_range(class, sched_class_highest, NULL)
+	for_class_range(class, sched_class_highest, sched_class_lowest)
 
 extern const struct sched_class stop_sched_class;
 extern const struct sched_class dl_sched_class;
-- 
cgit v1.2.3


From a87e749e8fa1aaef9b4db32e21c2795e69ce67bf Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 19 Dec 2019 16:44:54 -0500
Subject: sched: Remove struct sched_class::next field

Now that the sched_class descriptors are defined in order via the linker
script vmlinux.lds.h, there's no reason to have a "next" pointer to the
previous priroity structure. The order of the sturctures can be aligned as
an array, and used to index and find the next sched_class descriptor.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20191219214558.845353593@goodmis.org
---
 kernel/sched/deadline.c  | 1 -
 kernel/sched/fair.c      | 1 -
 kernel/sched/idle.c      | 1 -
 kernel/sched/rt.c        | 1 -
 kernel/sched/sched.h     | 1 -
 kernel/sched/stop_task.c | 1 -
 6 files changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d9e79462993b..c9cc1d6fa363 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2481,7 +2481,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 
 const struct sched_class dl_sched_class
 	__attribute__((section("__dl_sched_class"))) = {
-	.next			= &rt_sched_class,
 	.enqueue_task		= enqueue_task_dl,
 	.dequeue_task		= dequeue_task_dl,
 	.yield_task		= yield_task_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3365f6b07c36..a63f400013de 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11124,7 +11124,6 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
  */
 const struct sched_class fair_sched_class
 	__attribute__((section("__fair_sched_class"))) = {
-	.next			= &idle_sched_class,
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f5806295356b..336d478bddc8 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -455,7 +455,6 @@ static void update_curr_idle(struct rq *rq)
  */
 const struct sched_class idle_sched_class
 	__attribute__((section("__idle_sched_class"))) = {
-	/* .next is NULL */
 	/* no enqueue/yield_task for idle tasks */
 
 	/* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 6543d4430331..f215eea6a966 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2431,7 +2431,6 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 
 const struct sched_class rt_sched_class
 	__attribute__((section("__rt_sched_class"))) = {
-	.next			= &fair_sched_class,
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4165c06d1d7b..549e7e6e0a66 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1754,7 +1754,6 @@ extern const u32		sched_prio_to_wmult[40];
 #define RETRY_TASK		((void *)-1UL)
 
 struct sched_class {
-	const struct sched_class *next;
 
 #ifdef CONFIG_UCLAMP_TASK
 	int uclamp_enabled;
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index f4bbd54caae0..394bc8126a1e 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -111,7 +111,6 @@ static void update_curr_stop(struct rq *rq)
  */
 const struct sched_class stop_sched_class
 	__attribute__((section("__stop_sched_class"))) = {
-	.next			= &dl_sched_class,
 
 	.enqueue_task		= enqueue_task_stop,
 	.dequeue_task		= dequeue_task_stop,
-- 
cgit v1.2.3


From aa93cd53bc1b91b5f99c7b55e3dcc1ac98e99558 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <tkhai@yandex.ru>
Date: Thu, 19 Dec 2019 16:44:55 -0500
Subject: sched: Micro optimization in pick_next_task() and in
 check_preempt_curr()

This introduces an optimization based on xxx_sched_class addresses
in two hot scheduler functions: pick_next_task() and check_preempt_curr().

It is possible to compare pointers to sched classes to check, which
of them has a higher priority, instead of current iterations using
for_each_class().

One more result of the patch is that size of object file becomes a little
less (excluding added BUG_ON(), which goes in __init section):

$size kernel/sched/core.o
         text     data      bss	    dec	    hex	filename
before:  66446    18957	    676	  86079	  1503f	kernel/sched/core.o
after:   66398    18957	    676	  86031	  1500f	kernel/sched/core.o

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/711a9c4b-ff32-1136-b848-17c622d548f3@yandex.ru
---
 kernel/sched/core.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 81640fe0eae8..0a08d0c235d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1412,20 +1412,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 
 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
-	const struct sched_class *class;
-
-	if (p->sched_class == rq->curr->sched_class) {
+	if (p->sched_class == rq->curr->sched_class)
 		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
-	} else {
-		for_each_class(class) {
-			if (class == rq->curr->sched_class)
-				break;
-			if (class == p->sched_class) {
-				resched_curr(rq);
-				break;
-			}
-		}
-	}
+	else if (p->sched_class > rq->curr->sched_class)
+		resched_curr(rq);
 
 	/*
 	 * A queue event has occurred, and we're going to schedule.  In
@@ -4003,8 +3993,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	 * higher scheduling class, because otherwise those loose the
 	 * opportunity to pull in more work from other CPUs.
 	 */
-	if (likely((prev->sched_class == &idle_sched_class ||
-		    prev->sched_class == &fair_sched_class) &&
+	if (likely(prev->sched_class <= &fair_sched_class &&
 		   rq->nr_running == rq->cfs.h_nr_running)) {
 
 		p = pick_next_task_fair(rq, prev, rf);
-- 
cgit v1.2.3


From 423d02e1463b21109106f52d94f7396b63731f3b Mon Sep 17 00:00:00 2001
From: Peng Wang <rocking@linux.alibaba.com>
Date: Tue, 16 Jun 2020 14:04:07 +0800
Subject: sched/fair: Optimize dequeue_task_fair()

While looking at enqueue_task_fair and dequeue_task_fair, it occurred
to me that dequeue_task_fair can also be optimized as Vincent described
in commit 7d148be69e3a ("sched/fair: Optimize enqueue_task_fair()").

When encountering throttled cfs_rq, dequeue_throttle label can ensure
se not to be NULL, and rq->nr_running remains unchanged, so we can also
skip the early balance check.

Signed-off-by: Peng Wang <rocking@linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/701eef9a40de93dcf5fe7063fd607bca5db38e05.1592287263.git.rocking@linux.alibaba.com
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a63f400013de..b9b9f19e80c1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5624,14 +5624,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 	}
 
-dequeue_throttle:
-	if (!se)
-		sub_nr_running(rq, 1);
+	/* At this point se is NULL and we are at root level*/
+	sub_nr_running(rq, 1);
 
 	/* balance early to pull high priority tasks */
 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
 		rq->next_balance = jiffies;
 
+dequeue_throttle:
 	util_est_dequeue(&rq->cfs, p, task_sleep);
 	hrtick_update(rq);
 }
-- 
cgit v1.2.3


From bba18a1af33e0f0a1ad7d028208b306ef3f3df12 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 18 Jun 2020 19:47:50 +0300
Subject: console: Propagate error code from console ->setup()

Since console ->setup() hook returns meaningful error codes,
propagate it to the caller of try_enable_new_console().

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20200618164751.56828-6-andriy.shevchenko@linux.intel.com
---
 kernel/printk/printk.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8c14835be46c..aaea3ad182e1 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2668,7 +2668,7 @@ early_param("keep_bootcon", keep_bootcon_setup);
 static int try_enable_new_console(struct console *newcon, bool user_specified)
 {
 	struct console_cmdline *c;
-	int i;
+	int i, err;
 
 	for (i = 0, c = console_cmdline;
 	     i < MAX_CMDLINECONSOLES && c->name[0];
@@ -2691,8 +2691,8 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
 				return 0;
 
 			if (newcon->setup &&
-			    newcon->setup(newcon, c->options) != 0)
-				return -EIO;
+			    (err = newcon->setup(newcon, c->options)) != 0)
+				return err;
 		}
 		newcon->flags |= CON_ENABLED;
 		if (i == preferred_console) {
-- 
cgit v1.2.3


From 504603767ab639c47912be08d62e02b43125e82e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 18 Jun 2020 19:47:51 +0300
Subject: console: Fix trivia typo 'change' -> 'chance'

I bet the word 'chance' has to be used in 'had a chance to be called',
but, alas, I'm not native speaker...

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20200618164751.56828-7-andriy.shevchenko@linux.intel.com
---
 kernel/printk/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index aaea3ad182e1..6623e975675a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2705,7 +2705,7 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
 	/*
 	 * Some consoles, such as pstore and netconsole, can be enabled even
 	 * without matching. Accept the pre-enabled consoles only when match()
-	 * and setup() had a change to be called.
+	 * and setup() had a chance to be called.
 	 */
 	if (newcon->flags & CON_ENABLED && c->user_specified ==	user_specified)
 		return 0;
-- 
cgit v1.2.3


From f3bdc62fd82ed93dbe4d049eacba310de7eb2a6a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 17 Jun 2020 15:58:23 +0200
Subject: blktrace: Provide event for request merging

Currently blk-mq does not report any event when two requests get merged
in the elevator. This then results in difficult to understand sequence
of events like:

...
  8,0   34     1579     0.608765271  2718  I  WS 215023504 + 40 [dbench]
  8,0   34     1584     0.609184613  2719  A  WS 215023544 + 56 <- (8,4) 2160568
  8,0   34     1585     0.609184850  2719  Q  WS 215023544 + 56 [dbench]
  8,0   34     1586     0.609188524  2719  G  WS 215023544 + 56 [dbench]
  8,0    3      602     0.609684162   773  D  WS 215023504 + 96 [kworker/3:1H]
  8,0   34     1591     0.609843593     0  C  WS 215023504 + 96 [0]

and you can only guess (after quite some headscratching since the above
excerpt is intermixed with a lot of other IO) that request 215023544+56
got merged to request 215023504+40. Provide proper event for request
merging like we used to do in the legacy block layer.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c            |  2 ++
 include/trace/events/block.h | 15 +++++++++++++++
 kernel/trace/blktrace.c      | 10 ++++++++++
 3 files changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index f0b0bae075a0..9c9fb21584b6 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -793,6 +793,8 @@ static struct request *attempt_merge(struct request_queue *q,
 	 */
 	blk_account_io_merge_request(next);
 
+	trace_block_rq_merge(q, next);
+
 	/*
 	 * ownership of bio passed from next to req, return 'next' for
 	 * the caller to free
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 93b114226af8..34d64ca306b1 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -211,6 +211,21 @@ DEFINE_EVENT(block_rq, block_rq_issue,
 	TP_ARGS(q, rq)
 );
 
+/**
+ * block_rq_merge - merge request with another one in the elevator
+ * @q: queue holding operation
+ * @rq: block IO operation operation request
+ *
+ * Called when block operation request @rq from queue @q is merged to another
+ * request queued in the elevator.
+ */
+DEFINE_EVENT(block_rq, block_rq_merge,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq)
+);
+
 /**
  * block_bio_bounce - used bounce buffer when processing block operation
  * @q: queue holding the block operation
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c086c38f4954..7ba62d68885a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -860,6 +860,13 @@ static void blk_add_trace_rq_issue(void *ignore,
 			 blk_trace_request_get_cgid(q, rq));
 }
 
+static void blk_add_trace_rq_merge(void *ignore,
+				   struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE,
+			 blk_trace_request_get_cgid(q, rq));
+}
+
 static void blk_add_trace_rq_requeue(void *ignore,
 				     struct request_queue *q,
 				     struct request *rq)
@@ -1144,6 +1151,8 @@ static void blk_register_tracepoints(void)
 	WARN_ON(ret);
 	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
 	WARN_ON(ret);
+	ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
+	WARN_ON(ret);
 	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
@@ -1190,6 +1199,7 @@ static void blk_unregister_tracepoints(void)
 	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
 	unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
 	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
+	unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
 	unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
 	unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
 
-- 
cgit v1.2.3


From 985098a05eee6bf5caca7e997e02a5b15242cfa0 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 23 Jun 2020 09:09:10 +0200
Subject: docs: fix references for DMA*.txt files

As we moved those files to core-api, fix references to point
to their newer locations.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/37b2fd159fbc7655dbf33b3eb1215396a25f6344.1592895969.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/PCI/pci.rst                            |  6 +++---
 Documentation/block/biodoc.rst                       |  2 +-
 Documentation/bus-virt-phys-mapping.txt              |  2 +-
 Documentation/core-api/dma-api.rst                   |  6 +++---
 Documentation/core-api/dma-isa-lpc.rst               |  2 +-
 Documentation/driver-api/usb/dma.rst                 |  6 +++---
 Documentation/translations/ko_KR/memory-barriers.txt |  6 +++---
 arch/ia64/hp/common/sba_iommu.c                      | 12 ++++++------
 arch/parisc/kernel/pci-dma.c                         |  2 +-
 arch/x86/include/asm/dma-mapping.h                   |  4 ++--
 arch/x86/kernel/amd_gart_64.c                        |  2 +-
 drivers/parisc/sba_iommu.c                           | 14 +++++++-------
 include/linux/dma-mapping.h                          |  2 +-
 include/media/videobuf-dma-sg.h                      |  2 +-
 kernel/dma/debug.c                                   |  2 +-
 15 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst
index 8c016d8c9862..d10d3fe604c5 100644
--- a/Documentation/PCI/pci.rst
+++ b/Documentation/PCI/pci.rst
@@ -265,7 +265,7 @@ Set the DMA mask size
 ---------------------
 .. note::
    If anything below doesn't make sense, please refer to
-   Documentation/DMA-API.txt. This section is just a reminder that
+   :doc:`/core-api/dma-api`. This section is just a reminder that
    drivers need to indicate DMA capabilities of the device and is not
    an authoritative source for DMA interfaces.
 
@@ -291,7 +291,7 @@ Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are
 Setup shared control data
 -------------------------
 Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared)
-memory.  See Documentation/DMA-API.txt for a full description of
+memory.  See :doc:`/core-api/dma-api` for a full description of
 the DMA APIs. This section is just a reminder that it needs to be done
 before enabling DMA on the device.
 
@@ -421,7 +421,7 @@ owners if there is one.
 
 Then clean up "consistent" buffers which contain the control data.
 
-See Documentation/DMA-API.txt for details on unmapping interfaces.
+See :doc:`/core-api/dma-api` for details on unmapping interfaces.
 
 
 Unregister from other subsystems
diff --git a/Documentation/block/biodoc.rst b/Documentation/block/biodoc.rst
index b964796ec9c7..ba7f45d0271c 100644
--- a/Documentation/block/biodoc.rst
+++ b/Documentation/block/biodoc.rst
@@ -196,7 +196,7 @@ a virtual address mapping (unlike the earlier scheme of virtual address
 do not have a corresponding kernel virtual address space mapping) and
 low-memory pages.
 
-Note: Please refer to Documentation/DMA-API-HOWTO.txt for a discussion
+Note: Please refer to :doc:`/core-api/dma-api-howto` for a discussion
 on PCI high mem DMA aspects and mapping of scatter gather lists, and support
 for 64 bit PCI.
 
diff --git a/Documentation/bus-virt-phys-mapping.txt b/Documentation/bus-virt-phys-mapping.txt
index 4bb07c2f3e7d..c7bc99cd2e21 100644
--- a/Documentation/bus-virt-phys-mapping.txt
+++ b/Documentation/bus-virt-phys-mapping.txt
@@ -8,7 +8,7 @@ How to access I/O mapped memory from within device drivers
 
 	The virt_to_bus() and bus_to_virt() functions have been
 	superseded by the functionality provided by the PCI DMA interface
-	(see Documentation/DMA-API-HOWTO.txt).  They continue
+	(see :doc:`/core-api/dma-api-howto`).  They continue
 	to be documented below for historical purposes, but new code
 	must not use them. --davidm 00/12/12
 
diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst
index 2d8d2fed7317..63b4a2f20867 100644
--- a/Documentation/core-api/dma-api.rst
+++ b/Documentation/core-api/dma-api.rst
@@ -5,7 +5,7 @@ Dynamic DMA mapping using the generic device
 :Author: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
 
 This document describes the DMA API.  For a more gentle introduction
-of the API (and actual examples), see Documentation/DMA-API-HOWTO.txt.
+of the API (and actual examples), see :doc:`/core-api/dma-api-howto`.
 
 This API is split into two pieces.  Part I describes the basic API.
 Part II describes extensions for supporting non-consistent memory
@@ -471,7 +471,7 @@ without the _attrs suffixes, except that they pass an optional
 dma_attrs.
 
 The interpretation of DMA attributes is architecture-specific, and
-each attribute should be documented in Documentation/DMA-attributes.txt.
+each attribute should be documented in :doc:`/core-api/dma-attributes`.
 
 If dma_attrs are 0, the semantics of each of these functions
 is identical to those of the corresponding function
@@ -484,7 +484,7 @@ for DMA::
 
 	#include <linux/dma-mapping.h>
 	/* DMA_ATTR_FOO should be defined in linux/dma-mapping.h and
-	* documented in Documentation/DMA-attributes.txt */
+	* documented in Documentation/core-api/dma-attributes.rst */
 	...
 
 		unsigned long attr;
diff --git a/Documentation/core-api/dma-isa-lpc.rst b/Documentation/core-api/dma-isa-lpc.rst
index b1ec7b16c21f..e59a3d35a93d 100644
--- a/Documentation/core-api/dma-isa-lpc.rst
+++ b/Documentation/core-api/dma-isa-lpc.rst
@@ -17,7 +17,7 @@ To do ISA style DMA you need to include two headers::
 	#include <asm/dma.h>
 
 The first is the generic DMA API used to convert virtual addresses to
-bus addresses (see Documentation/DMA-API.txt for details).
+bus addresses (see :doc:`/core-api/dma-api` for details).
 
 The second contains the routines specific to ISA DMA transfers. Since
 this is not present on all platforms make sure you construct your
diff --git a/Documentation/driver-api/usb/dma.rst b/Documentation/driver-api/usb/dma.rst
index 59d5aee89e37..2b3dbd3265b4 100644
--- a/Documentation/driver-api/usb/dma.rst
+++ b/Documentation/driver-api/usb/dma.rst
@@ -10,7 +10,7 @@ API overview
 
 The big picture is that USB drivers can continue to ignore most DMA issues,
 though they still must provide DMA-ready buffers (see
-``Documentation/DMA-API-HOWTO.txt``).  That's how they've worked through
+:doc:`/core-api/dma-api-howto`).  That's how they've worked through
 the 2.4 (and earlier) kernels, or they can now be DMA-aware.
 
 DMA-aware usb drivers:
@@ -60,7 +60,7 @@ and effects like cache-trashing can impose subtle penalties.
   force a consistent memory access ordering by using memory barriers.  It's
   not using a streaming DMA mapping, so it's good for small transfers on
   systems where the I/O would otherwise thrash an IOMMU mapping.  (See
-  ``Documentation/DMA-API-HOWTO.txt`` for definitions of "coherent" and
+  :doc:`/core-api/dma-api-howto` for definitions of "coherent" and
   "streaming" DMA mappings.)
 
   Asking for 1/Nth of a page (as well as asking for N pages) is reasonably
@@ -91,7 +91,7 @@ Working with existing buffers
 Existing buffers aren't usable for DMA without first being mapped into the
 DMA address space of the device.  However, most buffers passed to your
 driver can safely be used with such DMA mapping.  (See the first section
-of Documentation/DMA-API-HOWTO.txt, titled "What memory is DMA-able?")
+of :doc:`/core-api/dma-api-howto`, titled "What memory is DMA-able?")
 
 - When you're using scatterlists, you can map everything at once.  On some
   systems, this kicks in an IOMMU and turns the scatterlists into single
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index 34d041d68f78..604cee350e53 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -570,8 +570,8 @@ ACQUIRE 는 해당 오퍼레이션의 로드 부분에만 적용되고 RELEASE 
 	[*] 버스 마스터링 DMA 와 일관성에 대해서는 다음을 참고하시기 바랍니다:
 
 	    Documentation/driver-api/pci/pci.rst
-	    Documentation/DMA-API-HOWTO.txt
-	    Documentation/DMA-API.txt
+	    Documentation/core-api/dma-api-howto.rst
+	    Documentation/core-api/dma-api.rst
 
 
 데이터 의존성 배리어 (역사적)
@@ -1907,7 +1907,7 @@ Mandatory 배리어들은 SMP 시스템에서도 UP 시스템에서도 SMP 효
 
      writel_relaxed() 와 같은 완화된 I/O 접근자들에 대한 자세한 내용을 위해서는
      "커널 I/O 배리어의 효과" 섹션을, consistent memory 에 대한 자세한 내용을
-     위해선 Documentation/DMA-API.txt 문서를 참고하세요.
+     위해선 Documentation/core-api/dma-api.rst 문서를 참고하세요.
 
 
 =========================
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index a806227c1fad..656a4888c300 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -907,7 +907,7 @@ sba_mark_invalid(struct ioc *ioc, dma_addr_t iova, size_t byte_cnt)
  * @dir: dma direction
  * @attrs: optional dma attributes
  *
- * See Documentation/DMA-API-HOWTO.txt
+ * See Documentation/core-api/dma-api-howto.rst
  */
 static dma_addr_t sba_map_page(struct device *dev, struct page *page,
 			       unsigned long poff, size_t size,
@@ -1028,7 +1028,7 @@ sba_mark_clean(struct ioc *ioc, dma_addr_t iova, size_t size)
  * @dir:  R/W or both.
  * @attrs: optional dma attributes
  *
- * See Documentation/DMA-API-HOWTO.txt
+ * See Documentation/core-api/dma-api-howto.rst
  */
 static void sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size,
 			   enum dma_data_direction dir, unsigned long attrs)
@@ -1105,7 +1105,7 @@ static void sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size,
  * @size:  number of bytes mapped in driver buffer.
  * @dma_handle:  IOVA of new buffer.
  *
- * See Documentation/DMA-API-HOWTO.txt
+ * See Documentation/core-api/dma-api-howto.rst
  */
 static void *
 sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
@@ -1162,7 +1162,7 @@ sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
  * @vaddr:  virtual address IOVA of "consistent" buffer.
  * @dma_handler:  IO virtual address of "consistent" buffer.
  *
- * See Documentation/DMA-API-HOWTO.txt
+ * See Documentation/core-api/dma-api-howto.rst
  */
 static void sba_free_coherent(struct device *dev, size_t size, void *vaddr,
 			      dma_addr_t dma_handle, unsigned long attrs)
@@ -1425,7 +1425,7 @@ static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
  * @dir:  R/W or both.
  * @attrs: optional dma attributes
  *
- * See Documentation/DMA-API-HOWTO.txt
+ * See Documentation/core-api/dma-api-howto.rst
  */
 static int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist,
 			    int nents, enum dma_data_direction dir,
@@ -1524,7 +1524,7 @@ static int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist,
  * @dir:  R/W or both.
  * @attrs: optional dma attributes
  *
- * See Documentation/DMA-API-HOWTO.txt
+ * See Documentation/core-api/dma-api-howto.rst
  */
 static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
 			       int nents, enum dma_data_direction dir,
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index 70cd24bdcfec..4f1596bb1936 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -3,7 +3,7 @@
 ** PARISC 1.1 Dynamic DMA mapping support.
 ** This implementation is for PA-RISC platforms that do not support
 ** I/O TLBs (aka DMA address translation hardware).
-** See Documentation/DMA-API-HOWTO.txt for interface definitions.
+** See Documentation/core-api/dma-api-howto.rst for interface definitions.
 **
 **      (c) Copyright 1999,2000 Hewlett-Packard Company
 **      (c) Copyright 2000 Grant Grundler
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 6b15a24930e0..fed67eafcacc 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -3,8 +3,8 @@
 #define _ASM_X86_DMA_MAPPING_H
 
 /*
- * IOMMU interface. See Documentation/DMA-API-HOWTO.txt and
- * Documentation/DMA-API.txt for documentation.
+ * IOMMU interface. See Documentation/core-api/dma-api-howto.rst and
+ * Documentation/core-api/dma-api.rst for documentation.
  */
 
 #include <linux/scatterlist.h>
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 17cb5b933dcf..e89031e9c847 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -6,7 +6,7 @@
  * This allows to use PCI devices that only support 32bit addresses on systems
  * with more than 4GB.
  *
- * See Documentation/DMA-API-HOWTO.txt for the interface specification.
+ * See Documentation/core-api/dma-api-howto.rst for the interface specification.
  *
  * Copyright 2002 Andi Kleen, SuSE Labs.
  */
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index 7e112829d250..5368452eb5a6 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -666,7 +666,7 @@ sba_mark_invalid(struct ioc *ioc, dma_addr_t iova, size_t byte_cnt)
  * @dev: instance of PCI owned by the driver that's asking
  * @mask:  number of address bits this PCI device can handle
  *
- * See Documentation/DMA-API-HOWTO.txt
+ * See Documentation/core-api/dma-api-howto.rst
  */
 static int sba_dma_supported( struct device *dev, u64 mask)
 {
@@ -698,7 +698,7 @@ static int sba_dma_supported( struct device *dev, u64 mask)
  * @size:  number of bytes to map in driver buffer.
  * @direction:  R/W or both.
  *
- * See Documentation/DMA-API-HOWTO.txt
+ * See Documentation/core-api/dma-api-howto.rst
  */
 static dma_addr_t
 sba_map_single(struct device *dev, void *addr, size_t size,
@@ -788,7 +788,7 @@ sba_map_page(struct device *dev, struct page *page, unsigned long offset,
  * @size:  number of bytes mapped in driver buffer.
  * @direction:  R/W or both.
  *
- * See Documentation/DMA-API-HOWTO.txt
+ * See Documentation/core-api/dma-api-howto.rst
  */
 static void
 sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size,
@@ -867,7 +867,7 @@ sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size,
  * @size:  number of bytes mapped in driver buffer.
  * @dma_handle:  IOVA of new buffer.
  *
- * See Documentation/DMA-API-HOWTO.txt
+ * See Documentation/core-api/dma-api-howto.rst
  */
 static void *sba_alloc(struct device *hwdev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs)
@@ -898,7 +898,7 @@ static void *sba_alloc(struct device *hwdev, size_t size, dma_addr_t *dma_handle
  * @vaddr:  virtual address IOVA of "consistent" buffer.
  * @dma_handler:  IO virtual address of "consistent" buffer.
  *
- * See Documentation/DMA-API-HOWTO.txt
+ * See Documentation/core-api/dma-api-howto.rst
  */
 static void
 sba_free(struct device *hwdev, size_t size, void *vaddr,
@@ -933,7 +933,7 @@ int dump_run_sg = 0;
  * @nents:  number of entries in list
  * @direction:  R/W or both.
  *
- * See Documentation/DMA-API-HOWTO.txt
+ * See Documentation/core-api/dma-api-howto.rst
  */
 static int
 sba_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
@@ -1017,7 +1017,7 @@ sba_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
  * @nents:  number of entries in list
  * @direction:  R/W or both.
  *
- * See Documentation/DMA-API-HOWTO.txt
+ * See Documentation/core-api/dma-api-howto.rst
  */
 static void 
 sba_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 78f677cf45ab..ef2b153ddbd9 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -14,7 +14,7 @@
 
 /**
  * List of possible attributes associated with a DMA mapping. The semantics
- * of each attribute should be defined in Documentation/DMA-attributes.txt.
+ * of each attribute should be defined in Documentation/core-api/dma-attributes.rst.
  */
 
 /*
diff --git a/include/media/videobuf-dma-sg.h b/include/media/videobuf-dma-sg.h
index b89d5e31f172..34450f7ad510 100644
--- a/include/media/videobuf-dma-sg.h
+++ b/include/media/videobuf-dma-sg.h
@@ -31,7 +31,7 @@
  *	does memory allocation too using vmalloc_32().
  *
  * videobuf_dma_*()
- *	see Documentation/DMA-API-HOWTO.txt, these functions to
+ *	see Documentation/core-api/dma-api-howto.rst, these functions to
  *	basically the same.  The map function does also build a
  *	scatterlist for the buffer (and unmap frees it ...)
  *
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 36c962a86bf2..f97f088ace7e 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1071,7 +1071,7 @@ static void check_unmap(struct dma_debug_entry *ref)
 	/*
 	 * Drivers should use dma_mapping_error() to check the returned
 	 * addresses of dma_map_single() and dma_map_page().
-	 * If not, print this warning message. See Documentation/DMA-API.txt.
+	 * If not, print this warning message. See Documentation/core-api/dma-api.rst.
 	 */
 	if (entry->map_err_type == MAP_ERR_NOT_CHECKED) {
 		err_printk(ref->dev, entry,
-- 
cgit v1.2.3


From c791cc4b1feb881b3644c059dba5464b076bf592 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 16 Jun 2020 14:53:55 -0400
Subject: tracing: Only allow trace_array_printk() to be used by instances

To prevent default "trace_printks()" from spamming the top level tracing
ring buffer, only allow trace instances to use trace_array_printk() (which
can be used without the trace_printk() start up warning).

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bb62269724d5..8241d1448d70 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3346,12 +3346,16 @@ int trace_array_printk(struct trace_array *tr,
 	int ret;
 	va_list ap;
 
-	if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
-		return 0;
-
 	if (!tr)
 		return -ENOENT;
 
+	/* This is only allowed for created instances */
+	if (tr == &global_trace)
+		return 0;
+
+	if (!(tr->trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
 	va_start(ap, fmt);
 	ret = trace_array_vprintk(tr, ip, fmt, ap);
 	va_end(ap);
-- 
cgit v1.2.3


From 7582f30cc9fbcdcd630398bea010e5f6a2e2fcab Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 27 Jun 2020 09:31:56 +0200
Subject: cgroup: unexport cgroup_rstat_updated

cgroup_rstat_updated is only used by core block code, no need to
export it.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/cgroup/rstat.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index b6397a186ce9..d51175cedfca 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -64,7 +64,6 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
 
 	raw_spin_unlock_irqrestore(cpu_lock, flags);
 }
-EXPORT_SYMBOL_GPL(cgroup_rstat_updated);
 
 /**
  * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
-- 
cgit v1.2.3


From 5da7cd11d0811c35a6988d416053b5421bc61521 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 22 Apr 2020 12:25:41 -0400
Subject: x86/ftrace: Only have the builtin ftrace_regs_caller call direct
 hooks

If a direct hook is attached to a function that ftrace also has a function
attached to it, then it is required that the ftrace_ops_list_func() is used
to iterate over the registered ftrace callbacks. This will also include the
direct ftrace_ops helper, that tells ftrace_regs_caller where to return to
(the direct callback and not the function that called it).

As this direct helper is only to handle the case of ftrace callbacks
attached to the same function as the direct callback, the ftrace callback
allocated trampolines (used to only call them), should never be used to
return back to a direct callback.

Only copy the portion of the ftrace_regs_caller that will return back to
what called it, and not the portion that returns back to the direct caller.

The direct ftrace_ops must then pick the ftrace_regs_caller builtin function
as its own trampoline to ensure that it will never have one allocated for
it (which would not include the handling of direct callbacks).

Link: http://lkml.kernel.org/r/20200422162750.495903799@goodmis.org

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/kernel/ftrace.c    | 7 -------
 arch/x86/kernel/ftrace_64.S | 3 +--
 kernel/trace/ftrace.c       | 8 ++++++++
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 51504566b3a6..d1a0190fef5b 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -367,13 +367,6 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
 	if (WARN_ON(ret < 0))
 		goto fail;
 
-	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
-		ip = trampoline + (ftrace_regs_caller_ret - ftrace_regs_caller);
-		ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE);
-		if (WARN_ON(ret < 0))
-			goto fail;
-	}
-
 	/*
 	 * The address of the ftrace_ops that is used for this trampoline
 	 * is stored at the end of the trampoline. This will be used to
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 3ba32cc58f01..43cf9a2b52c7 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -253,7 +253,7 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
 	 * The trampoline will add the code to jump
 	 * to the return.
 	 */
-SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL)
+SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
 	jmp ftrace_epilogue
 
 	/* Swap the flags with orig_rax */
@@ -264,7 +264,6 @@ SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL)
 	restore_mcount_regs 8
 	/* Restore flags */
 	popfq
-SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
 	UNWIND_HINT_RET_OFFSET
 	jmp	ftrace_epilogue
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1903b80db6eb..c141d347f71a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2388,6 +2388,14 @@ struct ftrace_ops direct_ops = {
 	.flags		= FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
 			  | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
 			  | FTRACE_OPS_FL_PERMANENT,
+	/*
+	 * By declaring the main trampoline as this trampoline
+	 * it will never have one allocated for it. Allocated
+	 * trampolines should not call direct functions.
+	 * The direct_ops should only be called by the builtin
+	 * ftrace_regs_caller trampoline.
+	 */
+	.trampoline	= FTRACE_REGS_ADDR,
 };
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 
-- 
cgit v1.2.3


From 43cb5451dffe0bc5d59688d4898c9a1f7c40d3b4 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 21 Apr 2020 19:04:06 +0200
Subject: docs: RCU: Convert torture.txt to ReST

- Add a SPDX header;
- Adjust document and section titles;
- Some whitespace fixes and new line breaks;
- Mark literal blocks as such;
- Add it to RCU/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/index.rst           |   1 +
 Documentation/RCU/torture.rst         | 293 ++++++++++++++++++++++++++++++++++
 Documentation/RCU/torture.txt         | 282 --------------------------------
 Documentation/locking/locktorture.rst |   2 +-
 MAINTAINERS                           |   4 +-
 kernel/rcu/rcutorture.c               |   2 +-
 6 files changed, 298 insertions(+), 286 deletions(-)
 create mode 100644 Documentation/RCU/torture.rst
 delete mode 100644 Documentation/RCU/torture.txt

(limited to 'kernel')

diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index 577a47e27f5d..5d5f9a1ab8f9 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -18,6 +18,7 @@ RCU concepts
    whatisRCU
    rcu
    rculist_nulls
+   torture
    listRCU
    NMI-RCU
    UP
diff --git a/Documentation/RCU/torture.rst b/Documentation/RCU/torture.rst
new file mode 100644
index 000000000000..a90147713062
--- /dev/null
+++ b/Documentation/RCU/torture.rst
@@ -0,0 +1,293 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+RCU Torture Test Operation
+==========================
+
+
+CONFIG_RCU_TORTURE_TEST
+=======================
+
+The CONFIG_RCU_TORTURE_TEST config option is available for all RCU
+implementations.  It creates an rcutorture kernel module that can
+be loaded to run a torture test.  The test periodically outputs
+status messages via printk(), which can be examined via the dmesg
+command (perhaps grepping for "torture").  The test is started
+when the module is loaded, and stops when the module is unloaded.
+
+Module parameters are prefixed by "rcutorture." in
+Documentation/admin-guide/kernel-parameters.txt.
+
+Output
+======
+
+The statistics output is as follows::
+
+	rcu-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4
+	rcu-torture: rtc:           (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767
+	rcu-torture: Reader Pipe:  727860534 34213 0 0 0 0 0 0 0 0 0
+	rcu-torture: Reader Batch:  727877838 17003 0 0 0 0 0 0 0 0 0
+	rcu-torture: Free-Block Circulation:  155440 155440 155440 155440 155440 155440 155440 155440 155440 155440 0
+	rcu-torture:--- End of test: SUCCESS: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4
+
+The command "dmesg | grep torture:" will extract this information on
+most systems.  On more esoteric configurations, it may be necessary to
+use other commands to access the output of the printk()s used by
+the RCU torture test.  The printk()s use KERN_ALERT, so they should
+be evident.  ;-)
+
+The first and last lines show the rcutorture module parameters, and the
+last line shows either "SUCCESS" or "FAILURE", based on rcutorture's
+automatic determination as to whether RCU operated correctly.
+
+The entries are as follows:
+
+*	"rtc": The hexadecimal address of the structure currently visible
+	to readers.
+
+*	"ver": The number of times since boot that the RCU writer task
+	has changed the structure visible to readers.
+
+*	"tfle": If non-zero, indicates that the "torture freelist"
+	containing structures to be placed into the "rtc" area is empty.
+	This condition is important, since it can fool you into thinking
+	that RCU is working when it is not.  :-/
+
+*	"rta": Number of structures allocated from the torture freelist.
+
+*	"rtaf": Number of allocations from the torture freelist that have
+	failed due to the list being empty.  It is not unusual for this
+	to be non-zero, but it is bad for it to be a large fraction of
+	the value indicated by "rta".
+
+*	"rtf": Number of frees into the torture freelist.
+
+*	"rtmbe": A non-zero value indicates that rcutorture believes that
+	rcu_assign_pointer() and rcu_dereference() are not working
+	correctly.  This value should be zero.
+
+*	"rtbe": A non-zero value indicates that one of the rcu_barrier()
+	family of functions is not working correctly.
+
+*	"rtbke": rcutorture was unable to create the real-time kthreads
+	used to force RCU priority inversion.  This value should be zero.
+
+*	"rtbre": Although rcutorture successfully created the kthreads
+	used to force RCU priority inversion, it was unable to set them
+	to the real-time priority level of 1.  This value should be zero.
+
+*	"rtbf": The number of times that RCU priority boosting failed
+	to resolve RCU priority inversion.
+
+*	"rtb": The number of times that rcutorture attempted to force
+	an RCU priority inversion condition.  If you are testing RCU
+	priority boosting via the "test_boost" module parameter, this
+	value should be non-zero.
+
+*	"nt": The number of times rcutorture ran RCU read-side code from
+	within a timer handler.  This value should be non-zero only
+	if you specified the "irqreader" module parameter.
+
+*	"Reader Pipe": Histogram of "ages" of structures seen by readers.
+	If any entries past the first two are non-zero, RCU is broken.
+	And rcutorture prints the error flag string "!!!" to make sure
+	you notice.  The age of a newly allocated structure is zero,
+	it becomes one when removed from reader visibility, and is
+	incremented once per grace period subsequently -- and is freed
+	after passing through (RCU_TORTURE_PIPE_LEN-2) grace periods.
+
+	The output displayed above was taken from a correctly working
+	RCU.  If you want to see what it looks like when broken, break
+	it yourself.  ;-)
+
+*	"Reader Batch": Another histogram of "ages" of structures seen
+	by readers, but in terms of counter flips (or batches) rather
+	than in terms of grace periods.  The legal number of non-zero
+	entries is again two.  The reason for this separate view is that
+	it is sometimes easier to get the third entry to show up in the
+	"Reader Batch" list than in the "Reader Pipe" list.
+
+*	"Free-Block Circulation": Shows the number of torture structures
+	that have reached a given point in the pipeline.  The first element
+	should closely correspond to the number of structures allocated,
+	the second to the number that have been removed from reader view,
+	and all but the last remaining to the corresponding number of
+	passes through a grace period.  The last entry should be zero,
+	as it is only incremented if a torture structure's counter
+	somehow gets incremented farther than it should.
+
+Different implementations of RCU can provide implementation-specific
+additional information.  For example, Tree SRCU provides the following
+additional line::
+
+	srcud-torture: Tree SRCU per-CPU(idx=0): 0(35,-21) 1(-4,24) 2(1,1) 3(-26,20) 4(28,-47) 5(-9,4) 6(-10,14) 7(-14,11) T(1,6)
+
+This line shows the per-CPU counter state, in this case for Tree SRCU
+using a dynamically allocated srcu_struct (hence "srcud-" rather than
+"srcu-").  The numbers in parentheses are the values of the "old" and
+"current" counters for the corresponding CPU.  The "idx" value maps the
+"old" and "current" values to the underlying array, and is useful for
+debugging.  The final "T" entry contains the totals of the counters.
+
+Usage on Specific Kernel Builds
+===============================
+
+It is sometimes desirable to torture RCU on a specific kernel build,
+for example, when preparing to put that kernel build into production.
+In that case, the kernel should be built with CONFIG_RCU_TORTURE_TEST=m
+so that the test can be started using modprobe and terminated using rmmod.
+
+For example, the following script may be used to torture RCU::
+
+	#!/bin/sh
+
+	modprobe rcutorture
+	sleep 3600
+	rmmod rcutorture
+	dmesg | grep torture:
+
+The output can be manually inspected for the error flag of "!!!".
+One could of course create a more elaborate script that automatically
+checked for such errors.  The "rmmod" command forces a "SUCCESS",
+"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed.  The first
+two are self-explanatory, while the last indicates that while there
+were no RCU failures, CPU-hotplug problems were detected.
+
+
+Usage on Mainline Kernels
+=========================
+
+When using rcutorture to test changes to RCU itself, it is often
+necessary to build a number of kernels in order to test that change
+across a broad range of combinations of the relevant Kconfig options
+and of the relevant kernel boot parameters.  In this situation, use
+of modprobe and rmmod can be quite time-consuming and error-prone.
+
+Therefore, the tools/testing/selftests/rcutorture/bin/kvm.sh
+script is available for mainline testing for x86, arm64, and
+powerpc.  By default, it will run the series of tests specified by
+tools/testing/selftests/rcutorture/configs/rcu/CFLIST, with each test
+running for 30 minutes within a guest OS using a minimal userspace
+supplied by an automatically generated initrd.  After the tests are
+complete, the resulting build products and console output are analyzed
+for errors and the results of the runs are summarized.
+
+On larger systems, rcutorture testing can be accelerated by passing the
+--cpus argument to kvm.sh.  For example, on a 64-CPU system, "--cpus 43"
+would use up to 43 CPUs to run tests concurrently, which as of v5.4 would
+complete all the scenarios in two batches, reducing the time to complete
+from about eight hours to about one hour (not counting the time to build
+the sixteen kernels).  The "--dryrun sched" argument will not run tests,
+but rather tell you how the tests would be scheduled into batches.  This
+can be useful when working out how many CPUs to specify in the --cpus
+argument.
+
+Not all changes require that all scenarios be run.  For example, a change
+to Tree SRCU might run only the SRCU-N and SRCU-P scenarios using the
+--configs argument to kvm.sh as follows:  "--configs 'SRCU-N SRCU-P'".
+Large systems can run multiple copies of of the full set of scenarios,
+for example, a system with 448 hardware threads can run five instances
+of the full set concurrently.  To make this happen::
+
+	kvm.sh --cpus 448 --configs '5*CFLIST'
+
+Alternatively, such a system can run 56 concurrent instances of a single
+eight-CPU scenario::
+
+	kvm.sh --cpus 448 --configs '56*TREE04'
+
+Or 28 concurrent instances of each of two eight-CPU scenarios::
+
+	kvm.sh --cpus 448 --configs '28*TREE03 28*TREE04'
+
+Of course, each concurrent instance will use memory, which can be
+limited using the --memory argument, which defaults to 512M.  Small
+values for memory may require disabling the callback-flooding tests
+using the --bootargs parameter discussed below.
+
+Sometimes additional debugging is useful, and in such cases the --kconfig
+parameter to kvm.sh may be used, for example, ``--kconfig 'CONFIG_KASAN=y'``.
+
+Kernel boot arguments can also be supplied, for example, to control
+rcutorture's module parameters.  For example, to test a change to RCU's
+CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'".
+This will of course result in the scripting reporting a failure, namely
+the resuling RCU CPU stall warning.  As noted above, reducing memory may
+require disabling rcutorture's callback-flooding tests::
+
+	kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \
+		--bootargs 'rcutorture.fwd_progress=0'
+
+Sometimes all that is needed is a full set of kernel builds.  This is
+what the --buildonly argument does.
+
+Finally, the --trust-make argument allows each kernel build to reuse what
+it can from the previous kernel build.
+
+There are additional more arcane arguments that are documented in the
+source code of the kvm.sh script.
+
+If a run contains failures, the number of buildtime and runtime failures
+is listed at the end of the kvm.sh output, which you really should redirect
+to a file.  The build products and console output of each run is kept in
+tools/testing/selftests/rcutorture/res in timestamped directories.  A
+given directory can be supplied to kvm-find-errors.sh in order to have
+it cycle you through summaries of errors and full error logs.  For example::
+
+	tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh \
+		tools/testing/selftests/rcutorture/res/2020.01.20-15.54.23
+
+However, it is often more convenient to access the files directly.
+Files pertaining to all scenarios in a run reside in the top-level
+directory (2020.01.20-15.54.23 in the example above), while per-scenario
+files reside in a subdirectory named after the scenario (for example,
+"TREE04").  If a given scenario ran more than once (as in "--configs
+'56*TREE04'" above), the directories corresponding to the second and
+subsequent runs of that scenario include a sequence number, for example,
+"TREE04.2", "TREE04.3", and so on.
+
+The most frequently used file in the top-level directory is testid.txt.
+If the test ran in a git repository, then this file contains the commit
+that was tested and any uncommitted changes in diff format.
+
+The most frequently used files in each per-scenario-run directory are:
+
+.config:
+	This file contains the Kconfig options.
+
+Make.out:
+	This contains build output for a specific scenario.
+
+console.log:
+	This contains the console output for a specific scenario.
+	This file may be examined once the kernel has booted, but
+	it might not exist if the build failed.
+
+vmlinux:
+	This contains the kernel, which can be useful with tools like
+	objdump and gdb.
+
+A number of additional files are available, but are less frequently used.
+Many are intended for debugging of rcutorture itself or of its scripting.
+
+As of v5.4, a successful run with the default set of scenarios produces
+the following summary at the end of the run on a 12-CPU system::
+
+    SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ]
+    SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ]
+    SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ]
+    SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ]
+    TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ]
+    TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ]
+    TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ]
+    TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198
+    TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631
+    TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ]
+    TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844
+    TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497
+    CPU count limited from 16 to 12
+    TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961
+    TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997
+    TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732
+    CPU count limited from 16 to 12
+    TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
deleted file mode 100644
index af712a3c5b6a..000000000000
--- a/Documentation/RCU/torture.txt
+++ /dev/null
@@ -1,282 +0,0 @@
-RCU Torture Test Operation
-
-
-CONFIG_RCU_TORTURE_TEST
-
-The CONFIG_RCU_TORTURE_TEST config option is available for all RCU
-implementations.  It creates an rcutorture kernel module that can
-be loaded to run a torture test.  The test periodically outputs
-status messages via printk(), which can be examined via the dmesg
-command (perhaps grepping for "torture").  The test is started
-when the module is loaded, and stops when the module is unloaded.
-
-Module parameters are prefixed by "rcutorture." in
-Documentation/admin-guide/kernel-parameters.txt.
-
-OUTPUT
-
-The statistics output is as follows:
-
-	rcu-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4
-	rcu-torture: rtc:           (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767
-	rcu-torture: Reader Pipe:  727860534 34213 0 0 0 0 0 0 0 0 0
-	rcu-torture: Reader Batch:  727877838 17003 0 0 0 0 0 0 0 0 0
-	rcu-torture: Free-Block Circulation:  155440 155440 155440 155440 155440 155440 155440 155440 155440 155440 0
-	rcu-torture:--- End of test: SUCCESS: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4
-
-The command "dmesg | grep torture:" will extract this information on
-most systems.  On more esoteric configurations, it may be necessary to
-use other commands to access the output of the printk()s used by
-the RCU torture test.  The printk()s use KERN_ALERT, so they should
-be evident.  ;-)
-
-The first and last lines show the rcutorture module parameters, and the
-last line shows either "SUCCESS" or "FAILURE", based on rcutorture's
-automatic determination as to whether RCU operated correctly.
-
-The entries are as follows:
-
-o	"rtc": The hexadecimal address of the structure currently visible
-	to readers.
-
-o	"ver": The number of times since boot that the RCU writer task
-	has changed the structure visible to readers.
-
-o	"tfle": If non-zero, indicates that the "torture freelist"
-	containing structures to be placed into the "rtc" area is empty.
-	This condition is important, since it can fool you into thinking
-	that RCU is working when it is not.  :-/
-
-o	"rta": Number of structures allocated from the torture freelist.
-
-o	"rtaf": Number of allocations from the torture freelist that have
-	failed due to the list being empty.  It is not unusual for this
-	to be non-zero, but it is bad for it to be a large fraction of
-	the value indicated by "rta".
-
-o	"rtf": Number of frees into the torture freelist.
-
-o	"rtmbe": A non-zero value indicates that rcutorture believes that
-	rcu_assign_pointer() and rcu_dereference() are not working
-	correctly.  This value should be zero.
-
-o	"rtbe": A non-zero value indicates that one of the rcu_barrier()
-	family of functions is not working correctly.
-
-o	"rtbke": rcutorture was unable to create the real-time kthreads
-	used to force RCU priority inversion.  This value should be zero.
-
-o	"rtbre": Although rcutorture successfully created the kthreads
-	used to force RCU priority inversion, it was unable to set them
-	to the real-time priority level of 1.  This value should be zero.
-
-o	"rtbf": The number of times that RCU priority boosting failed
-	to resolve RCU priority inversion.
-
-o	"rtb": The number of times that rcutorture attempted to force
-	an RCU priority inversion condition.  If you are testing RCU
-	priority boosting via the "test_boost" module parameter, this
-	value should be non-zero.
-
-o	"nt": The number of times rcutorture ran RCU read-side code from
-	within a timer handler.  This value should be non-zero only
-	if you specified the "irqreader" module parameter.
-
-o	"Reader Pipe": Histogram of "ages" of structures seen by readers.
-	If any entries past the first two are non-zero, RCU is broken.
-	And rcutorture prints the error flag string "!!!" to make sure
-	you notice.  The age of a newly allocated structure is zero,
-	it becomes one when removed from reader visibility, and is
-	incremented once per grace period subsequently -- and is freed
-	after passing through (RCU_TORTURE_PIPE_LEN-2) grace periods.
-
-	The output displayed above was taken from a correctly working
-	RCU.  If you want to see what it looks like when broken, break
-	it yourself.  ;-)
-
-o	"Reader Batch": Another histogram of "ages" of structures seen
-	by readers, but in terms of counter flips (or batches) rather
-	than in terms of grace periods.  The legal number of non-zero
-	entries is again two.  The reason for this separate view is that
-	it is sometimes easier to get the third entry to show up in the
-	"Reader Batch" list than in the "Reader Pipe" list.
-
-o	"Free-Block Circulation": Shows the number of torture structures
-	that have reached a given point in the pipeline.  The first element
-	should closely correspond to the number of structures allocated,
-	the second to the number that have been removed from reader view,
-	and all but the last remaining to the corresponding number of
-	passes through a grace period.  The last entry should be zero,
-	as it is only incremented if a torture structure's counter
-	somehow gets incremented farther than it should.
-
-Different implementations of RCU can provide implementation-specific
-additional information.  For example, Tree SRCU provides the following
-additional line:
-
-	srcud-torture: Tree SRCU per-CPU(idx=0): 0(35,-21) 1(-4,24) 2(1,1) 3(-26,20) 4(28,-47) 5(-9,4) 6(-10,14) 7(-14,11) T(1,6)
-
-This line shows the per-CPU counter state, in this case for Tree SRCU
-using a dynamically allocated srcu_struct (hence "srcud-" rather than
-"srcu-").  The numbers in parentheses are the values of the "old" and
-"current" counters for the corresponding CPU.  The "idx" value maps the
-"old" and "current" values to the underlying array, and is useful for
-debugging.  The final "T" entry contains the totals of the counters.
-
-
-USAGE ON SPECIFIC KERNEL BUILDS
-
-It is sometimes desirable to torture RCU on a specific kernel build,
-for example, when preparing to put that kernel build into production.
-In that case, the kernel should be built with CONFIG_RCU_TORTURE_TEST=m
-so that the test can be started using modprobe and terminated using rmmod.
-
-For example, the following script may be used to torture RCU:
-
-	#!/bin/sh
-
-	modprobe rcutorture
-	sleep 3600
-	rmmod rcutorture
-	dmesg | grep torture:
-
-The output can be manually inspected for the error flag of "!!!".
-One could of course create a more elaborate script that automatically
-checked for such errors.  The "rmmod" command forces a "SUCCESS",
-"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed.  The first
-two are self-explanatory, while the last indicates that while there
-were no RCU failures, CPU-hotplug problems were detected.
-
-
-USAGE ON MAINLINE KERNELS
-
-When using rcutorture to test changes to RCU itself, it is often
-necessary to build a number of kernels in order to test that change
-across a broad range of combinations of the relevant Kconfig options
-and of the relevant kernel boot parameters.  In this situation, use
-of modprobe and rmmod can be quite time-consuming and error-prone.
-
-Therefore, the tools/testing/selftests/rcutorture/bin/kvm.sh
-script is available for mainline testing for x86, arm64, and
-powerpc.  By default, it will run the series of tests specified by
-tools/testing/selftests/rcutorture/configs/rcu/CFLIST, with each test
-running for 30 minutes within a guest OS using a minimal userspace
-supplied by an automatically generated initrd.  After the tests are
-complete, the resulting build products and console output are analyzed
-for errors and the results of the runs are summarized.
-
-On larger systems, rcutorture testing can be accelerated by passing the
---cpus argument to kvm.sh.  For example, on a 64-CPU system, "--cpus 43"
-would use up to 43 CPUs to run tests concurrently, which as of v5.4 would
-complete all the scenarios in two batches, reducing the time to complete
-from about eight hours to about one hour (not counting the time to build
-the sixteen kernels).  The "--dryrun sched" argument will not run tests,
-but rather tell you how the tests would be scheduled into batches.  This
-can be useful when working out how many CPUs to specify in the --cpus
-argument.
-
-Not all changes require that all scenarios be run.  For example, a change
-to Tree SRCU might run only the SRCU-N and SRCU-P scenarios using the
---configs argument to kvm.sh as follows:  "--configs 'SRCU-N SRCU-P'".
-Large systems can run multiple copies of of the full set of scenarios,
-for example, a system with 448 hardware threads can run five instances
-of the full set concurrently.  To make this happen:
-
-	kvm.sh --cpus 448 --configs '5*CFLIST'
-
-Alternatively, such a system can run 56 concurrent instances of a single
-eight-CPU scenario:
-
-	kvm.sh --cpus 448 --configs '56*TREE04'
-
-Or 28 concurrent instances of each of two eight-CPU scenarios:
-
-	kvm.sh --cpus 448 --configs '28*TREE03 28*TREE04'
-
-Of course, each concurrent instance will use memory, which can be
-limited using the --memory argument, which defaults to 512M.  Small
-values for memory may require disabling the callback-flooding tests
-using the --bootargs parameter discussed below.
-
-Sometimes additional debugging is useful, and in such cases the --kconfig
-parameter to kvm.sh may be used, for example, "--kconfig 'CONFIG_KASAN=y'".
-
-Kernel boot arguments can also be supplied, for example, to control
-rcutorture's module parameters.  For example, to test a change to RCU's
-CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'".
-This will of course result in the scripting reporting a failure, namely
-the resuling RCU CPU stall warning.  As noted above, reducing memory may
-require disabling rcutorture's callback-flooding tests:
-
-	kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \
-		--bootargs 'rcutorture.fwd_progress=0'
-
-Sometimes all that is needed is a full set of kernel builds.  This is
-what the --buildonly argument does.
-
-Finally, the --trust-make argument allows each kernel build to reuse what
-it can from the previous kernel build.
-
-There are additional more arcane arguments that are documented in the
-source code of the kvm.sh script.
-
-If a run contains failures, the number of buildtime and runtime failures
-is listed at the end of the kvm.sh output, which you really should redirect
-to a file.  The build products and console output of each run is kept in
-tools/testing/selftests/rcutorture/res in timestamped directories.  A
-given directory can be supplied to kvm-find-errors.sh in order to have
-it cycle you through summaries of errors and full error logs.  For example:
-
-	tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh \
-		tools/testing/selftests/rcutorture/res/2020.01.20-15.54.23
-
-However, it is often more convenient to access the files directly.
-Files pertaining to all scenarios in a run reside in the top-level
-directory (2020.01.20-15.54.23 in the example above), while per-scenario
-files reside in a subdirectory named after the scenario (for example,
-"TREE04").  If a given scenario ran more than once (as in "--configs
-'56*TREE04'" above), the directories corresponding to the second and
-subsequent runs of that scenario include a sequence number, for example,
-"TREE04.2", "TREE04.3", and so on.
-
-The most frequently used file in the top-level directory is testid.txt.
-If the test ran in a git repository, then this file contains the commit
-that was tested and any uncommitted changes in diff format.
-
-The most frequently used files in each per-scenario-run directory are:
-
-.config: This file contains the Kconfig options.
-
-Make.out: This contains build output for a specific scenario.
-
-console.log: This contains the console output for a specific scenario.
-	This file may be examined once the kernel has booted, but
-	it might not exist if the build failed.
-
-vmlinux: This contains the kernel, which can be useful with tools like
-	objdump and gdb.
-
-A number of additional files are available, but are less frequently used.
-Many are intended for debugging of rcutorture itself or of its scripting.
-
-As of v5.4, a successful run with the default set of scenarios produces
-the following summary at the end of the run on a 12-CPU system:
-
-SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ]
-SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ]
-SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ]
-SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ]
-TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ]
-TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ]
-TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ]
-TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198
-TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631
-TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ]
-TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844
-TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497
-CPU count limited from 16 to 12
-TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961
-TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997
-TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732
-CPU count limited from 16 to 12
-TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011
diff --git a/Documentation/locking/locktorture.rst b/Documentation/locking/locktorture.rst
index 8012a74555e7..dfaf9fc883f4 100644
--- a/Documentation/locking/locktorture.rst
+++ b/Documentation/locking/locktorture.rst
@@ -166,4 +166,4 @@ checked for such errors.  The "rmmod" command forces a "SUCCESS",
 two are self-explanatory, while the last indicates that while there
 were no locking failures, CPU-hotplug problems were detected.
 
-Also see: Documentation/RCU/torture.txt
+Also see: Documentation/RCU/torture.rst
diff --git a/MAINTAINERS b/MAINTAINERS
index 496fd4eafb68..4429ce965b3a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14437,7 +14437,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev
 F:	Documentation/RCU/
 F:	include/linux/rcu*
 F:	kernel/rcu/
-X:	Documentation/RCU/torture.txt
+X:	Documentation/RCU/torture.rst
 X:	include/linux/srcu*.h
 X:	kernel/rcu/srcu*.c
 
@@ -17288,7 +17288,7 @@ M:	Josh Triplett <josh@joshtriplett.org>
 L:	linux-kernel@vger.kernel.org
 S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev
-F:	Documentation/RCU/torture.txt
+F:	Documentation/RCU/torture.rst
 F:	kernel/locking/locktorture.c
 F:	kernel/rcu/rcuperf.c
 F:	kernel/rcu/rcutorture.c
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index efb792e13fca..8205295fc33e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -7,7 +7,7 @@
  * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
  *	  Josh Triplett <josh@joshtriplett.org>
  *
- * See also:  Documentation/RCU/torture.txt
+ * See also:  Documentation/RCU/torture.rst
  */
 
 #define pr_fmt(fmt) fmt
-- 
cgit v1.2.3


From f2286ab99549271f3cec73e305b9ecca95d91394 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 21 Apr 2020 19:04:10 +0200
Subject: docs: RCU: Convert stallwarn.txt to ReST

- Add a SPDX header;
- Adjust document and section titles;
- Fix list markups;
- Some whitespace fixes and new line breaks;
- Mark literal blocks as such;
- Add it to RCU/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/index.rst     |   1 +
 Documentation/RCU/stallwarn.rst | 329 ++++++++++++++++++++++++++++++++++++++++
 Documentation/RCU/stallwarn.txt | 316 --------------------------------------
 kernel/rcu/tree_stall.h         |   4 +-
 4 files changed, 332 insertions(+), 318 deletions(-)
 create mode 100644 Documentation/RCU/stallwarn.rst
 delete mode 100644 Documentation/RCU/stallwarn.txt

(limited to 'kernel')

diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index 9a1d51f394dc..e703d3dbe60c 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -20,6 +20,7 @@ RCU concepts
    rculist_nulls
    rcuref
    torture
+   stallwarn
    listRCU
    NMI-RCU
    UP
diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst
new file mode 100644
index 000000000000..08bc9aec4606
--- /dev/null
+++ b/Documentation/RCU/stallwarn.rst
@@ -0,0 +1,329 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+Using RCU's CPU Stall Detector
+==============================
+
+This document first discusses what sorts of issues RCU's CPU stall
+detector can locate, and then discusses kernel parameters and Kconfig
+options that can be used to fine-tune the detector's operation.  Finally,
+this document explains the stall detector's "splat" format.
+
+
+What Causes RCU CPU Stall Warnings?
+===================================
+
+So your kernel printed an RCU CPU stall warning.  The next question is
+"What caused it?"  The following problems can result in RCU CPU stall
+warnings:
+
+-	A CPU looping in an RCU read-side critical section.
+
+-	A CPU looping with interrupts disabled.
+
+-	A CPU looping with preemption disabled.
+
+-	A CPU looping with bottom halves disabled.
+
+-	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
+	without invoking schedule().  If the looping in the kernel is
+	really expected and desirable behavior, you might need to add
+	some calls to cond_resched().
+
+-	Booting Linux using a console connection that is too slow to
+	keep up with the boot-time console-message rate.  For example,
+	a 115Kbaud serial console can be -way- too slow to keep up
+	with boot-time message rates, and will frequently result in
+	RCU CPU stall warning messages.  Especially if you have added
+	debug printk()s.
+
+-	Anything that prevents RCU's grace-period kthreads from running.
+	This can result in the "All QSes seen" console-log message.
+	This message will include information on when the kthread last
+	ran and how often it should be expected to run.  It can also
+	result in the ``rcu_.*kthread starved for`` console-log message,
+	which will include additional debugging information.
+
+-	A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
+	happen to preempt a low-priority task in the middle of an RCU
+	read-side critical section.   This is especially damaging if
+	that low-priority task is not permitted to run on any other CPU,
+	in which case the next RCU grace period can never complete, which
+	will eventually cause the system to run out of memory and hang.
+	While the system is in the process of running itself out of
+	memory, you might see stall-warning messages.
+
+-	A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
+	is running at a higher priority than the RCU softirq threads.
+	This will prevent RCU callbacks from ever being invoked,
+	and in a CONFIG_PREEMPT_RCU kernel will further prevent
+	RCU grace periods from ever completing.  Either way, the
+	system will eventually run out of memory and hang.  In the
+	CONFIG_PREEMPT_RCU case, you might see stall-warning
+	messages.
+
+	You can use the rcutree.kthread_prio kernel boot parameter to
+	increase the scheduling priority of RCU's kthreads, which can
+	help avoid this problem.  However, please note that doing this
+	can increase your system's context-switch rate and thus degrade
+	performance.
+
+-	A periodic interrupt whose handler takes longer than the time
+	interval between successive pairs of interrupts.  This can
+	prevent RCU's kthreads and softirq handlers from running.
+	Note that certain high-overhead debugging options, for example
+	the function_graph tracer, can result in interrupt handler taking
+	considerably longer than normal, which can in turn result in
+	RCU CPU stall warnings.
+
+-	Testing a workload on a fast system, tuning the stall-warning
+	timeout down to just barely avoid RCU CPU stall warnings, and then
+	running the same workload with the same stall-warning timeout on a
+	slow system.  Note that thermal throttling and on-demand governors
+	can cause a single system to be sometimes fast and sometimes slow!
+
+-	A hardware or software issue shuts off the scheduler-clock
+	interrupt on a CPU that is not in dyntick-idle mode.  This
+	problem really has happened, and seems to be most likely to
+	result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
+
+-	A bug in the RCU implementation.
+
+-	A hardware failure.  This is quite unlikely, but has occurred
+	at least once in real life.  A CPU failed in a running system,
+	becoming unresponsive, but not causing an immediate crash.
+	This resulted in a series of RCU CPU stall warnings, eventually
+	leading the realization that the CPU had failed.
+
+The RCU, RCU-sched, and RCU-tasks implementations have CPU stall warning.
+Note that SRCU does -not- have CPU stall warnings.  Please note that
+RCU only detects CPU stalls when there is a grace period in progress.
+No grace period, no CPU stall warnings.
+
+To diagnose the cause of the stall, inspect the stack traces.
+The offending function will usually be near the top of the stack.
+If you have a series of stall warnings from a single extended stall,
+comparing the stack traces can often help determine where the stall
+is occurring, which will usually be in the function nearest the top of
+that portion of the stack which remains the same from trace to trace.
+If you can reliably trigger the stall, ftrace can be quite helpful.
+
+RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
+and with RCU's event tracing.  For information on RCU's event tracing,
+see include/trace/events/rcu.h.
+
+
+Fine-Tuning the RCU CPU Stall Detector
+======================================
+
+The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's
+CPU stall detector, which detects conditions that unduly delay RCU grace
+periods.  This module parameter enables CPU stall detection by default,
+but may be overridden via boot-time parameter or at runtime via sysfs.
+The stall detector's idea of what constitutes "unduly delayed" is
+controlled by a set of kernel configuration variables and cpp macros:
+
+CONFIG_RCU_CPU_STALL_TIMEOUT
+----------------------------
+
+	This kernel configuration parameter defines the period of time
+	that RCU will wait from the beginning of a grace period until it
+	issues an RCU CPU stall warning.  This time period is normally
+	21 seconds.
+
+	This configuration parameter may be changed at runtime via the
+	/sys/module/rcupdate/parameters/rcu_cpu_stall_timeout, however
+	this parameter is checked only at the beginning of a cycle.
+	So if you are 10 seconds into a 40-second stall, setting this
+	sysfs parameter to (say) five will shorten the timeout for the
+	-next- stall, or the following warning for the current stall
+	(assuming the stall lasts long enough).  It will not affect the
+	timing of the next warning for the current stall.
+
+	Stall-warning messages may be enabled and disabled completely via
+	/sys/module/rcupdate/parameters/rcu_cpu_stall_suppress.
+
+RCU_STALL_DELAY_DELTA
+---------------------
+
+	Although the lockdep facility is extremely useful, it does add
+	some overhead.  Therefore, under CONFIG_PROVE_RCU, the
+	RCU_STALL_DELAY_DELTA macro allows five extra seconds before
+	giving an RCU CPU stall warning message.  (This is a cpp
+	macro, not a kernel configuration parameter.)
+
+RCU_STALL_RAT_DELAY
+-------------------
+
+	The CPU stall detector tries to make the offending CPU print its
+	own warnings, as this often gives better-quality stack traces.
+	However, if the offending CPU does not detect its own stall in
+	the number of jiffies specified by RCU_STALL_RAT_DELAY, then
+	some other CPU will complain.  This delay is normally set to
+	two jiffies.  (This is a cpp macro, not a kernel configuration
+	parameter.)
+
+rcupdate.rcu_task_stall_timeout
+-------------------------------
+
+	This boot/sysfs parameter controls the RCU-tasks stall warning
+	interval.  A value of zero or less suppresses RCU-tasks stall
+	warnings.  A positive value sets the stall-warning interval
+	in seconds.  An RCU-tasks stall warning starts with the line:
+
+		INFO: rcu_tasks detected stalls on tasks:
+
+	And continues with the output of sched_show_task() for each
+	task stalling the current RCU-tasks grace period.
+
+
+Interpreting RCU's CPU Stall-Detector "Splats"
+==============================================
+
+For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling,
+it will print a message similar to the following::
+
+	INFO: rcu_sched detected stalls on CPUs/tasks:
+	2-...: (3 GPs behind) idle=06c/0/0 softirq=1453/1455 fqs=0
+	16-...: (0 ticks this GP) idle=81c/0/0 softirq=764/764 fqs=0
+	(detected by 32, t=2603 jiffies, g=7075, q=625)
+
+This message indicates that CPU 32 detected that CPUs 2 and 16 were both
+causing stalls, and that the stall was affecting RCU-sched.  This message
+will normally be followed by stack dumps for each CPU.  Please note that
+PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, and that
+the tasks will be indicated by PID, for example, "P3421".  It is even
+possible for an rcu_state stall to be caused by both CPUs -and- tasks,
+in which case the offending CPUs and tasks will all be called out in the list.
+
+CPU 2's "(3 GPs behind)" indicates that this CPU has not interacted with
+the RCU core for the past three grace periods.  In contrast, CPU 16's "(0
+ticks this GP)" indicates that this CPU has not taken any scheduling-clock
+interrupts during the current stalled grace period.
+
+The "idle=" portion of the message prints the dyntick-idle state.
+The hex number before the first "/" is the low-order 12 bits of the
+dynticks counter, which will have an even-numbered value if the CPU
+is in dyntick-idle mode and an odd-numbered value otherwise.  The hex
+number between the two "/"s is the value of the nesting, which will be
+a small non-negative number if in the idle loop (as shown above) and a
+very large positive number otherwise.
+
+The "softirq=" portion of the message tracks the number of RCU softirq
+handlers that the stalled CPU has executed.  The number before the "/"
+is the number that had executed since boot at the time that this CPU
+last noted the beginning of a grace period, which might be the current
+(stalled) grace period, or it might be some earlier grace period (for
+example, if the CPU might have been in dyntick-idle mode for an extended
+time period.  The number after the "/" is the number that have executed
+since boot until the current time.  If this latter number stays constant
+across repeated stall-warning messages, it is possible that RCU's softirq
+handlers are no longer able to execute on this CPU.  This can happen if
+the stalled CPU is spinning with interrupts are disabled, or, in -rt
+kernels, if a high-priority process is starving RCU's softirq handler.
+
+The "fqs=" shows the number of force-quiescent-state idle/offline
+detection passes that the grace-period kthread has made across this
+CPU since the last time that this CPU noted the beginning of a grace
+period.
+
+The "detected by" line indicates which CPU detected the stall (in this
+case, CPU 32), how many jiffies have elapsed since the start of the grace
+period (in this case 2603), the grace-period sequence number (7075), and
+an estimate of the total number of RCU callbacks queued across all CPUs
+(625 in this case).
+
+In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
+for each CPU::
+
+	0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1
+
+The "last_accelerate:" prints the low-order 16 bits (in hex) of the
+jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
+from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
+rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle
+processing is enabled.
+
+If the grace period ends just as the stall warning starts printing,
+there will be a spurious stall-warning message, which will include
+the following::
+
+	INFO: Stall ended before state dump start
+
+This is rare, but does happen from time to time in real life.  It is also
+possible for a zero-jiffy stall to be flagged in this case, depending
+on how the stall warning and the grace-period initialization happen to
+interact.  Please note that it is not possible to entirely eliminate this
+sort of false positive without resorting to things like stop_machine(),
+which is overkill for this sort of problem.
+
+If all CPUs and tasks have passed through quiescent states, but the
+grace period has nevertheless failed to end, the stall-warning splat
+will include something like the following::
+
+	All QSes seen, last rcu_preempt kthread activity 23807 (4297905177-4297881370), jiffies_till_next_fqs=3, root ->qsmask 0x0
+
+The "23807" indicates that it has been more than 23 thousand jiffies
+since the grace-period kthread ran.  The "jiffies_till_next_fqs"
+indicates how frequently that kthread should run, giving the number
+of jiffies between force-quiescent-state scans, in this case three,
+which is way less than 23807.  Finally, the root rcu_node structure's
+->qsmask field is printed, which will normally be zero.
+
+If the relevant grace-period kthread has been unable to run prior to
+the stall warning, as was the case in the "All QSes seen" line above,
+the following additional line is printed::
+
+	kthread starved for 23807 jiffies! g7075 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1 ->cpu=5
+
+Starving the grace-period kthreads of CPU time can of course result
+in RCU CPU stall warnings even when all CPUs and tasks have passed
+through the required quiescent states.  The "g" number shows the current
+grace-period sequence number, the "f" precedes the ->gp_flags command
+to the grace-period kthread, the "RCU_GP_WAIT_FQS" indicates that the
+kthread is waiting for a short timeout, the "state" precedes value of the
+task_struct ->state field, and the "cpu" indicates that the grace-period
+kthread last ran on CPU 5.
+
+
+Multiple Warnings From One Stall
+================================
+
+If a stall lasts long enough, multiple stall-warning messages will be
+printed for it.  The second and subsequent messages are printed at
+longer intervals, so that the time between (say) the first and second
+message will be about three times the interval between the beginning
+of the stall and the first message.
+
+
+Stall Warnings for Expedited Grace Periods
+==========================================
+
+If an expedited grace period detects a stall, it will place a message
+like the following in dmesg::
+
+	INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 7-... } 21119 jiffies s: 73 root: 0x2/.
+
+This indicates that CPU 7 has failed to respond to a reschedule IPI.
+The three periods (".") following the CPU number indicate that the CPU
+is online (otherwise the first period would instead have been "O"),
+that the CPU was online at the beginning of the expedited grace period
+(otherwise the second period would have instead been "o"), and that
+the CPU has been online at least once since boot (otherwise, the third
+period would instead have been "N").  The number before the "jiffies"
+indicates that the expedited grace period has been going on for 21,119
+jiffies.  The number following the "s:" indicates that the expedited
+grace-period sequence counter is 73.  The fact that this last value is
+odd indicates that an expedited grace period is in flight.  The number
+following "root:" is a bitmask that indicates which children of the root
+rcu_node structure correspond to CPUs and/or tasks that are blocking the
+current expedited grace period.  If the tree had more than one level,
+additional hex numbers would be printed for the states of the other
+rcu_node structures in the tree.
+
+As with normal grace periods, PREEMPT_RCU builds can be stalled by
+tasks as well as by CPUs, and that the tasks will be indicated by PID,
+for example, "P3421".
+
+It is entirely possible to see stall warnings from normal and from
+expedited grace periods at about the same time during the same run.
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
deleted file mode 100644
index a360a8796710..000000000000
--- a/Documentation/RCU/stallwarn.txt
+++ /dev/null
@@ -1,316 +0,0 @@
-Using RCU's CPU Stall Detector
-
-This document first discusses what sorts of issues RCU's CPU stall
-detector can locate, and then discusses kernel parameters and Kconfig
-options that can be used to fine-tune the detector's operation.  Finally,
-this document explains the stall detector's "splat" format.
-
-
-What Causes RCU CPU Stall Warnings?
-
-So your kernel printed an RCU CPU stall warning.  The next question is
-"What caused it?"  The following problems can result in RCU CPU stall
-warnings:
-
-o	A CPU looping in an RCU read-side critical section.
-
-o	A CPU looping with interrupts disabled.
-
-o	A CPU looping with preemption disabled.
-
-o	A CPU looping with bottom halves disabled.
-
-o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
-	without invoking schedule().  If the looping in the kernel is
-	really expected and desirable behavior, you might need to add
-	some calls to cond_resched().
-
-o	Booting Linux using a console connection that is too slow to
-	keep up with the boot-time console-message rate.  For example,
-	a 115Kbaud serial console can be -way- too slow to keep up
-	with boot-time message rates, and will frequently result in
-	RCU CPU stall warning messages.  Especially if you have added
-	debug printk()s.
-
-o	Anything that prevents RCU's grace-period kthreads from running.
-	This can result in the "All QSes seen" console-log message.
-	This message will include information on when the kthread last
-	ran and how often it should be expected to run.  It can also
-	result in the "rcu_.*kthread starved for" console-log message,
-	which will include additional debugging information.
-
-o	A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
-	happen to preempt a low-priority task in the middle of an RCU
-	read-side critical section.   This is especially damaging if
-	that low-priority task is not permitted to run on any other CPU,
-	in which case the next RCU grace period can never complete, which
-	will eventually cause the system to run out of memory and hang.
-	While the system is in the process of running itself out of
-	memory, you might see stall-warning messages.
-
-o	A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
-	is running at a higher priority than the RCU softirq threads.
-	This will prevent RCU callbacks from ever being invoked,
-	and in a CONFIG_PREEMPT_RCU kernel will further prevent
-	RCU grace periods from ever completing.  Either way, the
-	system will eventually run out of memory and hang.  In the
-	CONFIG_PREEMPT_RCU case, you might see stall-warning
-	messages.
-
-	You can use the rcutree.kthread_prio kernel boot parameter to
-	increase the scheduling priority of RCU's kthreads, which can
-	help avoid this problem.  However, please note that doing this
-	can increase your system's context-switch rate and thus degrade
-	performance.
-
-o	A periodic interrupt whose handler takes longer than the time
-	interval between successive pairs of interrupts.  This can
-	prevent RCU's kthreads and softirq handlers from running.
-	Note that certain high-overhead debugging options, for example
-	the function_graph tracer, can result in interrupt handler taking
-	considerably longer than normal, which can in turn result in
-	RCU CPU stall warnings.
-
-o	Testing a workload on a fast system, tuning the stall-warning
-	timeout down to just barely avoid RCU CPU stall warnings, and then
-	running the same workload with the same stall-warning timeout on a
-	slow system.  Note that thermal throttling and on-demand governors
-	can cause a single system to be sometimes fast and sometimes slow!
-
-o	A hardware or software issue shuts off the scheduler-clock
-	interrupt on a CPU that is not in dyntick-idle mode.  This
-	problem really has happened, and seems to be most likely to
-	result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
-
-o	A bug in the RCU implementation.
-
-o	A hardware failure.  This is quite unlikely, but has occurred
-	at least once in real life.  A CPU failed in a running system,
-	becoming unresponsive, but not causing an immediate crash.
-	This resulted in a series of RCU CPU stall warnings, eventually
-	leading the realization that the CPU had failed.
-
-The RCU, RCU-sched, and RCU-tasks implementations have CPU stall warning.
-Note that SRCU does -not- have CPU stall warnings.  Please note that
-RCU only detects CPU stalls when there is a grace period in progress.
-No grace period, no CPU stall warnings.
-
-To diagnose the cause of the stall, inspect the stack traces.
-The offending function will usually be near the top of the stack.
-If you have a series of stall warnings from a single extended stall,
-comparing the stack traces can often help determine where the stall
-is occurring, which will usually be in the function nearest the top of
-that portion of the stack which remains the same from trace to trace.
-If you can reliably trigger the stall, ftrace can be quite helpful.
-
-RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
-and with RCU's event tracing.  For information on RCU's event tracing,
-see include/trace/events/rcu.h.
-
-
-Fine-Tuning the RCU CPU Stall Detector
-
-The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's
-CPU stall detector, which detects conditions that unduly delay RCU grace
-periods.  This module parameter enables CPU stall detection by default,
-but may be overridden via boot-time parameter or at runtime via sysfs.
-The stall detector's idea of what constitutes "unduly delayed" is
-controlled by a set of kernel configuration variables and cpp macros:
-
-CONFIG_RCU_CPU_STALL_TIMEOUT
-
-	This kernel configuration parameter defines the period of time
-	that RCU will wait from the beginning of a grace period until it
-	issues an RCU CPU stall warning.  This time period is normally
-	21 seconds.
-
-	This configuration parameter may be changed at runtime via the
-	/sys/module/rcupdate/parameters/rcu_cpu_stall_timeout, however
-	this parameter is checked only at the beginning of a cycle.
-	So if you are 10 seconds into a 40-second stall, setting this
-	sysfs parameter to (say) five will shorten the timeout for the
-	-next- stall, or the following warning for the current stall
-	(assuming the stall lasts long enough).  It will not affect the
-	timing of the next warning for the current stall.
-
-	Stall-warning messages may be enabled and disabled completely via
-	/sys/module/rcupdate/parameters/rcu_cpu_stall_suppress.
-
-RCU_STALL_DELAY_DELTA
-
-	Although the lockdep facility is extremely useful, it does add
-	some overhead.  Therefore, under CONFIG_PROVE_RCU, the
-	RCU_STALL_DELAY_DELTA macro allows five extra seconds before
-	giving an RCU CPU stall warning message.  (This is a cpp
-	macro, not a kernel configuration parameter.)
-
-RCU_STALL_RAT_DELAY
-
-	The CPU stall detector tries to make the offending CPU print its
-	own warnings, as this often gives better-quality stack traces.
-	However, if the offending CPU does not detect its own stall in
-	the number of jiffies specified by RCU_STALL_RAT_DELAY, then
-	some other CPU will complain.  This delay is normally set to
-	two jiffies.  (This is a cpp macro, not a kernel configuration
-	parameter.)
-
-rcupdate.rcu_task_stall_timeout
-
-	This boot/sysfs parameter controls the RCU-tasks stall warning
-	interval.  A value of zero or less suppresses RCU-tasks stall
-	warnings.  A positive value sets the stall-warning interval
-	in seconds.  An RCU-tasks stall warning starts with the line:
-
-		INFO: rcu_tasks detected stalls on tasks:
-
-	And continues with the output of sched_show_task() for each
-	task stalling the current RCU-tasks grace period.
-
-
-Interpreting RCU's CPU Stall-Detector "Splats"
-
-For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling,
-it will print a message similar to the following:
-
-	INFO: rcu_sched detected stalls on CPUs/tasks:
-	2-...: (3 GPs behind) idle=06c/0/0 softirq=1453/1455 fqs=0
-	16-...: (0 ticks this GP) idle=81c/0/0 softirq=764/764 fqs=0
-	(detected by 32, t=2603 jiffies, g=7075, q=625)
-
-This message indicates that CPU 32 detected that CPUs 2 and 16 were both
-causing stalls, and that the stall was affecting RCU-sched.  This message
-will normally be followed by stack dumps for each CPU.  Please note that
-PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, and that
-the tasks will be indicated by PID, for example, "P3421".  It is even
-possible for an rcu_state stall to be caused by both CPUs -and- tasks,
-in which case the offending CPUs and tasks will all be called out in the list.
-
-CPU 2's "(3 GPs behind)" indicates that this CPU has not interacted with
-the RCU core for the past three grace periods.  In contrast, CPU 16's "(0
-ticks this GP)" indicates that this CPU has not taken any scheduling-clock
-interrupts during the current stalled grace period.
-
-The "idle=" portion of the message prints the dyntick-idle state.
-The hex number before the first "/" is the low-order 12 bits of the
-dynticks counter, which will have an even-numbered value if the CPU
-is in dyntick-idle mode and an odd-numbered value otherwise.  The hex
-number between the two "/"s is the value of the nesting, which will be
-a small non-negative number if in the idle loop (as shown above) and a
-very large positive number otherwise.
-
-The "softirq=" portion of the message tracks the number of RCU softirq
-handlers that the stalled CPU has executed.  The number before the "/"
-is the number that had executed since boot at the time that this CPU
-last noted the beginning of a grace period, which might be the current
-(stalled) grace period, or it might be some earlier grace period (for
-example, if the CPU might have been in dyntick-idle mode for an extended
-time period.  The number after the "/" is the number that have executed
-since boot until the current time.  If this latter number stays constant
-across repeated stall-warning messages, it is possible that RCU's softirq
-handlers are no longer able to execute on this CPU.  This can happen if
-the stalled CPU is spinning with interrupts are disabled, or, in -rt
-kernels, if a high-priority process is starving RCU's softirq handler.
-
-The "fqs=" shows the number of force-quiescent-state idle/offline
-detection passes that the grace-period kthread has made across this
-CPU since the last time that this CPU noted the beginning of a grace
-period.
-
-The "detected by" line indicates which CPU detected the stall (in this
-case, CPU 32), how many jiffies have elapsed since the start of the grace
-period (in this case 2603), the grace-period sequence number (7075), and
-an estimate of the total number of RCU callbacks queued across all CPUs
-(625 in this case).
-
-In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
-for each CPU:
-
-	0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1
-
-The "last_accelerate:" prints the low-order 16 bits (in hex) of the
-jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
-from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
-rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle
-processing is enabled.
-
-If the grace period ends just as the stall warning starts printing,
-there will be a spurious stall-warning message, which will include
-the following:
-
-	INFO: Stall ended before state dump start
-
-This is rare, but does happen from time to time in real life.  It is also
-possible for a zero-jiffy stall to be flagged in this case, depending
-on how the stall warning and the grace-period initialization happen to
-interact.  Please note that it is not possible to entirely eliminate this
-sort of false positive without resorting to things like stop_machine(),
-which is overkill for this sort of problem.
-
-If all CPUs and tasks have passed through quiescent states, but the
-grace period has nevertheless failed to end, the stall-warning splat
-will include something like the following:
-
-	All QSes seen, last rcu_preempt kthread activity 23807 (4297905177-4297881370), jiffies_till_next_fqs=3, root ->qsmask 0x0
-
-The "23807" indicates that it has been more than 23 thousand jiffies
-since the grace-period kthread ran.  The "jiffies_till_next_fqs"
-indicates how frequently that kthread should run, giving the number
-of jiffies between force-quiescent-state scans, in this case three,
-which is way less than 23807.  Finally, the root rcu_node structure's
-->qsmask field is printed, which will normally be zero.
-
-If the relevant grace-period kthread has been unable to run prior to
-the stall warning, as was the case in the "All QSes seen" line above,
-the following additional line is printed:
-
-	kthread starved for 23807 jiffies! g7075 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1 ->cpu=5
-
-Starving the grace-period kthreads of CPU time can of course result
-in RCU CPU stall warnings even when all CPUs and tasks have passed
-through the required quiescent states.  The "g" number shows the current
-grace-period sequence number, the "f" precedes the ->gp_flags command
-to the grace-period kthread, the "RCU_GP_WAIT_FQS" indicates that the
-kthread is waiting for a short timeout, the "state" precedes value of the
-task_struct ->state field, and the "cpu" indicates that the grace-period
-kthread last ran on CPU 5.
-
-
-Multiple Warnings From One Stall
-
-If a stall lasts long enough, multiple stall-warning messages will be
-printed for it.  The second and subsequent messages are printed at
-longer intervals, so that the time between (say) the first and second
-message will be about three times the interval between the beginning
-of the stall and the first message.
-
-
-Stall Warnings for Expedited Grace Periods
-
-If an expedited grace period detects a stall, it will place a message
-like the following in dmesg:
-
-	INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 7-... } 21119 jiffies s: 73 root: 0x2/.
-
-This indicates that CPU 7 has failed to respond to a reschedule IPI.
-The three periods (".") following the CPU number indicate that the CPU
-is online (otherwise the first period would instead have been "O"),
-that the CPU was online at the beginning of the expedited grace period
-(otherwise the second period would have instead been "o"), and that
-the CPU has been online at least once since boot (otherwise, the third
-period would instead have been "N").  The number before the "jiffies"
-indicates that the expedited grace period has been going on for 21,119
-jiffies.  The number following the "s:" indicates that the expedited
-grace-period sequence counter is 73.  The fact that this last value is
-odd indicates that an expedited grace period is in flight.  The number
-following "root:" is a bitmask that indicates which children of the root
-rcu_node structure correspond to CPUs and/or tasks that are blocking the
-current expedited grace period.  If the tree had more than one level,
-additional hex numbers would be printed for the states of the other
-rcu_node structures in the tree.
-
-As with normal grace periods, PREEMPT_RCU builds can be stalled by
-tasks as well as by CPUs, and that the tasks will be indicated by PID,
-for example, "P3421".
-
-It is entirely possible to see stall warnings from normal and from
-expedited grace periods at about the same time during the same run.
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 54a6dba0280d..b04256cd7e12 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -468,7 +468,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
 
 	/*
 	 * OK, time to rat on our buddy...
-	 * See Documentation/RCU/stallwarn.txt for info on how to debug
+	 * See Documentation/RCU/stallwarn.rst for info on how to debug
 	 * RCU CPU stall warnings.
 	 */
 	pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
@@ -535,7 +535,7 @@ static void print_cpu_stall(unsigned long gps)
 
 	/*
 	 * OK, time to rat on ourselves...
-	 * See Documentation/RCU/stallwarn.txt for info on how to debug
+	 * See Documentation/RCU/stallwarn.rst for info on how to debug
 	 * RCU CPU stall warnings.
 	 */
 	pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
-- 
cgit v1.2.3


From 7ee880b7bf1dea88d0a472b775aebdb4fb6bf860 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Wed, 15 Apr 2020 22:26:55 +0000
Subject: rcu: Initialize and destroy rcu_synchronize only when necessary

The __wait_rcu_gp() function unconditionally initializes and cleans up
each element of rs_array[], whether used or not.  This is slightly
wasteful and rather confusing, so this commit skips both initialization
and cleanup for duplicate callback functions.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/update.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 84843adfd939..f5a82e107bcb 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -390,13 +390,14 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
 			might_sleep();
 			continue;
 		}
-		init_rcu_head_on_stack(&rs_array[i].head);
-		init_completion(&rs_array[i].completion);
 		for (j = 0; j < i; j++)
 			if (crcu_array[j] == crcu_array[i])
 				break;
-		if (j == i)
+		if (j == i) {
+			init_rcu_head_on_stack(&rs_array[i].head);
+			init_completion(&rs_array[i].completion);
 			(crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+		}
 	}
 
 	/* Wait for all callbacks to be invoked. */
@@ -407,9 +408,10 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
 		for (j = 0; j < i; j++)
 			if (crcu_array[j] == crcu_array[i])
 				break;
-		if (j == i)
+		if (j == i) {
 			wait_for_completion(&rs_array[i].completion);
-		destroy_rcu_head_on_stack(&rs_array[i].head);
+			destroy_rcu_head_on_stack(&rs_array[i].head);
+		}
 	}
 }
 EXPORT_SYMBOL_GPL(__wait_rcu_gp);
-- 
cgit v1.2.3


From abfce0414814149f716e1d30da1fb3140d1b3473 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Sun, 19 Apr 2020 21:57:15 +0000
Subject: rcu: Simplify the calculation of rcu_state.ncpus

There is only 1 bit set in mask, which means that the only difference
between oldmask and the new one will be at the position where the bit is
set in mask.  This commit therefore updates rcu_state.ncpus by checking
whether the bit in mask is already set in rnp->expmaskinitnext.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6c6569e0586c..bef1dc91bfbe 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3842,10 +3842,9 @@ void rcu_cpu_starting(unsigned int cpu)
 {
 	unsigned long flags;
 	unsigned long mask;
-	int nbits;
-	unsigned long oldmask;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
+	bool newcpu;
 
 	if (per_cpu(rcu_cpu_started, cpu))
 		return;
@@ -3857,12 +3856,10 @@ void rcu_cpu_starting(unsigned int cpu)
 	mask = rdp->grpmask;
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
-	oldmask = rnp->expmaskinitnext;
+	newcpu = !(rnp->expmaskinitnext & mask);
 	rnp->expmaskinitnext |= mask;
-	oldmask ^= rnp->expmaskinitnext;
-	nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
 	/* Allow lockless access for expedited grace periods. */
-	smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */
+	smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */
 	ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
 	rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
 	rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
-- 
cgit v1.2.3


From e816d56fad57ba9817cef6606b12f5e14647c3bf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 1 May 2020 16:49:48 -0700
Subject: rcu: Add callbacks-invoked counters

This commit adds a count of the callbacks invoked to the per-CPU rcu_data
structure.  This count is printed by the show_rcu_gp_kthreads() that
is invoked by rcutorture and the RCU CPU stall-warning code.  It is also
intended for use by drgn.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c       | 1 +
 kernel/rcu/tree.h       | 1 +
 kernel/rcu/tree_stall.h | 3 +++
 3 files changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bef1dc91bfbe..874c831bcc45 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2443,6 +2443,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	local_irq_save(flags);
 	rcu_nocb_lock(rdp);
 	count = -rcl.len;
+	rdp->n_cbs_invoked += count;
 	trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
 			    is_idle_task(current), rcu_is_callbacks_kthread());
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 43991a40b084..9c6f7343bec0 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -171,6 +171,7 @@ struct rcu_data {
 					/* different grace periods. */
 	long		qlen_last_fqs_check;
 					/* qlen at last check for QS forcing */
+	unsigned long	n_cbs_invoked;	/* # callbacks invoked since boot. */
 	unsigned long	n_force_qs_snap;
 					/* did other CPU force QS recently? */
 	long		blimit;		/* Upper limit on a processed batch */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 54a6dba0280d..2768ce6bf657 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -649,6 +649,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
  */
 void show_rcu_gp_kthreads(void)
 {
+	unsigned long cbs = 0;
 	int cpu;
 	unsigned long j;
 	unsigned long ja;
@@ -690,9 +691,11 @@ void show_rcu_gp_kthreads(void)
 	}
 	for_each_possible_cpu(cpu) {
 		rdp = per_cpu_ptr(&rcu_data, cpu);
+		cbs += data_race(rdp->n_cbs_invoked);
 		if (rcu_segcblist_is_offloaded(&rdp->cblist))
 			show_rcu_nocb_state(rdp);
 	}
+	pr_info("RCU callbacks invoked since boot: %lu\n", cbs);
 	show_rcu_tasks_gp_kthreads();
 }
 EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
-- 
cgit v1.2.3


From f8466f94685b5bd931384526cf51e090fd2ac706 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sun, 3 May 2020 19:16:09 -0700
Subject: rcu: Add comment documenting rcu_callback_map's purpose

The rcu_callback_map lockdep_map structure was added back in 2013, but
its purpose has become obscure.  This commit therefore documments that the
purpose of rcu_callback map is, in the words of commit 24ef659a857 ("rcu:
Provide better diagnostics for blocking in RCU callback functions"),
to help lockdep to tie an "inappropriate voluntary context switch back
to the fact that the function is being invoked from within a callback."

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/update.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f5a82e107bcb..ca17b771ad60 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -279,6 +279,7 @@ struct lockdep_map rcu_sched_lock_map = {
 };
 EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
 
+// Tell lockdep when RCU callbacks are being invoked.
 static struct lock_class_key rcu_callback_key;
 struct lockdep_map rcu_callback_map =
 	STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);
-- 
cgit v1.2.3


From 77865dea25c4f45ce0c5bf61a8470af01fccd944 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 7 May 2020 15:44:46 -0700
Subject: rcu: Grace-period-kthread related sleeps to idle priority

This commit converts the long-standing schedule_timeout_interruptible()
and schedule_timeout_uninterruptible() calls used by RCU's grace-period
kthread to schedule_timeout_idle().  This conversion avoids polluting
the load-average with RCU-related sleeping.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 874c831bcc45..feb31c201dee 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1638,7 +1638,7 @@ static void rcu_gp_slow(int delay)
 	if (delay > 0 &&
 	    !(rcu_seq_ctr(rcu_state.gp_seq) %
 	      (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
-		schedule_timeout_uninterruptible(delay);
+		schedule_timeout_idle(delay);
 }
 
 static unsigned long sleep_duration;
@@ -1661,7 +1661,7 @@ static void rcu_gp_torture_wait(void)
 	duration = xchg(&sleep_duration, 0UL);
 	if (duration > 0) {
 		pr_alert("%s: Waiting %lu jiffies\n", __func__, duration);
-		schedule_timeout_uninterruptible(duration);
+		schedule_timeout_idle(duration);
 		pr_alert("%s: Wait complete\n", __func__);
 	}
 }
@@ -2727,7 +2727,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
 	}
 	*statusp = RCU_KTHREAD_YIELDING;
 	trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
-	schedule_timeout_interruptible(2);
+	schedule_timeout_idle(2);
 	trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
 	*statusp = RCU_KTHREAD_WAITING;
 }
-- 
cgit v1.2.3


From a9352f72d6a9e8fe4840b9f0d97af8f5a6c52c79 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 7 May 2020 16:34:38 -0700
Subject: rcu: Priority-boost-related sleeps to idle priority

This commit converts the long-standing schedule_timeout_interruptible()
call used by RCU's priority-boosting kthreads to schedule_timeout_idle().
This conversion avoids polluting the load-average with RCU-related
sleeping.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 352223664ebd..25296c17a30d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1033,7 +1033,7 @@ static int rcu_boost_kthread(void *arg)
 		if (spincnt > 10) {
 			WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
 			trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
-			schedule_timeout_interruptible(2);
+			schedule_timeout_idle(2);
 			trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
 			spincnt = 0;
 		}
-- 
cgit v1.2.3


From f5ca34643bbd84f514bdeee194c45dd1fb066ef2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 7 May 2020 16:36:10 -0700
Subject: rcu: No-CBs-related sleeps to idle priority

This commit converts the schedule_timeout_interruptible() call used by
RCU's no-CBs grace-period kthreads to schedule_timeout_idle().  This
conversion avoids polluting the load-average with RCU-related sleeping.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 25296c17a30d..982fc5be5269 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2005,7 +2005,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 		/* Polling, so trace if first poll in the series. */
 		if (gotcbs)
 			trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
-		schedule_timeout_interruptible(1);
+		schedule_timeout_idle(1);
 	} else if (!needwait_gp) {
 		/* Wait for callbacks to appear. */
 		trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
-- 
cgit v1.2.3


From 68c2f27e01f61760e6ae76fff9682e1ffe9bacb6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 7 May 2020 16:38:29 -0700
Subject: rcu: Expedited grace-period sleeps to idle priority

This commit converts the schedule_timeout_uninterruptible() call used
by RCU's expedited grace-period processing to schedule_timeout_idle().
This conversion avoids polluting the load-average with RCU-related
sleeping.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 72952edad1e4..1888c0eb1216 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -403,7 +403,7 @@ retry_ipi:
 			/* Online, so delay for a bit and try again. */
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 			trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl"));
-			schedule_timeout_uninterruptible(1);
+			schedule_timeout_idle(1);
 			goto retry_ipi;
 		}
 		/* CPU really is offline, so we must report its QS. */
-- 
cgit v1.2.3


From 360fbbb4897c98971e8955b063c01250817a2191 Mon Sep 17 00:00:00 2001
From: Lihao Liang <lihaoliang@google.com>
Date: Thu, 14 May 2020 21:34:34 +0100
Subject: rcu: Update comment from rsp->rcu_gp_seq to rsp->gp_seq

Signed-off-by: Lihao Liang <lihaoliang@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9c6f7343bec0..575745f0a464 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -41,7 +41,7 @@ struct rcu_node {
 	raw_spinlock_t __private lock;	/* Root rcu_node's lock protects */
 					/*  some rcu_state fields as well as */
 					/*  following. */
-	unsigned long gp_seq;	/* Track rsp->rcu_gp_seq. */
+	unsigned long gp_seq;	/* Track rsp->gp_seq. */
 	unsigned long gp_seq_needed; /* Track furthest future GP request. */
 	unsigned long completedqs; /* All QSes done for this node. */
 	unsigned long qsmask;	/* CPUs or groups that need to switch in */
@@ -149,7 +149,7 @@ union rcu_noqs {
 /* Per-CPU data for read-copy update. */
 struct rcu_data {
 	/* 1) quiescent-state and grace-period handling : */
-	unsigned long	gp_seq;		/* Track rsp->rcu_gp_seq counter. */
+	unsigned long	gp_seq;		/* Track rsp->gp_seq counter. */
 	unsigned long	gp_seq_needed;	/* Track furthest future GP request. */
 	union rcu_noqs	cpu_no_qs;	/* No QSes yet for this CPU. */
 	bool		core_needs_qs;	/* Core waits for quiesc state. */
-- 
cgit v1.2.3


From 3c8920e2dbd1a55f72dc14d656df9d0097cf5c72 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 15 May 2020 02:34:29 +0200
Subject: tick/nohz: Narrow down noise while setting current task's tick
 dependency

Setting a tick dependency on any task, including the case where a task
sets that dependency on itself, triggers an IPI to all CPUs.  That is
of course suboptimal but it had previously not been an issue because it
was only used by POSIX CPU timers on nohz_full, which apparently never
occurs in latency-sensitive workloads in production.  (Or users of such
systems are suffering in silence on the one hand or venting their ire
on the wrong people on the other.)

But RCU now sets a task tick dependency on the current task in order
to fix stall issues that can occur during RCU callback processing.
Thus, RCU callback processing triggers frequent system-wide IPIs from
nohz_full CPUs.  This is quite counter-productive, after all, avoiding
IPIs is what nohz_full is supposed to be all about.

This commit therefore optimizes tasks' self-setting of a task tick
dependency by using tick_nohz_full_kick() to avoid the system-wide IPI.
Instead, only the execution of the one task is disturbed, which is
acceptable given that this disturbance is well down into the noise
compared to the degree to which the RCU callback processing itself
disturbs execution.

Fixes: 6a949b7af82d (rcu: Force on tick when invoking lots of callbacks)
Reported-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: stable@kernel.org
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/time/tick-sched.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e2dc9b8858c..f0199a4ba1ad 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -351,16 +351,24 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
 EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
 
 /*
- * Set a per-task tick dependency. Posix CPU timers need this in order to elapse
- * per task timers.
+ * Set a per-task tick dependency. RCU need this. Also posix CPU timers
+ * in order to elapse per task timers.
  */
 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
 {
-	/*
-	 * We could optimize this with just kicking the target running the task
-	 * if that noise matters for nohz full users.
-	 */
-	tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
+	if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) {
+		if (tsk == current) {
+			preempt_disable();
+			tick_nohz_full_kick();
+			preempt_enable();
+		} else {
+			/*
+			 * Some future tick_nohz_full_kick_task()
+			 * should optimize this.
+			 */
+			tick_nohz_full_kick_all();
+		}
+	}
 }
 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
 
-- 
cgit v1.2.3


From 55fbe86ef303bc8ab040e579fba34a750c08200e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 19 May 2020 15:02:02 -0700
Subject: rcu: Remove initialized but unused rnp from check_slow_task()

This commit removes the variable rnp from check_slow_task(), which
is defined, assigned to, but not otherwise used.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_stall.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 2768ce6bf657..d203f82a380a 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -237,14 +237,12 @@ struct rcu_stall_chk_rdr {
  */
 static bool check_slow_task(struct task_struct *t, void *arg)
 {
-	struct rcu_node *rnp;
 	struct rcu_stall_chk_rdr *rscrp = arg;
 
 	if (task_curr(t))
 		return false; // It is running, so decline to inspect it.
 	rscrp->nesting = t->rcu_read_lock_nesting;
 	rscrp->rs = t->rcu_read_unlock_special;
-	rnp = t->rcu_blocked_node;
 	rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry);
 	return true;
 }
-- 
cgit v1.2.3


From 04b25a495bd68c1dad07263fb91e8b5a31c00a9e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 19 May 2020 17:00:54 -0700
Subject: rcu: Mark rcu_nmi_enter() call to rcu_cleanup_after_idle() noinstr

The objtool complains about the call to rcu_cleanup_after_idle() from
rcu_nmi_enter(), so this commit adds instrumentation_begin() before that
call and instrumentation_end() after it.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index feb31c201dee..d17e5a08bf43 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -990,8 +990,11 @@ noinstr void rcu_nmi_enter(void)
 		rcu_dynticks_eqs_exit();
 		// ... but is watching here.
 
-		if (!in_nmi())
+		if (!in_nmi()) {
+			instrumentation_begin();
 			rcu_cleanup_after_idle();
+			instrumentation_end();
+		}
 
 		instrumentation_begin();
 		// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
-- 
cgit v1.2.3


From d29e0b26b020422cc51b5b51733cc50fcf443965 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 May 2020 08:49:29 -0700
Subject: lockdep: Complain only once about RCU in extended quiescent state

Currently, lockdep_rcu_suspicious() complains twice about RCU read-side
critical sections being invoked from within extended quiescent states,
for example:

	RCU used illegally from idle CPU!
	rcu_scheduler_active = 2, debug_locks = 1
	RCU used illegally from extended quiescent state!

This commit therefore saves a couple lines of code and one line of
console-log output by eliminating the first of these two complaints.

Link: https://lore.kernel.org/lkml/87wo4wnpzb.fsf@nanos.tec.linutronix.de
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/locking/lockdep.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 29a8de4c50b9..0a7549d159ed 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -5851,9 +5851,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 	pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
 	       !rcu_lockdep_current_cpu_online()
 			? "RCU used illegally from offline CPU!\n"
-			: !rcu_is_watching()
-				? "RCU used illegally from idle CPU!\n"
-				: "",
+			: "",
 	       rcu_scheduler_active, debug_locks);
 
 	/*
-- 
cgit v1.2.3


From e40bb921119814c6f746891af9cd37eccda616a4 Mon Sep 17 00:00:00 2001
From: Jules Irenge <jbi.octave@gmail.com>
Date: Mon, 1 Jun 2020 19:45:49 +0100
Subject: rcu: Replace 1 with true

Coccinelle reports a warning

WARNING: Assignment of 0/1 to bool variable

The root cause is that the variable lastphase is a bool, but is
initialised with integer 1.  This commit therefore replaces the 1 with
a true.

Signed-off-by: Jules Irenge <jbi.octave@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/update.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index ca17b771ad60..a0ba8858dd35 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -207,7 +207,7 @@ void rcu_end_inkernel_boot(void)
 	rcu_unexpedite_gp();
 	if (rcu_normal_after_boot)
 		WRITE_ONCE(rcu_normal, 1);
-	rcu_boot_ended = 1;
+	rcu_boot_ended = true;
 }
 
 /*
-- 
cgit v1.2.3


From c6dfd72b7a3b70a2054db0f73245ea2f762a8452 Mon Sep 17 00:00:00 2001
From: Peter Enderborg <peter.enderborg@sony.com>
Date: Thu, 4 Jun 2020 12:23:20 +0200
Subject: rcu: Stop shrinker loop

The count and scan can be separated in time, and there is a fair chance
that all work is already done when the scan starts, which might in turn
result in a needless retry.  This commit therefore avoids this retry by
returning SHRINK_STOP.

Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Peter Enderborg <peter.enderborg@sony.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d17e5a08bf43..c8196fab563c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3332,7 +3332,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 			break;
 	}
 
-	return freed;
+	return freed == 0 ? SHRINK_STOP : freed;
 }
 
 static struct shrinker kfree_rcu_shrinker = {
-- 
cgit v1.2.3


From 00943a609d7ad0f08e58bc9c214f38b0ba163c88 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@linux.alibaba.com>
Date: Fri, 12 Jun 2020 10:07:52 +0800
Subject: rcu: gp_max is protected by root rcu_node's lock

Because gp_max is protected by root rcu_node's lock, this commit moves
the gp_max definition to the region of the rcu_node structure containing
fields protected by this lock.

Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 575745f0a464..09ec93b16f28 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -302,6 +302,8 @@ struct rcu_state {
 	u8	boost ____cacheline_internodealigned_in_smp;
 						/* Subject to priority boost. */
 	unsigned long gp_seq;			/* Grace-period sequence #. */
+	unsigned long gp_max;			/* Maximum GP duration in */
+						/*  jiffies. */
 	struct task_struct *gp_kthread;		/* Task for grace periods. */
 	struct swait_queue_head gp_wq;		/* Where GP task waits. */
 	short gp_flags;				/* Commands for GP task. */
@@ -347,8 +349,6 @@ struct rcu_state {
 						/*  a reluctant CPU. */
 	unsigned long n_force_qs_gpstart;	/* Snapshot of n_force_qs at */
 						/*  GP start. */
-	unsigned long gp_max;			/* Maximum GP duration in */
-						/*  jiffies. */
 	const char *name;			/* Name of structure. */
 	char abbr;				/* Abbreviated name. */
 
-- 
cgit v1.2.3


From a2dae43088d51c4869e7fa91ca09bcc890e277fc Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@linux.alibaba.com>
Date: Fri, 12 Jun 2020 10:07:53 +0800
Subject: rcu: grplo/grphi just records CPU number

The ->grplo and ->grphi fields store the lowest and highest CPU number
covered by to a rcu_node structure, which is not the group number.
This commit therefore adjusts these fields' comments to match reality.

Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 09ec93b16f28..9f903f5c9fa1 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -73,8 +73,8 @@ struct rcu_node {
 	unsigned long ffmask;	/* Fully functional CPUs. */
 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
 				/*  Only one bit will be set in this mask. */
-	int	grplo;		/* lowest-numbered CPU or group here. */
-	int	grphi;		/* highest-numbered CPU or group here. */
+	int	grplo;		/* lowest-numbered CPU here. */
+	int	grphi;		/* highest-numbered CPU here. */
 	u8	grpnum;		/* CPU/group number for next level up. */
 	u8	level;		/* root is at level 0. */
 	bool	wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
-- 
cgit v1.2.3


From 7a0c2b0940c13a06573320ab7118375b35feef8b Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@linux.alibaba.com>
Date: Fri, 12 Jun 2020 10:07:54 +0800
Subject: rcu: grpnum just records group number

The ->grpnum field in the rcu_node structure contains the bit position
in this structure's parent's bitmasks, which is not the CPU number.
This commit therefore adjusts this field's comment accordingly.

Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9f903f5c9fa1..c96ae351688b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -75,7 +75,7 @@ struct rcu_node {
 				/*  Only one bit will be set in this mask. */
 	int	grplo;		/* lowest-numbered CPU here. */
 	int	grphi;		/* highest-numbered CPU here. */
-	u8	grpnum;		/* CPU/group number for next level up. */
+	u8	grpnum;		/* group number for next level up. */
 	u8	level;		/* root is at level 0. */
 	bool	wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
 				/*  exit RCU read-side critical sections */
-- 
cgit v1.2.3


From c3cb47a6cc74af0b79579ba167d7124eb669fbaa Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 15 Jun 2020 12:28:05 -0700
Subject: kernel/rcu/tree.c: Fix kernel-doc warnings

Fix kernel-doc warning:

../kernel/rcu/tree.c:959: warning: Excess function parameter 'irq' description in 'rcu_nmi_enter'

Fixes: cf7614e13c8f ("rcu: Refactor rcu_{nmi,irq}_{enter,exit}()")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c8196fab563c..ef05aac7f9d3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -954,7 +954,6 @@ void __rcu_irq_enter_check_tick(void)
 
 /**
  * rcu_nmi_enter - inform RCU of entry to NMI context
- * @irq: Is this call from rcu_irq_enter?
  *
  * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
  * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
-- 
cgit v1.2.3


From 8e11690d2f5a9823d66f68918c3986b4e9e160ab Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 4 May 2020 14:35:00 +0200
Subject: rcu: Fix a kernel-doc warnings for "count"

There are some kernel-doc warnings:

	./kernel/rcu/tree.c:2915: warning: Function parameter or member 'count' not described in 'kfree_rcu_cpu'

This commit therefore moves the comment for "count" to the kernel-doc
markup.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6c6569e0586c..ba4c477495b5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3004,6 +3004,7 @@ struct kfree_rcu_cpu_work {
  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
  * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
  * @initialized: The @lock and @rcu_work fields have been initialized
+ * @count: Number of objects for which GP not started
  *
  * This is a per-CPU structure.  The reason that it is not included in
  * the rcu_data structure is to permit this code to be extracted from
@@ -3019,7 +3020,6 @@ struct kfree_rcu_cpu {
 	struct delayed_work monitor_work;
 	bool monitor_todo;
 	bool initialized;
-	// Number of objects for which GP not started
 	int count;
 };
 
-- 
cgit v1.2.3


From 8ac88f7177c75bf9b7b8c29a8054115e1c712baf Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Mon, 25 May 2020 23:47:45 +0200
Subject: rcu/tree: Keep kfree_rcu() awake during lock contention

On PREEMPT_RT kernels, the krcp spinlock gets converted to an rt-mutex
and causes kfree_rcu() callers to sleep. This makes it unusable for
callers in purely atomic sections such as non-threaded IRQ handlers and
raw spinlock sections. Fix it by converting the spinlock to a raw
spinlock.

Vetting all code paths, there is no reason to believe that the raw
spinlock will hurt RT latencies as it is not held for a long time.

Cc: bigeasy@linutronix.de
Cc: Uladzislau Rezki <urezki@gmail.com>
Reviewed-by: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ba4c477495b5..c5de5adca0dd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3016,7 +3016,7 @@ struct kfree_rcu_cpu {
 	struct kfree_rcu_bulk_data *bhead;
 	struct kfree_rcu_bulk_data *bcached;
 	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	struct delayed_work monitor_work;
 	bool monitor_todo;
 	bool initialized;
@@ -3049,12 +3049,12 @@ static void kfree_rcu_work(struct work_struct *work)
 	krwp = container_of(to_rcu_work(work),
 			    struct kfree_rcu_cpu_work, rcu_work);
 	krcp = krwp->krcp;
-	spin_lock_irqsave(&krcp->lock, flags);
+	raw_spin_lock_irqsave(&krcp->lock, flags);
 	head = krwp->head_free;
 	krwp->head_free = NULL;
 	bhead = krwp->bhead_free;
 	krwp->bhead_free = NULL;
-	spin_unlock_irqrestore(&krcp->lock, flags);
+	raw_spin_unlock_irqrestore(&krcp->lock, flags);
 
 	/* "bhead" is now private, so traverse locklessly. */
 	for (; bhead; bhead = bnext) {
@@ -3157,14 +3157,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
 	krcp->monitor_todo = false;
 	if (queue_kfree_rcu_work(krcp)) {
 		// Success! Our job is done here.
-		spin_unlock_irqrestore(&krcp->lock, flags);
+		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 		return;
 	}
 
 	// Previous RCU batch still in progress, try again later.
 	krcp->monitor_todo = true;
 	schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
-	spin_unlock_irqrestore(&krcp->lock, flags);
+	raw_spin_unlock_irqrestore(&krcp->lock, flags);
 }
 
 /*
@@ -3177,11 +3177,11 @@ static void kfree_rcu_monitor(struct work_struct *work)
 	struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
 						 monitor_work.work);
 
-	spin_lock_irqsave(&krcp->lock, flags);
+	raw_spin_lock_irqsave(&krcp->lock, flags);
 	if (krcp->monitor_todo)
 		kfree_rcu_drain_unlock(krcp, flags);
 	else
-		spin_unlock_irqrestore(&krcp->lock, flags);
+		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 }
 
 static inline bool
@@ -3252,7 +3252,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	local_irq_save(flags);	// For safely calling this_cpu_ptr().
 	krcp = this_cpu_ptr(&krc);
 	if (krcp->initialized)
-		spin_lock(&krcp->lock);
+		raw_spin_lock(&krcp->lock);
 
 	// Queue the object but don't yet schedule the batch.
 	if (debug_rcu_head_queue(head)) {
@@ -3283,7 +3283,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 
 unlock_return:
 	if (krcp->initialized)
-		spin_unlock(&krcp->lock);
+		raw_spin_unlock(&krcp->lock);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(kfree_call_rcu);
@@ -3315,11 +3315,11 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
 		count = krcp->count;
-		spin_lock_irqsave(&krcp->lock, flags);
+		raw_spin_lock_irqsave(&krcp->lock, flags);
 		if (krcp->monitor_todo)
 			kfree_rcu_drain_unlock(krcp, flags);
 		else
-			spin_unlock_irqrestore(&krcp->lock, flags);
+			raw_spin_unlock_irqrestore(&krcp->lock, flags);
 
 		sc->nr_to_scan -= count;
 		freed += count;
@@ -3346,15 +3346,15 @@ void __init kfree_rcu_scheduler_running(void)
 	for_each_online_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
-		spin_lock_irqsave(&krcp->lock, flags);
+		raw_spin_lock_irqsave(&krcp->lock, flags);
 		if (!krcp->head || krcp->monitor_todo) {
-			spin_unlock_irqrestore(&krcp->lock, flags);
+			raw_spin_unlock_irqrestore(&krcp->lock, flags);
 			continue;
 		}
 		krcp->monitor_todo = true;
 		schedule_delayed_work_on(cpu, &krcp->monitor_work,
 					 KFREE_DRAIN_JIFFIES);
-		spin_unlock_irqrestore(&krcp->lock, flags);
+		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 	}
 }
 
@@ -4250,7 +4250,7 @@ static void __init kfree_rcu_batch_init(void)
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
-		spin_lock_init(&krcp->lock);
+		raw_spin_lock_init(&krcp->lock);
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;
-- 
cgit v1.2.3


From 4d2919411867848fab78c7cb13139e17ad8b85bc Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Mon, 25 May 2020 23:47:46 +0200
Subject: rcu/tree: Skip entry into the page allocator for PREEMPT_RT

To keep the kfree_rcu() code working in purely atomic sections on RT,
such as non-threaded IRQ handlers and raw spinlock sections, avoid
calling into the page allocator which uses sleeping locks on RT.

In fact, even if the  caller is preemptible, the kfree_rcu() code is
not, as the krcp->lock is a raw spinlock.

Calling into the page allocator is optional and avoiding it should be
Ok, especially with the page pre-allocation support in future patches.
Such pre-allocation would further avoid the a need for a dynamically
allocated page in the first place.

Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Uladzislau Rezki <urezki@gmail.com>
Co-developed-by: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c5de5adca0dd..e0425faf3b3b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3202,6 +3202,18 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
 		if (!bnode) {
 			WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
 
+			/*
+			 * To keep this path working on raw non-preemptible
+			 * sections, prevent the optional entry into the
+			 * allocator as it uses sleeping locks. In fact, even
+			 * if the caller of kfree_rcu() is preemptible, this
+			 * path still is not, as krcp->lock is a raw spinlock.
+			 * With additional page pre-allocation in the works,
+			 * hitting this return is going to be much less likely.
+			 */
+			if (IS_ENABLED(CONFIG_PREEMPT_RT))
+				return false;
+
 			bnode = (struct kfree_rcu_bulk_data *)
 				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 		}
-- 
cgit v1.2.3


From 594aa5975b9b5cfe9edaec06170e43b8c0607377 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:47 +0200
Subject: rcu/tree: Repeat the monitor if any free channel is busy

It is possible that one of the channels cannot be detached
because its free channel is busy and previously queued data
has not been processed yet. On the other hand, another
channel can be successfully detached causing the monitor
work to stop.

Prevent that by rescheduling the monitor work if there are
any channels in the pending state after a detach attempt.

Fixes: 34c881745549e ("rcu: Support kfree_bulk() interface in kfree_rcu()")
Acked-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e0425faf3b3b..5151fe4e1429 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3105,7 +3105,7 @@ static void kfree_rcu_work(struct work_struct *work)
 static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 {
 	struct kfree_rcu_cpu_work *krwp;
-	bool queued = false;
+	bool repeat = false;
 	int i;
 
 	lockdep_assert_held(&krcp->lock);
@@ -3143,11 +3143,14 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 			 * been detached following each other, one by one.
 			 */
 			queue_rcu_work(system_wq, &krwp->rcu_work);
-			queued = true;
 		}
+
+		/* Repeat if any "free" corresponding channel is still busy. */
+		if (krcp->bhead || krcp->head)
+			repeat = true;
 	}
 
-	return queued;
+	return !repeat;
 }
 
 static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
-- 
cgit v1.2.3


From 446044eb9c9c335d3ae1be4665193ab43ebb284e Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Mon, 25 May 2020 23:47:48 +0200
Subject: rcu/tree: Make debug_objects logic independent of rcu_head

kfree_rcu()'s debug_objects logic uses the address of the object's
embedded rcu_head to queue/unqueue. Instead of this, make use of the
object's address itself as preparation for future headless kfree_rcu()
support.

Reviewed-by: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5151fe4e1429..143c1e9265b6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2970,13 +2970,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
  * @nr_records: Number of active pointers in the array
  * @records: Array of the kfree_rcu() pointers
  * @next: Next bulk object in the block chain
- * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set
  */
 struct kfree_rcu_bulk_data {
 	unsigned long nr_records;
 	void *records[KFREE_BULK_MAX_ENTR];
 	struct kfree_rcu_bulk_data *next;
-	struct rcu_head *head_free_debug;
 };
 
 /**
@@ -3026,11 +3024,13 @@ struct kfree_rcu_cpu {
 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
 
 static __always_inline void
-debug_rcu_head_unqueue_bulk(struct rcu_head *head)
+debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead)
 {
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-	for (; head; head = head->next)
-		debug_rcu_head_unqueue(head);
+	int i;
+
+	for (i = 0; i < bhead->nr_records; i++)
+		debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
 #endif
 }
 
@@ -3060,7 +3060,7 @@ static void kfree_rcu_work(struct work_struct *work)
 	for (; bhead; bhead = bnext) {
 		bnext = bhead->next;
 
-		debug_rcu_head_unqueue_bulk(bhead->head_free_debug);
+		debug_rcu_bhead_unqueue(bhead);
 
 		rcu_lock_acquire(&rcu_callback_map);
 		trace_rcu_invoke_kfree_bulk_callback(rcu_state.name,
@@ -3082,14 +3082,15 @@ static void kfree_rcu_work(struct work_struct *work)
 	 */
 	for (; head; head = next) {
 		unsigned long offset = (unsigned long)head->func;
+		void *ptr = (void *)head - offset;
 
 		next = head->next;
-		debug_rcu_head_unqueue(head);
+		debug_rcu_head_unqueue((struct rcu_head *)ptr);
 		rcu_lock_acquire(&rcu_callback_map);
 		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
 
 		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
-			kfree((void *)head - offset);
+			kfree(ptr);
 
 		rcu_lock_release(&rcu_callback_map);
 		cond_resched_tasks_rcu_qs();
@@ -3228,18 +3229,11 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
 		/* Initialize the new block. */
 		bnode->nr_records = 0;
 		bnode->next = krcp->bhead;
-		bnode->head_free_debug = NULL;
 
 		/* Attach it to the head. */
 		krcp->bhead = bnode;
 	}
 
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-	head->func = func;
-	head->next = krcp->bhead->head_free_debug;
-	krcp->bhead->head_free_debug = head;
-#endif
-
 	/* Finally insert. */
 	krcp->bhead->records[krcp->bhead->nr_records++] =
 		(void *) head - (unsigned long) func;
@@ -3263,14 +3257,17 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	unsigned long flags;
 	struct kfree_rcu_cpu *krcp;
+	void *ptr;
 
 	local_irq_save(flags);	// For safely calling this_cpu_ptr().
 	krcp = this_cpu_ptr(&krc);
 	if (krcp->initialized)
 		raw_spin_lock(&krcp->lock);
 
+	ptr = (void *)head - (unsigned long)func;
+
 	// Queue the object but don't yet schedule the batch.
-	if (debug_rcu_head_queue(head)) {
+	if (debug_rcu_head_queue(ptr)) {
 		// Probable double kfree_rcu(), just leak.
 		WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
 			  __func__, head);
-- 
cgit v1.2.3


From 3af84862817403d317dc33312e7a88d76e79401a Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:49 +0200
Subject: rcu/tree: Simplify KFREE_BULK_MAX_ENTR macro

We can simplify KFREE_BULK_MAX_ENTR macro and get rid of
magic numbers which were used to make the structure to be
exactly one page.

Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 143c1e9265b6..bcdc06364426 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2958,13 +2958,6 @@ EXPORT_SYMBOL_GPL(call_rcu);
 #define KFREE_DRAIN_JIFFIES (HZ / 50)
 #define KFREE_N_BATCHES 2
 
-/*
- * This macro defines how many entries the "records" array
- * will contain. It is based on the fact that the size of
- * kfree_rcu_bulk_data structure becomes exactly one page.
- */
-#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3)
-
 /**
  * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
  * @nr_records: Number of active pointers in the array
@@ -2973,10 +2966,18 @@ EXPORT_SYMBOL_GPL(call_rcu);
  */
 struct kfree_rcu_bulk_data {
 	unsigned long nr_records;
-	void *records[KFREE_BULK_MAX_ENTR];
 	struct kfree_rcu_bulk_data *next;
+	void *records[];
 };
 
+/*
+ * This macro defines how many entries the "records" array
+ * will contain. It is based on the fact that the size of
+ * kfree_rcu_bulk_data structure becomes exactly one page.
+ */
+#define KFREE_BULK_MAX_ENTR \
+	((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *))
+
 /**
  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
  * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
-- 
cgit v1.2.3


From 952371d6fc0bc360d1d5780f86bb355836117ca2 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:50 +0200
Subject: rcu/tree: Move kfree_rcu_cpu locking/unlocking to separate functions

Introduce helpers to lock and unlock per-cpu "kfree_rcu_cpu"
structures. That will make kfree_call_rcu() more readable
and prevent programming errors.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bcdc06364426..368bdc441ffb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3035,6 +3035,27 @@ debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead)
 #endif
 }
 
+static inline struct kfree_rcu_cpu *
+krc_this_cpu_lock(unsigned long *flags)
+{
+	struct kfree_rcu_cpu *krcp;
+
+	local_irq_save(*flags);	// For safely calling this_cpu_ptr().
+	krcp = this_cpu_ptr(&krc);
+	if (likely(krcp->initialized))
+		raw_spin_lock(&krcp->lock);
+
+	return krcp;
+}
+
+static inline void
+krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
+{
+	if (likely(krcp->initialized))
+		raw_spin_unlock(&krcp->lock);
+	local_irq_restore(flags);
+}
+
 /*
  * This function is invoked in workqueue context after a grace period.
  * It frees all the objects queued on ->bhead_free or ->head_free.
@@ -3260,11 +3281,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	struct kfree_rcu_cpu *krcp;
 	void *ptr;
 
-	local_irq_save(flags);	// For safely calling this_cpu_ptr().
-	krcp = this_cpu_ptr(&krc);
-	if (krcp->initialized)
-		raw_spin_lock(&krcp->lock);
-
+	krcp = krc_this_cpu_lock(&flags);
 	ptr = (void *)head - (unsigned long)func;
 
 	// Queue the object but don't yet schedule the batch.
@@ -3295,9 +3312,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	}
 
 unlock_return:
-	if (krcp->initialized)
-		raw_spin_unlock(&krcp->lock);
-	local_irq_restore(flags);
+	krc_this_cpu_unlock(krcp, flags);
 }
 EXPORT_SYMBOL_GPL(kfree_call_rcu);
 
-- 
cgit v1.2.3


From 69f08d3999dbef1553a3332b8055282dd3893b6c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 May 2020 23:47:51 +0200
Subject: rcu/tree: Use static initializer for krc.lock

The per-CPU variable is initialized at runtime in
kfree_rcu_batch_init(). This function is invoked before
'rcu_scheduler_active' is set to 'RCU_SCHEDULER_RUNNING'.
After the initialisation, '->initialized' is to true.

The raw_spin_lock is only acquired if '->initialized' is
set to true. The worqueue item is only used if 'rcu_scheduler_active'
set to RCU_SCHEDULER_RUNNING which happens after initialisation.

Use a static initializer for krc.lock and remove the runtime
initialisation of the lock. Since the lock can now be always
acquired, remove the '->initialized' check.

Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 368bdc441ffb..a42a4693f161 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3002,7 +3002,7 @@ struct kfree_rcu_cpu_work {
  * @lock: Synchronize access to this structure
  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
  * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
- * @initialized: The @lock and @rcu_work fields have been initialized
+ * @initialized: The @rcu_work fields have been initialized
  * @count: Number of objects for which GP not started
  *
  * This is a per-CPU structure.  The reason that it is not included in
@@ -3022,7 +3022,9 @@ struct kfree_rcu_cpu {
 	int count;
 };
 
-static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
+	.lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
+};
 
 static __always_inline void
 debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead)
@@ -3042,8 +3044,7 @@ krc_this_cpu_lock(unsigned long *flags)
 
 	local_irq_save(*flags);	// For safely calling this_cpu_ptr().
 	krcp = this_cpu_ptr(&krc);
-	if (likely(krcp->initialized))
-		raw_spin_lock(&krcp->lock);
+	raw_spin_lock(&krcp->lock);
 
 	return krcp;
 }
@@ -3051,8 +3052,7 @@ krc_this_cpu_lock(unsigned long *flags)
 static inline void
 krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
 {
-	if (likely(krcp->initialized))
-		raw_spin_unlock(&krcp->lock);
+	raw_spin_unlock(&krcp->lock);
 	local_irq_restore(flags);
 }
 
@@ -4278,7 +4278,6 @@ static void __init kfree_rcu_batch_init(void)
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
-		raw_spin_lock_init(&krcp->lock);
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;
-- 
cgit v1.2.3


From 53c72b590b3a0afd6747d6f7957e6838003e90a4 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:52 +0200
Subject: rcu/tree: cache specified number of objects

In order to reduce the dynamic need for pages in kfree_rcu(),
pre-allocate a configurable number of pages per CPU and link
them in a list. When kfree_rcu() reclaims objects, the object's
container page is cached into a list instead of being released
to the low-level page allocator.

Such an approach provides O(1) access to free pages while also
reducing the number of requests to the page allocator. It also
makes the kfree_rcu() code to have free pages available during
a low memory condition.

A read-only sysfs parameter (rcu_min_cached_objs) reflects the
minimum number of allowed cached pages per CPU.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  8 +++
 kernel/rcu/tree.c                               | 66 +++++++++++++++++++++++--
 2 files changed, 70 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fb95fad81c79..befaa63652ff 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4038,6 +4038,14 @@
 			latencies, which will choose a value aligned
 			with the appropriate hardware boundaries.
 
+	rcutree.rcu_min_cached_objs= [KNL]
+			Minimum number of objects which are cached and
+			maintained per one CPU. Object size is equal
+			to PAGE_SIZE. The cache allows to reduce the
+			pressure to page allocator, also it makes the
+			whole algorithm to behave better in low memory
+			condition.
+
 	rcutree.jiffies_till_first_fqs= [KNL]
 			Set delay from grace-period initialization to
 			first attempt to force quiescent states.
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a42a4693f161..37c0cd0332f8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -175,6 +175,15 @@ module_param(gp_init_delay, int, 0444);
 static int gp_cleanup_delay;
 module_param(gp_cleanup_delay, int, 0444);
 
+/*
+ * This rcu parameter is runtime-read-only. It reflects
+ * a minimum allowed number of objects which can be cached
+ * per-CPU. Object size is equal to one page. This value
+ * can be changed at boot time.
+ */
+static int rcu_min_cached_objs = 2;
+module_param(rcu_min_cached_objs, int, 0444);
+
 /* Retrieve RCU kthreads priority for rcutorture */
 int rcu_get_gp_kthreads_prio(void)
 {
@@ -2997,7 +3006,6 @@ struct kfree_rcu_cpu_work {
  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
  * @head: List of kfree_rcu() objects not yet waiting for a grace period
  * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
- * @bcached: Keeps at most one object for later reuse when build chain blocks
  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
  * @lock: Synchronize access to this structure
  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
@@ -3013,13 +3021,22 @@ struct kfree_rcu_cpu_work {
 struct kfree_rcu_cpu {
 	struct rcu_head *head;
 	struct kfree_rcu_bulk_data *bhead;
-	struct kfree_rcu_bulk_data *bcached;
 	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
 	raw_spinlock_t lock;
 	struct delayed_work monitor_work;
 	bool monitor_todo;
 	bool initialized;
 	int count;
+
+	/*
+	 * A simple cache list that contains objects for
+	 * reuse purpose. In order to save some per-cpu
+	 * space the list is singular. Even though it is
+	 * lockless an access has to be protected by the
+	 * per-cpu lock.
+	 */
+	struct llist_head bkvcache;
+	int nr_bkv_objs;
 };
 
 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
@@ -3056,6 +3073,31 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
 	local_irq_restore(flags);
 }
 
+static inline struct kfree_rcu_bulk_data *
+get_cached_bnode(struct kfree_rcu_cpu *krcp)
+{
+	if (!krcp->nr_bkv_objs)
+		return NULL;
+
+	krcp->nr_bkv_objs--;
+	return (struct kfree_rcu_bulk_data *)
+		llist_del_first(&krcp->bkvcache);
+}
+
+static inline bool
+put_cached_bnode(struct kfree_rcu_cpu *krcp,
+	struct kfree_rcu_bulk_data *bnode)
+{
+	// Check the limit.
+	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
+		return false;
+
+	llist_add((struct llist_node *) bnode, &krcp->bkvcache);
+	krcp->nr_bkv_objs++;
+	return true;
+
+}
+
 /*
  * This function is invoked in workqueue context after a grace period.
  * It frees all the objects queued on ->bhead_free or ->head_free.
@@ -3091,7 +3133,12 @@ static void kfree_rcu_work(struct work_struct *work)
 		kfree_bulk(bhead->nr_records, bhead->records);
 		rcu_lock_release(&rcu_callback_map);
 
-		if (cmpxchg(&krcp->bcached, NULL, bhead))
+		krcp = krc_this_cpu_lock(&flags);
+		if (put_cached_bnode(krcp, bhead))
+			bhead = NULL;
+		krc_this_cpu_unlock(krcp, flags);
+
+		if (bhead)
 			free_page((unsigned long) bhead);
 
 		cond_resched_tasks_rcu_qs();
@@ -3224,7 +3271,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
 	/* Check if a new block is required. */
 	if (!krcp->bhead ||
 			krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
-		bnode = xchg(&krcp->bcached, NULL);
+		bnode = get_cached_bnode(krcp);
 		if (!bnode) {
 			WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
 
@@ -4277,12 +4324,23 @@ static void __init kfree_rcu_batch_init(void)
 
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+		struct kfree_rcu_bulk_data *bnode;
 
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;
 		}
 
+		for (i = 0; i < rcu_min_cached_objs; i++) {
+			bnode = (struct kfree_rcu_bulk_data *)
+				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+
+			if (bnode)
+				put_cached_bnode(krcp, bnode);
+			else
+				pr_err("Failed to preallocate for %d CPU!\n", cpu);
+		}
+
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
 		krcp->initialized = true;
 	}
-- 
cgit v1.2.3


From 5f3c8d620447d509e534962e23f7edfb85f4e533 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:53 +0200
Subject: rcu/tree: Maintain separate array for vmalloc ptrs

To do so, we use an array of kvfree_rcu_bulk_data structures.
It consists of two elements:
 - index number 0 corresponds to slab pointers.
 - index number 1 corresponds to vmalloc pointers.

Keeping vmalloc pointers separated from slab pointers makes
it possible to invoke the right freeing API for the right
kind of pointer.

It also prepares us for future headless support for vmalloc
and SLAB objects. Such objects cannot be queued on a linked
list and are instead directly into an array.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 173 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 100 insertions(+), 73 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 37c0cd0332f8..67c4b984c499 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -57,6 +57,8 @@
 #include <linux/slab.h>
 #include <linux/sched/isolation.h>
 #include <linux/sched/clock.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
 #include "../time/tick-internal.h"
 
 #include "tree.h"
@@ -2966,46 +2968,47 @@ EXPORT_SYMBOL_GPL(call_rcu);
 /* Maximum number of jiffies to wait before draining a batch. */
 #define KFREE_DRAIN_JIFFIES (HZ / 50)
 #define KFREE_N_BATCHES 2
+#define FREE_N_CHANNELS 2
 
 /**
- * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
+ * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
  * @nr_records: Number of active pointers in the array
- * @records: Array of the kfree_rcu() pointers
  * @next: Next bulk object in the block chain
+ * @records: Array of the kvfree_rcu() pointers
  */
-struct kfree_rcu_bulk_data {
+struct kvfree_rcu_bulk_data {
 	unsigned long nr_records;
-	struct kfree_rcu_bulk_data *next;
+	struct kvfree_rcu_bulk_data *next;
 	void *records[];
 };
 
 /*
  * This macro defines how many entries the "records" array
  * will contain. It is based on the fact that the size of
- * kfree_rcu_bulk_data structure becomes exactly one page.
+ * kvfree_rcu_bulk_data structure becomes exactly one page.
  */
-#define KFREE_BULK_MAX_ENTR \
-	((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *))
+#define KVFREE_BULK_MAX_ENTR \
+	((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
 
 /**
  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
  * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
  * @head_free: List of kfree_rcu() objects waiting for a grace period
- * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
+ * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
  * @krcp: Pointer to @kfree_rcu_cpu structure
  */
 
 struct kfree_rcu_cpu_work {
 	struct rcu_work rcu_work;
 	struct rcu_head *head_free;
-	struct kfree_rcu_bulk_data *bhead_free;
+	struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
 	struct kfree_rcu_cpu *krcp;
 };
 
 /**
  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
  * @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
+ * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
  * @lock: Synchronize access to this structure
  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
@@ -3020,7 +3023,7 @@ struct kfree_rcu_cpu_work {
  */
 struct kfree_rcu_cpu {
 	struct rcu_head *head;
-	struct kfree_rcu_bulk_data *bhead;
+	struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
 	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
 	raw_spinlock_t lock;
 	struct delayed_work monitor_work;
@@ -3044,7 +3047,7 @@ static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
 };
 
 static __always_inline void
-debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead)
+debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
 {
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
 	int i;
@@ -3073,20 +3076,20 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
 	local_irq_restore(flags);
 }
 
-static inline struct kfree_rcu_bulk_data *
+static inline struct kvfree_rcu_bulk_data *
 get_cached_bnode(struct kfree_rcu_cpu *krcp)
 {
 	if (!krcp->nr_bkv_objs)
 		return NULL;
 
 	krcp->nr_bkv_objs--;
-	return (struct kfree_rcu_bulk_data *)
+	return (struct kvfree_rcu_bulk_data *)
 		llist_del_first(&krcp->bkvcache);
 }
 
 static inline bool
 put_cached_bnode(struct kfree_rcu_cpu *krcp,
-	struct kfree_rcu_bulk_data *bnode)
+	struct kvfree_rcu_bulk_data *bnode)
 {
 	// Check the limit.
 	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
@@ -3105,43 +3108,63 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp,
 static void kfree_rcu_work(struct work_struct *work)
 {
 	unsigned long flags;
+	struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
 	struct rcu_head *head, *next;
-	struct kfree_rcu_bulk_data *bhead, *bnext;
 	struct kfree_rcu_cpu *krcp;
 	struct kfree_rcu_cpu_work *krwp;
+	int i, j;
 
 	krwp = container_of(to_rcu_work(work),
 			    struct kfree_rcu_cpu_work, rcu_work);
 	krcp = krwp->krcp;
+
 	raw_spin_lock_irqsave(&krcp->lock, flags);
+	// Channels 1 and 2.
+	for (i = 0; i < FREE_N_CHANNELS; i++) {
+		bkvhead[i] = krwp->bkvhead_free[i];
+		krwp->bkvhead_free[i] = NULL;
+	}
+
+	// Channel 3.
 	head = krwp->head_free;
 	krwp->head_free = NULL;
-	bhead = krwp->bhead_free;
-	krwp->bhead_free = NULL;
 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
 
-	/* "bhead" is now private, so traverse locklessly. */
-	for (; bhead; bhead = bnext) {
-		bnext = bhead->next;
-
-		debug_rcu_bhead_unqueue(bhead);
-
-		rcu_lock_acquire(&rcu_callback_map);
-		trace_rcu_invoke_kfree_bulk_callback(rcu_state.name,
-			bhead->nr_records, bhead->records);
-
-		kfree_bulk(bhead->nr_records, bhead->records);
-		rcu_lock_release(&rcu_callback_map);
+	// Handle two first channels.
+	for (i = 0; i < FREE_N_CHANNELS; i++) {
+		for (; bkvhead[i]; bkvhead[i] = bnext) {
+			bnext = bkvhead[i]->next;
+			debug_rcu_bhead_unqueue(bkvhead[i]);
+
+			rcu_lock_acquire(&rcu_callback_map);
+			if (i == 0) { // kmalloc() / kfree().
+				trace_rcu_invoke_kfree_bulk_callback(
+					rcu_state.name, bkvhead[i]->nr_records,
+					bkvhead[i]->records);
+
+				kfree_bulk(bkvhead[i]->nr_records,
+					bkvhead[i]->records);
+			} else { // vmalloc() / vfree().
+				for (j = 0; j < bkvhead[i]->nr_records; j++) {
+					trace_rcu_invoke_kfree_callback(
+						rcu_state.name,
+						bkvhead[i]->records[j], 0);
+
+					vfree(bkvhead[i]->records[j]);
+				}
+			}
+			rcu_lock_release(&rcu_callback_map);
 
-		krcp = krc_this_cpu_lock(&flags);
-		if (put_cached_bnode(krcp, bhead))
-			bhead = NULL;
-		krc_this_cpu_unlock(krcp, flags);
+			krcp = krc_this_cpu_lock(&flags);
+			if (put_cached_bnode(krcp, bkvhead[i]))
+				bkvhead[i] = NULL;
+			krc_this_cpu_unlock(krcp, flags);
 
-		if (bhead)
-			free_page((unsigned long) bhead);
+			if (bkvhead[i])
+				free_page((unsigned long) bkvhead[i]);
 
-		cond_resched_tasks_rcu_qs();
+			cond_resched_tasks_rcu_qs();
+		}
 	}
 
 	/*
@@ -3159,7 +3182,7 @@ static void kfree_rcu_work(struct work_struct *work)
 		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
 
 		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
-			kfree(ptr);
+			kvfree(ptr);
 
 		rcu_lock_release(&rcu_callback_map);
 		cond_resched_tasks_rcu_qs();
@@ -3176,7 +3199,7 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 {
 	struct kfree_rcu_cpu_work *krwp;
 	bool repeat = false;
-	int i;
+	int i, j;
 
 	lockdep_assert_held(&krcp->lock);
 
@@ -3184,21 +3207,25 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 		krwp = &(krcp->krw_arr[i]);
 
 		/*
-		 * Try to detach bhead or head and attach it over any
+		 * Try to detach bkvhead or head and attach it over any
 		 * available corresponding free channel. It can be that
 		 * a previous RCU batch is in progress, it means that
 		 * immediately to queue another one is not possible so
 		 * return false to tell caller to retry.
 		 */
-		if ((krcp->bhead && !krwp->bhead_free) ||
+		if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
+			(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
 				(krcp->head && !krwp->head_free)) {
-			/* Channel 1. */
-			if (!krwp->bhead_free) {
-				krwp->bhead_free = krcp->bhead;
-				krcp->bhead = NULL;
+			// Channel 1 corresponds to SLAB ptrs.
+			// Channel 2 corresponds to vmalloc ptrs.
+			for (j = 0; j < FREE_N_CHANNELS; j++) {
+				if (!krwp->bkvhead_free[j]) {
+					krwp->bkvhead_free[j] = krcp->bkvhead[j];
+					krcp->bkvhead[j] = NULL;
+				}
 			}
 
-			/* Channel 2. */
+			// Channel 3 corresponds to emergency path.
 			if (!krwp->head_free) {
 				krwp->head_free = krcp->head;
 				krcp->head = NULL;
@@ -3207,16 +3234,17 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 			WRITE_ONCE(krcp->count, 0);
 
 			/*
-			 * One work is per one batch, so there are two "free channels",
-			 * "bhead_free" and "head_free" the batch can handle. It can be
-			 * that the work is in the pending state when two channels have
-			 * been detached following each other, one by one.
+			 * One work is per one batch, so there are three
+			 * "free channels", the batch can handle. It can
+			 * be that the work is in the pending state when
+			 * channels have been detached following by each
+			 * other.
 			 */
 			queue_rcu_work(system_wq, &krwp->rcu_work);
 		}
 
-		/* Repeat if any "free" corresponding channel is still busy. */
-		if (krcp->bhead || krcp->head)
+		// Repeat if any "free" corresponding channel is still busy.
+		if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
 			repeat = true;
 	}
 
@@ -3258,23 +3286,22 @@ static void kfree_rcu_monitor(struct work_struct *work)
 }
 
 static inline bool
-kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
-	struct rcu_head *head, rcu_callback_t func)
+kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
 {
-	struct kfree_rcu_bulk_data *bnode;
+	struct kvfree_rcu_bulk_data *bnode;
+	int idx;
 
 	if (unlikely(!krcp->initialized))
 		return false;
 
 	lockdep_assert_held(&krcp->lock);
+	idx = !!is_vmalloc_addr(ptr);
 
 	/* Check if a new block is required. */
-	if (!krcp->bhead ||
-			krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
+	if (!krcp->bkvhead[idx] ||
+			krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
 		bnode = get_cached_bnode(krcp);
 		if (!bnode) {
-			WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
-
 			/*
 			 * To keep this path working on raw non-preemptible
 			 * sections, prevent the optional entry into the
@@ -3287,7 +3314,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
 			if (IS_ENABLED(CONFIG_PREEMPT_RT))
 				return false;
 
-			bnode = (struct kfree_rcu_bulk_data *)
+			bnode = (struct kvfree_rcu_bulk_data *)
 				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 		}
 
@@ -3297,30 +3324,30 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
 
 		/* Initialize the new block. */
 		bnode->nr_records = 0;
-		bnode->next = krcp->bhead;
+		bnode->next = krcp->bkvhead[idx];
 
 		/* Attach it to the head. */
-		krcp->bhead = bnode;
+		krcp->bkvhead[idx] = bnode;
 	}
 
 	/* Finally insert. */
-	krcp->bhead->records[krcp->bhead->nr_records++] =
-		(void *) head - (unsigned long) func;
+	krcp->bkvhead[idx]->records
+		[krcp->bkvhead[idx]->nr_records++] = ptr;
 
 	return true;
 }
 
 /*
- * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace
- * period. Please note there are two paths are maintained, one is the main one
- * that uses kfree_bulk() interface and second one is emergency one, that is
- * used only when the main path can not be maintained temporary, due to memory
- * pressure.
+ * Queue a request for lazy invocation of appropriate free routine after a
+ * grace period. Please note there are three paths are maintained, two are the
+ * main ones that use array of pointers interface and third one is emergency
+ * one, that is used only when the main path can not be maintained temporary,
+ * due to memory pressure.
  *
  * Each kfree_call_rcu() request is added to a batch. The batch will be drained
  * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
  * be free'd in workqueue context. This allows us to: batch requests together to
- * reduce the number of grace periods during heavy kfree_rcu() load.
+ * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
  */
 void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
@@ -3343,7 +3370,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	 * Under high memory pressure GFP_NOWAIT can fail,
 	 * in that case the emergency path is maintained.
 	 */
-	if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) {
+	if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) {
 		head->func = func;
 		head->next = krcp->head;
 		krcp->head = head;
@@ -4324,7 +4351,7 @@ static void __init kfree_rcu_batch_init(void)
 
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-		struct kfree_rcu_bulk_data *bnode;
+		struct kvfree_rcu_bulk_data *bnode;
 
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
@@ -4332,7 +4359,7 @@ static void __init kfree_rcu_batch_init(void)
 		}
 
 		for (i = 0; i < rcu_min_cached_objs; i++) {
-			bnode = (struct kfree_rcu_bulk_data *)
+			bnode = (struct kvfree_rcu_bulk_data *)
 				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 
 			if (bnode)
-- 
cgit v1.2.3


From 64d1d06ccb1b7de245ccf781b91517f328bebd9f Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:54 +0200
Subject: rcu/tiny: support vmalloc in tiny-RCU

Replace kfree() with kvfree() in rcu_reclaim_tiny().
This makes it possible to release either SLAB or vmalloc
objects after a GP.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tiny.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index dd572ce7c747..4b99f7b88bee 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -23,6 +23,7 @@
 #include <linux/cpu.h>
 #include <linux/prefetch.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 
 #include "rcu.h"
 
@@ -86,7 +87,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
 	rcu_lock_acquire(&rcu_callback_map);
 	if (__is_kfree_rcu_offset(offset)) {
 		trace_rcu_invoke_kfree_callback("", head, offset);
-		kfree((void *)head - offset);
+		kvfree((void *)head - offset);
 		rcu_lock_release(&rcu_callback_map);
 		return true;
 	}
-- 
cgit v1.2.3


From c408b215f58f7156bb6bafb64c0263ee907033df Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:55 +0200
Subject: rcu: Rename *_kfree_callback/*_kfree_rcu_offset/kfree_call_*

The following changes are introduced:

1. Rename rcu_invoke_kfree_callback() to rcu_invoke_kvfree_callback(),
as well as the associated trace events, so the rcu_kfree_callback(),
becomes rcu_kvfree_callback(). The reason is to be aligned with kvfree()
notation.

2. Rename __is_kfree_rcu_offset to __is_kvfree_rcu_offset. All RCU
paths use kvfree() now instead of kfree(), thus rename it.

3. Rename kfree_call_rcu() to the kvfree_call_rcu(). The reason is,
it is capable of freeing vmalloc() memory now. Do the same with
__kfree_rcu() macro, it becomes __kvfree_rcu(), the goal is the
same.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h   | 14 +++++++-------
 include/linux/rcutiny.h    |  2 +-
 include/linux/rcutree.h    |  2 +-
 include/trace/events/rcu.h |  8 ++++----
 kernel/rcu/tiny.c          |  4 ++--
 kernel/rcu/tree.c          | 16 ++++++++--------
 6 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 659cbfa7581a..b344fc800a9b 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -828,17 +828,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 
 /*
  * Does the specified offset indicate that the corresponding rcu_head
- * structure can be handled by kfree_rcu()?
+ * structure can be handled by kvfree_rcu()?
  */
-#define __is_kfree_rcu_offset(offset) ((offset) < 4096)
+#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
 
 /*
  * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain.
  */
-#define __kfree_rcu(head, offset) \
+#define __kvfree_rcu(head, offset) \
 	do { \
-		BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \
-		kfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
+		BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \
+		kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
 	} while (0)
 
 /**
@@ -857,7 +857,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
  * Because the functions are not allowed in the low-order 4096 bytes of
  * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
  * If the offset is larger than 4095 bytes, a compile-time error will
- * be generated in __kfree_rcu().  If this error is triggered, you can
+ * be generated in __kvfree_rcu(). If this error is triggered, you can
  * either fall back to use of call_rcu() or rearrange the structure to
  * position the rcu_head structure into the first 4096 bytes.
  *
@@ -872,7 +872,7 @@ do {									\
 	typeof (ptr) ___p = (ptr);					\
 									\
 	if (___p)							\
-		__kfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
+		__kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
 } while (0)
 
 /*
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 8512caeb7682..fb2eb39c484f 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -34,7 +34,7 @@ static inline void synchronize_rcu_expedited(void)
 	synchronize_rcu();
 }
 
-static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	call_rcu(head, func);
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d5cc9d675987..d2f4064ebd1d 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -33,7 +33,7 @@ static inline void rcu_virt_note_context_switch(int cpu)
 }
 
 void synchronize_rcu_expedited(void);
-void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
 
 void rcu_barrier(void);
 bool rcu_eqs_special_set(int cpu);
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index f9a7811148e2..0ee93d0b1daa 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -506,13 +506,13 @@ TRACE_EVENT_RCU(rcu_callback,
 
 /*
  * Tracepoint for the registration of a single RCU callback of the special
- * kfree() form.  The first argument is the RCU type, the second argument
+ * kvfree() form.  The first argument is the RCU type, the second argument
  * is a pointer to the RCU callback, the third argument is the offset
  * of the callback within the enclosing RCU-protected data structure,
  * the fourth argument is the number of lazy callbacks queued, and the
  * fifth argument is the total number of callbacks queued.
  */
-TRACE_EVENT_RCU(rcu_kfree_callback,
+TRACE_EVENT_RCU(rcu_kvfree_callback,
 
 	TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset,
 		 long qlen),
@@ -596,12 +596,12 @@ TRACE_EVENT_RCU(rcu_invoke_callback,
 
 /*
  * Tracepoint for the invocation of a single RCU callback of the special
- * kfree() form.  The first argument is the RCU flavor, the second
+ * kvfree() form.  The first argument is the RCU flavor, the second
  * argument is a pointer to the RCU callback, and the third argument
  * is the offset of the callback within the enclosing RCU-protected
  * data structure.
  */
-TRACE_EVENT_RCU(rcu_invoke_kfree_callback,
+TRACE_EVENT_RCU(rcu_invoke_kvfree_callback,
 
 	TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset),
 
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 4b99f7b88bee..aa897c3f2e92 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -85,8 +85,8 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
 	unsigned long offset = (unsigned long)head->func;
 
 	rcu_lock_acquire(&rcu_callback_map);
-	if (__is_kfree_rcu_offset(offset)) {
-		trace_rcu_invoke_kfree_callback("", head, offset);
+	if (__is_kvfree_rcu_offset(offset)) {
+		trace_rcu_invoke_kvfree_callback("", head, offset);
 		kvfree((void *)head - offset);
 		rcu_lock_release(&rcu_callback_map);
 		return true;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 67c4b984c499..f22c47e72287 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2905,8 +2905,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
 		return; // Enqueued onto ->nocb_bypass, so just leave.
 	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
 	rcu_segcblist_enqueue(&rdp->cblist, head);
-	if (__is_kfree_rcu_offset((unsigned long)func))
-		trace_rcu_kfree_callback(rcu_state.name, head,
+	if (__is_kvfree_rcu_offset((unsigned long)func))
+		trace_rcu_kvfree_callback(rcu_state.name, head,
 					 (unsigned long)func,
 					 rcu_segcblist_n_cbs(&rdp->cblist));
 	else
@@ -3146,7 +3146,7 @@ static void kfree_rcu_work(struct work_struct *work)
 					bkvhead[i]->records);
 			} else { // vmalloc() / vfree().
 				for (j = 0; j < bkvhead[i]->nr_records; j++) {
-					trace_rcu_invoke_kfree_callback(
+					trace_rcu_invoke_kvfree_callback(
 						rcu_state.name,
 						bkvhead[i]->records[j], 0);
 
@@ -3179,9 +3179,9 @@ static void kfree_rcu_work(struct work_struct *work)
 		next = head->next;
 		debug_rcu_head_unqueue((struct rcu_head *)ptr);
 		rcu_lock_acquire(&rcu_callback_map);
-		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
+		trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
 
-		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
+		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
 			kvfree(ptr);
 
 		rcu_lock_release(&rcu_callback_map);
@@ -3344,12 +3344,12 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
  * one, that is used only when the main path can not be maintained temporary,
  * due to memory pressure.
  *
- * Each kfree_call_rcu() request is added to a batch. The batch will be drained
+ * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
  * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
  * be free'd in workqueue context. This allows us to: batch requests together to
  * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
  */
-void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	unsigned long flags;
 	struct kfree_rcu_cpu *krcp;
@@ -3388,7 +3388,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 unlock_return:
 	krc_this_cpu_unlock(krcp, flags);
 }
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
 
 static unsigned long
 kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
-- 
cgit v1.2.3


From 3042f83f19bec2e0cd356f72b39e4d816e8cd5ff Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:58 +0200
Subject: rcu: Support reclaim for head-less object

Update the kvfree_call_rcu() function with head-less support.
This allows RCU to reclaim objects without an embedded rcu_head.

tree-RCU:
We introduce two chains of arrays to store SLAB-backed and vmalloc
pointers, each.  Storage in either of these arrays does not require
embedding an rcu_head within the object.

Maintaining the arrays may become impossible due to high memory
pressure. For such cases there is an emergency path. Objects with
rcu_head inside are just queued on a backup rcu_head list. Later on
that list is drained. As for the head-less variant, as the current
context can sleep, the following emergency measures are applied:
   a) Synchronously wait until a grace period has elapsed.
   b) Call kvfree().

tiny-RCU:
For double argument calls, there are no new changes in behavior. For
single argument call, kvfree() is directly inlined on the current
stack after a synchronize_rcu() call. Note that for tiny-RCU, any
call to synchronize_rcu() is actually a quiescent state, therefore
it does nothing.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h | 18 +++++++++++++++++-
 kernel/rcu/tree.c       | 45 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 60 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index fb2eb39c484f..5cc9637cac16 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -34,9 +34,25 @@ static inline void synchronize_rcu_expedited(void)
 	synchronize_rcu();
 }
 
+/*
+ * Add one more declaration of kvfree() here. It is
+ * not so straight forward to just include <linux/mm.h>
+ * where it is defined due to getting many compile
+ * errors caused by that include.
+ */
+extern void kvfree(const void *addr);
+
 static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
-	call_rcu(head, func);
+	if (head) {
+		call_rcu(head, func);
+		return;
+	}
+
+	// kvfree_rcu(one_arg) call.
+	might_sleep();
+	synchronize_rcu();
+	kvfree((void *) func);
 }
 
 void rcu_qs(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f22c47e72287..01f29e4500ba 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3314,6 +3314,13 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
 			if (IS_ENABLED(CONFIG_PREEMPT_RT))
 				return false;
 
+			/*
+			 * NOTE: For one argument of kvfree_rcu() we can
+			 * drop the lock and get the page in sleepable
+			 * context. That would allow to maintain an array
+			 * for the CONFIG_PREEMPT_RT as well if no cached
+			 * pages are available.
+			 */
 			bnode = (struct kvfree_rcu_bulk_data *)
 				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 		}
@@ -3353,16 +3360,33 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	unsigned long flags;
 	struct kfree_rcu_cpu *krcp;
+	bool success;
 	void *ptr;
 
+	if (head) {
+		ptr = (void *) head - (unsigned long) func;
+	} else {
+		/*
+		 * Please note there is a limitation for the head-less
+		 * variant, that is why there is a clear rule for such
+		 * objects: it can be used from might_sleep() context
+		 * only. For other places please embed an rcu_head to
+		 * your data.
+		 */
+		might_sleep();
+		ptr = (unsigned long *) func;
+	}
+
 	krcp = krc_this_cpu_lock(&flags);
-	ptr = (void *)head - (unsigned long)func;
 
 	// Queue the object but don't yet schedule the batch.
 	if (debug_rcu_head_queue(ptr)) {
 		// Probable double kfree_rcu(), just leak.
 		WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
 			  __func__, head);
+
+		// Mark as success and leave.
+		success = true;
 		goto unlock_return;
 	}
 
@@ -3370,10 +3394,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	 * Under high memory pressure GFP_NOWAIT can fail,
 	 * in that case the emergency path is maintained.
 	 */
-	if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) {
+	success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
+	if (!success) {
+		if (head == NULL)
+			// Inline if kvfree_rcu(one_arg) call.
+			goto unlock_return;
+
 		head->func = func;
 		head->next = krcp->head;
 		krcp->head = head;
+		success = true;
 	}
 
 	WRITE_ONCE(krcp->count, krcp->count + 1);
@@ -3387,6 +3417,17 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 
 unlock_return:
 	krc_this_cpu_unlock(krcp, flags);
+
+	/*
+	 * Inline kvfree() after synchronize_rcu(). We can do
+	 * it from might_sleep() context only, so the current
+	 * CPU can pass the QS state.
+	 */
+	if (!success) {
+		debug_rcu_head_unqueue((struct rcu_head *) ptr);
+		synchronize_rcu();
+		kvfree(ptr);
+	}
 }
 EXPORT_SYMBOL_GPL(kvfree_call_rcu);
 
-- 
cgit v1.2.3


From ea6eed9f7d7382c7230202d4c3bf74185f193394 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 7 May 2020 16:47:13 -0700
Subject: rcu-tasks: Convert sleeps to idle priority

This commit converts the long-standing schedule_timeout_interruptible()
and schedule_timeout_uninterruptible() calls used by the various Tasks
RCU's grace-period kthreads to schedule_timeout_idle().  This conversion
avoids polluting the load-average with Tasks-RCU-related sleeping.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index ce23f6cc5043..91fee8122acd 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -205,7 +205,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 			if (!rtp->cbs_head) {
 				WARN_ON(signal_pending(current));
 				set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS);
-				schedule_timeout_interruptible(HZ/10);
+				schedule_timeout_idle(HZ/10);
 			}
 			continue;
 		}
@@ -227,7 +227,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 			cond_resched();
 		}
 		/* Paranoid sleep to keep this from entering a tight loop */
-		schedule_timeout_uninterruptible(HZ/10);
+		schedule_timeout_idle(HZ/10);
 
 		set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
 	}
@@ -336,7 +336,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
 
 		/* Slowly back off waiting for holdouts */
 		set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
-		schedule_timeout_interruptible(HZ/fract);
+		schedule_timeout_idle(HZ/fract);
 
 		if (fract > 1)
 			fract--;
-- 
cgit v1.2.3


From 04a3c5aa7a8cb2ce97f9beb627ba742bc8b0fe03 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 May 2020 19:27:06 -0700
Subject: rcu-tasks: Make rcu_tasks_postscan() be static

The rcu_tasks_postscan() function is not used outside of RCU's tasks.h
file, so this commit makes it be static.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 91fee8122acd..da200e53d60d 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -402,7 +402,7 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
 }
 
 /* Processing between scanning taskslist and draining the holdout list. */
-void rcu_tasks_postscan(struct list_head *hop)
+static void rcu_tasks_postscan(struct list_head *hop)
 {
 	/*
 	 * Wait for tasks that are in the process of exiting.  This
-- 
cgit v1.2.3


From 5b3cc99bedf5885055fbaf35fe63d205f06b5be5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 May 2020 19:33:47 -0700
Subject: rcu-tasks: Add #include of rcupdate_trace.h to update.c

Although this is in some strict sense unnecessary, it is good to allow
the compiler to compare the function declaration with its definition.
This commit therefore adds a #include of linux/rcupdate_trace.h to
kernel/rcu/update.c.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/update.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 84843adfd939..c0fea809d738 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -42,6 +42,7 @@
 #include <linux/kprobes.h>
 #include <linux/slab.h>
 #include <linux/irq_work.h>
+#include <linux/rcupdate_trace.h>
 
 #define CREATE_TRACE_POINTS
 
-- 
cgit v1.2.3


From 8344496e8b49c4122c1808d6cd3f8dc71bccb595 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 May 2020 20:03:48 -0700
Subject: rcu-tasks: Conditionally compile show_rcu_tasks_gp_kthreads()

The show_rcu_tasks_gp_kthreads() function is not invoked by Tiny RCU,
but is nevertheless defined in Tiny RCU builds that enable Tasks Trace
RCU.  This commit therefore conditionally compiles this function so
that it is defined only in builds that actually use it.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index da200e53d60d..d5c003c1972c 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -103,6 +103,7 @@ module_param(rcu_task_stall_timeout, int, 0644);
 #define RTGS_WAIT_READERS	 9
 #define RTGS_INVOKE_CBS		10
 #define RTGS_WAIT_CBS		11
+#ifndef CONFIG_TINY_RCU
 static const char * const rcu_tasks_gp_state_names[] = {
 	"RTGS_INIT",
 	"RTGS_WAIT_WAIT_CBS",
@@ -117,6 +118,7 @@ static const char * const rcu_tasks_gp_state_names[] = {
 	"RTGS_INVOKE_CBS",
 	"RTGS_WAIT_CBS",
 };
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 ////////////////////////////////////////////////////////////////////////
 //
@@ -129,6 +131,7 @@ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate)
 	rtp->gp_jiffies = jiffies;
 }
 
+#ifndef CONFIG_TINY_RCU
 /* Return state name. */
 static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
 {
@@ -139,6 +142,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
 		return "???";
 	return rcu_tasks_gp_state_names[j];
 }
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 // Enqueue a callback for the specified flavor of Tasks RCU.
 static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
@@ -268,6 +272,7 @@ static void __init rcu_tasks_bootup_oddness(void)
 
 #endif /* #ifndef CONFIG_TINY_RCU */
 
+#ifndef CONFIG_TINY_RCU
 /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
 static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
 {
@@ -281,6 +286,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
 		".C"[!!data_race(rtp->cbs_head)],
 		s);
 }
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 static void exit_tasks_rcu_finish_trace(struct task_struct *t);
 
@@ -557,10 +563,12 @@ static int __init rcu_spawn_tasks_kthread(void)
 }
 core_initcall(rcu_spawn_tasks_kthread);
 
+#ifndef CONFIG_TINY_RCU
 static void show_rcu_tasks_classic_gp_kthread(void)
 {
 	show_rcu_tasks_generic_gp_kthread(&rcu_tasks, "");
 }
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 /* Do the srcu_read_lock() for the above synchronize_srcu().  */
 void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
@@ -682,10 +690,12 @@ static int __init rcu_spawn_tasks_rude_kthread(void)
 }
 core_initcall(rcu_spawn_tasks_rude_kthread);
 
+#ifndef CONFIG_TINY_RCU
 static void show_rcu_tasks_rude_gp_kthread(void)
 {
 	show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, "");
 }
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 #else /* #ifdef CONFIG_TASKS_RUDE_RCU */
 static void show_rcu_tasks_rude_gp_kthread(void) {}
@@ -1164,6 +1174,7 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
 }
 core_initcall(rcu_spawn_tasks_trace_kthread);
 
+#ifndef CONFIG_TINY_RCU
 static void show_rcu_tasks_trace_gp_kthread(void)
 {
 	char buf[64];
@@ -1174,18 +1185,21 @@ static void show_rcu_tasks_trace_gp_kthread(void)
 		data_race(n_heavy_reader_attempts));
 	show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf);
 }
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 #else /* #ifdef CONFIG_TASKS_TRACE_RCU */
 static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
 static inline void show_rcu_tasks_trace_gp_kthread(void) {}
 #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
 
+#ifndef CONFIG_TINY_RCU
 void show_rcu_tasks_gp_kthreads(void)
 {
 	show_rcu_tasks_classic_gp_kthread();
 	show_rcu_tasks_rude_gp_kthread();
 	show_rcu_tasks_trace_gp_kthread();
 }
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
 static inline void rcu_tasks_bootup_oddness(void) {}
-- 
cgit v1.2.3


From 30d8aa5128f12c9d781b67c9694c1abfa4f6ce6a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 9 Jun 2020 09:24:51 -0700
Subject: rcu-tasks: Fix code-style issues

This commit declares trc_n_readers_need_end and trc_wait static and
replaced a "&" with "&&".  The "&" happened to work because the values
are bool, but accidents waiting to happen and all that...

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index d5c003c1972c..828f222895f1 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -737,8 +737,8 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
 
 #ifdef CONFIG_TASKS_TRACE_RCU
 
-atomic_t trc_n_readers_need_end;	// Number of waited-for readers.
-DECLARE_WAIT_QUEUE_HEAD(trc_wait);	// List of holdout tasks.
+static atomic_t trc_n_readers_need_end;		// Number of waited-for readers.
+static DECLARE_WAIT_QUEUE_HEAD(trc_wait);	// List of holdout tasks.
 
 // Record outstanding IPIs to each CPU.  No point in sending two...
 static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
@@ -845,7 +845,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
 	bool ofl = cpu_is_offline(cpu);
 
 	if (task_curr(t)) {
-		WARN_ON_ONCE(ofl & !is_idle_task(t));
+		WARN_ON_ONCE(ofl && !is_idle_task(t));
 
 		// If no chance of heavyweight readers, do it the hard way.
 		if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
-- 
cgit v1.2.3


From 7e866460cc18797b3a59360f5f8c444598a21729 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Mon, 25 May 2020 00:36:47 -0400
Subject: rcuperf: Remove useless while loops around wait_event

wait_event() already retries if the condition for the wake up is not
satisifed after wake up. Remove them from the rcuperf test.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcuperf.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 16dd1e6b7c09..246da8fe199e 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -576,11 +576,8 @@ static int compute_real(int n)
 static int
 rcu_perf_shutdown(void *arg)
 {
-	do {
-		wait_event(shutdown_wq,
-			   atomic_read(&n_rcu_perf_writer_finished) >=
-			   nrealwriters);
-	} while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters);
+	wait_event(shutdown_wq,
+		   atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters);
 	smp_mb(); /* Wake before output. */
 	rcu_perf_cleanup();
 	kernel_power_off();
@@ -693,11 +690,8 @@ kfree_perf_cleanup(void)
 static int
 kfree_perf_shutdown(void *arg)
 {
-	do {
-		wait_event(shutdown_wq,
-			   atomic_read(&n_kfree_perf_thread_ended) >=
-			   kfree_nrealthreads);
-	} while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
+	wait_event(shutdown_wq,
+		   atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads);
 
 	smp_mb(); /* Wake before output. */
 
-- 
cgit v1.2.3


From 653ed64b01dc5989f8f579d0038e987476c2c023 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Mon, 25 May 2020 00:36:48 -0400
Subject: refperf: Add a test to measure performance of read-side
 synchronization

Add a test for comparing the performance of RCU with various read-side
synchronization mechanisms. The test has proved useful for collecting
data and performing these comparisons.

Currently RCU, SRCU, reader-writer lock, reader-writer semaphore and
reference counting can be measured using refperf.perf_type parameter.
Each invocation of the test runs measures performance of a specific
mechanism.

The maximum number of CPUs to concurrently run readers on is chosen by
the test itself and is 75% of the total number of CPUs. So if you had 24
CPUs, the test runs with a maximum of 18 parallel readers.

A number of experiments are conducted, and in each experiment, the
number of readers is increased by 1, upto the 75% of CPUs mark. During
each experiment, all readers execute an empty loop with refperf.loops
iterations and time the total loop duration. This is then averaged.

Example output:
Parameters "refperf.perf_type=srcu refperf.loops=2000000" looks like:

[    3.347133] srcu-ref-perf:
[    3.347133] Threads  Time(ns)
[    3.347133] 1        36
[    3.347133] 2        34
[    3.347133] 3        34
[    3.347133] 4        34
[    3.347133] 5        33
[    3.347133] 6        33
[    3.347133] 7        33
[    3.347133] 8        33
[    3.347133] 9        33
[    3.347133] 10       33
[    3.347133] 11       33
[    3.347133] 12       33
[    3.347133] 13       33
[    3.347133] 14       33
[    3.347133] 15       32
[    3.347133] 16       33
[    3.347133] 17       33
[    3.347133] 18       34

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig.debug |  19 ++
 kernel/rcu/Makefile      |   1 +
 kernel/rcu/refperf.c     | 558 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 578 insertions(+)
 create mode 100644 kernel/rcu/refperf.c

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 452feae8de20..858765b7f644 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -61,6 +61,25 @@ config RCU_TORTURE_TEST
 	  Say M if you want the RCU torture tests to build as a module.
 	  Say N if you are unsure.
 
+config RCU_REF_PERF_TEST
+	tristate "Performance tests for read-side synchronization (RCU and others)"
+	depends on DEBUG_KERNEL
+	select TORTURE_TEST
+	select SRCU
+	select TASKS_RCU
+	select TASKS_RUDE_RCU
+	select TASKS_TRACE_RCU
+	default n
+	help
+	  This option provides a kernel module that runs performance tests
+	  useful comparing RCU with various read-side synchronization mechanisms.
+	  The kernel module may be built after the fact on the running kernel to be
+	  tested, if desired.
+
+	  Say Y here if you want these performance tests built into the kernel.
+	  Say M if you want to build it as a module instead.
+	  Say N if you are unsure.
+
 config RCU_CPU_STALL_TIMEOUT
 	int "RCU CPU stall timeout in seconds"
 	depends on RCU_STALL_COMMON
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index f91f2c2cf138..ba7d82609cbe 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o
 obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
+obj-$(CONFIG_RCU_REF_PERF_TEST) += refperf.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_TINY_RCU) += tiny.o
 obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
new file mode 100644
index 000000000000..61161530acc8
--- /dev/null
+++ b/kernel/rcu/refperf.c
@@ -0,0 +1,558 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Performance test comparing RCU vs other mechanisms
+// for acquiring references on objects.
+//
+// Copyright (C) Google, 2020.
+//
+// Author: Joel Fernandes <joel@joelfernandes.org>
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/torture.h>
+#include <linux/types.h>
+
+#include "rcu.h"
+
+#define PERF_FLAG "-ref-perf: "
+
+#define PERFOUT(s, x...) \
+	pr_alert("%s" PERF_FLAG s, perf_type, ## x)
+
+#define VERBOSE_PERFOUT(s, x...) \
+	do { if (verbose) pr_alert("%s" PERF_FLAG s, perf_type, ## x); } while (0)
+
+#define VERBOSE_PERFOUT_ERRSTRING(s, x...) \
+	do { if (verbose) pr_alert("%s" PERF_FLAG "!!! " s, perf_type, ## x); } while (0)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
+
+static char *perf_type = "rcu";
+module_param(perf_type, charp, 0444);
+MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
+
+torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+
+// Number of loops per experiment, all readers execute an operation concurrently
+torture_param(long, loops, 10000000, "Number of loops per experiment.");
+
+#ifdef MODULE
+# define REFPERF_SHUTDOWN 0
+#else
+# define REFPERF_SHUTDOWN 1
+#endif
+
+torture_param(bool, shutdown, REFPERF_SHUTDOWN,
+	      "Shutdown at end of performance tests.");
+
+struct reader_task {
+	struct task_struct *task;
+	atomic_t start;
+	wait_queue_head_t wq;
+	u64 last_duration_ns;
+
+	// The average latency When 1..<this reader> are concurrently
+	// running an experiment. For example, if this reader_task is
+	// of index 5 in the reader_tasks array, then result is for
+	// 6 cores.
+	u64 result_avg;
+};
+
+static struct task_struct *shutdown_task;
+static wait_queue_head_t shutdown_wq;
+
+static struct task_struct *main_task;
+static wait_queue_head_t main_wq;
+static int shutdown_start;
+
+static struct reader_task *reader_tasks;
+static int nreaders;
+
+// Number of readers that are part of the current experiment.
+static atomic_t nreaders_exp;
+
+// Use to wait for all threads to start.
+static atomic_t n_init;
+
+// Track which experiment is currently running.
+static int exp_idx;
+
+// Operations vector for selecting different types of tests.
+struct ref_perf_ops {
+	void (*init)(void);
+	void (*cleanup)(void);
+	int (*readlock)(void);
+	void (*readunlock)(int idx);
+	const char *name;
+};
+
+static struct ref_perf_ops *cur_ops;
+
+// Definitions for RCU ref perf testing.
+static int ref_rcu_read_lock(void) __acquires(RCU)
+{
+	rcu_read_lock();
+	return 0;
+}
+
+static void ref_rcu_read_unlock(int idx) __releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static void rcu_sync_perf_init(void)
+{
+}
+
+static struct ref_perf_ops rcu_ops = {
+	.init		= rcu_sync_perf_init,
+	.readlock	= ref_rcu_read_lock,
+	.readunlock	= ref_rcu_read_unlock,
+	.name		= "rcu"
+};
+
+
+// Definitions for SRCU ref perf testing.
+DEFINE_STATIC_SRCU(srcu_refctl_perf);
+static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf;
+
+static int srcu_ref_perf_read_lock(void) __acquires(srcu_ctlp)
+{
+	return srcu_read_lock(srcu_ctlp);
+}
+
+static void srcu_ref_perf_read_unlock(int idx) __releases(srcu_ctlp)
+{
+	srcu_read_unlock(srcu_ctlp, idx);
+}
+
+static struct ref_perf_ops srcu_ops = {
+	.init		= rcu_sync_perf_init,
+	.readlock	= srcu_ref_perf_read_lock,
+	.readunlock	= srcu_ref_perf_read_unlock,
+	.name		= "srcu"
+};
+
+// Definitions for reference count
+static atomic_t refcnt;
+
+static int srcu_ref_perf_refcnt_lock(void)
+{
+	atomic_inc(&refcnt);
+	return 0;
+}
+
+static void srcu_ref_perf_refcnt_unlock(int idx) __releases(srcu_ctlp)
+{
+	atomic_dec(&refcnt);
+	srcu_read_unlock(srcu_ctlp, idx);
+}
+
+static struct ref_perf_ops refcnt_ops = {
+	.init		= rcu_sync_perf_init,
+	.readlock	= srcu_ref_perf_refcnt_lock,
+	.readunlock	= srcu_ref_perf_refcnt_unlock,
+	.name		= "refcnt"
+};
+
+// Definitions for rwlock
+static rwlock_t test_rwlock;
+
+static void ref_perf_rwlock_init(void)
+{
+	rwlock_init(&test_rwlock);
+}
+
+static int ref_perf_rwlock_lock(void)
+{
+	read_lock(&test_rwlock);
+	return 0;
+}
+
+static void ref_perf_rwlock_unlock(int idx)
+{
+	read_unlock(&test_rwlock);
+}
+
+static struct ref_perf_ops rwlock_ops = {
+	.init		= ref_perf_rwlock_init,
+	.readlock	= ref_perf_rwlock_lock,
+	.readunlock	= ref_perf_rwlock_unlock,
+	.name		= "rwlock"
+};
+
+// Definitions for rwsem
+static struct rw_semaphore test_rwsem;
+
+static void ref_perf_rwsem_init(void)
+{
+	init_rwsem(&test_rwsem);
+}
+
+static int ref_perf_rwsem_lock(void)
+{
+	down_read(&test_rwsem);
+	return 0;
+}
+
+static void ref_perf_rwsem_unlock(int idx)
+{
+	up_read(&test_rwsem);
+}
+
+static struct ref_perf_ops rwsem_ops = {
+	.init		= ref_perf_rwsem_init,
+	.readlock	= ref_perf_rwsem_lock,
+	.readunlock	= ref_perf_rwsem_unlock,
+	.name		= "rwsem"
+};
+
+// Reader kthread.  Repeatedly does empty RCU read-side
+// critical section, minimizing update-side interference.
+static int
+ref_perf_reader(void *arg)
+{
+	unsigned long flags;
+	long me = (long)arg;
+	struct reader_task *rt = &(reader_tasks[me]);
+	unsigned long spincnt;
+	int idx;
+	u64 start;
+	s64 duration;
+
+	VERBOSE_PERFOUT("ref_perf_reader %ld: task started", me);
+	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+	set_user_nice(current, MAX_NICE);
+	atomic_inc(&n_init);
+repeat:
+	VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
+
+	// Wait for signal that this reader can start.
+	wait_event(rt->wq, (atomic_read(&nreaders_exp) && atomic_read(&rt->start)) ||
+			   torture_must_stop());
+
+	if (torture_must_stop())
+		goto end;
+
+	// Make sure that the CPU is affinitized appropriately during testing.
+	WARN_ON_ONCE(smp_processor_id() != me);
+
+	atomic_dec(&rt->start);
+
+	// To prevent noise, keep interrupts disabled. This also has the
+	// effect of preventing entries into slow path for rcu_read_unlock().
+	local_irq_save(flags);
+	start = ktime_get_mono_fast_ns();
+
+	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
+
+	for (spincnt = 0; spincnt < loops; spincnt++) {
+		idx = cur_ops->readlock();
+		cur_ops->readunlock(idx);
+	}
+
+	duration = ktime_get_mono_fast_ns() - start;
+	local_irq_restore(flags);
+
+	rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
+
+	atomic_dec(&nreaders_exp);
+
+	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)",
+			me, exp_idx, atomic_read(&nreaders_exp));
+
+	if (!atomic_read(&nreaders_exp))
+		wake_up(&main_wq);
+
+	if (!torture_must_stop())
+		goto repeat;
+end:
+	torture_kthread_stopping("ref_perf_reader");
+	return 0;
+}
+
+void reset_readers(int n)
+{
+	int i;
+	struct reader_task *rt;
+
+	for (i = 0; i < n; i++) {
+		rt = &(reader_tasks[i]);
+
+		rt->last_duration_ns = 0;
+	}
+}
+
+// Print the results of each reader and return the sum of all their durations.
+u64 process_durations(int n)
+{
+	int i;
+	struct reader_task *rt;
+	char buf1[64];
+	char buf[512];
+	u64 sum = 0;
+
+	buf[0] = 0;
+	sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
+		exp_idx);
+
+	for (i = 0; i <= n && !torture_must_stop(); i++) {
+		rt = &(reader_tasks[i]);
+		sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns);
+
+		if (i % 5 == 0)
+			strcat(buf, "\n");
+		strcat(buf, buf1);
+
+		sum += rt->last_duration_ns;
+	}
+	strcat(buf, "\n");
+
+	PERFOUT("%s\n", buf);
+
+	return sum;
+}
+
+// The main_func is the main orchestrator, it performs a bunch of
+// experiments.  For every experiment, it orders all the readers
+// involved to start and waits for them to finish the experiment. It
+// then reads their timestamps and starts the next experiment. Each
+// experiment progresses from 1 concurrent reader to N of them at which
+// point all the timestamps are printed.
+static int main_func(void *arg)
+{
+	int exp, r;
+	char buf1[64];
+	char buf[512];
+
+	set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
+	set_user_nice(current, MAX_NICE);
+
+	VERBOSE_PERFOUT("main_func task started");
+	atomic_inc(&n_init);
+
+	// Wait for all threads to start.
+	wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1));
+
+	// Start exp readers up per experiment
+	for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) {
+		if (torture_must_stop())
+			goto end;
+
+		reset_readers(exp);
+		atomic_set(&nreaders_exp, exp + 1);
+
+		exp_idx = exp;
+
+		for (r = 0; r <= exp; r++) {
+			atomic_set(&reader_tasks[r].start, 1);
+			wake_up(&reader_tasks[r].wq);
+		}
+
+		VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers",
+				exp);
+
+		wait_event(main_wq,
+			   !atomic_read(&nreaders_exp) || torture_must_stop());
+
+		VERBOSE_PERFOUT("main_func: experiment ended");
+
+		if (torture_must_stop())
+			goto end;
+
+		reader_tasks[exp].result_avg = process_durations(exp) / ((exp + 1) * loops);
+	}
+
+	// Print the average of all experiments
+	PERFOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
+
+	buf[0] = 0;
+	strcat(buf, "\n");
+	strcat(buf, "Threads\tTime(ns)\n");
+
+	for (exp = 0; exp < nreaders; exp++) {
+		sprintf(buf1, "%d\t%llu\n", exp + 1, reader_tasks[exp].result_avg);
+		strcat(buf, buf1);
+	}
+
+	PERFOUT("%s", buf);
+
+	// This will shutdown everything including us.
+	if (shutdown) {
+		shutdown_start = 1;
+		wake_up(&shutdown_wq);
+	}
+
+	// Wait for torture to stop us
+	while (!torture_must_stop())
+		schedule_timeout_uninterruptible(1);
+
+end:
+	torture_kthread_stopping("main_func");
+	return 0;
+}
+
+static void
+ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag)
+{
+	pr_alert("%s" PERF_FLAG
+		 "--- %s:  verbose=%d shutdown=%d loops=%ld\n", perf_type, tag,
+		 verbose, shutdown, loops);
+}
+
+static void
+ref_perf_cleanup(void)
+{
+	int i;
+
+	if (torture_cleanup_begin())
+		return;
+
+	if (!cur_ops) {
+		torture_cleanup_end();
+		return;
+	}
+
+	if (reader_tasks) {
+		for (i = 0; i < nreaders; i++)
+			torture_stop_kthread("ref_perf_reader",
+					     reader_tasks[i].task);
+	}
+	kfree(reader_tasks);
+
+	torture_stop_kthread("main_task", main_task);
+	kfree(main_task);
+
+	// Do perf-type-specific cleanup operations.
+	if (cur_ops->cleanup != NULL)
+		cur_ops->cleanup();
+
+	torture_cleanup_end();
+}
+
+// Shutdown kthread.  Just waits to be awakened, then shuts down system.
+static int
+ref_perf_shutdown(void *arg)
+{
+	wait_event(shutdown_wq, shutdown_start);
+
+	smp_mb(); // Wake before output.
+	ref_perf_cleanup();
+	kernel_power_off();
+
+	return -EINVAL;
+}
+
+static int __init
+ref_perf_init(void)
+{
+	long i;
+	int firsterr = 0;
+	static struct ref_perf_ops *perf_ops[] = {
+		&rcu_ops, &srcu_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops,
+	};
+
+	if (!torture_init_begin(perf_type, verbose))
+		return -EBUSY;
+
+	for (i = 0; i < ARRAY_SIZE(perf_ops); i++) {
+		cur_ops = perf_ops[i];
+		if (strcmp(perf_type, cur_ops->name) == 0)
+			break;
+	}
+	if (i == ARRAY_SIZE(perf_ops)) {
+		pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type);
+		pr_alert("rcu-perf types:");
+		for (i = 0; i < ARRAY_SIZE(perf_ops); i++)
+			pr_cont(" %s", perf_ops[i]->name);
+		pr_cont("\n");
+		WARN_ON(!IS_MODULE(CONFIG_RCU_REF_PERF_TEST));
+		firsterr = -EINVAL;
+		cur_ops = NULL;
+		goto unwind;
+	}
+	if (cur_ops->init)
+		cur_ops->init();
+
+	ref_perf_print_module_parms(cur_ops, "Start of test");
+
+	// Shutdown task
+	if (shutdown) {
+		init_waitqueue_head(&shutdown_wq);
+		firsterr = torture_create_kthread(ref_perf_shutdown, NULL,
+						  shutdown_task);
+		if (firsterr)
+			goto unwind;
+		schedule_timeout_uninterruptible(1);
+	}
+
+	// Reader tasks (~75% of online CPUs).
+	nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
+	reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	if (!reader_tasks) {
+		VERBOSE_PERFOUT_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+
+	VERBOSE_PERFOUT("Starting %d reader threads\n", nreaders);
+
+	for (i = 0; i < nreaders; i++) {
+		firsterr = torture_create_kthread(ref_perf_reader, (void *)i,
+						  reader_tasks[i].task);
+		if (firsterr)
+			goto unwind;
+
+		init_waitqueue_head(&(reader_tasks[i].wq));
+	}
+
+	// Main Task
+	init_waitqueue_head(&main_wq);
+	firsterr = torture_create_kthread(main_func, NULL, main_task);
+	if (firsterr)
+		goto unwind;
+	schedule_timeout_uninterruptible(1);
+
+
+	// Wait until all threads start
+	while (atomic_read(&n_init) < nreaders + 1)
+		schedule_timeout_uninterruptible(1);
+
+	wake_up(&main_wq);
+
+	torture_init_end();
+	return 0;
+
+unwind:
+	torture_init_end();
+	ref_perf_cleanup();
+	return firsterr;
+}
+
+module_init(ref_perf_init);
+module_exit(ref_perf_cleanup);
-- 
cgit v1.2.3


From 708cda31652c02e64adaeafafe7b996e4e14c3eb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 09:22:24 -0700
Subject: rcuperf: Add comments explaining the high reader overhead

This commit adds comments explaining why the readers have otherwise insane
levels of measurement overhead, namely that they are intended as a test
load for update-side performance measurements, not as a straight-up
read-side performance test.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcuperf.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 246da8fe199e..d906ca987936 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -69,6 +69,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
  *	value specified by nr_cpus for a read-only test.
  *
  * Various other use cases may of course be specified.
+ *
+ * Note that this test's readers are intended only as a test load for
+ * the writers.  The reader performance statistics will be overly
+ * pessimistic due to the per-critical-section interrupt disabling,
+ * test-end checks, and the pair of calls through pointers.
  */
 
 #ifdef MODULE
@@ -309,8 +314,10 @@ static void rcu_perf_wait_shutdown(void)
 }
 
 /*
- * RCU perf reader kthread.  Repeatedly does empty RCU read-side
- * critical section, minimizing update-side interference.
+ * RCU perf reader kthread.  Repeatedly does empty RCU read-side critical
+ * section, minimizing update-side interference.  However, the point of
+ * this test is not to evaluate reader performance, but instead to serve
+ * as a test load for update-side performance testing.
  */
 static int
 rcu_perf_reader(void *arg)
-- 
cgit v1.2.3


From 777a54c908ec69fa0eccab54068a49ecda38ffde Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 14:16:44 -0700
Subject: refperf: Add holdoff parameter to allow CPUs to come online

This commit adds an rcuperf module parameter named "holdoff" that
defaults to 10 seconds if refperf is built in and to zero otherwise.
The assumption is that all the CPUs are online by the time that the
modprobe and insmod commands are going to do anything, and that normal
systems will have all the CPUs online within ten seconds.

Larger systems may take many tens of seconds or even minutes to get
to this point, hence this being a module parameter instead of being a
hard-coded constant.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 61161530acc8..4d686fdc3105 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -57,7 +57,10 @@ MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
 
 torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
 
-// Number of loops per experiment, all readers execute an operation concurrently
+// Wait until there are multiple CPUs before starting test.
+torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0,
+	      "Holdoff time before test start (s)");
+// Number of loops per experiment, all readers execute operations concurrently.
 torture_param(long, loops, 10000000, "Number of loops per experiment.");
 
 #ifdef MODULE
@@ -248,6 +251,8 @@ ref_perf_reader(void *arg)
 	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
 	set_user_nice(current, MAX_NICE);
 	atomic_inc(&n_init);
+	if (holdoff)
+		schedule_timeout_interruptible(holdoff * HZ);
 repeat:
 	VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
 
@@ -357,6 +362,8 @@ static int main_func(void *arg)
 
 	// Wait for all threads to start.
 	wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1));
+	if (holdoff)
+		schedule_timeout_interruptible(holdoff * HZ);
 
 	// Start exp readers up per experiment
 	for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) {
@@ -420,8 +427,8 @@ static void
 ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag)
 {
 	pr_alert("%s" PERF_FLAG
-		 "--- %s:  verbose=%d shutdown=%d loops=%ld\n", perf_type, tag,
-		 verbose, shutdown, loops);
+		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld\n", perf_type, tag,
+		 verbose, shutdown, holdoff, loops);
 }
 
 static void
-- 
cgit v1.2.3


From 75dd8efef56ed5959c398974c785026f84aa0d1a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 14:59:06 -0700
Subject: refperf: Hoist function-pointer calls out of the loop

Current runs show PREEMPT=n rcu_read_lock()/rcu_read_unlock() pairs
consuming between 20 and 30 nanoseconds, when in fact the actual value is
zero, give or take the barrier() asm's effect on compiler optimizations.
The additional overhead is caused by function calls through pointers
(especially in these days of Spectre mitigations) and perhaps also
needless argument passing, a non-const loop limit, and an upcounting loop.

This commit therefore combines the ->readlock() and ->readunlock()
function pointers into a single ->readsection() function pointer that
takes the loop count as a const parameter and keeps any data passed
from the read-lock to the read-unlock internal to this new function.

These changes reduce the measured overhead of the aforementioned
PREEMPT=n rcu_read_lock()/rcu_read_unlock() pairs from between 20 and
30 nanoseconds to somewhere south of 500 picoseconds.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 92 ++++++++++++++++++++++------------------------------
 1 file changed, 38 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 4d686fdc3105..57c7b7a40bd2 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -108,23 +108,20 @@ static int exp_idx;
 struct ref_perf_ops {
 	void (*init)(void);
 	void (*cleanup)(void);
-	int (*readlock)(void);
-	void (*readunlock)(int idx);
+	void (*readsection)(const int nloops);
 	const char *name;
 };
 
 static struct ref_perf_ops *cur_ops;
 
-// Definitions for RCU ref perf testing.
-static int ref_rcu_read_lock(void) __acquires(RCU)
+static void ref_rcu_read_section(const int nloops)
 {
-	rcu_read_lock();
-	return 0;
-}
+	int i;
 
-static void ref_rcu_read_unlock(int idx) __releases(RCU)
-{
-	rcu_read_unlock();
+	for (i = nloops; i >= 0; i--) {
+		rcu_read_lock();
+		rcu_read_unlock();
+	}
 }
 
 static void rcu_sync_perf_init(void)
@@ -133,8 +130,7 @@ static void rcu_sync_perf_init(void)
 
 static struct ref_perf_ops rcu_ops = {
 	.init		= rcu_sync_perf_init,
-	.readlock	= ref_rcu_read_lock,
-	.readunlock	= ref_rcu_read_unlock,
+	.readsection	= ref_rcu_read_section,
 	.name		= "rcu"
 };
 
@@ -143,42 +139,39 @@ static struct ref_perf_ops rcu_ops = {
 DEFINE_STATIC_SRCU(srcu_refctl_perf);
 static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf;
 
-static int srcu_ref_perf_read_lock(void) __acquires(srcu_ctlp)
+static void srcu_ref_perf_read_section(int nloops)
 {
-	return srcu_read_lock(srcu_ctlp);
-}
+	int i;
+	int idx;
 
-static void srcu_ref_perf_read_unlock(int idx) __releases(srcu_ctlp)
-{
-	srcu_read_unlock(srcu_ctlp, idx);
+	for (i = nloops; i >= 0; i--) {
+		idx = srcu_read_lock(srcu_ctlp);
+		srcu_read_unlock(srcu_ctlp, idx);
+	}
 }
 
 static struct ref_perf_ops srcu_ops = {
 	.init		= rcu_sync_perf_init,
-	.readlock	= srcu_ref_perf_read_lock,
-	.readunlock	= srcu_ref_perf_read_unlock,
+	.readsection	= srcu_ref_perf_read_section,
 	.name		= "srcu"
 };
 
 // Definitions for reference count
 static atomic_t refcnt;
 
-static int srcu_ref_perf_refcnt_lock(void)
+static void ref_perf_refcnt_section(const int nloops)
 {
-	atomic_inc(&refcnt);
-	return 0;
-}
+	int i;
 
-static void srcu_ref_perf_refcnt_unlock(int idx) __releases(srcu_ctlp)
-{
-	atomic_dec(&refcnt);
-	srcu_read_unlock(srcu_ctlp, idx);
+	for (i = nloops; i >= 0; i--) {
+		atomic_inc(&refcnt);
+		atomic_dec(&refcnt);
+	}
 }
 
 static struct ref_perf_ops refcnt_ops = {
 	.init		= rcu_sync_perf_init,
-	.readlock	= srcu_ref_perf_refcnt_lock,
-	.readunlock	= srcu_ref_perf_refcnt_unlock,
+	.readsection	= ref_perf_refcnt_section,
 	.name		= "refcnt"
 };
 
@@ -190,21 +183,19 @@ static void ref_perf_rwlock_init(void)
 	rwlock_init(&test_rwlock);
 }
 
-static int ref_perf_rwlock_lock(void)
+static void ref_perf_rwlock_section(const int nloops)
 {
-	read_lock(&test_rwlock);
-	return 0;
-}
+	int i;
 
-static void ref_perf_rwlock_unlock(int idx)
-{
-	read_unlock(&test_rwlock);
+	for (i = nloops; i >= 0; i--) {
+		read_lock(&test_rwlock);
+		read_unlock(&test_rwlock);
+	}
 }
 
 static struct ref_perf_ops rwlock_ops = {
 	.init		= ref_perf_rwlock_init,
-	.readlock	= ref_perf_rwlock_lock,
-	.readunlock	= ref_perf_rwlock_unlock,
+	.readsection	= ref_perf_rwlock_section,
 	.name		= "rwlock"
 };
 
@@ -216,21 +207,19 @@ static void ref_perf_rwsem_init(void)
 	init_rwsem(&test_rwsem);
 }
 
-static int ref_perf_rwsem_lock(void)
+static void ref_perf_rwsem_section(const int nloops)
 {
-	down_read(&test_rwsem);
-	return 0;
-}
+	int i;
 
-static void ref_perf_rwsem_unlock(int idx)
-{
-	up_read(&test_rwsem);
+	for (i = nloops; i >= 0; i--) {
+		down_read(&test_rwsem);
+		up_read(&test_rwsem);
+	}
 }
 
 static struct ref_perf_ops rwsem_ops = {
 	.init		= ref_perf_rwsem_init,
-	.readlock	= ref_perf_rwsem_lock,
-	.readunlock	= ref_perf_rwsem_unlock,
+	.readsection	= ref_perf_rwsem_section,
 	.name		= "rwsem"
 };
 
@@ -242,8 +231,6 @@ ref_perf_reader(void *arg)
 	unsigned long flags;
 	long me = (long)arg;
 	struct reader_task *rt = &(reader_tasks[me]);
-	unsigned long spincnt;
-	int idx;
 	u64 start;
 	s64 duration;
 
@@ -275,10 +262,7 @@ repeat:
 
 	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
 
-	for (spincnt = 0; spincnt < loops; spincnt++) {
-		idx = cur_ops->readlock();
-		cur_ops->readunlock(idx);
-	}
+	cur_ops->readsection(loops);
 
 	duration = ktime_get_mono_fast_ns() - start;
 	local_irq_restore(flags);
-- 
cgit v1.2.3


From 83b88c86da0e5f97faeac5a9bb19fe32f8c0394b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 15:31:07 -0700
Subject: refperf: Allow decimal nanoseconds

The CONFIG_PREEMPT=n rcu_read_lock()/rcu_read_unlock() pair's overhead,
even including loop overhead, is far less than one nanosecond.
Since logscale plots are not all that happy with zero values, provide
picoseconds as decimals.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 57c7b7a40bd2..e991d4820f51 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -375,7 +375,7 @@ static int main_func(void *arg)
 		if (torture_must_stop())
 			goto end;
 
-		reader_tasks[exp].result_avg = process_durations(exp) / ((exp + 1) * loops);
+		reader_tasks[exp].result_avg = 1000 * process_durations(exp) / ((exp + 1) * loops);
 	}
 
 	// Print the average of all experiments
@@ -386,7 +386,7 @@ static int main_func(void *arg)
 	strcat(buf, "Threads\tTime(ns)\n");
 
 	for (exp = 0; exp < nreaders; exp++) {
-		sprintf(buf1, "%d\t%llu\n", exp + 1, reader_tasks[exp].result_avg);
+		sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, reader_tasks[exp].result_avg / 1000, (int)(reader_tasks[exp].result_avg % 1000));
 		strcat(buf, buf1);
 	}
 
-- 
cgit v1.2.3


From 8fc28783a0c3704ea27505a25dbde8333d75380c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 15:48:38 -0700
Subject: refperf: Convert nreaders to a module parameter

This commit converts nreaders to a module parameter, with the default
of -1 specifying the old behavior of using 75% of the readers.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index e991d4820f51..020e55a9a64b 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -62,6 +62,12 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0,
 	      "Holdoff time before test start (s)");
 // Number of loops per experiment, all readers execute operations concurrently.
 torture_param(long, loops, 10000000, "Number of loops per experiment.");
+// Number of readers, with -1 defaulting to about 75% of the CPUs.
+torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
+// Number of runs.
+torture_param(int, nruns, 30, "Number of experiments to run.");
+// Reader delay in nanoseconds, 0 for no delay.
+torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
 
 #ifdef MODULE
 # define REFPERF_SHUTDOWN 0
@@ -93,7 +99,6 @@ static wait_queue_head_t main_wq;
 static int shutdown_start;
 
 static struct reader_task *reader_tasks;
-static int nreaders;
 
 // Number of readers that are part of the current experiment.
 static atomic_t nreaders_exp;
@@ -411,8 +416,8 @@ static void
 ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag)
 {
 	pr_alert("%s" PERF_FLAG
-		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld\n", perf_type, tag,
-		 verbose, shutdown, holdoff, loops);
+		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d\n", perf_type, tag,
+		 verbose, shutdown, holdoff, loops, nreaders);
 }
 
 static void
@@ -501,8 +506,9 @@ ref_perf_init(void)
 		schedule_timeout_uninterruptible(1);
 	}
 
-	// Reader tasks (~75% of online CPUs).
-	nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
+	// Reader tasks (default to ~75% of online CPUs).
+	if (nreaders < 0)
+		nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
 	reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
 			       GFP_KERNEL);
 	if (!reader_tasks) {
-- 
cgit v1.2.3


From dbf28efdae7bb51032eeb0fe1b6bd07d6f0f9b6c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 17:22:24 -0700
Subject: refperf: Provide module parameter to specify number of experiments

The current code uses the number of threads both to limit the number
of threads and to specify the number of experiments, but also varies
the number of threads as the experiments progress.  This commit takes
a different approach by adding an refperf.nruns module parameter that
specifies the number of experiments, and furthermore uses the same
number of threads for each experiment.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 020e55a9a64b..6324449db404 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -83,12 +83,6 @@ struct reader_task {
 	atomic_t start;
 	wait_queue_head_t wq;
 	u64 last_duration_ns;
-
-	// The average latency When 1..<this reader> are concurrently
-	// running an experiment. For example, if this reader_task is
-	// of index 5 in the reader_tasks array, then result is for
-	// 6 cores.
-	u64 result_avg;
 };
 
 static struct task_struct *shutdown_task;
@@ -289,12 +283,12 @@ end:
 	return 0;
 }
 
-void reset_readers(int n)
+void reset_readers(void)
 {
 	int i;
 	struct reader_task *rt;
 
-	for (i = 0; i < n; i++) {
+	for (i = 0; i < nreaders; i++) {
 		rt = &(reader_tasks[i]);
 
 		rt->last_duration_ns = 0;
@@ -314,7 +308,7 @@ u64 process_durations(int n)
 	sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
 		exp_idx);
 
-	for (i = 0; i <= n && !torture_must_stop(); i++) {
+	for (i = 0; i < n && !torture_must_stop(); i++) {
 		rt = &(reader_tasks[i]);
 		sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns);
 
@@ -342,11 +336,15 @@ static int main_func(void *arg)
 	int exp, r;
 	char buf1[64];
 	char buf[512];
+	u64 *result_avg;
 
 	set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
 	set_user_nice(current, MAX_NICE);
 
 	VERBOSE_PERFOUT("main_func task started");
+	result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
+	if (!result_avg)
+		VERBOSE_PERFOUT_ERRSTRING("out of memory");
 	atomic_inc(&n_init);
 
 	// Wait for all threads to start.
@@ -355,22 +353,24 @@ static int main_func(void *arg)
 		schedule_timeout_interruptible(holdoff * HZ);
 
 	// Start exp readers up per experiment
-	for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) {
+	for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
+		if (!result_avg)
+			break;
 		if (torture_must_stop())
 			goto end;
 
-		reset_readers(exp);
-		atomic_set(&nreaders_exp, exp + 1);
+		reset_readers();
+		atomic_set(&nreaders_exp, nreaders);
 
 		exp_idx = exp;
 
-		for (r = 0; r <= exp; r++) {
+		for (r = 0; r < nreaders; r++) {
 			atomic_set(&reader_tasks[r].start, 1);
 			wake_up(&reader_tasks[r].wq);
 		}
 
 		VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers",
-				exp);
+				nreaders);
 
 		wait_event(main_wq,
 			   !atomic_read(&nreaders_exp) || torture_must_stop());
@@ -380,7 +380,7 @@ static int main_func(void *arg)
 		if (torture_must_stop())
 			goto end;
 
-		reader_tasks[exp].result_avg = 1000 * process_durations(exp) / ((exp + 1) * loops);
+		result_avg[exp] = 1000 * process_durations(nreaders) / (nreaders * loops);
 	}
 
 	// Print the average of all experiments
@@ -390,12 +390,15 @@ static int main_func(void *arg)
 	strcat(buf, "\n");
 	strcat(buf, "Threads\tTime(ns)\n");
 
-	for (exp = 0; exp < nreaders; exp++) {
-		sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, reader_tasks[exp].result_avg / 1000, (int)(reader_tasks[exp].result_avg % 1000));
+	for (exp = 0; exp < nruns; exp++) {
+		if (!result_avg)
+			break;
+		sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000));
 		strcat(buf, buf1);
 	}
 
-	PERFOUT("%s", buf);
+	if (result_avg)
+		PERFOUT("%s", buf);
 
 	// This will shutdown everything including us.
 	if (shutdown) {
@@ -416,8 +419,8 @@ static void
 ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag)
 {
 	pr_alert("%s" PERF_FLAG
-		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d\n", perf_type, tag,
-		 verbose, shutdown, holdoff, loops, nreaders);
+		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d\n", perf_type, tag,
+		 verbose, shutdown, holdoff, loops, nreaders, nruns);
 }
 
 static void
-- 
cgit v1.2.3


From f518f154ecef347777db33b7c9b0581f245159f0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 17:32:56 -0700
Subject: refperf: Dynamically allocate experiment-summary output buffer

Currently, the buffer used to accumulate the experiment-summary output
is fixed size, which will cause problems if someone decides to run
one hundred experiments.  This commit therefore dynamically allocates
this buffer.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 6324449db404..75b9cceaece1 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -333,9 +333,10 @@ u64 process_durations(int n)
 // point all the timestamps are printed.
 static int main_func(void *arg)
 {
+	bool errexit = false;
 	int exp, r;
 	char buf1[64];
-	char buf[512];
+	char *buf;
 	u64 *result_avg;
 
 	set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
@@ -343,8 +344,11 @@ static int main_func(void *arg)
 
 	VERBOSE_PERFOUT("main_func task started");
 	result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
-	if (!result_avg)
+	buf = kzalloc(64 + nruns * 32, GFP_KERNEL);
+	if (!result_avg || !buf) {
 		VERBOSE_PERFOUT_ERRSTRING("out of memory");
+		errexit = true;
+	}
 	atomic_inc(&n_init);
 
 	// Wait for all threads to start.
@@ -354,7 +358,7 @@ static int main_func(void *arg)
 
 	// Start exp readers up per experiment
 	for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
-		if (!result_avg)
+		if (errexit)
 			break;
 		if (torture_must_stop())
 			goto end;
@@ -391,13 +395,13 @@ static int main_func(void *arg)
 	strcat(buf, "Threads\tTime(ns)\n");
 
 	for (exp = 0; exp < nruns; exp++) {
-		if (!result_avg)
+		if (errexit)
 			break;
 		sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000));
 		strcat(buf, buf1);
 	}
 
-	if (result_avg)
+	if (!errexit)
 		PERFOUT("%s", buf);
 
 	// This will shutdown everything including us.
@@ -412,6 +416,8 @@ static int main_func(void *arg)
 
 end:
 	torture_kthread_stopping("main_func");
+	kfree(result_avg);
+	kfree(buf);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2e90de76f226f11fe26c871aa321be28152f565a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 17:45:03 -0700
Subject: refperf: Dynamically allocate thread-summary output buffer

Currently, the buffer used to accumulate the thread-summary output is
fixed size, which will cause problems if someone decides to run on a large
number of PCUs.  This commit therefore dynamically allocates this buffer.

[ paulmck: Fix memory allocation as suggested by KASAN. ]
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 75b9cceaece1..fc940e3dba1f 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -301,9 +301,12 @@ u64 process_durations(int n)
 	int i;
 	struct reader_task *rt;
 	char buf1[64];
-	char buf[512];
+	char *buf;
 	u64 sum = 0;
 
+	buf = kmalloc(128 + nreaders * 32, GFP_KERNEL);
+	if (!buf)
+		return 0;
 	buf[0] = 0;
 	sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
 		exp_idx);
@@ -322,6 +325,7 @@ u64 process_durations(int n)
 
 	PERFOUT("%s\n", buf);
 
+	kfree(buf);
 	return sum;
 }
 
-- 
cgit v1.2.3


From 2990750bceb05c3cdeae3a6d2683cbc4ae4de15e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 May 2020 09:32:57 -0700
Subject: refperf: Make functions static

Because the reset_readers() and process_durations() functions are used
only within kernel/rcu/refperf.c, this commit makes them static.

Reported-by: kbuild test robot <lkp@intel.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index fc940e3dba1f..0a900f3ae151 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -283,7 +283,7 @@ end:
 	return 0;
 }
 
-void reset_readers(void)
+static void reset_readers(void)
 {
 	int i;
 	struct reader_task *rt;
@@ -296,7 +296,7 @@ void reset_readers(void)
 }
 
 // Print the results of each reader and return the sum of all their durations.
-u64 process_durations(int n)
+static u64 process_durations(int n)
 {
 	int i;
 	struct reader_task *rt;
-- 
cgit v1.2.3


From b864f89ff61492f56b4e8c6713a5efec6540a0e2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 May 2020 10:57:34 -0700
Subject: refperf: Tune reader measurement interval

This commit moves a printk() out of the measurement interval, converts
a atomic_dec()/atomic_read() pair to atomic_dec_and_test(), and adds
a smp_mb__before_atomic() to avoid potential wake/wait hangs.  These
changes have the added benefit of reducing the number of loops required
for amortizing loop overhead for CONFIG_PREEMPT=n RCU measurements from
1,000,000 to 10,000.  This reduction in turn shortens the test, reducing
the probability of interference.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 0a900f3ae151..8815ccfb6f98 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -252,15 +252,16 @@ repeat:
 	// Make sure that the CPU is affinitized appropriately during testing.
 	WARN_ON_ONCE(smp_processor_id() != me);
 
+	smp_mb__before_atomic();
 	atomic_dec(&rt->start);
 
+	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
+
 	// To prevent noise, keep interrupts disabled. This also has the
 	// effect of preventing entries into slow path for rcu_read_unlock().
 	local_irq_save(flags);
 	start = ktime_get_mono_fast_ns();
 
-	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
-
 	cur_ops->readsection(loops);
 
 	duration = ktime_get_mono_fast_ns() - start;
@@ -268,14 +269,12 @@ repeat:
 
 	rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
 
-	atomic_dec(&nreaders_exp);
+	if (atomic_dec_and_test(&nreaders_exp))
+		wake_up(&main_wq);
 
 	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)",
 			me, exp_idx, atomic_read(&nreaders_exp));
 
-	if (!atomic_read(&nreaders_exp))
-		wake_up(&main_wq);
-
 	if (!torture_must_stop())
 		goto repeat;
 end:
-- 
cgit v1.2.3


From af2789db13b8dc38d16e969f8c11b9468be42d46 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 May 2020 11:22:03 -0700
Subject: refperf: Convert reader_task structure's "start" field to int

This commit converts the reader_task structure's "start" field to int
in order to demote a full barrier to an smp_load_acquire() and also to
simplify the code a bit.  While in the area, and to enlist the compiler's
help in ensuring that nothing was missed, the field's name was changed
to start_reader.

Also while in the area, change the main_func() store to use
smp_store_release() to further fortify against wait/wake races.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 8815ccfb6f98..2fd3ed1a0d0d 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -80,7 +80,7 @@ torture_param(bool, shutdown, REFPERF_SHUTDOWN,
 
 struct reader_task {
 	struct task_struct *task;
-	atomic_t start;
+	int start_reader;
 	wait_queue_head_t wq;
 	u64 last_duration_ns;
 };
@@ -243,7 +243,7 @@ repeat:
 	VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
 
 	// Wait for signal that this reader can start.
-	wait_event(rt->wq, (atomic_read(&nreaders_exp) && atomic_read(&rt->start)) ||
+	wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
 			   torture_must_stop());
 
 	if (torture_must_stop())
@@ -252,8 +252,7 @@ repeat:
 	// Make sure that the CPU is affinitized appropriately during testing.
 	WARN_ON_ONCE(smp_processor_id() != me);
 
-	smp_mb__before_atomic();
-	atomic_dec(&rt->start);
+	WRITE_ONCE(rt->start_reader, 0);
 
 	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
 
@@ -372,7 +371,7 @@ static int main_func(void *arg)
 		exp_idx = exp;
 
 		for (r = 0; r < nreaders; r++) {
-			atomic_set(&reader_tasks[r].start, 1);
+			smp_store_release(&reader_tasks[r].start_reader, 1);
 			wake_up(&reader_tasks[r].wq);
 		}
 
-- 
cgit v1.2.3


From 86e0da2bb8ed934d3dce5a337895f1118f59c087 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 May 2020 11:40:52 -0700
Subject: refperf: More closely synchronize reader start times

Currently, readers are awakened individually.  On most systems, this
results in significant wakeup delay from one reader to the next, which
can result in the first and last reader having sole access to the
synchronization primitive in question.  If that synchronization primitive
involves shared memory, those readers will rack up a huge number of
operations in a very short time, causing large perturbations in the
results.

This commit therefore has the readers busy-wait after being awakened,
and uses a new n_started variable to synchronize their start times.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 2fd3ed1a0d0d..234bb0e84a8b 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -99,6 +99,7 @@ static atomic_t nreaders_exp;
 
 // Use to wait for all threads to start.
 static atomic_t n_init;
+static atomic_t n_started;
 
 // Track which experiment is currently running.
 static int exp_idx;
@@ -253,6 +254,9 @@ repeat:
 	WARN_ON_ONCE(smp_processor_id() != me);
 
 	WRITE_ONCE(rt->start_reader, 0);
+	if (!atomic_dec_return(&n_started))
+		while (atomic_read_acquire(&n_started))
+			cpu_relax();
 
 	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
 
@@ -367,6 +371,7 @@ static int main_func(void *arg)
 
 		reset_readers();
 		atomic_set(&nreaders_exp, nreaders);
+		atomic_set(&n_started, nreaders);
 
 		exp_idx = exp;
 
-- 
cgit v1.2.3


From 2db0bda38453f472640f4ece1e2a495cbd44f892 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 May 2020 12:34:57 -0700
Subject: refperf: Add warmup and cooldown processing phases

This commit causes all the readers to start running unmeasured load
until all readers have done at least one such run (thus having warmed
up), then run the measured load, and then run unmeasured load until all
readers have completed their measured load.  This approach avoids any
thread running measured load while other readers are idle.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 234bb0e84a8b..445190b97b05 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -100,6 +100,8 @@ static atomic_t nreaders_exp;
 // Use to wait for all threads to start.
 static atomic_t n_init;
 static atomic_t n_started;
+static atomic_t n_warmedup;
+static atomic_t n_cooleddown;
 
 // Track which experiment is currently running.
 static int exp_idx;
@@ -260,8 +262,15 @@ repeat:
 
 	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
 
-	// To prevent noise, keep interrupts disabled. This also has the
-	// effect of preventing entries into slow path for rcu_read_unlock().
+
+	// To reduce noise, do an initial cache-warming invocation, check
+	// in, and then keep warming until everyone has checked in.
+	cur_ops->readsection(loops);
+	if (!atomic_dec_return(&n_warmedup))
+		while (atomic_read_acquire(&n_warmedup))
+			cur_ops->readsection(loops);
+	// Also keep interrupts disabled.  This also has the effect
+	// of preventing entries into slow path for rcu_read_unlock().
 	local_irq_save(flags);
 	start = ktime_get_mono_fast_ns();
 
@@ -271,6 +280,11 @@ repeat:
 	local_irq_restore(flags);
 
 	rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
+	// To reduce runtime-skew noise, do maintain-load invocations until
+	// everyone is done.
+	if (!atomic_dec_return(&n_cooleddown))
+		while (atomic_read_acquire(&n_cooleddown))
+			cur_ops->readsection(loops);
 
 	if (atomic_dec_and_test(&nreaders_exp))
 		wake_up(&main_wq);
@@ -372,6 +386,8 @@ static int main_func(void *arg)
 		reset_readers();
 		atomic_set(&nreaders_exp, nreaders);
 		atomic_set(&n_started, nreaders);
+		atomic_set(&n_warmedup, nreaders);
+		atomic_set(&n_cooleddown, nreaders);
 
 		exp_idx = exp;
 
-- 
cgit v1.2.3


From 6efb06340846c788336f402e3a472a24fabb431e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 May 2020 14:26:25 -0700
Subject: refperf: Label experiment-number column "Runs"

The experiment-number column is currently labeled "Threads", which is
misleading at best.  This commit therefore relabels it as "Runs", and
adjusts the scripts accordingly.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c                                          | 2 +-
 tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 445190b97b05..2d2d227d761a 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -415,7 +415,7 @@ static int main_func(void *arg)
 
 	buf[0] = 0;
 	strcat(buf, "\n");
-	strcat(buf, "Threads\tTime(ns)\n");
+	strcat(buf, "Runs\tTime(ns)\n");
 
 	for (exp = 0; exp < nruns; exp++) {
 		if (errexit)
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
index 6fc06cd3538e..0660f3fab215 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
@@ -24,7 +24,7 @@ configfile=`echo $i | sed -e 's/^.*\///'`
 
 sed -e 's/^\[[^]]*]//' < $i/console.log | tr -d '\015' |
 awk -v configfile="$configfile" '
-/^[ 	]*Threads	Time\(ns\) *$/ {
+/^[ 	]*Runs	Time\(ns\) *$/ {
 	if (dataphase + 0 == 0) {
 		dataphase = 1;
 		# print configfile, $0;
-- 
cgit v1.2.3


From 96af8669591d740a1e2695c4d96e544409dbf896 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 27 May 2020 16:46:56 -0700
Subject: refperf: Simplify initialization-time wakeup protocol

This commit moves the reader-launch wait loop from ref_perf_init()
to main_func(), removing one layer of wakeup and allowing slightly
faster system boot.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 2d2d227d761a..7839237ffc17 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -369,13 +369,14 @@ static int main_func(void *arg)
 		VERBOSE_PERFOUT_ERRSTRING("out of memory");
 		errexit = true;
 	}
-	atomic_inc(&n_init);
-
-	// Wait for all threads to start.
-	wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1));
 	if (holdoff)
 		schedule_timeout_interruptible(holdoff * HZ);
 
+	// Wait for all threads to start.
+	atomic_inc(&n_init);
+	while (atomic_read(&n_init) < nreaders + 1)
+		schedule_timeout_uninterruptible(1);
+
 	// Start exp readers up per experiment
 	for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
 		if (errexit)
@@ -565,14 +566,6 @@ ref_perf_init(void)
 	firsterr = torture_create_kthread(main_func, NULL, main_task);
 	if (firsterr)
 		goto unwind;
-	schedule_timeout_uninterruptible(1);
-
-
-	// Wait until all threads start
-	while (atomic_read(&n_init) < nreaders + 1)
-		schedule_timeout_uninterruptible(1);
-
-	wake_up(&main_wq);
 
 	torture_init_end();
 	return 0;
-- 
cgit v1.2.3


From b4d1e34f6502a138e32275baabdb6d593d7ea432 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 May 2020 16:37:35 -0700
Subject: refperf: Add read-side delay module parameter

This commit adds a refperf.readdelay module parameter that controls the
duration of each critical section.  This parameter allows gathering data
showing how the performance differences between the various primitives
vary with critical-section length.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 108 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 89 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 7839237ffc17..57a750bbcaca 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -66,8 +66,8 @@ torture_param(long, loops, 10000000, "Number of loops per experiment.");
 torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
 // Number of runs.
 torture_param(int, nruns, 30, "Number of experiments to run.");
-// Reader delay in nanoseconds, 0 for no delay.
-torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
+// Reader delay in microseconds, 0 for no delay.
+torture_param(int, readdelay, 0, "Read-side delay in microseconds.");
 
 #ifdef MODULE
 # define REFPERF_SHUTDOWN 0
@@ -111,6 +111,7 @@ struct ref_perf_ops {
 	void (*init)(void);
 	void (*cleanup)(void);
 	void (*readsection)(const int nloops);
+	void (*delaysection)(const int nloops, const int ndelay);
 	const char *name;
 };
 
@@ -126,6 +127,17 @@ static void ref_rcu_read_section(const int nloops)
 	}
 }
 
+static void ref_rcu_delay_section(const int nloops, const int ndelay)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		rcu_read_lock();
+		udelay(ndelay);
+		rcu_read_unlock();
+	}
+}
+
 static void rcu_sync_perf_init(void)
 {
 }
@@ -133,6 +145,7 @@ static void rcu_sync_perf_init(void)
 static struct ref_perf_ops rcu_ops = {
 	.init		= rcu_sync_perf_init,
 	.readsection	= ref_rcu_read_section,
+	.delaysection	= ref_rcu_delay_section,
 	.name		= "rcu"
 };
 
@@ -141,7 +154,7 @@ static struct ref_perf_ops rcu_ops = {
 DEFINE_STATIC_SRCU(srcu_refctl_perf);
 static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf;
 
-static void srcu_ref_perf_read_section(int nloops)
+static void srcu_ref_perf_read_section(const int nloops)
 {
 	int i;
 	int idx;
@@ -152,16 +165,29 @@ static void srcu_ref_perf_read_section(int nloops)
 	}
 }
 
+static void srcu_ref_perf_delay_section(const int nloops, const int ndelay)
+{
+	int i;
+	int idx;
+
+	for (i = nloops; i >= 0; i--) {
+		idx = srcu_read_lock(srcu_ctlp);
+		udelay(ndelay);
+		srcu_read_unlock(srcu_ctlp, idx);
+	}
+}
+
 static struct ref_perf_ops srcu_ops = {
 	.init		= rcu_sync_perf_init,
 	.readsection	= srcu_ref_perf_read_section,
+	.delaysection	= srcu_ref_perf_delay_section,
 	.name		= "srcu"
 };
 
 // Definitions for reference count
 static atomic_t refcnt;
 
-static void ref_perf_refcnt_section(const int nloops)
+static void ref_refcnt_section(const int nloops)
 {
 	int i;
 
@@ -171,45 +197,69 @@ static void ref_perf_refcnt_section(const int nloops)
 	}
 }
 
+static void ref_refcnt_delay_section(const int nloops, const int ndelay)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		atomic_inc(&refcnt);
+		udelay(ndelay);
+		atomic_dec(&refcnt);
+	}
+}
+
 static struct ref_perf_ops refcnt_ops = {
 	.init		= rcu_sync_perf_init,
-	.readsection	= ref_perf_refcnt_section,
+	.readsection	= ref_refcnt_section,
+	.delaysection	= ref_refcnt_delay_section,
 	.name		= "refcnt"
 };
 
 // Definitions for rwlock
 static rwlock_t test_rwlock;
 
-static void ref_perf_rwlock_init(void)
+static void ref_rwlock_init(void)
 {
 	rwlock_init(&test_rwlock);
 }
 
-static void ref_perf_rwlock_section(const int nloops)
+static void ref_rwlock_section(const int nloops)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		read_lock(&test_rwlock);
+		read_unlock(&test_rwlock);
+	}
+}
+
+static void ref_rwlock_delay_section(const int nloops, const int ndelay)
 {
 	int i;
 
 	for (i = nloops; i >= 0; i--) {
 		read_lock(&test_rwlock);
+		udelay(ndelay);
 		read_unlock(&test_rwlock);
 	}
 }
 
 static struct ref_perf_ops rwlock_ops = {
-	.init		= ref_perf_rwlock_init,
-	.readsection	= ref_perf_rwlock_section,
+	.init		= ref_rwlock_init,
+	.readsection	= ref_rwlock_section,
+	.delaysection	= ref_rwlock_delay_section,
 	.name		= "rwlock"
 };
 
 // Definitions for rwsem
 static struct rw_semaphore test_rwsem;
 
-static void ref_perf_rwsem_init(void)
+static void ref_rwsem_init(void)
 {
 	init_rwsem(&test_rwsem);
 }
 
-static void ref_perf_rwsem_section(const int nloops)
+static void ref_rwsem_section(const int nloops)
 {
 	int i;
 
@@ -219,12 +269,32 @@ static void ref_perf_rwsem_section(const int nloops)
 	}
 }
 
+static void ref_rwsem_delay_section(const int nloops, const int ndelay)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		down_read(&test_rwsem);
+		udelay(ndelay);
+		up_read(&test_rwsem);
+	}
+}
+
 static struct ref_perf_ops rwsem_ops = {
-	.init		= ref_perf_rwsem_init,
-	.readsection	= ref_perf_rwsem_section,
+	.init		= ref_rwsem_init,
+	.readsection	= ref_rwsem_section,
+	.delaysection	= ref_rwsem_delay_section,
 	.name		= "rwsem"
 };
 
+static void rcu_perf_one_reader(void)
+{
+	if (readdelay <= 0)
+		cur_ops->readsection(loops);
+	else
+		cur_ops->delaysection(loops, readdelay);
+}
+
 // Reader kthread.  Repeatedly does empty RCU read-side
 // critical section, minimizing update-side interference.
 static int
@@ -265,16 +335,16 @@ repeat:
 
 	// To reduce noise, do an initial cache-warming invocation, check
 	// in, and then keep warming until everyone has checked in.
-	cur_ops->readsection(loops);
+	rcu_perf_one_reader();
 	if (!atomic_dec_return(&n_warmedup))
 		while (atomic_read_acquire(&n_warmedup))
-			cur_ops->readsection(loops);
+			rcu_perf_one_reader();
 	// Also keep interrupts disabled.  This also has the effect
 	// of preventing entries into slow path for rcu_read_unlock().
 	local_irq_save(flags);
 	start = ktime_get_mono_fast_ns();
 
-	cur_ops->readsection(loops);
+	rcu_perf_one_reader();
 
 	duration = ktime_get_mono_fast_ns() - start;
 	local_irq_restore(flags);
@@ -284,7 +354,7 @@ repeat:
 	// everyone is done.
 	if (!atomic_dec_return(&n_cooleddown))
 		while (atomic_read_acquire(&n_cooleddown))
-			cur_ops->readsection(loops);
+			rcu_perf_one_reader();
 
 	if (atomic_dec_and_test(&nreaders_exp))
 		wake_up(&main_wq);
@@ -449,8 +519,8 @@ static void
 ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag)
 {
 	pr_alert("%s" PERF_FLAG
-		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d\n", perf_type, tag,
-		 verbose, shutdown, holdoff, loops, nreaders, nruns);
+		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", perf_type, tag,
+		 verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay);
 }
 
 static void
-- 
cgit v1.2.3


From 4dd72a338a07486823037a6b45334d05192c913a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 29 May 2020 13:11:26 -0700
Subject: refperf: Adjust refperf.loop default value

With the various measurement optimizations, 10,000 loops normally
suffices.  This commit therefore reduces the refperf.loops default value
from 10,000,000 to 10,000.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 57a750bbcaca..063eeb0473a1 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -61,7 +61,7 @@ torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
 torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0,
 	      "Holdoff time before test start (s)");
 // Number of loops per experiment, all readers execute operations concurrently.
-torture_param(long, loops, 10000000, "Number of loops per experiment.");
+torture_param(long, loops, 10000, "Number of loops per experiment.");
 // Number of readers, with -1 defaulting to about 75% of the CPUs.
 torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
 // Number of runs.
-- 
cgit v1.2.3


From 7c944d7c67daee84e3c756bb74ad2f32b28c41cf Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 29 May 2020 14:36:26 -0700
Subject: refperf: Work around 64-bit division
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A 64-bit division was introduced in refperf, breaking compilation
on all 32-bit architectures:

kernel/rcu/refperf.o: in function `main_func':
refperf.c:(.text+0x57c): undefined reference to `__aeabi_uldivmod'

Fix this by using div_u64 to mark the expensive operation.

[ paulmck: Update primitive and format per Nathan Chancellor. ]
Fixes: bd5b16d6c88d ("refperf: Allow decimal nanoseconds")
Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: Valdis Klētnieks <valdis.kletnieks@vt.edu>
Acked-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 063eeb0473a1..80d449060bdf 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -478,7 +478,7 @@ static int main_func(void *arg)
 		if (torture_must_stop())
 			goto end;
 
-		result_avg[exp] = 1000 * process_durations(nreaders) / (nreaders * loops);
+		result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops);
 	}
 
 	// Print the average of all experiments
@@ -489,9 +489,13 @@ static int main_func(void *arg)
 	strcat(buf, "Runs\tTime(ns)\n");
 
 	for (exp = 0; exp < nruns; exp++) {
+		u64 avg;
+		u32 rem;
+
 		if (errexit)
 			break;
-		sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000));
+		avg = div_u64_rem(result_avg[exp], 1000, &rem);
+		sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem);
 		strcat(buf, buf1);
 	}
 
-- 
cgit v1.2.3


From 918b351d965560c7902ad482cf87049517843ff2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sun, 31 May 2020 18:14:57 -0700
Subject: refperf: Change readdelay module parameter to nanoseconds

The current units of microseconds are too coarse, so this commit
changes the units to nanoseconds.  However, ndelay is used only for the
nanoseconds with udelay being used for whole microseconds.  For example,
setting refperf.readdelay=1500 results in a udelay(1) followed by an
ndelay(500).

Suggested-by: Akira Yokosawa <akiyks@gmail.com>
[ paulmck: Abstracted delay per Akira feedback and move from 80 to 100 lines. ]
[ paulmck: Fix names as suggested by kbuild test robot. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 80d449060bdf..49fffb9bce77 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -66,8 +66,8 @@ torture_param(long, loops, 10000, "Number of loops per experiment.");
 torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
 // Number of runs.
 torture_param(int, nruns, 30, "Number of experiments to run.");
-// Reader delay in microseconds, 0 for no delay.
-torture_param(int, readdelay, 0, "Read-side delay in microseconds.");
+// Reader delay in nanoseconds, 0 for no delay.
+torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
 
 #ifdef MODULE
 # define REFPERF_SHUTDOWN 0
@@ -111,12 +111,20 @@ struct ref_perf_ops {
 	void (*init)(void);
 	void (*cleanup)(void);
 	void (*readsection)(const int nloops);
-	void (*delaysection)(const int nloops, const int ndelay);
+	void (*delaysection)(const int nloops, const int udl, const int ndl);
 	const char *name;
 };
 
 static struct ref_perf_ops *cur_ops;
 
+static void un_delay(const int udl, const int ndl)
+{
+	if (udl)
+		udelay(udl);
+	if (ndl)
+		ndelay(ndl);
+}
+
 static void ref_rcu_read_section(const int nloops)
 {
 	int i;
@@ -127,13 +135,13 @@ static void ref_rcu_read_section(const int nloops)
 	}
 }
 
-static void ref_rcu_delay_section(const int nloops, const int ndelay)
+static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl)
 {
 	int i;
 
 	for (i = nloops; i >= 0; i--) {
 		rcu_read_lock();
-		udelay(ndelay);
+		un_delay(udl, ndl);
 		rcu_read_unlock();
 	}
 }
@@ -165,14 +173,14 @@ static void srcu_ref_perf_read_section(const int nloops)
 	}
 }
 
-static void srcu_ref_perf_delay_section(const int nloops, const int ndelay)
+static void srcu_ref_perf_delay_section(const int nloops, const int udl, const int ndl)
 {
 	int i;
 	int idx;
 
 	for (i = nloops; i >= 0; i--) {
 		idx = srcu_read_lock(srcu_ctlp);
-		udelay(ndelay);
+		un_delay(udl, ndl);
 		srcu_read_unlock(srcu_ctlp, idx);
 	}
 }
@@ -197,13 +205,13 @@ static void ref_refcnt_section(const int nloops)
 	}
 }
 
-static void ref_refcnt_delay_section(const int nloops, const int ndelay)
+static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl)
 {
 	int i;
 
 	for (i = nloops; i >= 0; i--) {
 		atomic_inc(&refcnt);
-		udelay(ndelay);
+		un_delay(udl, ndl);
 		atomic_dec(&refcnt);
 	}
 }
@@ -233,13 +241,13 @@ static void ref_rwlock_section(const int nloops)
 	}
 }
 
-static void ref_rwlock_delay_section(const int nloops, const int ndelay)
+static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl)
 {
 	int i;
 
 	for (i = nloops; i >= 0; i--) {
 		read_lock(&test_rwlock);
-		udelay(ndelay);
+		un_delay(udl, ndl);
 		read_unlock(&test_rwlock);
 	}
 }
@@ -269,13 +277,13 @@ static void ref_rwsem_section(const int nloops)
 	}
 }
 
-static void ref_rwsem_delay_section(const int nloops, const int ndelay)
+static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl)
 {
 	int i;
 
 	for (i = nloops; i >= 0; i--) {
 		down_read(&test_rwsem);
-		udelay(ndelay);
+		un_delay(udl, ndl);
 		up_read(&test_rwsem);
 	}
 }
@@ -292,7 +300,7 @@ static void rcu_perf_one_reader(void)
 	if (readdelay <= 0)
 		cur_ops->readsection(loops);
 	else
-		cur_ops->delaysection(loops, readdelay);
+		cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000);
 }
 
 // Reader kthread.  Repeatedly does empty RCU read-side
-- 
cgit v1.2.3


From 72bb749e7048d0a8d7663b59ec1a33bd56c51083 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Jun 2020 08:34:41 -0700
Subject: refperf: Add test for RCU Tasks Trace readers.

This commit adds testing for RCU Tasks Trace readers to the refperf module.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 49fffb9bce77..da7de9ac548d 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -25,6 +25,7 @@
 #include <linux/notifier.h>
 #include <linux/percpu.h>
 #include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
@@ -157,7 +158,6 @@ static struct ref_perf_ops rcu_ops = {
 	.name		= "rcu"
 };
 
-
 // Definitions for SRCU ref perf testing.
 DEFINE_STATIC_SRCU(srcu_refctl_perf);
 static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf;
@@ -192,6 +192,35 @@ static struct ref_perf_ops srcu_ops = {
 	.name		= "srcu"
 };
 
+// Definitions for RCU Tasks Trace ref perf testing.
+static void rcu_trace_ref_perf_read_section(const int nloops)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		rcu_read_lock_trace();
+		rcu_read_unlock_trace();
+	}
+}
+
+static void rcu_trace_ref_perf_delay_section(const int nloops, const int udl, const int ndl)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		rcu_read_lock_trace();
+		un_delay(udl, ndl);
+		rcu_read_unlock_trace();
+	}
+}
+
+static struct ref_perf_ops rcu_trace_ops = {
+	.init		= rcu_sync_perf_init,
+	.readsection	= rcu_trace_ref_perf_read_section,
+	.delaysection	= rcu_trace_ref_perf_delay_section,
+	.name		= "rcu-trace"
+};
+
 // Definitions for reference count
 static atomic_t refcnt;
 
@@ -584,7 +613,7 @@ ref_perf_init(void)
 	long i;
 	int firsterr = 0;
 	static struct ref_perf_ops *perf_ops[] = {
-		&rcu_ops, &srcu_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops,
+		&rcu_ops, &srcu_ops, &rcu_trace_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops,
 	};
 
 	if (!torture_init_begin(perf_type, verbose))
-- 
cgit v1.2.3


From e13ef442fe522fa1f604efec8c899a0e1fc3d426 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Jun 2020 11:56:34 -0700
Subject: refperf: Add test for RCU Tasks readers

This commit adds testing for RCU Tasks readers to the refperf module.
This also applies to RCU Rude readers, as both flavors have empty
(as in non-existent) read-side markers.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index da7de9ac548d..2bfdcdcb6bd1 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -192,6 +192,31 @@ static struct ref_perf_ops srcu_ops = {
 	.name		= "srcu"
 };
 
+// Definitions for RCU Tasks ref perf testing: Empty read markers.
+// These definitions also work for RCU Rude readers.
+static void rcu_tasks_ref_perf_read_section(const int nloops)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--)
+		continue;
+}
+
+static void rcu_tasks_ref_perf_delay_section(const int nloops, const int udl, const int ndl)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--)
+		un_delay(udl, ndl);
+}
+
+static struct ref_perf_ops rcu_tasks_ops = {
+	.init		= rcu_sync_perf_init,
+	.readsection	= rcu_tasks_ref_perf_read_section,
+	.delaysection	= rcu_tasks_ref_perf_delay_section,
+	.name		= "rcu-tasks"
+};
+
 // Definitions for RCU Tasks Trace ref perf testing.
 static void rcu_trace_ref_perf_read_section(const int nloops)
 {
@@ -613,7 +638,8 @@ ref_perf_init(void)
 	long i;
 	int firsterr = 0;
 	static struct ref_perf_ops *perf_ops[] = {
-		&rcu_ops, &srcu_ops, &rcu_trace_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops,
+		&rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops,
+		&refcnt_ops, &rwlock_ops, &rwsem_ops,
 	};
 
 	if (!torture_init_begin(perf_type, verbose))
-- 
cgit v1.2.3


From c7dcf8106f7570b133b05ff68fd4100064965d9d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 12 Jun 2020 13:11:29 -0700
Subject: rcu-tasks: Fix synchronize_rcu_tasks_trace() header comment

The synchronize_rcu_tasks_trace() header comment incorrectly claims that
any number of things delimit RCU Tasks Trace read-side critical sections,
when in fact only rcu_read_lock_trace() and rcu_read_unlock_trace() do so.
This commit therefore fixes this comment, and, while in the area, fixes
a typo in the rcu_read_lock_trace() header comment.

Reported-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate_trace.h | 4 ++--
 kernel/rcu/tasks.h             | 9 ++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h
index 4c25a41f8b27..d9015aac78c6 100644
--- a/include/linux/rcupdate_trace.h
+++ b/include/linux/rcupdate_trace.h
@@ -36,8 +36,8 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting);
 /**
  * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section
  *
- * When synchronize_rcu_trace() is invoked by one task, then that task
- * is guaranteed to block until all other tasks exit their read-side
+ * When synchronize_rcu_tasks_trace() is invoked by one task, then that
+ * task is guaranteed to block until all other tasks exit their read-side
  * critical sections.  Similarly, if call_rcu_trace() is invoked on one
  * task while other tasks are within RCU read-side critical sections,
  * invocation of the corresponding RCU callback is deferred until after
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index ce23f6cc5043..a77298c1d126 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1118,11 +1118,10 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace);
  * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period
  *
  * Control will return to the caller some time after a trace rcu-tasks
- * grace period has elapsed, in other words after all currently
- * executing rcu-tasks read-side critical sections have elapsed.  These
- * read-side critical sections are delimited by calls to schedule(),
- * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory,
- * anyway) cond_resched().
+ * grace period has elapsed, in other words after all currently executing
+ * rcu-tasks read-side critical sections have elapsed.  These read-side
+ * critical sections are delimited by calls to rcu_read_lock_trace()
+ * and rcu_read_unlock_trace().
  *
  * This is a very specialized primitive, intended only for a few uses in
  * tracing and other situations requiring manipulation of function preambles
-- 
cgit v1.2.3


From 8e4ec3d02b549a731c94b4bcddff212bb92cdbaf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 17 Jun 2020 11:33:54 -0700
Subject: refperf: Rename RCU_REF_PERF_TEST to RCU_REF_SCALE_TEST

The old Kconfig option name is all too easy to conflate with the
unrelated "perf" feature, so this commit renames RCU_REF_PERF_TEST to
RCU_REF_SCALE_TEST.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig.debug                                    | 4 ++--
 kernel/rcu/Makefile                                         | 2 +-
 kernel/rcu/refperf.c                                        | 6 +++---
 tools/testing/selftests/rcutorture/configs/refperf/CFcommon | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 858765b7f644..3cf6132a4bb9 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -61,8 +61,8 @@ config RCU_TORTURE_TEST
 	  Say M if you want the RCU torture tests to build as a module.
 	  Say N if you are unsure.
 
-config RCU_REF_PERF_TEST
-	tristate "Performance tests for read-side synchronization (RCU and others)"
+config RCU_REF_SCALE_TEST
+	tristate "Scalability tests for read-side synchronization (RCU and others)"
 	depends on DEBUG_KERNEL
 	select TORTURE_TEST
 	select SRCU
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index ba7d82609cbe..45d562de279a 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -12,7 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o
 obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
-obj-$(CONFIG_RCU_REF_PERF_TEST) += refperf.o
+obj-$(CONFIG_RCU_REF_SCALE_TEST) += refperf.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_TINY_RCU) += tiny.o
 obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 2bfdcdcb6bd1..7c980573acbe 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+
 //
-// Performance test comparing RCU vs other mechanisms
+// Scalability test comparing RCU vs other mechanisms
 // for acquiring references on objects.
 //
 // Copyright (C) Google, 2020.
@@ -59,7 +59,7 @@ MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
 torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
 
 // Wait until there are multiple CPUs before starting test.
-torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0,
+torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
 	      "Holdoff time before test start (s)");
 // Number of loops per experiment, all readers execute operations concurrently.
 torture_param(long, loops, 10000, "Number of loops per experiment.");
@@ -656,7 +656,7 @@ ref_perf_init(void)
 		for (i = 0; i < ARRAY_SIZE(perf_ops); i++)
 			pr_cont(" %s", perf_ops[i]->name);
 		pr_cont("\n");
-		WARN_ON(!IS_MODULE(CONFIG_RCU_REF_PERF_TEST));
+		WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
 		firsterr = -EINVAL;
 		cur_ops = NULL;
 		goto unwind;
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/CFcommon b/tools/testing/selftests/rcutorture/configs/refperf/CFcommon
index 8ba5ba207503..a98b58b54bb1 100644
--- a/tools/testing/selftests/rcutorture/configs/refperf/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/refperf/CFcommon
@@ -1,2 +1,2 @@
-CONFIG_RCU_REF_PERF_TEST=y
+CONFIG_RCU_REF_SCALE_TEST=y
 CONFIG_PRINTK_TIME=y
-- 
cgit v1.2.3


From 1fbeb3a8c4de29433a8d230ee600b13d369b6c0f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 17 Jun 2020 11:53:53 -0700
Subject: refperf: Rename refperf.c to refscale.c and change internal names

This commit further avoids conflation of refperf with the kernel's perf
feature by renaming kernel/rcu/refperf.c to kernel/rcu/refscale.c,
and also by similarly renaming the functions and variables inside
this file.  This has the side effect of changing the names of the
kernel boot parameters, so kernel-parameters.txt and ver_functions.sh
are also updated.

The rcutorture --torture type remains refperf, and this will be
addressed in a separate commit.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt    |  17 +-
 kernel/rcu/Makefile                                |   2 +-
 kernel/rcu/refperf.c                               | 717 ---------------------
 kernel/rcu/refscale.c                              | 717 +++++++++++++++++++++
 .../rcutorture/configs/refperf/ver_functions.sh    |   4 +-
 5 files changed, 730 insertions(+), 727 deletions(-)
 delete mode 100644 kernel/rcu/refperf.c
 create mode 100644 kernel/rcu/refscale.c

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 20cd00b78fc4..a4e4e0f6a550 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4407,13 +4407,13 @@
 			      reboot_cpu is s[mp]#### with #### being the processor
 					to be used for rebooting.
 
-	refperf.holdoff= [KNL]
+	refscale.holdoff= [KNL]
 			Set test-start holdoff period.  The purpose of
 			this parameter is to delay the start of the
 			test until boot completes in order to avoid
 			interference.
 
-	refperf.loops= [KNL]
+	refscale.loops= [KNL]
 			Set the number of loops over the synchronization
 			primitive under test.  Increasing this number
 			reduces noise due to loop start/end overhead,
@@ -4421,26 +4421,29 @@
 			noise to a handful of picoseconds on ca. 2020
 			x86 laptops.
 
-	refperf.nreaders= [KNL]
+	refscale.nreaders= [KNL]
 			Set number of readers.  The default value of -1
 			selects N, where N is roughly 75% of the number
 			of CPUs.  A value of zero is an interesting choice.
 
-	refperf.nruns= [KNL]
+	refscale.nruns= [KNL]
 			Set number of runs, each of which is dumped onto
 			the console log.
 
-	refperf.readdelay= [KNL]
+	refscale.readdelay= [KNL]
 			Set the read-side critical-section duration,
 			measured in microseconds.
 
-	refperf.shutdown= [KNL]
+	refscale.scale_type= [KNL]
+			Specify the read-protection implementation to test.
+
+	refscale.shutdown= [KNL]
 			Shut down the system at the end of the performance
 			test.  This defaults to 1 (shut it down) when
 			rcuperf is built into the kernel and to 0 (leave
 			it running) when rcuperf is built as a module.
 
-	refperf.verbose= [KNL]
+	refscale.verbose= [KNL]
 			Enable additional printk() statements.
 
 	relax_domain_level=
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 45d562de279a..95f5117ef8da 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -12,7 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o
 obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
-obj-$(CONFIG_RCU_REF_SCALE_TEST) += refperf.o
+obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_TINY_RCU) += tiny.o
 obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
deleted file mode 100644
index 7c980573acbe..000000000000
--- a/kernel/rcu/refperf.c
+++ /dev/null
@@ -1,717 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-//
-// Scalability test comparing RCU vs other mechanisms
-// for acquiring references on objects.
-//
-// Copyright (C) Google, 2020.
-//
-// Author: Joel Fernandes <joel@joelfernandes.org>
-
-#define pr_fmt(fmt) fmt
-
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/completion.h>
-#include <linux/cpu.h>
-#include <linux/delay.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/kthread.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/notifier.h>
-#include <linux/percpu.h>
-#include <linux/rcupdate.h>
-#include <linux/rcupdate_trace.h>
-#include <linux/reboot.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/stat.h>
-#include <linux/srcu.h>
-#include <linux/slab.h>
-#include <linux/torture.h>
-#include <linux/types.h>
-
-#include "rcu.h"
-
-#define PERF_FLAG "-ref-perf: "
-
-#define PERFOUT(s, x...) \
-	pr_alert("%s" PERF_FLAG s, perf_type, ## x)
-
-#define VERBOSE_PERFOUT(s, x...) \
-	do { if (verbose) pr_alert("%s" PERF_FLAG s, perf_type, ## x); } while (0)
-
-#define VERBOSE_PERFOUT_ERRSTRING(s, x...) \
-	do { if (verbose) pr_alert("%s" PERF_FLAG "!!! " s, perf_type, ## x); } while (0)
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
-
-static char *perf_type = "rcu";
-module_param(perf_type, charp, 0444);
-MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
-
-torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
-
-// Wait until there are multiple CPUs before starting test.
-torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
-	      "Holdoff time before test start (s)");
-// Number of loops per experiment, all readers execute operations concurrently.
-torture_param(long, loops, 10000, "Number of loops per experiment.");
-// Number of readers, with -1 defaulting to about 75% of the CPUs.
-torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
-// Number of runs.
-torture_param(int, nruns, 30, "Number of experiments to run.");
-// Reader delay in nanoseconds, 0 for no delay.
-torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
-
-#ifdef MODULE
-# define REFPERF_SHUTDOWN 0
-#else
-# define REFPERF_SHUTDOWN 1
-#endif
-
-torture_param(bool, shutdown, REFPERF_SHUTDOWN,
-	      "Shutdown at end of performance tests.");
-
-struct reader_task {
-	struct task_struct *task;
-	int start_reader;
-	wait_queue_head_t wq;
-	u64 last_duration_ns;
-};
-
-static struct task_struct *shutdown_task;
-static wait_queue_head_t shutdown_wq;
-
-static struct task_struct *main_task;
-static wait_queue_head_t main_wq;
-static int shutdown_start;
-
-static struct reader_task *reader_tasks;
-
-// Number of readers that are part of the current experiment.
-static atomic_t nreaders_exp;
-
-// Use to wait for all threads to start.
-static atomic_t n_init;
-static atomic_t n_started;
-static atomic_t n_warmedup;
-static atomic_t n_cooleddown;
-
-// Track which experiment is currently running.
-static int exp_idx;
-
-// Operations vector for selecting different types of tests.
-struct ref_perf_ops {
-	void (*init)(void);
-	void (*cleanup)(void);
-	void (*readsection)(const int nloops);
-	void (*delaysection)(const int nloops, const int udl, const int ndl);
-	const char *name;
-};
-
-static struct ref_perf_ops *cur_ops;
-
-static void un_delay(const int udl, const int ndl)
-{
-	if (udl)
-		udelay(udl);
-	if (ndl)
-		ndelay(ndl);
-}
-
-static void ref_rcu_read_section(const int nloops)
-{
-	int i;
-
-	for (i = nloops; i >= 0; i--) {
-		rcu_read_lock();
-		rcu_read_unlock();
-	}
-}
-
-static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl)
-{
-	int i;
-
-	for (i = nloops; i >= 0; i--) {
-		rcu_read_lock();
-		un_delay(udl, ndl);
-		rcu_read_unlock();
-	}
-}
-
-static void rcu_sync_perf_init(void)
-{
-}
-
-static struct ref_perf_ops rcu_ops = {
-	.init		= rcu_sync_perf_init,
-	.readsection	= ref_rcu_read_section,
-	.delaysection	= ref_rcu_delay_section,
-	.name		= "rcu"
-};
-
-// Definitions for SRCU ref perf testing.
-DEFINE_STATIC_SRCU(srcu_refctl_perf);
-static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf;
-
-static void srcu_ref_perf_read_section(const int nloops)
-{
-	int i;
-	int idx;
-
-	for (i = nloops; i >= 0; i--) {
-		idx = srcu_read_lock(srcu_ctlp);
-		srcu_read_unlock(srcu_ctlp, idx);
-	}
-}
-
-static void srcu_ref_perf_delay_section(const int nloops, const int udl, const int ndl)
-{
-	int i;
-	int idx;
-
-	for (i = nloops; i >= 0; i--) {
-		idx = srcu_read_lock(srcu_ctlp);
-		un_delay(udl, ndl);
-		srcu_read_unlock(srcu_ctlp, idx);
-	}
-}
-
-static struct ref_perf_ops srcu_ops = {
-	.init		= rcu_sync_perf_init,
-	.readsection	= srcu_ref_perf_read_section,
-	.delaysection	= srcu_ref_perf_delay_section,
-	.name		= "srcu"
-};
-
-// Definitions for RCU Tasks ref perf testing: Empty read markers.
-// These definitions also work for RCU Rude readers.
-static void rcu_tasks_ref_perf_read_section(const int nloops)
-{
-	int i;
-
-	for (i = nloops; i >= 0; i--)
-		continue;
-}
-
-static void rcu_tasks_ref_perf_delay_section(const int nloops, const int udl, const int ndl)
-{
-	int i;
-
-	for (i = nloops; i >= 0; i--)
-		un_delay(udl, ndl);
-}
-
-static struct ref_perf_ops rcu_tasks_ops = {
-	.init		= rcu_sync_perf_init,
-	.readsection	= rcu_tasks_ref_perf_read_section,
-	.delaysection	= rcu_tasks_ref_perf_delay_section,
-	.name		= "rcu-tasks"
-};
-
-// Definitions for RCU Tasks Trace ref perf testing.
-static void rcu_trace_ref_perf_read_section(const int nloops)
-{
-	int i;
-
-	for (i = nloops; i >= 0; i--) {
-		rcu_read_lock_trace();
-		rcu_read_unlock_trace();
-	}
-}
-
-static void rcu_trace_ref_perf_delay_section(const int nloops, const int udl, const int ndl)
-{
-	int i;
-
-	for (i = nloops; i >= 0; i--) {
-		rcu_read_lock_trace();
-		un_delay(udl, ndl);
-		rcu_read_unlock_trace();
-	}
-}
-
-static struct ref_perf_ops rcu_trace_ops = {
-	.init		= rcu_sync_perf_init,
-	.readsection	= rcu_trace_ref_perf_read_section,
-	.delaysection	= rcu_trace_ref_perf_delay_section,
-	.name		= "rcu-trace"
-};
-
-// Definitions for reference count
-static atomic_t refcnt;
-
-static void ref_refcnt_section(const int nloops)
-{
-	int i;
-
-	for (i = nloops; i >= 0; i--) {
-		atomic_inc(&refcnt);
-		atomic_dec(&refcnt);
-	}
-}
-
-static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl)
-{
-	int i;
-
-	for (i = nloops; i >= 0; i--) {
-		atomic_inc(&refcnt);
-		un_delay(udl, ndl);
-		atomic_dec(&refcnt);
-	}
-}
-
-static struct ref_perf_ops refcnt_ops = {
-	.init		= rcu_sync_perf_init,
-	.readsection	= ref_refcnt_section,
-	.delaysection	= ref_refcnt_delay_section,
-	.name		= "refcnt"
-};
-
-// Definitions for rwlock
-static rwlock_t test_rwlock;
-
-static void ref_rwlock_init(void)
-{
-	rwlock_init(&test_rwlock);
-}
-
-static void ref_rwlock_section(const int nloops)
-{
-	int i;
-
-	for (i = nloops; i >= 0; i--) {
-		read_lock(&test_rwlock);
-		read_unlock(&test_rwlock);
-	}
-}
-
-static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl)
-{
-	int i;
-
-	for (i = nloops; i >= 0; i--) {
-		read_lock(&test_rwlock);
-		un_delay(udl, ndl);
-		read_unlock(&test_rwlock);
-	}
-}
-
-static struct ref_perf_ops rwlock_ops = {
-	.init		= ref_rwlock_init,
-	.readsection	= ref_rwlock_section,
-	.delaysection	= ref_rwlock_delay_section,
-	.name		= "rwlock"
-};
-
-// Definitions for rwsem
-static struct rw_semaphore test_rwsem;
-
-static void ref_rwsem_init(void)
-{
-	init_rwsem(&test_rwsem);
-}
-
-static void ref_rwsem_section(const int nloops)
-{
-	int i;
-
-	for (i = nloops; i >= 0; i--) {
-		down_read(&test_rwsem);
-		up_read(&test_rwsem);
-	}
-}
-
-static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl)
-{
-	int i;
-
-	for (i = nloops; i >= 0; i--) {
-		down_read(&test_rwsem);
-		un_delay(udl, ndl);
-		up_read(&test_rwsem);
-	}
-}
-
-static struct ref_perf_ops rwsem_ops = {
-	.init		= ref_rwsem_init,
-	.readsection	= ref_rwsem_section,
-	.delaysection	= ref_rwsem_delay_section,
-	.name		= "rwsem"
-};
-
-static void rcu_perf_one_reader(void)
-{
-	if (readdelay <= 0)
-		cur_ops->readsection(loops);
-	else
-		cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000);
-}
-
-// Reader kthread.  Repeatedly does empty RCU read-side
-// critical section, minimizing update-side interference.
-static int
-ref_perf_reader(void *arg)
-{
-	unsigned long flags;
-	long me = (long)arg;
-	struct reader_task *rt = &(reader_tasks[me]);
-	u64 start;
-	s64 duration;
-
-	VERBOSE_PERFOUT("ref_perf_reader %ld: task started", me);
-	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
-	set_user_nice(current, MAX_NICE);
-	atomic_inc(&n_init);
-	if (holdoff)
-		schedule_timeout_interruptible(holdoff * HZ);
-repeat:
-	VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
-
-	// Wait for signal that this reader can start.
-	wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
-			   torture_must_stop());
-
-	if (torture_must_stop())
-		goto end;
-
-	// Make sure that the CPU is affinitized appropriately during testing.
-	WARN_ON_ONCE(smp_processor_id() != me);
-
-	WRITE_ONCE(rt->start_reader, 0);
-	if (!atomic_dec_return(&n_started))
-		while (atomic_read_acquire(&n_started))
-			cpu_relax();
-
-	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
-
-
-	// To reduce noise, do an initial cache-warming invocation, check
-	// in, and then keep warming until everyone has checked in.
-	rcu_perf_one_reader();
-	if (!atomic_dec_return(&n_warmedup))
-		while (atomic_read_acquire(&n_warmedup))
-			rcu_perf_one_reader();
-	// Also keep interrupts disabled.  This also has the effect
-	// of preventing entries into slow path for rcu_read_unlock().
-	local_irq_save(flags);
-	start = ktime_get_mono_fast_ns();
-
-	rcu_perf_one_reader();
-
-	duration = ktime_get_mono_fast_ns() - start;
-	local_irq_restore(flags);
-
-	rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
-	// To reduce runtime-skew noise, do maintain-load invocations until
-	// everyone is done.
-	if (!atomic_dec_return(&n_cooleddown))
-		while (atomic_read_acquire(&n_cooleddown))
-			rcu_perf_one_reader();
-
-	if (atomic_dec_and_test(&nreaders_exp))
-		wake_up(&main_wq);
-
-	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)",
-			me, exp_idx, atomic_read(&nreaders_exp));
-
-	if (!torture_must_stop())
-		goto repeat;
-end:
-	torture_kthread_stopping("ref_perf_reader");
-	return 0;
-}
-
-static void reset_readers(void)
-{
-	int i;
-	struct reader_task *rt;
-
-	for (i = 0; i < nreaders; i++) {
-		rt = &(reader_tasks[i]);
-
-		rt->last_duration_ns = 0;
-	}
-}
-
-// Print the results of each reader and return the sum of all their durations.
-static u64 process_durations(int n)
-{
-	int i;
-	struct reader_task *rt;
-	char buf1[64];
-	char *buf;
-	u64 sum = 0;
-
-	buf = kmalloc(128 + nreaders * 32, GFP_KERNEL);
-	if (!buf)
-		return 0;
-	buf[0] = 0;
-	sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
-		exp_idx);
-
-	for (i = 0; i < n && !torture_must_stop(); i++) {
-		rt = &(reader_tasks[i]);
-		sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns);
-
-		if (i % 5 == 0)
-			strcat(buf, "\n");
-		strcat(buf, buf1);
-
-		sum += rt->last_duration_ns;
-	}
-	strcat(buf, "\n");
-
-	PERFOUT("%s\n", buf);
-
-	kfree(buf);
-	return sum;
-}
-
-// The main_func is the main orchestrator, it performs a bunch of
-// experiments.  For every experiment, it orders all the readers
-// involved to start and waits for them to finish the experiment. It
-// then reads their timestamps and starts the next experiment. Each
-// experiment progresses from 1 concurrent reader to N of them at which
-// point all the timestamps are printed.
-static int main_func(void *arg)
-{
-	bool errexit = false;
-	int exp, r;
-	char buf1[64];
-	char *buf;
-	u64 *result_avg;
-
-	set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
-	set_user_nice(current, MAX_NICE);
-
-	VERBOSE_PERFOUT("main_func task started");
-	result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
-	buf = kzalloc(64 + nruns * 32, GFP_KERNEL);
-	if (!result_avg || !buf) {
-		VERBOSE_PERFOUT_ERRSTRING("out of memory");
-		errexit = true;
-	}
-	if (holdoff)
-		schedule_timeout_interruptible(holdoff * HZ);
-
-	// Wait for all threads to start.
-	atomic_inc(&n_init);
-	while (atomic_read(&n_init) < nreaders + 1)
-		schedule_timeout_uninterruptible(1);
-
-	// Start exp readers up per experiment
-	for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
-		if (errexit)
-			break;
-		if (torture_must_stop())
-			goto end;
-
-		reset_readers();
-		atomic_set(&nreaders_exp, nreaders);
-		atomic_set(&n_started, nreaders);
-		atomic_set(&n_warmedup, nreaders);
-		atomic_set(&n_cooleddown, nreaders);
-
-		exp_idx = exp;
-
-		for (r = 0; r < nreaders; r++) {
-			smp_store_release(&reader_tasks[r].start_reader, 1);
-			wake_up(&reader_tasks[r].wq);
-		}
-
-		VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers",
-				nreaders);
-
-		wait_event(main_wq,
-			   !atomic_read(&nreaders_exp) || torture_must_stop());
-
-		VERBOSE_PERFOUT("main_func: experiment ended");
-
-		if (torture_must_stop())
-			goto end;
-
-		result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops);
-	}
-
-	// Print the average of all experiments
-	PERFOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
-
-	buf[0] = 0;
-	strcat(buf, "\n");
-	strcat(buf, "Runs\tTime(ns)\n");
-
-	for (exp = 0; exp < nruns; exp++) {
-		u64 avg;
-		u32 rem;
-
-		if (errexit)
-			break;
-		avg = div_u64_rem(result_avg[exp], 1000, &rem);
-		sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem);
-		strcat(buf, buf1);
-	}
-
-	if (!errexit)
-		PERFOUT("%s", buf);
-
-	// This will shutdown everything including us.
-	if (shutdown) {
-		shutdown_start = 1;
-		wake_up(&shutdown_wq);
-	}
-
-	// Wait for torture to stop us
-	while (!torture_must_stop())
-		schedule_timeout_uninterruptible(1);
-
-end:
-	torture_kthread_stopping("main_func");
-	kfree(result_avg);
-	kfree(buf);
-	return 0;
-}
-
-static void
-ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag)
-{
-	pr_alert("%s" PERF_FLAG
-		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", perf_type, tag,
-		 verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay);
-}
-
-static void
-ref_perf_cleanup(void)
-{
-	int i;
-
-	if (torture_cleanup_begin())
-		return;
-
-	if (!cur_ops) {
-		torture_cleanup_end();
-		return;
-	}
-
-	if (reader_tasks) {
-		for (i = 0; i < nreaders; i++)
-			torture_stop_kthread("ref_perf_reader",
-					     reader_tasks[i].task);
-	}
-	kfree(reader_tasks);
-
-	torture_stop_kthread("main_task", main_task);
-	kfree(main_task);
-
-	// Do perf-type-specific cleanup operations.
-	if (cur_ops->cleanup != NULL)
-		cur_ops->cleanup();
-
-	torture_cleanup_end();
-}
-
-// Shutdown kthread.  Just waits to be awakened, then shuts down system.
-static int
-ref_perf_shutdown(void *arg)
-{
-	wait_event(shutdown_wq, shutdown_start);
-
-	smp_mb(); // Wake before output.
-	ref_perf_cleanup();
-	kernel_power_off();
-
-	return -EINVAL;
-}
-
-static int __init
-ref_perf_init(void)
-{
-	long i;
-	int firsterr = 0;
-	static struct ref_perf_ops *perf_ops[] = {
-		&rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops,
-		&refcnt_ops, &rwlock_ops, &rwsem_ops,
-	};
-
-	if (!torture_init_begin(perf_type, verbose))
-		return -EBUSY;
-
-	for (i = 0; i < ARRAY_SIZE(perf_ops); i++) {
-		cur_ops = perf_ops[i];
-		if (strcmp(perf_type, cur_ops->name) == 0)
-			break;
-	}
-	if (i == ARRAY_SIZE(perf_ops)) {
-		pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type);
-		pr_alert("rcu-perf types:");
-		for (i = 0; i < ARRAY_SIZE(perf_ops); i++)
-			pr_cont(" %s", perf_ops[i]->name);
-		pr_cont("\n");
-		WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
-		firsterr = -EINVAL;
-		cur_ops = NULL;
-		goto unwind;
-	}
-	if (cur_ops->init)
-		cur_ops->init();
-
-	ref_perf_print_module_parms(cur_ops, "Start of test");
-
-	// Shutdown task
-	if (shutdown) {
-		init_waitqueue_head(&shutdown_wq);
-		firsterr = torture_create_kthread(ref_perf_shutdown, NULL,
-						  shutdown_task);
-		if (firsterr)
-			goto unwind;
-		schedule_timeout_uninterruptible(1);
-	}
-
-	// Reader tasks (default to ~75% of online CPUs).
-	if (nreaders < 0)
-		nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
-	reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
-			       GFP_KERNEL);
-	if (!reader_tasks) {
-		VERBOSE_PERFOUT_ERRSTRING("out of memory");
-		firsterr = -ENOMEM;
-		goto unwind;
-	}
-
-	VERBOSE_PERFOUT("Starting %d reader threads\n", nreaders);
-
-	for (i = 0; i < nreaders; i++) {
-		firsterr = torture_create_kthread(ref_perf_reader, (void *)i,
-						  reader_tasks[i].task);
-		if (firsterr)
-			goto unwind;
-
-		init_waitqueue_head(&(reader_tasks[i].wq));
-	}
-
-	// Main Task
-	init_waitqueue_head(&main_wq);
-	firsterr = torture_create_kthread(main_func, NULL, main_task);
-	if (firsterr)
-		goto unwind;
-
-	torture_init_end();
-	return 0;
-
-unwind:
-	torture_init_end();
-	ref_perf_cleanup();
-	return firsterr;
-}
-
-module_init(ref_perf_init);
-module_exit(ref_perf_cleanup);
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
new file mode 100644
index 000000000000..d9291f883b54
--- /dev/null
+++ b/kernel/rcu/refscale.c
@@ -0,0 +1,717 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Scalability test comparing RCU vs other mechanisms
+// for acquiring references on objects.
+//
+// Copyright (C) Google, 2020.
+//
+// Author: Joel Fernandes <joel@joelfernandes.org>
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/torture.h>
+#include <linux/types.h>
+
+#include "rcu.h"
+
+#define SCALE_FLAG "-ref-scale: "
+
+#define SCALEOUT(s, x...) \
+	pr_alert("%s" SCALE_FLAG s, scale_type, ## x)
+
+#define VERBOSE_SCALEOUT(s, x...) \
+	do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0)
+
+#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \
+	do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
+
+static char *scale_type = "rcu";
+module_param(scale_type, charp, 0444);
+MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
+
+torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+
+// Wait until there are multiple CPUs before starting test.
+torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
+	      "Holdoff time before test start (s)");
+// Number of loops per experiment, all readers execute operations concurrently.
+torture_param(long, loops, 10000, "Number of loops per experiment.");
+// Number of readers, with -1 defaulting to about 75% of the CPUs.
+torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
+// Number of runs.
+torture_param(int, nruns, 30, "Number of experiments to run.");
+// Reader delay in nanoseconds, 0 for no delay.
+torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
+
+#ifdef MODULE
+# define REFSCALE_SHUTDOWN 0
+#else
+# define REFSCALE_SHUTDOWN 1
+#endif
+
+torture_param(bool, shutdown, REFSCALE_SHUTDOWN,
+	      "Shutdown at end of scalability tests.");
+
+struct reader_task {
+	struct task_struct *task;
+	int start_reader;
+	wait_queue_head_t wq;
+	u64 last_duration_ns;
+};
+
+static struct task_struct *shutdown_task;
+static wait_queue_head_t shutdown_wq;
+
+static struct task_struct *main_task;
+static wait_queue_head_t main_wq;
+static int shutdown_start;
+
+static struct reader_task *reader_tasks;
+
+// Number of readers that are part of the current experiment.
+static atomic_t nreaders_exp;
+
+// Use to wait for all threads to start.
+static atomic_t n_init;
+static atomic_t n_started;
+static atomic_t n_warmedup;
+static atomic_t n_cooleddown;
+
+// Track which experiment is currently running.
+static int exp_idx;
+
+// Operations vector for selecting different types of tests.
+struct ref_scale_ops {
+	void (*init)(void);
+	void (*cleanup)(void);
+	void (*readsection)(const int nloops);
+	void (*delaysection)(const int nloops, const int udl, const int ndl);
+	const char *name;
+};
+
+static struct ref_scale_ops *cur_ops;
+
+static void un_delay(const int udl, const int ndl)
+{
+	if (udl)
+		udelay(udl);
+	if (ndl)
+		ndelay(ndl);
+}
+
+static void ref_rcu_read_section(const int nloops)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		rcu_read_lock();
+		rcu_read_unlock();
+	}
+}
+
+static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		rcu_read_lock();
+		un_delay(udl, ndl);
+		rcu_read_unlock();
+	}
+}
+
+static void rcu_sync_scale_init(void)
+{
+}
+
+static struct ref_scale_ops rcu_ops = {
+	.init		= rcu_sync_scale_init,
+	.readsection	= ref_rcu_read_section,
+	.delaysection	= ref_rcu_delay_section,
+	.name		= "rcu"
+};
+
+// Definitions for SRCU ref scale testing.
+DEFINE_STATIC_SRCU(srcu_refctl_scale);
+static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale;
+
+static void srcu_ref_scale_read_section(const int nloops)
+{
+	int i;
+	int idx;
+
+	for (i = nloops; i >= 0; i--) {
+		idx = srcu_read_lock(srcu_ctlp);
+		srcu_read_unlock(srcu_ctlp, idx);
+	}
+}
+
+static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+	int i;
+	int idx;
+
+	for (i = nloops; i >= 0; i--) {
+		idx = srcu_read_lock(srcu_ctlp);
+		un_delay(udl, ndl);
+		srcu_read_unlock(srcu_ctlp, idx);
+	}
+}
+
+static struct ref_scale_ops srcu_ops = {
+	.init		= rcu_sync_scale_init,
+	.readsection	= srcu_ref_scale_read_section,
+	.delaysection	= srcu_ref_scale_delay_section,
+	.name		= "srcu"
+};
+
+// Definitions for RCU Tasks ref scale testing: Empty read markers.
+// These definitions also work for RCU Rude readers.
+static void rcu_tasks_ref_scale_read_section(const int nloops)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--)
+		continue;
+}
+
+static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--)
+		un_delay(udl, ndl);
+}
+
+static struct ref_scale_ops rcu_tasks_ops = {
+	.init		= rcu_sync_scale_init,
+	.readsection	= rcu_tasks_ref_scale_read_section,
+	.delaysection	= rcu_tasks_ref_scale_delay_section,
+	.name		= "rcu-tasks"
+};
+
+// Definitions for RCU Tasks Trace ref scale testing.
+static void rcu_trace_ref_scale_read_section(const int nloops)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		rcu_read_lock_trace();
+		rcu_read_unlock_trace();
+	}
+}
+
+static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		rcu_read_lock_trace();
+		un_delay(udl, ndl);
+		rcu_read_unlock_trace();
+	}
+}
+
+static struct ref_scale_ops rcu_trace_ops = {
+	.init		= rcu_sync_scale_init,
+	.readsection	= rcu_trace_ref_scale_read_section,
+	.delaysection	= rcu_trace_ref_scale_delay_section,
+	.name		= "rcu-trace"
+};
+
+// Definitions for reference count
+static atomic_t refcnt;
+
+static void ref_refcnt_section(const int nloops)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		atomic_inc(&refcnt);
+		atomic_dec(&refcnt);
+	}
+}
+
+static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		atomic_inc(&refcnt);
+		un_delay(udl, ndl);
+		atomic_dec(&refcnt);
+	}
+}
+
+static struct ref_scale_ops refcnt_ops = {
+	.init		= rcu_sync_scale_init,
+	.readsection	= ref_refcnt_section,
+	.delaysection	= ref_refcnt_delay_section,
+	.name		= "refcnt"
+};
+
+// Definitions for rwlock
+static rwlock_t test_rwlock;
+
+static void ref_rwlock_init(void)
+{
+	rwlock_init(&test_rwlock);
+}
+
+static void ref_rwlock_section(const int nloops)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		read_lock(&test_rwlock);
+		read_unlock(&test_rwlock);
+	}
+}
+
+static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		read_lock(&test_rwlock);
+		un_delay(udl, ndl);
+		read_unlock(&test_rwlock);
+	}
+}
+
+static struct ref_scale_ops rwlock_ops = {
+	.init		= ref_rwlock_init,
+	.readsection	= ref_rwlock_section,
+	.delaysection	= ref_rwlock_delay_section,
+	.name		= "rwlock"
+};
+
+// Definitions for rwsem
+static struct rw_semaphore test_rwsem;
+
+static void ref_rwsem_init(void)
+{
+	init_rwsem(&test_rwsem);
+}
+
+static void ref_rwsem_section(const int nloops)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		down_read(&test_rwsem);
+		up_read(&test_rwsem);
+	}
+}
+
+static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		down_read(&test_rwsem);
+		un_delay(udl, ndl);
+		up_read(&test_rwsem);
+	}
+}
+
+static struct ref_scale_ops rwsem_ops = {
+	.init		= ref_rwsem_init,
+	.readsection	= ref_rwsem_section,
+	.delaysection	= ref_rwsem_delay_section,
+	.name		= "rwsem"
+};
+
+static void rcu_scale_one_reader(void)
+{
+	if (readdelay <= 0)
+		cur_ops->readsection(loops);
+	else
+		cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000);
+}
+
+// Reader kthread.  Repeatedly does empty RCU read-side
+// critical section, minimizing update-side interference.
+static int
+ref_scale_reader(void *arg)
+{
+	unsigned long flags;
+	long me = (long)arg;
+	struct reader_task *rt = &(reader_tasks[me]);
+	u64 start;
+	s64 duration;
+
+	VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me);
+	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+	set_user_nice(current, MAX_NICE);
+	atomic_inc(&n_init);
+	if (holdoff)
+		schedule_timeout_interruptible(holdoff * HZ);
+repeat:
+	VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
+
+	// Wait for signal that this reader can start.
+	wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
+			   torture_must_stop());
+
+	if (torture_must_stop())
+		goto end;
+
+	// Make sure that the CPU is affinitized appropriately during testing.
+	WARN_ON_ONCE(smp_processor_id() != me);
+
+	WRITE_ONCE(rt->start_reader, 0);
+	if (!atomic_dec_return(&n_started))
+		while (atomic_read_acquire(&n_started))
+			cpu_relax();
+
+	VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx);
+
+
+	// To reduce noise, do an initial cache-warming invocation, check
+	// in, and then keep warming until everyone has checked in.
+	rcu_scale_one_reader();
+	if (!atomic_dec_return(&n_warmedup))
+		while (atomic_read_acquire(&n_warmedup))
+			rcu_scale_one_reader();
+	// Also keep interrupts disabled.  This also has the effect
+	// of preventing entries into slow path for rcu_read_unlock().
+	local_irq_save(flags);
+	start = ktime_get_mono_fast_ns();
+
+	rcu_scale_one_reader();
+
+	duration = ktime_get_mono_fast_ns() - start;
+	local_irq_restore(flags);
+
+	rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
+	// To reduce runtime-skew noise, do maintain-load invocations until
+	// everyone is done.
+	if (!atomic_dec_return(&n_cooleddown))
+		while (atomic_read_acquire(&n_cooleddown))
+			rcu_scale_one_reader();
+
+	if (atomic_dec_and_test(&nreaders_exp))
+		wake_up(&main_wq);
+
+	VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)",
+			me, exp_idx, atomic_read(&nreaders_exp));
+
+	if (!torture_must_stop())
+		goto repeat;
+end:
+	torture_kthread_stopping("ref_scale_reader");
+	return 0;
+}
+
+static void reset_readers(void)
+{
+	int i;
+	struct reader_task *rt;
+
+	for (i = 0; i < nreaders; i++) {
+		rt = &(reader_tasks[i]);
+
+		rt->last_duration_ns = 0;
+	}
+}
+
+// Print the results of each reader and return the sum of all their durations.
+static u64 process_durations(int n)
+{
+	int i;
+	struct reader_task *rt;
+	char buf1[64];
+	char *buf;
+	u64 sum = 0;
+
+	buf = kmalloc(128 + nreaders * 32, GFP_KERNEL);
+	if (!buf)
+		return 0;
+	buf[0] = 0;
+	sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
+		exp_idx);
+
+	for (i = 0; i < n && !torture_must_stop(); i++) {
+		rt = &(reader_tasks[i]);
+		sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns);
+
+		if (i % 5 == 0)
+			strcat(buf, "\n");
+		strcat(buf, buf1);
+
+		sum += rt->last_duration_ns;
+	}
+	strcat(buf, "\n");
+
+	SCALEOUT("%s\n", buf);
+
+	kfree(buf);
+	return sum;
+}
+
+// The main_func is the main orchestrator, it performs a bunch of
+// experiments.  For every experiment, it orders all the readers
+// involved to start and waits for them to finish the experiment. It
+// then reads their timestamps and starts the next experiment. Each
+// experiment progresses from 1 concurrent reader to N of them at which
+// point all the timestamps are printed.
+static int main_func(void *arg)
+{
+	bool errexit = false;
+	int exp, r;
+	char buf1[64];
+	char *buf;
+	u64 *result_avg;
+
+	set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
+	set_user_nice(current, MAX_NICE);
+
+	VERBOSE_SCALEOUT("main_func task started");
+	result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
+	buf = kzalloc(64 + nruns * 32, GFP_KERNEL);
+	if (!result_avg || !buf) {
+		VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+		errexit = true;
+	}
+	if (holdoff)
+		schedule_timeout_interruptible(holdoff * HZ);
+
+	// Wait for all threads to start.
+	atomic_inc(&n_init);
+	while (atomic_read(&n_init) < nreaders + 1)
+		schedule_timeout_uninterruptible(1);
+
+	// Start exp readers up per experiment
+	for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
+		if (errexit)
+			break;
+		if (torture_must_stop())
+			goto end;
+
+		reset_readers();
+		atomic_set(&nreaders_exp, nreaders);
+		atomic_set(&n_started, nreaders);
+		atomic_set(&n_warmedup, nreaders);
+		atomic_set(&n_cooleddown, nreaders);
+
+		exp_idx = exp;
+
+		for (r = 0; r < nreaders; r++) {
+			smp_store_release(&reader_tasks[r].start_reader, 1);
+			wake_up(&reader_tasks[r].wq);
+		}
+
+		VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers",
+				nreaders);
+
+		wait_event(main_wq,
+			   !atomic_read(&nreaders_exp) || torture_must_stop());
+
+		VERBOSE_SCALEOUT("main_func: experiment ended");
+
+		if (torture_must_stop())
+			goto end;
+
+		result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops);
+	}
+
+	// Print the average of all experiments
+	SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
+
+	buf[0] = 0;
+	strcat(buf, "\n");
+	strcat(buf, "Runs\tTime(ns)\n");
+
+	for (exp = 0; exp < nruns; exp++) {
+		u64 avg;
+		u32 rem;
+
+		if (errexit)
+			break;
+		avg = div_u64_rem(result_avg[exp], 1000, &rem);
+		sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem);
+		strcat(buf, buf1);
+	}
+
+	if (!errexit)
+		SCALEOUT("%s", buf);
+
+	// This will shutdown everything including us.
+	if (shutdown) {
+		shutdown_start = 1;
+		wake_up(&shutdown_wq);
+	}
+
+	// Wait for torture to stop us
+	while (!torture_must_stop())
+		schedule_timeout_uninterruptible(1);
+
+end:
+	torture_kthread_stopping("main_func");
+	kfree(result_avg);
+	kfree(buf);
+	return 0;
+}
+
+static void
+ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag)
+{
+	pr_alert("%s" SCALE_FLAG
+		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+		 verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay);
+}
+
+static void
+ref_scale_cleanup(void)
+{
+	int i;
+
+	if (torture_cleanup_begin())
+		return;
+
+	if (!cur_ops) {
+		torture_cleanup_end();
+		return;
+	}
+
+	if (reader_tasks) {
+		for (i = 0; i < nreaders; i++)
+			torture_stop_kthread("ref_scale_reader",
+					     reader_tasks[i].task);
+	}
+	kfree(reader_tasks);
+
+	torture_stop_kthread("main_task", main_task);
+	kfree(main_task);
+
+	// Do scale-type-specific cleanup operations.
+	if (cur_ops->cleanup != NULL)
+		cur_ops->cleanup();
+
+	torture_cleanup_end();
+}
+
+// Shutdown kthread.  Just waits to be awakened, then shuts down system.
+static int
+ref_scale_shutdown(void *arg)
+{
+	wait_event(shutdown_wq, shutdown_start);
+
+	smp_mb(); // Wake before output.
+	ref_scale_cleanup();
+	kernel_power_off();
+
+	return -EINVAL;
+}
+
+static int __init
+ref_scale_init(void)
+{
+	long i;
+	int firsterr = 0;
+	static struct ref_scale_ops *scale_ops[] = {
+		&rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops,
+		&refcnt_ops, &rwlock_ops, &rwsem_ops,
+	};
+
+	if (!torture_init_begin(scale_type, verbose))
+		return -EBUSY;
+
+	for (i = 0; i < ARRAY_SIZE(scale_ops); i++) {
+		cur_ops = scale_ops[i];
+		if (strcmp(scale_type, cur_ops->name) == 0)
+			break;
+	}
+	if (i == ARRAY_SIZE(scale_ops)) {
+		pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type);
+		pr_alert("rcu-scale types:");
+		for (i = 0; i < ARRAY_SIZE(scale_ops); i++)
+			pr_cont(" %s", scale_ops[i]->name);
+		pr_cont("\n");
+		WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
+		firsterr = -EINVAL;
+		cur_ops = NULL;
+		goto unwind;
+	}
+	if (cur_ops->init)
+		cur_ops->init();
+
+	ref_scale_print_module_parms(cur_ops, "Start of test");
+
+	// Shutdown task
+	if (shutdown) {
+		init_waitqueue_head(&shutdown_wq);
+		firsterr = torture_create_kthread(ref_scale_shutdown, NULL,
+						  shutdown_task);
+		if (firsterr)
+			goto unwind;
+		schedule_timeout_uninterruptible(1);
+	}
+
+	// Reader tasks (default to ~75% of online CPUs).
+	if (nreaders < 0)
+		nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
+	reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	if (!reader_tasks) {
+		VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+
+	VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders);
+
+	for (i = 0; i < nreaders; i++) {
+		firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
+						  reader_tasks[i].task);
+		if (firsterr)
+			goto unwind;
+
+		init_waitqueue_head(&(reader_tasks[i].wq));
+	}
+
+	// Main Task
+	init_waitqueue_head(&main_wq);
+	firsterr = torture_create_kthread(main_func, NULL, main_task);
+	if (firsterr)
+		goto unwind;
+
+	torture_init_end();
+	return 0;
+
+unwind:
+	torture_init_end();
+	ref_scale_cleanup();
+	return firsterr;
+}
+
+module_init(ref_scale_init);
+module_exit(ref_scale_cleanup);
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh
index 489f05dd929a..321e82641287 100644
--- a/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh
+++ b/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh
@@ -11,6 +11,6 @@
 #
 # Adds per-version torture-module parameters to kernels supporting them.
 per_version_boot_params () {
-	echo $1 refperf.shutdown=1 \
-		refperf.verbose=1
+	echo $1 refscale.shutdown=1 \
+		refscale.verbose=1
 }
-- 
cgit v1.2.3


From 7fef6cff8f2814bf8eb632e2bb8f0a987ffd9ece Mon Sep 17 00:00:00 2001
From: Ethon Paul <ethp@qq.com>
Date: Sat, 18 Apr 2020 19:46:47 +0800
Subject: srcu: Fix a typo in comment "amoritized"->"amortized"

This commit fixes a typo in a comment.

Signed-off-by: Ethon Paul <ethp@qq.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6d3ef700fb0e..8ff71e5d0fe8 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -766,7 +766,7 @@ static void srcu_flip(struct srcu_struct *ssp)
  * it, if this function was preempted for enough time for the counters
  * to wrap, it really doesn't matter whether or not we expedite the grace
  * period.  The extra overhead of a needlessly expedited grace period is
- * negligible when amoritized over that time period, and the extra latency
+ * negligible when amortized over that time period, and the extra latency
  * of a needlessly non-expedited grace period is similarly negligible.
  */
 static bool srcu_might_be_idle(struct srcu_struct *ssp)
-- 
cgit v1.2.3


From bde50d8ff83e4ce9e576f7c5ba1edb48a3610a5b Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 26 May 2020 15:41:34 +0200
Subject: srcu: Avoid local_irq_save() before acquiring spinlock_t

SRCU disables interrupts to get a stable per-CPU pointer and then
acquires the spinlock which is in the per-CPU data structure. The
release uses spin_unlock_irqrestore(). While this is correct on a non-RT
kernel, this conflicts with the RT semantics because the spinlock is
converted to a 'sleeping' spinlock. Sleeping locks can obviously not be
acquired with interrupts disabled.

Acquire the per-CPU pointer `ssp->sda' without disabling preemption and
then acquire the spinlock_t of the per-CPU data structure. The lock will
ensure that the data is consistent.

The added call to check_init_srcu_struct() is now needed because a
statically defined srcu_struct may remain uninitialized until this
point and the newly introduced locking operation requires an initialized
spinlock_t.

This change was tested for four hours with 8*SRCU-N and 8*SRCU-P without
causing any warnings.

Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: rcu@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 8ff71e5d0fe8..c100acf332ed 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -777,14 +777,15 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
 	unsigned long t;
 	unsigned long tlast;
 
+	check_init_srcu_struct(ssp);
 	/* If the local srcu_data structure has callbacks, not idle.  */
-	local_irq_save(flags);
-	sdp = this_cpu_ptr(ssp->sda);
+	sdp = raw_cpu_ptr(ssp->sda);
+	spin_lock_irqsave_rcu_node(sdp, flags);
 	if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
-		local_irq_restore(flags);
+		spin_unlock_irqrestore_rcu_node(sdp, flags);
 		return false; /* Callbacks already present, so not idle. */
 	}
-	local_irq_restore(flags);
+	spin_unlock_irqrestore_rcu_node(sdp, flags);
 
 	/*
 	 * No local callbacks, so probabalistically probe global state.
@@ -864,9 +865,8 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
 	}
 	rhp->func = func;
 	idx = srcu_read_lock(ssp);
-	local_irq_save(flags);
-	sdp = this_cpu_ptr(ssp->sda);
-	spin_lock_rcu_node(sdp);
+	sdp = raw_cpu_ptr(ssp->sda);
+	spin_lock_irqsave_rcu_node(sdp, flags);
 	rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&ssp->srcu_gp_seq));
-- 
cgit v1.2.3


From d02c6b52d12fa30eeabfaf5aefe12078eacb94b2 Mon Sep 17 00:00:00 2001
From: Zou Wei <zou_wei@huawei.com>
Date: Mon, 13 Apr 2020 20:02:59 +0800
Subject: locktorture: Use true and false to assign to bool variables

This commit fixes the following coccicheck warnings:

kernel/locking/locktorture.c:689:6-10: WARNING: Assignment of 0/1 to bool variable
kernel/locking/locktorture.c:907:2-20: WARNING: Assignment of 0/1 to bool variable
kernel/locking/locktorture.c:938:3-20: WARNING: Assignment of 0/1 to bool variable
kernel/locking/locktorture.c:668:2-19: WARNING: Assignment of 0/1 to bool variable
kernel/locking/locktorture.c:674:2-19: WARNING: Assignment of 0/1 to bool variable
kernel/locking/locktorture.c:634:2-20: WARNING: Assignment of 0/1 to bool variable
kernel/locking/locktorture.c:640:2-20: WARNING: Assignment of 0/1 to bool variable

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Zou Wei <zou_wei@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/locking/locktorture.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 5efbfc68ce99..8ff6f50e06a0 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -631,13 +631,13 @@ static int lock_torture_writer(void *arg)
 		cxt.cur_ops->writelock();
 		if (WARN_ON_ONCE(lock_is_write_held))
 			lwsp->n_lock_fail++;
-		lock_is_write_held = 1;
+		lock_is_write_held = true;
 		if (WARN_ON_ONCE(lock_is_read_held))
 			lwsp->n_lock_fail++; /* rare, but... */
 
 		lwsp->n_lock_acquired++;
 		cxt.cur_ops->write_delay(&rand);
-		lock_is_write_held = 0;
+		lock_is_write_held = false;
 		cxt.cur_ops->writeunlock();
 
 		stutter_wait("lock_torture_writer");
@@ -665,13 +665,13 @@ static int lock_torture_reader(void *arg)
 			schedule_timeout_uninterruptible(1);
 
 		cxt.cur_ops->readlock();
-		lock_is_read_held = 1;
+		lock_is_read_held = true;
 		if (WARN_ON_ONCE(lock_is_write_held))
 			lrsp->n_lock_fail++; /* rare, but... */
 
 		lrsp->n_lock_acquired++;
 		cxt.cur_ops->read_delay(&rand);
-		lock_is_read_held = 0;
+		lock_is_read_held = false;
 		cxt.cur_ops->readunlock();
 
 		stutter_wait("lock_torture_reader");
@@ -686,7 +686,7 @@ static int lock_torture_reader(void *arg)
 static void __torture_print_stats(char *page,
 				  struct lock_stress_stats *statp, bool write)
 {
-	bool fail = 0;
+	bool fail = false;
 	int i, n_stress;
 	long max = 0, min = statp ? statp[0].n_lock_acquired : 0;
 	long long sum = 0;
@@ -904,7 +904,7 @@ static int __init lock_torture_init(void)
 
 	/* Initialize the statistics so that each run gets its own numbers. */
 	if (nwriters_stress) {
-		lock_is_write_held = 0;
+		lock_is_write_held = false;
 		cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress,
 					 sizeof(*cxt.lwsa),
 					 GFP_KERNEL);
@@ -935,7 +935,7 @@ static int __init lock_torture_init(void)
 		}
 
 		if (nreaders_stress) {
-			lock_is_read_held = 0;
+			lock_is_read_held = false;
 			cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
 						 sizeof(*cxt.lrsa),
 						 GFP_KERNEL);
-- 
cgit v1.2.3


From 4a5f133c15b77c4018e8d7996541868ac94afb4f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 24 Apr 2020 11:21:40 -0700
Subject: rcutorture: Add races with task-exit processing

Several variants of Linux-kernel RCU interact with task-exit processing,
including preemptible RCU, Tasks RCU, and Tasks Trace RCU.  This commit
therefore adds testing of this interaction to rcutorture by adding
rcutorture.read_exit_burst and rcutorture.read_exit_delay kernel-boot
parameters.  These kernel parameters control the frequency and spacing
of special read-then-exit kthreads that are spawned.

[ paulmck: Apply feedback from Dan Carpenter's static checker. ]
[ paulmck: Reduce latency to avoid false-positive shutdown hangs. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  14 +++
 include/linux/torture.h                         |   5 ++
 kernel/rcu/rcutorture.c                         | 112 +++++++++++++++++++++++-
 3 files changed, 128 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fb95fad81c79..a0dcc925c8a2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4258,6 +4258,20 @@
 			Set time (jiffies) between CPU-hotplug operations,
 			or zero to disable CPU-hotplug testing.
 
+	rcutorture.read_exit= [KNL]
+			Set the number of read-then-exit kthreads used
+			to test the interaction of RCU updaters and
+			task-exit processing.
+
+	rcutorture.read_exit_burst= [KNL]
+			The number of times in a given read-then-exit
+			episode that a set of read-then-exit kthreads
+			is spawned.
+
+	rcutorture.read_exit_delay= [KNL]
+			The delay, in seconds, between successive
+			read-then-exit testing episodes.
+
 	rcutorture.shuffle_interval= [KNL]
 			Set task-shuffle interval (s).  Shuffling tasks
 			allows some CPUs to go into dyntick-idle mode
diff --git a/include/linux/torture.h b/include/linux/torture.h
index 629b66e6c161..7f65bd1dd307 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -55,6 +55,11 @@ struct torture_random_state {
 #define DEFINE_TORTURE_RANDOM_PERCPU(name) \
 	DEFINE_PER_CPU(struct torture_random_state, name)
 unsigned long torture_random(struct torture_random_state *trsp);
+static inline void torture_random_init(struct torture_random_state *trsp)
+{
+	trsp->trs_state = 0;
+	trsp->trs_count = 0;
+}
 
 /* Task shuffler, which causes CPUs to occasionally go idle. */
 void torture_shuffle_task_register(struct task_struct *tp);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index efb792e13fca..2621a339c8a4 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -109,6 +109,10 @@ torture_param(int, object_debug, 0,
 torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
 torture_param(int, onoff_interval, 0,
 	     "Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(int, read_exit_delay, 13,
+	      "Delay between read-then-exit episodes (s)");
+torture_param(int, read_exit_burst, 16,
+	      "# of read-then-exit bursts per episode, zero to disable");
 torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
 torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
 torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
@@ -146,6 +150,7 @@ static struct task_struct *stall_task;
 static struct task_struct *fwd_prog_task;
 static struct task_struct **barrier_cbs_tasks;
 static struct task_struct *barrier_task;
+static struct task_struct *read_exit_task;
 
 #define RCU_TORTURE_PIPE_LEN 10
 
@@ -177,6 +182,7 @@ static long n_rcu_torture_boosts;
 static atomic_long_t n_rcu_torture_timers;
 static long n_barrier_attempts;
 static long n_barrier_successes; /* did rcu_barrier test succeed? */
+static unsigned long n_read_exits;
 static struct list_head rcu_torture_removed;
 static unsigned long shutdown_jiffies;
 
@@ -1539,10 +1545,11 @@ rcu_torture_stats_print(void)
 		n_rcu_torture_boosts,
 		atomic_long_read(&n_rcu_torture_timers));
 	torture_onoff_stats();
-	pr_cont("barrier: %ld/%ld:%ld\n",
+	pr_cont("barrier: %ld/%ld:%ld ",
 		data_race(n_barrier_successes),
 		data_race(n_barrier_attempts),
 		data_race(n_rcu_torture_barrier_error));
+	pr_cont("read-exits: %ld\n", data_race(n_read_exits));
 
 	pr_alert("%s%s ", torture_type, TORTURE_FLAG);
 	if (atomic_read(&n_rcu_torture_mberror) ||
@@ -1634,7 +1641,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
 		 "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
 		 "stall_cpu_block=%d "
 		 "n_barrier_cbs=%d "
-		 "onoff_interval=%d onoff_holdoff=%d\n",
+		 "onoff_interval=%d onoff_holdoff=%d "
+		 "read_exit_delay=%d read_exit_burst=%d\n",
 		 torture_type, tag, nrealreaders, nfakewriters,
 		 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
 		 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -1643,7 +1651,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
 		 stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
 		 stall_cpu_block,
 		 n_barrier_cbs,
-		 onoff_interval, onoff_holdoff);
+		 onoff_interval, onoff_holdoff,
+		 read_exit_delay, read_exit_burst);
 }
 
 static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -2338,6 +2347,99 @@ static bool rcu_torture_can_boost(void)
 	return true;
 }
 
+static bool read_exit_child_stop;
+static bool read_exit_child_stopped;
+static wait_queue_head_t read_exit_wq;
+
+// Child kthread which just does an rcutorture reader and exits.
+static int rcu_torture_read_exit_child(void *trsp_in)
+{
+	struct torture_random_state *trsp = trsp_in;
+
+	set_user_nice(current, MAX_NICE);
+	// Minimize time between reading and exiting.
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	(void)rcu_torture_one_read(trsp);
+	return 0;
+}
+
+// Parent kthread which creates and destroys read-exit child kthreads.
+static int rcu_torture_read_exit(void *unused)
+{
+	int count = 0;
+	bool errexit = false;
+	int i;
+	struct task_struct *tsp;
+	DEFINE_TORTURE_RANDOM(trs);
+
+	// Allocate and initialize.
+	set_user_nice(current, MAX_NICE);
+	VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test");
+
+	// Each pass through this loop does one read-exit episode.
+	do {
+		if (++count > read_exit_burst) {
+			VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
+			rcu_barrier(); // Wait for task_struct free, avoid OOM.
+			for (i = 0; i < read_exit_delay; i++) {
+				schedule_timeout_uninterruptible(HZ);
+				if (READ_ONCE(read_exit_child_stop))
+					break;
+			}
+			if (!READ_ONCE(read_exit_child_stop))
+				VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
+			count = 0;
+		}
+		if (READ_ONCE(read_exit_child_stop))
+			break;
+		// Spawn child.
+		tsp = kthread_run(rcu_torture_read_exit_child,
+				     &trs, "%s",
+				     "rcu_torture_read_exit_child");
+		if (IS_ERR(tsp)) {
+			VERBOSE_TOROUT_ERRSTRING("out of memory");
+			errexit = true;
+			tsp = NULL;
+			break;
+		}
+		cond_resched();
+		kthread_stop(tsp);
+		n_read_exits ++;
+		stutter_wait("rcu_torture_read_exit");
+	} while (!errexit && !READ_ONCE(read_exit_child_stop));
+
+	// Clean up and exit.
+	smp_store_release(&read_exit_child_stopped, true); // After reaping.
+	smp_mb(); // Store before wakeup.
+	wake_up(&read_exit_wq);
+	while (!torture_must_stop())
+		schedule_timeout_uninterruptible(1);
+	torture_kthread_stopping("rcu_torture_read_exit");
+	return 0;
+}
+
+static int rcu_torture_read_exit_init(void)
+{
+	if (read_exit_burst <= 0)
+		return -EINVAL;
+	init_waitqueue_head(&read_exit_wq);
+	read_exit_child_stop = false;
+	read_exit_child_stopped = false;
+	return torture_create_kthread(rcu_torture_read_exit, NULL,
+				      read_exit_task);
+}
+
+static void rcu_torture_read_exit_cleanup(void)
+{
+	if (!read_exit_task)
+		return;
+	WRITE_ONCE(read_exit_child_stop, true);
+	smp_mb(); // Above write before wait.
+	wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped));
+	torture_stop_kthread(rcutorture_read_exit, read_exit_task);
+}
+
 static enum cpuhp_state rcutor_hp;
 
 static void
@@ -2359,6 +2461,7 @@ rcu_torture_cleanup(void)
 	}
 
 	show_rcu_gp_kthreads();
+	rcu_torture_read_exit_cleanup();
 	rcu_torture_barrier_cleanup();
 	torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
 	torture_stop_kthread(rcu_torture_stall, stall_task);
@@ -2680,6 +2783,9 @@ rcu_torture_init(void)
 	if (firsterr)
 		goto unwind;
 	firsterr = rcu_torture_barrier_init();
+	if (firsterr)
+		goto unwind;
+	firsterr = rcu_torture_read_exit_init();
 	if (firsterr)
 		goto unwind;
 	if (object_debug)
-- 
cgit v1.2.3


From cae7cc6ba5bad320c2055ac54f73affd051e76ca Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sun, 26 Apr 2020 19:20:37 -0700
Subject: rcutorture: NULL rcu_torture_current earlier in cleanup code

Currently, the rcu_torture_current variable remains non-NULL until after
all readers have stopped.  During this time, rcu_torture_stats_print()
will think that the test is still ongoing, which can result in confusing
dmesg output.  This commit therefore NULLs rcu_torture_current immediately
after the rcu_torture_writer() kthread has decided to stop, thus informing
rcu_torture_stats_print() much sooner.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 2621a339c8a4..59112077a6da 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1172,6 +1172,7 @@ rcu_torture_writer(void *arg)
 					WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
 				}
 	} while (!torture_must_stop());
+	rcu_torture_current = NULL;  // Let stats task know that we are done.
 	/* Reset expediting back to unexpedited. */
 	if (expediting > 0)
 		expediting = -expediting;
@@ -2473,7 +2474,6 @@ rcu_torture_cleanup(void)
 					     reader_tasks[i]);
 		kfree(reader_tasks);
 	}
-	rcu_torture_current = NULL;
 
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++) {
-- 
cgit v1.2.3


From 8f43d5911b38f00dfa46169dcb1feb1e101dd906 Mon Sep 17 00:00:00 2001
From: Jules Irenge <jbi.octave@gmail.com>
Date: Mon, 1 Jun 2020 19:45:48 +0100
Subject: rcu/rcutorture: Replace 0 with false

Coccinelle reports a warning

WARNING: Assignment of 0/1 to bool variable

The root cause is that the variable lastphase is a bool, but is
initialised with integer 0.  This commit therefore replaces the 0 with
a false.

Signed-off-by: Jules Irenge <jbi.octave@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 59112077a6da..37455a12898e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2185,7 +2185,7 @@ static void rcu_torture_barrier1cb(void *rcu_void)
 static int rcu_torture_barrier_cbs(void *arg)
 {
 	long myid = (long)arg;
-	bool lastphase = 0;
+	bool lastphase = false;
 	bool newphase;
 	struct rcu_head rcu;
 
-- 
cgit v1.2.3


From 775227511843202e65a7f194cbf64f38de01f004 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 11 Jun 2020 16:43:14 -0700
Subject: rcutorture: Check for unwatched readers

RCU is supposed to be watching all non-idle kernel code and also all
softirq handlers.  This commit adds some teeth to this statement by
adding a WARN_ON_ONCE().

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 37455a12898e..9c310016585b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1377,6 +1377,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
 	struct rt_read_seg *rtrsp1;
 	unsigned long long ts;
 
+	WARN_ON_ONCE(!rcu_is_watching());
 	newstate = rcutorture_extend_mask(readstate, trsp);
 	rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
 	started = cur_ops->get_gp_seq();
-- 
cgit v1.2.3


From 2102ad290af06119ccfb56ddc3a0e5011a91537e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 16 Jun 2020 15:38:24 -0700
Subject: torture: Dump ftrace at shutdown only if requested

If there is a large number of torture tests running concurrently,
all of which are dumping large ftrace buffers at shutdown time, the
resulting dumping can take a very long time, particularly on systems
with rotating-rust storage.  This commit therefore adds a default-off
torture.ftrace_dump_at_shutdown module parameter that enables
shutdown-time ftrace-buffer dumping.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 7 +++++++
 kernel/torture.c                                | 6 +++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a0dcc925c8a2..9f11ff80d4ad 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5096,6 +5096,13 @@
 			Prevent the CPU-hotplug component of torturing
 			until after init has spawned.
 
+	torture.ftrace_dump_at_shutdown= [KNL]
+			Dump the ftrace buffer at torture-test shutdown,
+			even if there were no errors.  This can be a
+			very costly operation when many torture tests
+			are running concurrently, especially on systems
+			with rotating-rust storage.
+
 	tp720=		[HW,PS2]
 
 	tpm_suspend_pcr=[HW,TPM]
diff --git a/kernel/torture.c b/kernel/torture.c
index a1a41484ff6d..1061492f14bd 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -45,6 +45,9 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
 static bool disable_onoff_at_boot;
 module_param(disable_onoff_at_boot, bool, 0444);
 
+static bool ftrace_dump_at_shutdown;
+module_param(ftrace_dump_at_shutdown, bool, 0444);
+
 static char *torture_type;
 static int verbose;
 
@@ -527,7 +530,8 @@ static int torture_shutdown(void *arg)
 		torture_shutdown_hook();
 	else
 		VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
-	rcu_ftrace_dump(DUMP_ALL);
+	if (ftrace_dump_at_shutdown)
+		rcu_ftrace_dump(DUMP_ALL);
 	kernel_power_off();	/* Shut down the system. */
 	return 0;
 }
-- 
cgit v1.2.3


From cda099b37d7165fc73a63961739acf026444cde2 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Wed, 19 Feb 2020 11:00:54 -0800
Subject: fork: Annotate a data race in vm_area_dup()

struct vm_area_struct could be accessed concurrently as noticed by
KCSAN,

 write to 0xffff9cf8bba08ad8 of 8 bytes by task 14263 on cpu 35:
  vma_interval_tree_insert+0x101/0x150:
  rb_insert_augmented_cached at include/linux/rbtree_augmented.h:58
  (inlined by) vma_interval_tree_insert at mm/interval_tree.c:23
  __vma_link_file+0x6e/0xe0
  __vma_link_file at mm/mmap.c:629
  vma_link+0xa2/0x120
  mmap_region+0x753/0xb90
  do_mmap+0x45c/0x710
  vm_mmap_pgoff+0xc0/0x130
  ksys_mmap_pgoff+0x1d1/0x300
  __x64_sys_mmap+0x33/0x40
  do_syscall_64+0x91/0xc44
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

 read to 0xffff9cf8bba08a80 of 200 bytes by task 14262 on cpu 122:
  vm_area_dup+0x6a/0xe0
  vm_area_dup at kernel/fork.c:362
  __split_vma+0x72/0x2a0
  __split_vma at mm/mmap.c:2661
  split_vma+0x5a/0x80
  mprotect_fixup+0x368/0x3f0
  do_mprotect_pkey+0x263/0x420
  __x64_sys_mprotect+0x51/0x70
  do_syscall_64+0x91/0xc44
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

vm_area_dup() blindly copies all fields of original VMA to the new one.
This includes coping vm_area_struct::shared.rb which is normally
protected by i_mmap_lock. But this is fine because the read value will
be overwritten on the following __vma_link_file() under proper
protection. Thus, mark it as an intentional data race and insert a few
assertions for the fields that should not be modified concurrently.

Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/fork.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 142b23645d82..bba10fbcdce7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -359,7 +359,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 
 	if (new) {
-		*new = *orig;
+		ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+		ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+		/*
+		 * orig->shared.rb may be modified concurrently, but the clone
+		 * will be reinitialized.
+		 */
+		*new = data_race(*orig);
 		INIT_LIST_HEAD(&new->anon_vma_chain);
 		new->vm_next = new->vm_prev = NULL;
 	}
-- 
cgit v1.2.3


From 1fe84fd4a4027a17d511a832f89ab14107650ba4 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 5 May 2020 20:28:21 +0200
Subject: kcsan: Add test suite

This adds KCSAN test focusing on behaviour of the integrated runtime.
Tests various race scenarios, and verifies the reports generated to
console. Makes use of KUnit for test organization, and the Torture
framework for test thread control.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/Makefile     |    3 +
 kernel/kcsan/kcsan-test.c | 1084 +++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.kcsan         |   23 +-
 3 files changed, 1109 insertions(+), 1 deletion(-)
 create mode 100644 kernel/kcsan/kcsan-test.c

(limited to 'kernel')

diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index d4999b38d1be..14533cf24bc3 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -12,3 +12,6 @@ CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \
 
 obj-y := core.o debugfs.o report.o
 obj-$(CONFIG_KCSAN_SELFTEST) += test.o
+
+CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
+obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o
diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c
new file mode 100644
index 000000000000..a8c11506dd2a
--- /dev/null
+++ b/kernel/kcsan/kcsan-test.c
@@ -0,0 +1,1084 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN test with various race scenarious to test runtime behaviour. Since the
+ * interface with which KCSAN's reports are obtained is via the console, this is
+ * the output we should verify. For each test case checks the presence (or
+ * absence) of generated reports. Relies on 'console' tracepoint to capture
+ * reports as they appear in the kernel log.
+ *
+ * Makes use of KUnit for test organization, and the Torture framework for test
+ * thread control.
+ *
+ * Copyright (C) 2020, Google LLC.
+ * Author: Marco Elver <elver@google.com>
+ */
+
+#include <kunit/test.h>
+#include <linux/jiffies.h>
+#include <linux/kcsan-checks.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/timer.h>
+#include <linux/torture.h>
+#include <linux/tracepoint.h>
+#include <linux/types.h>
+#include <trace/events/printk.h>
+
+/* Points to current test-case memory access "kernels". */
+static void (*access_kernels[2])(void);
+
+static struct task_struct **threads; /* Lists of threads. */
+static unsigned long end_time;       /* End time of test. */
+
+/* Report as observed from console. */
+static struct {
+	spinlock_t lock;
+	int nlines;
+	char lines[3][512];
+} observed = {
+	.lock = __SPIN_LOCK_UNLOCKED(observed.lock),
+};
+
+/* Setup test checking loop. */
+static __no_kcsan_or_inline void
+begin_test_checks(void (*func1)(void), void (*func2)(void))
+{
+	kcsan_disable_current();
+
+	/*
+	 * Require at least as long as KCSAN_REPORT_ONCE_IN_MS, to ensure at
+	 * least one race is reported.
+	 */
+	end_time = jiffies + msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS + 500);
+
+	/* Signal start; release potential initialization of shared data. */
+	smp_store_release(&access_kernels[0], func1);
+	smp_store_release(&access_kernels[1], func2);
+}
+
+/* End test checking loop. */
+static __no_kcsan_or_inline bool
+end_test_checks(bool stop)
+{
+	if (!stop && time_before(jiffies, end_time)) {
+		/* Continue checking */
+		might_sleep();
+		return false;
+	}
+
+	kcsan_enable_current();
+	return true;
+}
+
+/*
+ * Probe for console output: checks if a race was reported, and obtains observed
+ * lines of interest.
+ */
+__no_kcsan
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+	unsigned long flags;
+	int nlines;
+
+	/*
+	 * Note that KCSAN reports under a global lock, so we do not risk the
+	 * possibility of having multiple reports interleaved. If that were the
+	 * case, we'd expect tests to fail.
+	 */
+
+	spin_lock_irqsave(&observed.lock, flags);
+	nlines = observed.nlines;
+
+	if (strnstr(buf, "BUG: KCSAN: ", len) && strnstr(buf, "test_", len)) {
+		/*
+		 * KCSAN report and related to the test.
+		 *
+		 * The provided @buf is not NUL-terminated; copy no more than
+		 * @len bytes and let strscpy() add the missing NUL-terminator.
+		 */
+		strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0])));
+		nlines = 1;
+	} else if ((nlines == 1 || nlines == 2) && strnstr(buf, "bytes by", len)) {
+		strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0])));
+
+		if (strnstr(buf, "race at unknown origin", len)) {
+			if (WARN_ON(nlines != 2))
+				goto out;
+
+			/* No second line of interest. */
+			strcpy(observed.lines[nlines++], "<none>");
+		}
+	}
+
+out:
+	WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */
+	spin_unlock_irqrestore(&observed.lock, flags);
+}
+
+/* Check if a report related to the test exists. */
+__no_kcsan
+static bool report_available(void)
+{
+	return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines);
+}
+
+/* Report information we expect in a report. */
+struct expect_report {
+	/* Access information of both accesses. */
+	struct {
+		void *fn;    /* Function pointer to expected function of top frame. */
+		void *addr;  /* Address of access; unchecked if NULL. */
+		size_t size; /* Size of access; unchecked if @addr is NULL. */
+		int type;    /* Access type, see KCSAN_ACCESS definitions. */
+	} access[2];
+};
+
+/* Check observed report matches information in @r. */
+__no_kcsan
+static bool report_matches(const struct expect_report *r)
+{
+	const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
+	bool ret = false;
+	unsigned long flags;
+	typeof(observed.lines) expect;
+	const char *end;
+	char *cur;
+	int i;
+
+	/* Doubled-checked locking. */
+	if (!report_available())
+		return false;
+
+	/* Generate expected report contents. */
+
+	/* Title */
+	cur = expect[0];
+	end = &expect[0][sizeof(expect[0]) - 1];
+	cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ",
+			 is_assert ? "assert: race" : "data-race");
+	if (r->access[1].fn) {
+		char tmp[2][64];
+		int cmp;
+
+		/* Expect lexographically sorted function names in title. */
+		scnprintf(tmp[0], sizeof(tmp[0]), "%pS", r->access[0].fn);
+		scnprintf(tmp[1], sizeof(tmp[1]), "%pS", r->access[1].fn);
+		cmp = strcmp(tmp[0], tmp[1]);
+		cur += scnprintf(cur, end - cur, "%ps / %ps",
+				 cmp < 0 ? r->access[0].fn : r->access[1].fn,
+				 cmp < 0 ? r->access[1].fn : r->access[0].fn);
+	} else {
+		scnprintf(cur, end - cur, "%pS", r->access[0].fn);
+		/* The exact offset won't match, remove it. */
+		cur = strchr(expect[0], '+');
+		if (cur)
+			*cur = '\0';
+	}
+
+	/* Access 1 */
+	cur = expect[1];
+	end = &expect[1][sizeof(expect[1]) - 1];
+	if (!r->access[1].fn)
+		cur += scnprintf(cur, end - cur, "race at unknown origin, with ");
+
+	/* Access 1 & 2 */
+	for (i = 0; i < 2; ++i) {
+		const char *const access_type =
+			(r->access[i].type & KCSAN_ACCESS_ASSERT) ?
+				((r->access[i].type & KCSAN_ACCESS_WRITE) ?
+					 "assert no accesses" :
+					 "assert no writes") :
+				((r->access[i].type & KCSAN_ACCESS_WRITE) ?
+					 "write" :
+					 "read");
+		const char *const access_type_aux =
+			(r->access[i].type & KCSAN_ACCESS_ATOMIC) ?
+				" (marked)" :
+				((r->access[i].type & KCSAN_ACCESS_SCOPED) ?
+					 " (scoped)" :
+					 "");
+
+		if (i == 1) {
+			/* Access 2 */
+			cur = expect[2];
+			end = &expect[2][sizeof(expect[2]) - 1];
+
+			if (!r->access[1].fn) {
+				/* Dummy string if no second access is available. */
+				strcpy(cur, "<none>");
+				break;
+			}
+		}
+
+		cur += scnprintf(cur, end - cur, "%s%s to ", access_type,
+				 access_type_aux);
+
+		if (r->access[i].addr) /* Address is optional. */
+			cur += scnprintf(cur, end - cur, "0x%px of %zu bytes",
+					 r->access[i].addr, r->access[i].size);
+	}
+
+	spin_lock_irqsave(&observed.lock, flags);
+	if (!report_available())
+		goto out; /* A new report is being captured. */
+
+	/* Finally match expected output to what we actually observed. */
+	ret = strstr(observed.lines[0], expect[0]) &&
+	      /* Access info may appear in any order. */
+	      ((strstr(observed.lines[1], expect[1]) &&
+		strstr(observed.lines[2], expect[2])) ||
+	       (strstr(observed.lines[1], expect[2]) &&
+		strstr(observed.lines[2], expect[1])));
+out:
+	spin_unlock_irqrestore(&observed.lock, flags);
+	return ret;
+}
+
+/* ===== Test kernels ===== */
+
+static long test_sink;
+static long test_var;
+/* @test_array should be large enough to fall into multiple watchpoint slots. */
+static long test_array[3 * PAGE_SIZE / sizeof(long)];
+static struct {
+	long val[8];
+} test_struct;
+static DEFINE_SEQLOCK(test_seqlock);
+
+/*
+ * Helper to avoid compiler optimizing out reads, and to generate source values
+ * for writes.
+ */
+__no_kcsan
+static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); }
+
+static noinline void test_kernel_read(void) { sink_value(test_var); }
+
+static noinline void test_kernel_write(void)
+{
+	test_var = READ_ONCE_NOCHECK(test_sink) + 1;
+}
+
+static noinline void test_kernel_write_nochange(void) { test_var = 42; }
+
+/* Suffixed by value-change exception filter. */
+static noinline void test_kernel_write_nochange_rcu(void) { test_var = 42; }
+
+static noinline void test_kernel_read_atomic(void)
+{
+	sink_value(READ_ONCE(test_var));
+}
+
+static noinline void test_kernel_write_atomic(void)
+{
+	WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1);
+}
+
+__no_kcsan
+static noinline void test_kernel_write_uninstrumented(void) { test_var++; }
+
+static noinline void test_kernel_data_race(void) { data_race(test_var++); }
+
+static noinline void test_kernel_assert_writer(void)
+{
+	ASSERT_EXCLUSIVE_WRITER(test_var);
+}
+
+static noinline void test_kernel_assert_access(void)
+{
+	ASSERT_EXCLUSIVE_ACCESS(test_var);
+}
+
+#define TEST_CHANGE_BITS 0xff00ff00
+
+static noinline void test_kernel_change_bits(void)
+{
+	if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) {
+		/*
+		 * Avoid race of unknown origin for this test, just pretend they
+		 * are atomic.
+		 */
+		kcsan_nestable_atomic_begin();
+		test_var ^= TEST_CHANGE_BITS;
+		kcsan_nestable_atomic_end();
+	} else
+		WRITE_ONCE(test_var, READ_ONCE(test_var) ^ TEST_CHANGE_BITS);
+}
+
+static noinline void test_kernel_assert_bits_change(void)
+{
+	ASSERT_EXCLUSIVE_BITS(test_var, TEST_CHANGE_BITS);
+}
+
+static noinline void test_kernel_assert_bits_nochange(void)
+{
+	ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS);
+}
+
+/* To check that scoped assertions do trigger anywhere in scope. */
+static noinline void test_enter_scope(void)
+{
+	int x = 0;
+
+	/* Unrelated accesses to scoped assert. */
+	READ_ONCE(test_sink);
+	kcsan_check_read(&x, sizeof(x));
+}
+
+static noinline void test_kernel_assert_writer_scoped(void)
+{
+	ASSERT_EXCLUSIVE_WRITER_SCOPED(test_var);
+	test_enter_scope();
+}
+
+static noinline void test_kernel_assert_access_scoped(void)
+{
+	ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_var);
+	test_enter_scope();
+}
+
+static noinline void test_kernel_rmw_array(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(test_array); ++i)
+		test_array[i]++;
+}
+
+static noinline void test_kernel_write_struct(void)
+{
+	kcsan_check_write(&test_struct, sizeof(test_struct));
+	kcsan_disable_current();
+	test_struct.val[3]++; /* induce value change */
+	kcsan_enable_current();
+}
+
+static noinline void test_kernel_write_struct_part(void)
+{
+	test_struct.val[3] = 42;
+}
+
+static noinline void test_kernel_read_struct_zero_size(void)
+{
+	kcsan_check_read(&test_struct.val[3], 0);
+}
+
+static noinline void test_kernel_seqlock_reader(void)
+{
+	unsigned int seq;
+
+	do {
+		seq = read_seqbegin(&test_seqlock);
+		sink_value(test_var);
+	} while (read_seqretry(&test_seqlock, seq));
+}
+
+static noinline void test_kernel_seqlock_writer(void)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&test_seqlock, flags);
+	test_var++;
+	write_sequnlock_irqrestore(&test_seqlock, flags);
+}
+
+/* ===== Test cases ===== */
+
+/* Simple test with normal data race. */
+__no_kcsan
+static void test_basic(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	static const struct expect_report never = {
+		.access = {
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	bool match_expect = false;
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_write, test_kernel_read);
+	do {
+		match_expect |= report_matches(&expect);
+		match_never = report_matches(&never);
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/*
+ * Stress KCSAN with lots of concurrent races on different addresses until
+ * timeout.
+ */
+__no_kcsan
+static void test_concurrent_races(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			/* NULL will match any address. */
+			{ test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE },
+			{ test_kernel_rmw_array, NULL, 0, 0 },
+		},
+	};
+	static const struct expect_report never = {
+		.access = {
+			{ test_kernel_rmw_array, NULL, 0, 0 },
+			{ test_kernel_rmw_array, NULL, 0, 0 },
+		},
+	};
+	bool match_expect = false;
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_rmw_array, test_kernel_rmw_array);
+	do {
+		match_expect |= report_matches(&expect);
+		match_never |= report_matches(&never);
+	} while (!end_test_checks(false));
+	KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check matches exist. */
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test the KCSAN_REPORT_VALUE_CHANGE_ONLY option. */
+__no_kcsan
+static void test_novalue_change(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_write_nochange, test_kernel_read);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY))
+		KUNIT_EXPECT_FALSE(test, match_expect);
+	else
+		KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that the rules where the KCSAN_REPORT_VALUE_CHANGE_ONLY option should
+ * never apply work.
+ */
+__no_kcsan
+static void test_novalue_change_exception(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Test that data races of unknown origin are reported. */
+__no_kcsan
+static void test_unknown_origin(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+			{ NULL },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_write_uninstrumented, test_kernel_read);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN))
+		KUNIT_EXPECT_TRUE(test, match_expect);
+	else
+		KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+/* Test KCSAN_ASSUME_PLAIN_WRITES_ATOMIC if it is selected. */
+__no_kcsan
+static void test_write_write_assume_atomic(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+			{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_write, test_kernel_write);
+	do {
+		sink_value(READ_ONCE(test_var)); /* induce value-change */
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC))
+		KUNIT_EXPECT_FALSE(test, match_expect);
+	else
+		KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that data races with writes larger than word-size are always reported,
+ * even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected.
+ */
+__no_kcsan
+static void test_write_write_struct(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+			{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_write_struct, test_kernel_write_struct);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that data races where only one write is larger than word-size are always
+ * reported, even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected.
+ */
+__no_kcsan
+static void test_write_write_struct_part(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+			{ test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_write_struct, test_kernel_write_struct_part);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Test that races with atomic accesses never result in reports. */
+__no_kcsan
+static void test_read_atomic_write_atomic(struct kunit *test)
+{
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_read_atomic, test_kernel_write_atomic);
+	do {
+		match_never = report_available();
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test that a race with an atomic and plain access result in reports. */
+__no_kcsan
+static void test_read_plain_atomic_write(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+			{ test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
+		},
+	};
+	bool match_expect = false;
+
+	if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
+		return;
+
+	begin_test_checks(test_kernel_read, test_kernel_write_atomic);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Zero-sized accesses should never cause data race reports. */
+__no_kcsan
+static void test_zero_size_access(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+			{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+		},
+	};
+	const struct expect_report never = {
+		.access = {
+			{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+			{ test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 },
+		},
+	};
+	bool match_expect = false;
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_write_struct, test_kernel_read_struct_zero_size);
+	do {
+		match_expect |= report_matches(&expect);
+		match_never = report_matches(&never);
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check. */
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test the data_race() macro. */
+__no_kcsan
+static void test_data_race(struct kunit *test)
+{
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_data_race, test_kernel_data_race);
+	do {
+		match_never = report_available();
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_writer(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+			{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_assert_writer, test_kernel_write_nochange);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_assert_access, test_kernel_read);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access_writer(struct kunit *test)
+{
+	const struct expect_report expect_access_writer = {
+		.access = {
+			{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+			{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+		},
+	};
+	const struct expect_report expect_access_access = {
+		.access = {
+			{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+			{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+		},
+	};
+	const struct expect_report never = {
+		.access = {
+			{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+			{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+		},
+	};
+	bool match_expect_access_writer = false;
+	bool match_expect_access_access = false;
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_assert_access, test_kernel_assert_writer);
+	do {
+		match_expect_access_writer |= report_matches(&expect_access_writer);
+		match_expect_access_access |= report_matches(&expect_access_access);
+		match_never |= report_matches(&never);
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_TRUE(test, match_expect_access_writer);
+	KUNIT_EXPECT_TRUE(test, match_expect_access_access);
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_bits_change(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+			{ test_kernel_change_bits, &test_var, sizeof(test_var),
+				KCSAN_ACCESS_WRITE | (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) ? 0 : KCSAN_ACCESS_ATOMIC) },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_assert_bits_change, test_kernel_change_bits);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_bits_nochange(struct kunit *test)
+{
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_assert_bits_nochange, test_kernel_change_bits);
+	do {
+		match_never = report_available();
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_writer_scoped(struct kunit *test)
+{
+	const struct expect_report expect_start = {
+		.access = {
+			{ test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
+			{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+		},
+	};
+	const struct expect_report expect_anywhere = {
+		.access = {
+			{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
+			{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+		},
+	};
+	bool match_expect_start = false;
+	bool match_expect_anywhere = false;
+
+	begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange);
+	do {
+		match_expect_start |= report_matches(&expect_start);
+		match_expect_anywhere |= report_matches(&expect_anywhere);
+	} while (!end_test_checks(match_expect_start && match_expect_anywhere));
+	KUNIT_EXPECT_TRUE(test, match_expect_start);
+	KUNIT_EXPECT_TRUE(test, match_expect_anywhere);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access_scoped(struct kunit *test)
+{
+	const struct expect_report expect_start1 = {
+		.access = {
+			{ test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	const struct expect_report expect_start2 = {
+		.access = { expect_start1.access[0], expect_start1.access[0] },
+	};
+	const struct expect_report expect_inscope = {
+		.access = {
+			{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	bool match_expect_start = false;
+	bool match_expect_inscope = false;
+
+	begin_test_checks(test_kernel_assert_access_scoped, test_kernel_read);
+	end_time += msecs_to_jiffies(1000); /* This test requires a bit more time. */
+	do {
+		match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2);
+		match_expect_inscope |= report_matches(&expect_inscope);
+	} while (!end_test_checks(match_expect_start && match_expect_inscope));
+	KUNIT_EXPECT_TRUE(test, match_expect_start);
+	KUNIT_EXPECT_TRUE(test, match_expect_inscope);
+}
+
+/* Test that racing accesses in seqlock critical sections are not reported. */
+__no_kcsan
+static void test_seqlock_noreport(struct kunit *test)
+{
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_seqlock_reader, test_kernel_seqlock_writer);
+	do {
+		match_never = report_available();
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/*
+ * Each test case is run with different numbers of threads. Until KUnit supports
+ * passing arguments for each test case, we encode #threads in the test case
+ * name (read by get_num_threads()). [The '-' was chosen as a stylistic
+ * preference to separate test name and #threads.]
+ *
+ * The thread counts are chosen to cover potentially interesting boundaries and
+ * corner cases (range 2-5), and then stress the system with larger counts.
+ */
+#define KCSAN_KUNIT_CASE(test_name)                                            \
+	{ .run_case = test_name, .name = #test_name "-02" },                   \
+	{ .run_case = test_name, .name = #test_name "-03" },                   \
+	{ .run_case = test_name, .name = #test_name "-04" },                   \
+	{ .run_case = test_name, .name = #test_name "-05" },                   \
+	{ .run_case = test_name, .name = #test_name "-08" },                   \
+	{ .run_case = test_name, .name = #test_name "-16" }
+
+static struct kunit_case kcsan_test_cases[] = {
+	KCSAN_KUNIT_CASE(test_basic),
+	KCSAN_KUNIT_CASE(test_concurrent_races),
+	KCSAN_KUNIT_CASE(test_novalue_change),
+	KCSAN_KUNIT_CASE(test_novalue_change_exception),
+	KCSAN_KUNIT_CASE(test_unknown_origin),
+	KCSAN_KUNIT_CASE(test_write_write_assume_atomic),
+	KCSAN_KUNIT_CASE(test_write_write_struct),
+	KCSAN_KUNIT_CASE(test_write_write_struct_part),
+	KCSAN_KUNIT_CASE(test_read_atomic_write_atomic),
+	KCSAN_KUNIT_CASE(test_read_plain_atomic_write),
+	KCSAN_KUNIT_CASE(test_zero_size_access),
+	KCSAN_KUNIT_CASE(test_data_race),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_writer),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_access),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_bits_change),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped),
+	KCSAN_KUNIT_CASE(test_seqlock_noreport),
+	{},
+};
+
+/* ===== End test cases ===== */
+
+/* Get number of threads encoded in test name. */
+static bool __no_kcsan
+get_num_threads(const char *test, int *nthreads)
+{
+	int len = strlen(test);
+
+	if (WARN_ON(len < 3))
+		return false;
+
+	*nthreads = test[len - 1] - '0';
+	*nthreads += (test[len - 2] - '0') * 10;
+
+	if (WARN_ON(*nthreads < 0))
+		return false;
+
+	return true;
+}
+
+/* Concurrent accesses from interrupts. */
+__no_kcsan
+static void access_thread_timer(struct timer_list *timer)
+{
+	static atomic_t cnt = ATOMIC_INIT(0);
+	unsigned int idx;
+	void (*func)(void);
+
+	idx = (unsigned int)atomic_inc_return(&cnt) % ARRAY_SIZE(access_kernels);
+	/* Acquire potential initialization. */
+	func = smp_load_acquire(&access_kernels[idx]);
+	if (func)
+		func();
+}
+
+/* The main loop for each thread. */
+__no_kcsan
+static int access_thread(void *arg)
+{
+	struct timer_list timer;
+	unsigned int cnt = 0;
+	unsigned int idx;
+	void (*func)(void);
+
+	timer_setup_on_stack(&timer, access_thread_timer, 0);
+	do {
+		might_sleep();
+
+		if (!timer_pending(&timer))
+			mod_timer(&timer, jiffies + 1);
+		else {
+			/* Iterate through all kernels. */
+			idx = cnt++ % ARRAY_SIZE(access_kernels);
+			/* Acquire potential initialization. */
+			func = smp_load_acquire(&access_kernels[idx]);
+			if (func)
+				func();
+		}
+	} while (!torture_must_stop());
+	del_timer_sync(&timer);
+	destroy_timer_on_stack(&timer);
+
+	torture_kthread_stopping("access_thread");
+	return 0;
+}
+
+__no_kcsan
+static int test_init(struct kunit *test)
+{
+	unsigned long flags;
+	int nthreads;
+	int i;
+
+	spin_lock_irqsave(&observed.lock, flags);
+	for (i = 0; i < ARRAY_SIZE(observed.lines); ++i)
+		observed.lines[i][0] = '\0';
+	observed.nlines = 0;
+	spin_unlock_irqrestore(&observed.lock, flags);
+
+	if (!torture_init_begin((char *)test->name, 1))
+		return -EBUSY;
+
+	if (!get_num_threads(test->name, &nthreads))
+		goto err;
+
+	if (WARN_ON(threads))
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) {
+		if (WARN_ON(access_kernels[i]))
+			goto err;
+	}
+
+	if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
+		/*
+		 * Without any preemption, keep 2 CPUs free for other tasks, one
+		 * of which is the main test case function checking for
+		 * completion or failure.
+		 */
+		const int min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0;
+		const int min_required_cpus = 2 + min_unused_cpus;
+
+		if (num_online_cpus() < min_required_cpus) {
+			pr_err("%s: too few online CPUs (%u < %d) for test",
+			       test->name, num_online_cpus(), min_required_cpus);
+			goto err;
+		} else if (nthreads > num_online_cpus() - min_unused_cpus) {
+			nthreads = num_online_cpus() - min_unused_cpus;
+			pr_warn("%s: limiting number of threads to %d\n",
+				test->name, nthreads);
+		}
+	}
+
+	if (nthreads) {
+		threads = kcalloc(nthreads + 1, sizeof(struct task_struct *),
+				  GFP_KERNEL);
+		if (WARN_ON(!threads))
+			goto err;
+
+		threads[nthreads] = NULL;
+		for (i = 0; i < nthreads; ++i) {
+			if (torture_create_kthread(access_thread, NULL,
+						   threads[i]))
+				goto err;
+		}
+	}
+
+	torture_init_end();
+
+	return 0;
+
+err:
+	kfree(threads);
+	threads = NULL;
+	torture_init_end();
+	return -EINVAL;
+}
+
+__no_kcsan
+static void test_exit(struct kunit *test)
+{
+	struct task_struct **stop_thread;
+	int i;
+
+	if (torture_cleanup_begin())
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(access_kernels); ++i)
+		WRITE_ONCE(access_kernels[i], NULL);
+
+	if (threads) {
+		for (stop_thread = threads; *stop_thread; stop_thread++)
+			torture_stop_kthread(reader_thread, *stop_thread);
+
+		kfree(threads);
+		threads = NULL;
+	}
+
+	torture_cleanup_end();
+}
+
+static struct kunit_suite kcsan_test_suite = {
+	.name = "kcsan-test",
+	.test_cases = kcsan_test_cases,
+	.init = test_init,
+	.exit = test_exit,
+};
+static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL };
+
+__no_kcsan
+static void register_tracepoints(struct tracepoint *tp, void *ignore)
+{
+	check_trace_callback_type_console(probe_console);
+	if (!strcmp(tp->name, "console"))
+		WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+}
+
+__no_kcsan
+static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+{
+	if (!strcmp(tp->name, "console"))
+		tracepoint_probe_unregister(tp, probe_console, NULL);
+}
+
+/*
+ * We only want to do tracepoints setup and teardown once, therefore we have to
+ * customize the init and exit functions and cannot rely on kunit_test_suite().
+ */
+static int __init kcsan_test_init(void)
+{
+	/*
+	 * Because we want to be able to build the test as a module, we need to
+	 * iterate through all known tracepoints, since the static registration
+	 * won't work here.
+	 */
+	for_each_kernel_tracepoint(register_tracepoints, NULL);
+	return __kunit_test_suites_init(kcsan_test_suites);
+}
+
+static void kcsan_test_exit(void)
+{
+	__kunit_test_suites_exit(kcsan_test_suites);
+	for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+	tracepoint_synchronize_unregister();
+}
+
+late_initcall(kcsan_test_init);
+module_exit(kcsan_test_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan
index 5ee88e5119c2..3f3b5bca7a8f 100644
--- a/lib/Kconfig.kcsan
+++ b/lib/Kconfig.kcsan
@@ -59,7 +59,28 @@ config KCSAN_SELFTEST
 	bool "Perform short selftests on boot"
 	default y
 	help
-	  Run KCSAN selftests on boot. On test failure, causes the kernel to panic.
+	  Run KCSAN selftests on boot. On test failure, causes the kernel to
+	  panic. Recommended to be enabled, ensuring critical functionality
+	  works as intended.
+
+config KCSAN_TEST
+	tristate "KCSAN test for integrated runtime behaviour"
+	depends on TRACEPOINTS && KUNIT
+	select TORTURE_TEST
+	help
+	  KCSAN test focusing on behaviour of the integrated runtime. Tests
+	  various race scenarios, and verifies the reports generated to
+	  console. Makes use of KUnit for test organization, and the Torture
+	  framework for test thread control.
+
+	  Each test case may run at least up to KCSAN_REPORT_ONCE_IN_MS
+	  milliseconds. Test run duration may be optimized by building the
+	  kernel and KCSAN test with KCSAN_REPORT_ONCE_IN_MS set to a lower
+	  than default value.
+
+	  Say Y here if you want the test to be built into the kernel and run
+	  during boot; say M if you want the test to build as a module; say N
+	  if you are unsure.
 
 config KCSAN_EARLY_ENABLE
 	bool "Early enable during boot"
-- 
cgit v1.2.3


From 33190b675ce2eacbeb4e75168c05b41110b506ec Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Tue, 11 Feb 2020 08:54:15 -0500
Subject: locking/osq_lock: Annotate a data race in osq_lock

The prev->next pointer can be accessed concurrently as noticed by KCSAN:

 write (marked) to 0xffff9d3370dbbe40 of 8 bytes by task 3294 on cpu 107:
  osq_lock+0x25f/0x350
  osq_wait_next at kernel/locking/osq_lock.c:79
  (inlined by) osq_lock at kernel/locking/osq_lock.c:185
  rwsem_optimistic_spin
  <snip>

 read to 0xffff9d3370dbbe40 of 8 bytes by task 3398 on cpu 100:
  osq_lock+0x196/0x350
  osq_lock at kernel/locking/osq_lock.c:157
  rwsem_optimistic_spin
  <snip>

Since the write only stores NULL to prev->next and the read tests if
prev->next equals to this_cpu_ptr(&osq_node). Even if the value is
shattered, the code is still working correctly. Thus, mark it as an
intentional data race using the data_race() macro.

Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/locking/osq_lock.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 1f7734949ac8..1de006ed3aa8 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -154,7 +154,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	 */
 
 	for (;;) {
-		if (prev->next == node &&
+		/*
+		 * cpu_relax() below implies a compiler barrier which would
+		 * prevent this comparison being optimized away.
+		 */
+		if (data_race(prev->next) == node &&
 		    cmpxchg(&prev->next, node, NULL) == node)
 			break;
 
-- 
cgit v1.2.3


From 2888557f68db334a3839dcc262264a4c436f576b Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 2 Jun 2020 16:36:33 +0200
Subject: kcsan: Prefer '__no_kcsan inline' in test

Instead of __no_kcsan_or_inline, prefer '__no_kcsan inline' in test --
this is in case we decide to remove __no_kcsan_or_inline.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/kcsan-test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c
index a8c11506dd2a..3af420ad6ee7 100644
--- a/kernel/kcsan/kcsan-test.c
+++ b/kernel/kcsan/kcsan-test.c
@@ -43,7 +43,7 @@ static struct {
 };
 
 /* Setup test checking loop. */
-static __no_kcsan_or_inline void
+static __no_kcsan inline void
 begin_test_checks(void (*func1)(void), void (*func2)(void))
 {
 	kcsan_disable_current();
@@ -60,7 +60,7 @@ begin_test_checks(void (*func1)(void), void (*func2)(void))
 }
 
 /* End test checking loop. */
-static __no_kcsan_or_inline bool
+static __no_kcsan inline bool
 end_test_checks(bool stop)
 {
 	if (!stop && time_before(jiffies, end_time)) {
-- 
cgit v1.2.3


From 9dd979bae4cf76558ff816abe83283308fb1ae8c Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 16 Jun 2020 14:36:22 +0200
Subject: kcsan: Silence -Wmissing-prototypes warning with W=1

The functions here should not be forward declared for explicit use
elsewhere in the kernel, as they should only be emitted by the compiler
due to sanitizer instrumentation.  Add forward declarations a line above
their definition to shut up warnings in W=1 builds.

Link: https://lkml.kernel.org/r/202006060103.jSCpnV1g%lkp@intel.com
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 15f67949d11e..1866bafda4fd 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -754,6 +754,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
  */
 
 #define DEFINE_TSAN_READ_WRITE(size)                                           \
+	void __tsan_read##size(void *ptr);                                     \
 	void __tsan_read##size(void *ptr)                                      \
 	{                                                                      \
 		check_access(ptr, size, 0);                                    \
@@ -762,6 +763,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
 	void __tsan_unaligned_read##size(void *ptr)                            \
 		__alias(__tsan_read##size);                                    \
 	EXPORT_SYMBOL(__tsan_unaligned_read##size);                            \
+	void __tsan_write##size(void *ptr);                                    \
 	void __tsan_write##size(void *ptr)                                     \
 	{                                                                      \
 		check_access(ptr, size, KCSAN_ACCESS_WRITE);                   \
@@ -777,12 +779,14 @@ DEFINE_TSAN_READ_WRITE(4);
 DEFINE_TSAN_READ_WRITE(8);
 DEFINE_TSAN_READ_WRITE(16);
 
+void __tsan_read_range(void *ptr, size_t size);
 void __tsan_read_range(void *ptr, size_t size)
 {
 	check_access(ptr, size, 0);
 }
 EXPORT_SYMBOL(__tsan_read_range);
 
+void __tsan_write_range(void *ptr, size_t size);
 void __tsan_write_range(void *ptr, size_t size)
 {
 	check_access(ptr, size, KCSAN_ACCESS_WRITE);
@@ -799,6 +803,7 @@ EXPORT_SYMBOL(__tsan_write_range);
  * the size-check of compiletime_assert_rwonce_type().
  */
 #define DEFINE_TSAN_VOLATILE_READ_WRITE(size)                                  \
+	void __tsan_volatile_read##size(void *ptr);                            \
 	void __tsan_volatile_read##size(void *ptr)                             \
 	{                                                                      \
 		const bool is_atomic = size <= sizeof(long long) &&            \
@@ -811,6 +816,7 @@ EXPORT_SYMBOL(__tsan_write_range);
 	void __tsan_unaligned_volatile_read##size(void *ptr)                   \
 		__alias(__tsan_volatile_read##size);                           \
 	EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size);                   \
+	void __tsan_volatile_write##size(void *ptr);                           \
 	void __tsan_volatile_write##size(void *ptr)                            \
 	{                                                                      \
 		const bool is_atomic = size <= sizeof(long long) &&            \
@@ -836,14 +842,17 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(16);
  * The below are not required by KCSAN, but can still be emitted by the
  * compiler.
  */
+void __tsan_func_entry(void *call_pc);
 void __tsan_func_entry(void *call_pc)
 {
 }
 EXPORT_SYMBOL(__tsan_func_entry);
+void __tsan_func_exit(void);
 void __tsan_func_exit(void)
 {
 }
 EXPORT_SYMBOL(__tsan_func_exit);
+void __tsan_init(void);
 void __tsan_init(void)
 {
 }
-- 
cgit v1.2.3


From acfa087ccf2d2eff46186477f53e4c3ffbdb033d Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 16 Jun 2020 14:36:23 +0200
Subject: kcsan: Rename test.c to selftest.c

Rename 'test.c' to 'selftest.c' to better reflect its purpose (Kconfig
variable and code inside already match this). This is to avoid confusion
with the test suite module in 'kcsan-test.c'.

No functional change.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/Makefile   |   2 +-
 kernel/kcsan/selftest.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/kcsan/test.c     | 131 ------------------------------------------------
 3 files changed, 132 insertions(+), 132 deletions(-)
 create mode 100644 kernel/kcsan/selftest.c
 delete mode 100644 kernel/kcsan/test.c

(limited to 'kernel')

diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index 14533cf24bc3..092ce58d2e56 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -11,7 +11,7 @@ CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \
 	$(call cc-option,-fno-stack-protector,)
 
 obj-y := core.o debugfs.o report.o
-obj-$(CONFIG_KCSAN_SELFTEST) += test.o
+obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
 
 CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
 obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
new file mode 100644
index 000000000000..d26a052d3383
--- /dev/null
+++ b/kernel/kcsan/selftest.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/types.h>
+
+#include "encoding.h"
+
+#define ITERS_PER_TEST 2000
+
+/* Test requirements. */
+static bool test_requires(void)
+{
+	/* random should be initialized for the below tests */
+	return prandom_u32() + prandom_u32() != 0;
+}
+
+/*
+ * Test watchpoint encode and decode: check that encoding some access's info,
+ * and then subsequent decode preserves the access's info.
+ */
+static bool test_encode_decode(void)
+{
+	int i;
+
+	for (i = 0; i < ITERS_PER_TEST; ++i) {
+		size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
+		bool is_write = !!prandom_u32_max(2);
+		unsigned long addr;
+
+		prandom_bytes(&addr, sizeof(addr));
+		if (WARN_ON(!check_encodable(addr, size)))
+			return false;
+
+		/* Encode and decode */
+		{
+			const long encoded_watchpoint =
+				encode_watchpoint(addr, size, is_write);
+			unsigned long verif_masked_addr;
+			size_t verif_size;
+			bool verif_is_write;
+
+			/* Check special watchpoints */
+			if (WARN_ON(decode_watchpoint(
+				    INVALID_WATCHPOINT, &verif_masked_addr,
+				    &verif_size, &verif_is_write)))
+				return false;
+			if (WARN_ON(decode_watchpoint(
+				    CONSUMED_WATCHPOINT, &verif_masked_addr,
+				    &verif_size, &verif_is_write)))
+				return false;
+
+			/* Check decoding watchpoint returns same data */
+			if (WARN_ON(!decode_watchpoint(
+				    encoded_watchpoint, &verif_masked_addr,
+				    &verif_size, &verif_is_write)))
+				return false;
+			if (WARN_ON(verif_masked_addr !=
+				    (addr & WATCHPOINT_ADDR_MASK)))
+				goto fail;
+			if (WARN_ON(verif_size != size))
+				goto fail;
+			if (WARN_ON(is_write != verif_is_write))
+				goto fail;
+
+			continue;
+fail:
+			pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
+			       __func__, is_write ? "write" : "read", size,
+			       addr, encoded_watchpoint,
+			       verif_is_write ? "write" : "read", verif_size,
+			       verif_masked_addr);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/* Test access matching function. */
+static bool test_matching_access(void)
+{
+	if (WARN_ON(!matching_access(10, 1, 10, 1)))
+		return false;
+	if (WARN_ON(!matching_access(10, 2, 11, 1)))
+		return false;
+	if (WARN_ON(!matching_access(10, 1, 9, 2)))
+		return false;
+	if (WARN_ON(matching_access(10, 1, 11, 1)))
+		return false;
+	if (WARN_ON(matching_access(9, 1, 10, 1)))
+		return false;
+
+	/*
+	 * An access of size 0 could match another access, as demonstrated here.
+	 * Rather than add more comparisons to 'matching_access()', which would
+	 * end up in the fast-path for *all* checks, check_access() simply
+	 * returns for all accesses of size 0.
+	 */
+	if (WARN_ON(!matching_access(8, 8, 12, 0)))
+		return false;
+
+	return true;
+}
+
+static int __init kcsan_selftest(void)
+{
+	int passed = 0;
+	int total = 0;
+
+#define RUN_TEST(do_test)                                                      \
+	do {                                                                   \
+		++total;                                                       \
+		if (do_test())                                                 \
+			++passed;                                              \
+		else                                                           \
+			pr_err("KCSAN selftest: " #do_test " failed");         \
+	} while (0)
+
+	RUN_TEST(test_requires);
+	RUN_TEST(test_encode_decode);
+	RUN_TEST(test_matching_access);
+
+	pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total);
+	if (passed != total)
+		panic("KCSAN selftests failed");
+	return 0;
+}
+postcore_initcall(kcsan_selftest);
diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c
deleted file mode 100644
index d26a052d3383..000000000000
--- a/kernel/kcsan/test.c
+++ /dev/null
@@ -1,131 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/printk.h>
-#include <linux/random.h>
-#include <linux/types.h>
-
-#include "encoding.h"
-
-#define ITERS_PER_TEST 2000
-
-/* Test requirements. */
-static bool test_requires(void)
-{
-	/* random should be initialized for the below tests */
-	return prandom_u32() + prandom_u32() != 0;
-}
-
-/*
- * Test watchpoint encode and decode: check that encoding some access's info,
- * and then subsequent decode preserves the access's info.
- */
-static bool test_encode_decode(void)
-{
-	int i;
-
-	for (i = 0; i < ITERS_PER_TEST; ++i) {
-		size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
-		bool is_write = !!prandom_u32_max(2);
-		unsigned long addr;
-
-		prandom_bytes(&addr, sizeof(addr));
-		if (WARN_ON(!check_encodable(addr, size)))
-			return false;
-
-		/* Encode and decode */
-		{
-			const long encoded_watchpoint =
-				encode_watchpoint(addr, size, is_write);
-			unsigned long verif_masked_addr;
-			size_t verif_size;
-			bool verif_is_write;
-
-			/* Check special watchpoints */
-			if (WARN_ON(decode_watchpoint(
-				    INVALID_WATCHPOINT, &verif_masked_addr,
-				    &verif_size, &verif_is_write)))
-				return false;
-			if (WARN_ON(decode_watchpoint(
-				    CONSUMED_WATCHPOINT, &verif_masked_addr,
-				    &verif_size, &verif_is_write)))
-				return false;
-
-			/* Check decoding watchpoint returns same data */
-			if (WARN_ON(!decode_watchpoint(
-				    encoded_watchpoint, &verif_masked_addr,
-				    &verif_size, &verif_is_write)))
-				return false;
-			if (WARN_ON(verif_masked_addr !=
-				    (addr & WATCHPOINT_ADDR_MASK)))
-				goto fail;
-			if (WARN_ON(verif_size != size))
-				goto fail;
-			if (WARN_ON(is_write != verif_is_write))
-				goto fail;
-
-			continue;
-fail:
-			pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
-			       __func__, is_write ? "write" : "read", size,
-			       addr, encoded_watchpoint,
-			       verif_is_write ? "write" : "read", verif_size,
-			       verif_masked_addr);
-			return false;
-		}
-	}
-
-	return true;
-}
-
-/* Test access matching function. */
-static bool test_matching_access(void)
-{
-	if (WARN_ON(!matching_access(10, 1, 10, 1)))
-		return false;
-	if (WARN_ON(!matching_access(10, 2, 11, 1)))
-		return false;
-	if (WARN_ON(!matching_access(10, 1, 9, 2)))
-		return false;
-	if (WARN_ON(matching_access(10, 1, 11, 1)))
-		return false;
-	if (WARN_ON(matching_access(9, 1, 10, 1)))
-		return false;
-
-	/*
-	 * An access of size 0 could match another access, as demonstrated here.
-	 * Rather than add more comparisons to 'matching_access()', which would
-	 * end up in the fast-path for *all* checks, check_access() simply
-	 * returns for all accesses of size 0.
-	 */
-	if (WARN_ON(!matching_access(8, 8, 12, 0)))
-		return false;
-
-	return true;
-}
-
-static int __init kcsan_selftest(void)
-{
-	int passed = 0;
-	int total = 0;
-
-#define RUN_TEST(do_test)                                                      \
-	do {                                                                   \
-		++total;                                                       \
-		if (do_test())                                                 \
-			++passed;                                              \
-		else                                                           \
-			pr_err("KCSAN selftest: " #do_test " failed");         \
-	} while (0)
-
-	RUN_TEST(test_requires);
-	RUN_TEST(test_encode_decode);
-	RUN_TEST(test_matching_access);
-
-	pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total);
-	if (passed != total)
-		panic("KCSAN selftests failed");
-	return 0;
-}
-postcore_initcall(kcsan_selftest);
-- 
cgit v1.2.3


From 7e766560e6e2c1cf2782f00e63c31564e4c9f0fe Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 16 Jun 2020 14:36:24 +0200
Subject: kcsan: Remove existing special atomic rules

Remove existing special atomic rules from kcsan_is_atomic_special()
because they are no longer needed. Since we rely on the compiler
emitting instrumentation distinguishing volatile accesses, the rules
have become redundant.

Let's keep kcsan_is_atomic_special() around, so that we have an obvious
place to add special rules should the need arise in future.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/atomic.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h
index be9e625227f3..75fe701f4127 100644
--- a/kernel/kcsan/atomic.h
+++ b/kernel/kcsan/atomic.h
@@ -3,8 +3,7 @@
 #ifndef _KERNEL_KCSAN_ATOMIC_H
 #define _KERNEL_KCSAN_ATOMIC_H
 
-#include <linux/jiffies.h>
-#include <linux/sched.h>
+#include <linux/types.h>
 
 /*
  * Special rules for certain memory where concurrent conflicting accesses are
@@ -13,8 +12,7 @@
  */
 static bool kcsan_is_atomic_special(const volatile void *ptr)
 {
-	/* volatile globals that have been observed in data races. */
-	return ptr == &jiffies || ptr == &current->state;
+	return false;
 }
 
 #endif /* _KERNEL_KCSAN_ATOMIC_H */
-- 
cgit v1.2.3


From 56b031f0abf55254d47a329010574733fa9a27b8 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 16 Jun 2020 14:36:25 +0200
Subject: kcsan: Add jiffies test to test suite

Add a test that KCSAN nor the compiler gets confused about accesses to
jiffies on different architectures.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/kcsan-test.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c
index 3af420ad6ee7..fed6fcb5768c 100644
--- a/kernel/kcsan/kcsan-test.c
+++ b/kernel/kcsan/kcsan-test.c
@@ -366,6 +366,11 @@ static noinline void test_kernel_read_struct_zero_size(void)
 	kcsan_check_read(&test_struct.val[3], 0);
 }
 
+static noinline void test_kernel_jiffies_reader(void)
+{
+	sink_value((long)jiffies);
+}
+
 static noinline void test_kernel_seqlock_reader(void)
 {
 	unsigned int seq;
@@ -817,6 +822,23 @@ static void test_assert_exclusive_access_scoped(struct kunit *test)
 	KUNIT_EXPECT_TRUE(test, match_expect_inscope);
 }
 
+/*
+ * jiffies is special (declared to be volatile) and its accesses are typically
+ * not marked; this test ensures that the compiler nor KCSAN gets confused about
+ * jiffies's declaration on different architectures.
+ */
+__no_kcsan
+static void test_jiffies_noreport(struct kunit *test)
+{
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_jiffies_reader, test_kernel_jiffies_reader);
+	do {
+		match_never = report_available();
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
 /* Test that racing accesses in seqlock critical sections are not reported. */
 __no_kcsan
 static void test_seqlock_noreport(struct kunit *test)
@@ -867,6 +889,7 @@ static struct kunit_case kcsan_test_cases[] = {
 	KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange),
 	KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped),
 	KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped),
+	KCSAN_KUNIT_CASE(test_jiffies_noreport),
 	KCSAN_KUNIT_CASE(test_seqlock_noreport),
 	{},
 };
-- 
cgit v1.2.3


From 2839a232071f588d334543fb86f5689b43353842 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 18 Jun 2020 11:31:17 +0200
Subject: kcsan: Simplify compiler flags

Simplify the set of compiler flags for the runtime by removing cc-option
from -fno-stack-protector, because all supported compilers support it.
This saves us one compiler invocation during build.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index 092ce58d2e56..fea064afc4f7 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -7,8 +7,8 @@ CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
 
-CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \
-	$(call cc-option,-fno-stack-protector,)
+CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
+	-fno-stack-protector
 
 obj-y := core.o debugfs.o report.o
 obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
-- 
cgit v1.2.3


From 61d56d7aa5eca3b909bce51ba8125b0fa44d7e17 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 18 Jun 2020 11:31:18 +0200
Subject: kcsan: Disable branch tracing in core runtime

Disable branch tracing in core KCSAN runtime if branches are being
traced (TRACE_BRANCH_PROFILING). This it to avoid its performance
impact, but also avoid recursion in case KCSAN is enabled for the branch
tracing runtime.

The latter had already been a problem for KASAN:
https://lore.kernel.org/lkml/CANpmjNOeXmD5E3O50Z3MjkiuCYaYOPyi+1rq=GZvEKwBvLR0Ug@mail.gmail.com/

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index fea064afc4f7..65ca5539c470 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -8,7 +8,7 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
 
 CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
-	-fno-stack-protector
+	-fno-stack-protector -DDISABLE_BRANCH_PROFILING
 
 obj-y := core.o debugfs.o report.o
 obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
-- 
cgit v1.2.3


From 142240398e50e5fe3171bcf2459856603be13a39 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Sat, 27 Jun 2020 23:24:19 -0400
Subject: audit: add gfp parameter to audit_log_nfcfg

Fixed an inconsistent use of GFP flags in nft_obj_notify() that used
GFP_KERNEL when a GFP flag was passed in to that function.  Given this
allocated memory was then used in audit_log_nfcfg() it led to an audit
of all other GFP allocations in net/netfilter/nf_tables_api.c and a
modification of audit_log_nfcfg() to accept a GFP parameter.

Reported-by: Dan Carptenter <dan.carpenter@oracle.com>
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h           |  8 ++++----
 kernel/auditsc.c                |  4 ++--
 net/bridge/netfilter/ebtables.c |  6 +++---
 net/netfilter/nf_tables_api.c   | 33 +++++++++++++++++++++------------
 net/netfilter/x_tables.c        |  5 +++--
 5 files changed, 33 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 604ede630580..d93739f7a35a 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -404,7 +404,7 @@ extern void __audit_fanotify(unsigned int response);
 extern void __audit_tk_injoffset(struct timespec64 offset);
 extern void __audit_ntp_log(const struct audit_ntp_data *ad);
 extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
-			      enum audit_nfcfgop op);
+			      enum audit_nfcfgop op, gfp_t gfp);
 
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -542,10 +542,10 @@ static inline void audit_ntp_log(const struct audit_ntp_data *ad)
 
 static inline void audit_log_nfcfg(const char *name, u8 af,
 				   unsigned int nentries,
-				   enum audit_nfcfgop op)
+				   enum audit_nfcfgop op, gfp_t gfp)
 {
 	if (audit_enabled)
-		__audit_log_nfcfg(name, af, nentries, op);
+		__audit_log_nfcfg(name, af, nentries, op, gfp);
 }
 
 extern int audit_n_rules;
@@ -683,7 +683,7 @@ static inline void audit_ptrace(struct task_struct *t)
 
 static inline void audit_log_nfcfg(const char *name, u8 af,
 				   unsigned int nentries,
-				   enum audit_nfcfgop op)
+				   enum audit_nfcfgop op, gfp_t gfp)
 { }
 
 #define audit_n_rules 0
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3a9100e95fda..eae1a599ffe3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2572,12 +2572,12 @@ void __audit_ntp_log(const struct audit_ntp_data *ad)
 }
 
 void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
-		       enum audit_nfcfgop op)
+		       enum audit_nfcfgop op, gfp_t gfp)
 {
 	struct audit_buffer *ab;
 	char comm[sizeof(current->comm)];
 
-	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG);
+	ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG);
 	if (!ab)
 		return;
 	audit_log_format(ab, "table=%s family=%u entries=%u op=%s",
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index c83ffe912163..b13b49b9f75c 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1047,7 +1047,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
 	vfree(counterstmp);
 
 	audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries,
-			AUDIT_XT_OP_REPLACE);
+			AUDIT_XT_OP_REPLACE, GFP_KERNEL);
 	return ret;
 
 free_unlock:
@@ -1123,7 +1123,7 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
 	list_del(&table->list);
 	mutex_unlock(&ebt_mutex);
 	audit_log_nfcfg(table->name, AF_BRIDGE, table->private->nentries,
-			AUDIT_XT_OP_UNREGISTER);
+			AUDIT_XT_OP_UNREGISTER, GFP_KERNEL);
 	EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size,
 			  ebt_cleanup_entry, net, NULL);
 	if (table->private->nentries)
@@ -1218,7 +1218,7 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table,
 	}
 
 	audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries,
-			AUDIT_XT_OP_REGISTER);
+			AUDIT_XT_OP_REGISTER, GFP_KERNEL);
 	return ret;
 free_unlock:
 	mutex_unlock(&ebt_mutex);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 164700273947..f7ff91479647 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -702,7 +702,8 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
 			ctx->table->use,
 			event == NFT_MSG_NEWTABLE ?
 				AUDIT_NFT_OP_TABLE_REGISTER :
-				AUDIT_NFT_OP_TABLE_UNREGISTER);
+				AUDIT_NFT_OP_TABLE_UNREGISTER,
+			GFP_KERNEL);
 	kfree(buf);
 
 	if (!ctx->report &&
@@ -1448,7 +1449,8 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
 			ctx->chain->use,
 			event == NFT_MSG_NEWCHAIN ?
 				AUDIT_NFT_OP_CHAIN_REGISTER :
-				AUDIT_NFT_OP_CHAIN_UNREGISTER);
+				AUDIT_NFT_OP_CHAIN_UNREGISTER,
+			GFP_KERNEL);
 	kfree(buf);
 
 	if (!ctx->report &&
@@ -2724,7 +2726,8 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx,
 			rule->handle,
 			event == NFT_MSG_NEWRULE ?
 				AUDIT_NFT_OP_RULE_REGISTER :
-				AUDIT_NFT_OP_RULE_UNREGISTER);
+				AUDIT_NFT_OP_RULE_UNREGISTER,
+			GFP_KERNEL);
 	kfree(buf);
 
 	if (!ctx->report &&
@@ -3737,7 +3740,8 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx,
 			set->field_count,
 			event == NFT_MSG_NEWSET ?
 				AUDIT_NFT_OP_SET_REGISTER :
-				AUDIT_NFT_OP_SET_UNREGISTER);
+				AUDIT_NFT_OP_SET_UNREGISTER,
+			gfp_flags);
 	kfree(buf);
 
 	if (!ctx->report &&
@@ -4864,7 +4868,8 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
 			set->handle,
 			event == NFT_MSG_NEWSETELEM ?
 				AUDIT_NFT_OP_SETELEM_REGISTER :
-				AUDIT_NFT_OP_SETELEM_UNREGISTER);
+				AUDIT_NFT_OP_SETELEM_UNREGISTER,
+			GFP_KERNEL);
 	kfree(buf);
 
 	if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
@@ -5956,7 +5961,8 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 				audit_log_nfcfg(buf,
 						family,
 						obj->handle,
-						AUDIT_NFT_OP_OBJ_RESET);
+						AUDIT_NFT_OP_OBJ_RESET,
+						GFP_KERNEL);
 				kfree(buf);
 			}
 
@@ -6071,13 +6077,14 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 		reset = true;
 
 	if (reset) {
-		char *buf = kasprintf(GFP_KERNEL, "%s:%llu;?:0",
+		char *buf = kasprintf(GFP_ATOMIC, "%s:%llu;?:0",
 				      table->name, table->handle);
 
 		audit_log_nfcfg(buf,
 				family,
 				obj->handle,
-				AUDIT_NFT_OP_OBJ_RESET);
+				AUDIT_NFT_OP_OBJ_RESET,
+				GFP_KERNEL);
 		kfree(buf);
 	}
 
@@ -6156,7 +6163,7 @@ void nft_obj_notify(struct net *net, const struct nft_table *table,
 {
 	struct sk_buff *skb;
 	int err;
-	char *buf = kasprintf(GFP_KERNEL, "%s:%llu;?:0",
+	char *buf = kasprintf(gfp, "%s:%llu;?:0",
 			      table->name, table->handle);
 
 	audit_log_nfcfg(buf,
@@ -6164,7 +6171,8 @@ void nft_obj_notify(struct net *net, const struct nft_table *table,
 			obj->handle,
 			event == NFT_MSG_NEWOBJ ?
 				AUDIT_NFT_OP_OBJ_REGISTER :
-				AUDIT_NFT_OP_OBJ_UNREGISTER);
+				AUDIT_NFT_OP_OBJ_UNREGISTER,
+			GFP_KERNEL);
 	kfree(buf);
 
 	if (!report &&
@@ -6954,7 +6962,8 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
 			flowtable->hooknum,
 			event == NFT_MSG_NEWFLOWTABLE ?
 				AUDIT_NFT_OP_FLOWTABLE_REGISTER :
-				AUDIT_NFT_OP_FLOWTABLE_UNREGISTER);
+				AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
+			GFP_KERNEL);
 	kfree(buf);
 
 	if (ctx->report &&
@@ -7078,7 +7087,7 @@ static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb,
 	int err;
 
 	audit_log_nfcfg("?:0;?:0", 0, net->nft.base_seq,
-			AUDIT_NFT_OP_GEN_REGISTER);
+			AUDIT_NFT_OP_GEN_REGISTER, GFP_KERNEL);
 
 	if (nlmsg_report(nlh) &&
 	    !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 99a468be4a59..9ad8f3ff66f5 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1410,7 +1410,8 @@ xt_replace_table(struct xt_table *table,
 
 	audit_log_nfcfg(table->name, table->af, private->number,
 			!private->number ? AUDIT_XT_OP_REGISTER :
-					   AUDIT_XT_OP_REPLACE);
+					   AUDIT_XT_OP_REPLACE,
+			GFP_KERNEL);
 	return private;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -1473,7 +1474,7 @@ void *xt_unregister_table(struct xt_table *table)
 	list_del(&table->list);
 	mutex_unlock(&xt[table->af].mutex);
 	audit_log_nfcfg(table->name, table->af, private->number,
-			AUDIT_XT_OP_UNREGISTER);
+			AUDIT_XT_OP_UNREGISTER, GFP_KERNEL);
 	kfree(table);
 
 	return private;
-- 
cgit v1.2.3


From 7ef282e05132d56b6f6b71e3873f317664bea78b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 29 Jun 2020 23:45:56 -0400
Subject: tracing: Move pipe reference to trace array instead of current_tracer

If a process has the trace_pipe open on a trace_array, the current tracer
for that trace array should not be changed. This was original enforced by a
global lock, but when instances were introduced, it was moved to the
current_trace. But this structure is shared by all instances, and a
trace_pipe is for a single instance. There's no reason that a process that
has trace_pipe open on one instance should prevent another instance from
changing its current tracer. Move the reference counter to the trace_array
instead.

This is marked as "Fixes" but is more of a clean up than a true fix.
Backport if you want, but its not critical.

Fixes: cf6ab6d9143b1 ("tracing: Add ref count to tracer for when they are being read by pipe")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 12 ++++++------
 kernel/trace/trace.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8241d1448d70..64c5b8146cca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5891,7 +5891,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
 	}
 
 	/* If trace pipe files are being read, we can't change the tracer */
-	if (tr->current_trace->ref) {
+	if (tr->trace_ref) {
 		ret = -EBUSY;
 		goto out;
 	}
@@ -6107,7 +6107,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 
 	nonseekable_open(inode, filp);
 
-	tr->current_trace->ref++;
+	tr->trace_ref++;
 out:
 	mutex_unlock(&trace_types_lock);
 	return ret;
@@ -6126,7 +6126,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 
 	mutex_lock(&trace_types_lock);
 
-	tr->current_trace->ref--;
+	tr->trace_ref--;
 
 	if (iter->trace->pipe_close)
 		iter->trace->pipe_close(iter);
@@ -7428,7 +7428,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = info;
 
-	tr->current_trace->ref++;
+	tr->trace_ref++;
 
 	mutex_unlock(&trace_types_lock);
 
@@ -7529,7 +7529,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
 
 	mutex_lock(&trace_types_lock);
 
-	iter->tr->current_trace->ref--;
+	iter->tr->trace_ref--;
 
 	__trace_array_put(iter->tr);
 
@@ -8737,7 +8737,7 @@ static int __remove_instance(struct trace_array *tr)
 	int i;
 
 	/* Reference counter for a newly created trace array = 1. */
-	if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref))
+	if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
 		return -EBUSY;
 
 	list_del(&tr->list);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 13db4000af3f..f21607f87189 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -356,6 +356,7 @@ struct trace_array {
 	struct trace_event_file *trace_marker_file;
 	cpumask_var_t		tracing_cpumask; /* only trace on set CPUs */
 	int			ref;
+	int			trace_ref;
 #ifdef CONFIG_FUNCTION_TRACER
 	struct ftrace_ops	*ops;
 	struct trace_pid_list	__rcu *function_pids;
@@ -547,7 +548,6 @@ struct tracer {
 	struct tracer		*next;
 	struct tracer_flags	*flags;
 	int			enabled;
-	int			ref;
 	bool			print_max;
 	bool			allow_instances;
 #ifdef CONFIG_TRACER_MAX_TRACE
-- 
cgit v1.2.3


From a389d86f7fd0902e4ce4136a5601988dbd371eb1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Sun, 28 Jun 2020 22:52:25 -0400
Subject: ring-buffer: Have nested events still record running time stamp

Up until now, if an event is interrupted while it is recorded by an
interrupt, and that interrupt records events, the time of those events will
all be the same. This is because events only record the delta of the time
since the previous event (or beginning of a page), and to handle updating
the time keeping for that of nested events is extremely racy. After years of
thinking about this and several failed attempts, I finally have a solution
to solve this puzzle.

The problem is that you need to atomically calculate the delta and then
update the time stamp you made the delta from, as well as then record it
into the buffer, all this while at any time an interrupt can come in and
do the same thing. This is easy to solve with heavy weight atomics, but that
would be detrimental to the performance of the ring buffer. The current
state of affairs sacrificed the time deltas for nested events for
performance.

The reason for previous failed attempts at solving this puzzle was because I
was trying to completely avoid slow atomic operations like cmpxchg. I final
came to the conclusion to always avoid cmpxchg is not possible, which is why
those previous attempts always failed. But it is possible to pick one path
(the most common case) and avoid cmpxchg in that path, which is the "fast
path". The most common case is that an event will not be interrupted and
have other events added into it. An event can detect if it has
interrupted another event, and for these cases we can make it the slow
path and use the heavy operations like cmpxchg.

One more player was added to the game that made this possible, and that is
the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59
bit time stamp. (Of course this breaks if a machine is running for more than
18 years without a reboot!).

There's barrier() placements around for being paranoid, even when they
are not needed because of other atomic functions near by. But those
should not hurt, as if they are not needed, they basically become a nop.

Note, this also makes the race window much smaller, which means there
are less slow paths to slow down the performance.

The basic idea is that there's two main paths taken.

 1) Not being interrupted between time stamps and reserving buffer space.
    In this case, the time stamps taken are true to the location in the
    buffer.

 2) Was interrupted by another path between taking time stamps and reserving
    buffer space.

The objective is to know what the delta is from the last reserved location
in the buffer.

As it is possible to detect if an event is interrupting another event before
reserving data, space is added to the length to be reserved to inject a full
time stamp along with the event being reserved.

When an event is not interrupted, the write stamp is always the time of the
last event written to the buffer.

In path 1, there's two sub paths we care about:

 a) The event did not interrupt another event.
 b) The event interrupted another event.

In case a, as the write stamp was read and known to be correct, the delta
between the current time stamp and the write stamp is the delta between the
current event and the previously recorded event.

In case b, extra space was reserved to just put the full time stamp into the
buffer. Which is done, as stated, in this path the time stamp taken is known
to match the location in the buffer.

In path 2, there's also two sub paths we care about:

 a) The event was not interrupted by another event since it reserved space
    on the buffer and re-reading the write stamp.
 b) The event was interrupted by another event.

In case a, the write stamp is that of the last event that interrupted this
event between taking the time stamps and reserving. As no event came in
after re-reading the write stamp, that event is known to be the time of the
event directly before this event and the delta can be the new time stamp and
the write stamp.

In case b, one or more events came in between reserving the event and
re-reading he write stamp. Since this event's buffer reservation is between
other events at this path, there's no way to know what the delta is. But
because an event interrupted this event after it started, its fine to just
give a zero delta, and take the same time stamp as the events that happened
within the event being recorded.

Here's the implementation of the design of this solution:

 All this is per cpu, and only needs to worry about nested events (not
 parallel events).

The players:

 write_tail: The index in the buffer where new events can be written to.
     It is incremented via local_add() to reserve space for a new event.

 before_stamp: A time stamp set by all events before reserving space.

 write_stamp: A time stamp updated by events after it has successfully
     reserved space.

	/* Save the current position of write */
 [A]	w = local_read(write_tail);
	barrier();
	/* Read both before and write stamps before touching anything */
	before = local_read(before_stamp);
	after = local_read(write_stamp);
	barrier();

	/*
	 * If before and after are the same, then this event is not
	 * interrupting a time update. If it is, then reserve space for adding
	 * a full time stamp (this can turn into a time extend which is
	 * just an extended time delta but fill up the extra space).
	 */
	if (after != before)
		abs = true;

	ts = clock();

	/* Now update the before_stamp (everyone does this!) */
 [B]	local_set(before_stamp, ts);

	/* Now reserve space on the buffer */
 [C]	write = local_add_return(len, write_tail);

	/* Set tail to be were this event's data is */
	tail = write - len;

 	if (w == tail) {

		/* Nothing interrupted this between A and C */
 [D]		local_set(write_stamp, ts);
		barrier();
 [E]		save_before = local_read(before_stamp);

 		if (!abs) {
			/* This did not interrupt a time update */
			delta = ts - after;
		} else {
			delta = ts; /* The full time stamp will be in use */
		}
		if (ts != save_before) {
			/* slow path - Was interrupted between C and E */
			/* The update to write_stamp could have overwritten the update to
			 * it by the interrupting event, but before and after should be
			 * the same for all completed top events */
			after = local_read(write_stamp);
			if (save_before > after)
				local_cmpxchg(write_stamp, after, save_before);
		}
	} else {
		/* slow path - Interrupted between A and C */

		after = local_read(write_stamp);
		temp_ts = clock();
		barrier();
 [F]		if (write == local_read(write_tail) && after < temp_ts) {
			/* This was not interrupted since C and F
			 * The last write_stamp is still valid for the previous event
			 * in the buffer. */
			delta = temp_ts - after;
			/* OK to keep this new time stamp */
			ts = temp_ts;
		} else {
			/* Interrupted between C and F
			 * Well, there's no use to try to know what the time stamp
			 * is for the previous event. Just set delta to zero and
			 * be the same time as that event that interrupted us before
			 * the reservation of the buffer. */

			delta = 0;
		}
		/* No need to use full timestamps here */
		abs = 0;
	}

Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home
Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org
Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org

Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 276 +++++++++++++++++++++++++++++----------------
 1 file changed, 181 insertions(+), 95 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 00867ff82412..026238c55b0c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -27,6 +27,7 @@
 #include <linux/oom.h>
 
 #include <asm/local.h>
+#include <asm/local64.h>
 
 static void update_pages_handler(struct work_struct *work);
 
@@ -418,6 +419,17 @@ struct rb_event_info {
 	int			add_timestamp;
 };
 
+/*
+ * Used for the add_timestamp
+ *  NONE
+ *  NORMAL - may be for either time extend or absolute
+ *  FORCE - force a full time stamp.
+ */
+enum {
+	RB_ADD_STAMP_NONE,
+	RB_ADD_STAMP_NORMAL,
+	RB_ADD_STAMP_FORCE
+};
 /*
  * Used for which event context the event is in.
  *  NMI     = 0
@@ -470,7 +482,8 @@ struct ring_buffer_per_cpu {
 	size_t				shortest_full;
 	unsigned long			read;
 	unsigned long			read_bytes;
-	u64				write_stamp;
+	local64_t			write_stamp;
+	local64_t			before_stamp;
 	u64				read_stamp;
 	/* ring buffer pages to update, > 0 to add, < 0 to remove */
 	long				nr_pages_to_update;
@@ -2416,16 +2429,13 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 	unsigned length = info->length;
 	u64 delta = info->delta;
 
-	/* Only a commit updates the timestamp */
-	if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
-		delta = 0;
-
 	/*
 	 * If we need to add a timestamp, then we
 	 * add it to the start of the reserved space.
 	 */
 	if (unlikely(info->add_timestamp)) {
-		bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
+		bool abs = info->add_timestamp == RB_ADD_STAMP_FORCE ||
+			ring_buffer_time_stamp_abs(cpu_buffer->buffer);
 
 		event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
 		length -= RB_LEN_TIME_EXTEND;
@@ -2480,6 +2490,39 @@ static inline bool sched_clock_stable(void)
 }
 #endif
 
+static __always_inline bool
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+		   struct ring_buffer_event *event)
+{
+	unsigned long addr = (unsigned long)event;
+	unsigned long index;
+
+	index = rb_event_index(event);
+	addr &= PAGE_MASK;
+
+	return cpu_buffer->commit_page->page == (void *)addr &&
+		rb_commit_index(cpu_buffer) == index;
+}
+
+static u64 rb_time_delta(struct ring_buffer_event *event)
+{
+	switch (event->type_len) {
+	case RINGBUF_TYPE_PADDING:
+		return 0;
+
+	case RINGBUF_TYPE_TIME_EXTEND:
+		return ring_buffer_event_time_stamp(event);
+
+	case RINGBUF_TYPE_TIME_STAMP:
+		return 0;
+
+	case RINGBUF_TYPE_DATA:
+		return event->time_delta;
+	default:
+		return 0;
+	}
+}
+
 static inline int
 rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 		  struct ring_buffer_event *event)
@@ -2488,6 +2531,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 	struct buffer_page *bpage;
 	unsigned long index;
 	unsigned long addr;
+	u64 write_stamp;
+	u64 delta;
 
 	new_index = rb_event_index(event);
 	old_index = new_index + rb_event_ts_length(event);
@@ -2496,10 +2541,32 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 
 	bpage = READ_ONCE(cpu_buffer->tail_page);
 
+	delta = rb_time_delta(event);
+
+	write_stamp = local64_read(&cpu_buffer->write_stamp);
+
+	/* Make sure the write stamp is read before testing the location */
+	barrier();
+
 	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
 		unsigned long write_mask =
 			local_read(&bpage->write) & ~RB_WRITE_MASK;
 		unsigned long event_length = rb_event_length(event);
+		u64 ret;
+
+		ret = local64_cmpxchg(&cpu_buffer->write_stamp, write_stamp, write_stamp - delta);
+		/* Something came in, can't discard */
+		if (ret != write_stamp)
+			return 0;
+
+		/*
+		 * If an event were to come in now, it would see that the
+		 * write_stamp and the before_stamp are different, and assume
+		 * that this event just added itself before updating
+		 * the write stamp. The interrupting event will fix the
+		 * write stamp for us, and use the before stamp as its delta.
+		 */
+
 		/*
 		 * This is on the tail page. It is possible that
 		 * a write could come in and move the tail page
@@ -2551,10 +2618,6 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 		local_set(&cpu_buffer->commit_page->page->commit,
 			  rb_page_write(cpu_buffer->commit_page));
 		rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-		/* Only update the write stamp if the page has an event */
-		if (rb_page_write(cpu_buffer->commit_page))
-			cpu_buffer->write_stamp =
-				cpu_buffer->commit_page->page->time_stamp;
 		/* add barrier to keep gcc from optimizing too much */
 		barrier();
 	}
@@ -2626,54 +2689,10 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
 		event->time_delta = 1;
 }
 
-static __always_inline bool
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
-		   struct ring_buffer_event *event)
-{
-	unsigned long addr = (unsigned long)event;
-	unsigned long index;
-
-	index = rb_event_index(event);
-	addr &= PAGE_MASK;
-
-	return cpu_buffer->commit_page->page == (void *)addr &&
-		rb_commit_index(cpu_buffer) == index;
-}
-
-static __always_inline void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-		      struct ring_buffer_event *event)
-{
-	u64 delta;
-
-	/*
-	 * The event first in the commit queue updates the
-	 * time stamp.
-	 */
-	if (rb_event_is_commit(cpu_buffer, event)) {
-		/*
-		 * A commit event that is first on a page
-		 * updates the write timestamp with the page stamp
-		 */
-		if (!rb_event_index(event))
-			cpu_buffer->write_stamp =
-				cpu_buffer->commit_page->page->time_stamp;
-		else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
-			delta = ring_buffer_event_time_stamp(event);
-			cpu_buffer->write_stamp += delta;
-		} else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
-			delta = ring_buffer_event_time_stamp(event);
-			cpu_buffer->write_stamp = delta;
-		} else
-			cpu_buffer->write_stamp += event->time_delta;
-	}
-}
-
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 		      struct ring_buffer_event *event)
 {
 	local_inc(&cpu_buffer->entries);
-	rb_update_write_stamp(cpu_buffer, event);
 	rb_end_commit(cpu_buffer);
 }
 
@@ -2872,13 +2891,13 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
 		  (unsigned long long)info->delta,
 		  (unsigned long long)info->ts,
-		  (unsigned long long)cpu_buffer->write_stamp,
+		  (unsigned long long)local64_read(&cpu_buffer->write_stamp),
 		  sched_clock_stable() ? "" :
 		  "If you just came from a suspend/resume,\n"
 		  "please switch to the trace global clock:\n"
 		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n"
 		  "or add trace_clock=global to the kernel command line\n");
-	info->add_timestamp = 1;
+	info->add_timestamp = RB_ADD_STAMP_NORMAL;
 }
 
 static struct ring_buffer_event *
@@ -2887,8 +2906,36 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 {
 	struct ring_buffer_event *event;
 	struct buffer_page *tail_page;
-	unsigned long tail, write;
+	unsigned long tail, write, w;
+	u64 before, after;
+	bool abs = false;
+
+	/* Don't let the compiler play games with cpu_buffer->tail_page */
+	tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
+
+ /*A*/	w = local_read(&tail_page->write) & RB_WRITE_MASK;
+	barrier();
+	before = local64_read(&cpu_buffer->before_stamp);
+	after = local64_read(&cpu_buffer->write_stamp);
+	barrier();
+	info->ts = rb_time_stamp(cpu_buffer->buffer);
+
+	if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
+		info->delta = info->ts;
+		abs = true;
+	} else {
+		info->delta = info->ts - after;
+	}
+
+	if (unlikely(test_time_stamp(info->delta)))
+		rb_handle_timestamp(cpu_buffer, info);
 
+	/*
+	 * If interrupting an event time update, we may need an absolute timestamp.
+	 * Don't bother if this is the start of a new page (w == 0).
+	 */
+	if (unlikely(before != after && w))
+		info->add_timestamp = RB_ADD_STAMP_FORCE;
 	/*
 	 * If the time delta since the last event is too big to
 	 * hold in the time field of the event, then we append a
@@ -2897,25 +2944,88 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (unlikely(info->add_timestamp))
 		info->length += RB_LEN_TIME_EXTEND;
 
-	/* Don't let the compiler play games with cpu_buffer->tail_page */
-	tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
-	write = local_add_return(info->length, &tail_page->write);
+ /*B*/	local64_set(&cpu_buffer->before_stamp, info->ts);
+
+ /*C*/	write = local_add_return(info->length, &tail_page->write);
 
 	/* set write to only the index of the write */
 	write &= RB_WRITE_MASK;
+
 	tail = write - info->length;
 
+	/* See if we shot pass the end of this buffer page */
+	if (unlikely(write > BUF_PAGE_SIZE)) {
+		if (tail != w) {
+			/* before and after may now different, fix it up*/
+			before = local64_read(&cpu_buffer->before_stamp);
+			after = local64_read(&cpu_buffer->write_stamp);
+			if (before != after)
+				(void)local64_cmpxchg(&cpu_buffer->before_stamp, before, after);
+		}
+		return rb_move_tail(cpu_buffer, tail, info);
+	}
+
+	if (likely(tail == w)) {
+		u64 save_before;
+
+		/* Nothing interrupted us between A and C */
+ /*D*/		local64_set(&cpu_buffer->write_stamp, info->ts);
+		barrier();
+ /*E*/		save_before = local64_read(&cpu_buffer->before_stamp);
+		if (likely(info->add_timestamp != RB_ADD_STAMP_FORCE))
+			/* This did not interrupt any time update */
+			info->delta = info->ts - after;
+		else
+			/* Just use full timestamp for inerrupting event */
+			info->delta = info->ts;
+		barrier();
+		if (unlikely(info->ts != save_before)) {
+			/* SLOW PATH - Interrupted between C and E */
+
+			after = local64_read(&cpu_buffer->write_stamp);
+			/* Write stamp must only go forward */
+			if (save_before > after) {
+				/*
+				 * We do not care about the result, only that
+				 * it gets updated atomically.
+				 */
+				(void)local64_cmpxchg(&cpu_buffer->write_stamp, after, save_before);
+			}
+		}
+	} else {
+		u64 ts;
+		/* SLOW PATH - Interrupted between A and C */
+		after = local64_read(&cpu_buffer->write_stamp);
+		ts = rb_time_stamp(cpu_buffer->buffer);
+		barrier();
+ /*E*/		if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+		    after < ts) {
+			/* Nothing came after this event between C and E */
+			info->delta = ts - after;
+			(void)local64_cmpxchg(&cpu_buffer->write_stamp, after, info->ts);
+			info->ts = ts;
+		} else {
+			/*
+			 * Interrupted beween C and E:
+			 * Lost the previous events time stamp. Just set the
+			 * delta to zero, and this will be the same time as
+			 * the event this event interrupted. And the events that
+			 * came after this will still be correct (as they would
+			 * have built their delta on the previous event.
+			 */
+			info->delta = 0;
+		}
+		if (info->add_timestamp == RB_ADD_STAMP_FORCE)
+			info->add_timestamp = RB_ADD_STAMP_NORMAL;
+	}
+
 	/*
 	 * If this is the first commit on the page, then it has the same
 	 * timestamp as the page itself.
 	 */
-	if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
+	if (unlikely(!tail && info->add_timestamp != RB_ADD_STAMP_FORCE && !abs))
 		info->delta = 0;
 
-	/* See if we shot pass the end of this buffer page */
-	if (unlikely(write > BUF_PAGE_SIZE))
-		return rb_move_tail(cpu_buffer, tail, info);
-
 	/* We reserved something on the buffer */
 
 	event = __rb_page_index(tail_page, tail);
@@ -2944,9 +3054,9 @@ rb_reserve_next_event(struct trace_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct rb_event_info info;
 	int nr_loops = 0;
-	u64 diff;
 
 	rb_start_commit(cpu_buffer);
+	/* The commit page can not change after this */
 
 #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
 	/*
@@ -2965,7 +3075,7 @@ rb_reserve_next_event(struct trace_buffer *buffer,
 
 	info.length = rb_calculate_event_length(length);
  again:
-	info.add_timestamp = 0;
+	info.add_timestamp = RB_ADD_STAMP_NONE;
 	info.delta = 0;
 
 	/*
@@ -2980,22 +3090,6 @@ rb_reserve_next_event(struct trace_buffer *buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		goto out_fail;
 
-	info.ts = rb_time_stamp(cpu_buffer->buffer);
-	diff = info.ts - cpu_buffer->write_stamp;
-
-	/* make sure this diff is calculated here */
-	barrier();
-
-	if (ring_buffer_time_stamp_abs(buffer)) {
-		info.delta = info.ts;
-		rb_handle_timestamp(cpu_buffer, &info);
-	} else /* Did the write stamp get updated already? */
-		if (likely(info.ts >= cpu_buffer->write_stamp)) {
-		info.delta = diff;
-		if (unlikely(test_time_stamp(info.delta)))
-			rb_handle_timestamp(cpu_buffer, &info);
-	}
-
 	event = __rb_reserve_next(cpu_buffer, &info);
 
 	if (unlikely(PTR_ERR(event) == -EAGAIN)) {
@@ -3004,11 +3098,8 @@ rb_reserve_next_event(struct trace_buffer *buffer,
 		goto again;
 	}
 
-	if (!event)
-		goto out_fail;
-
-	return event;
-
+	if (likely(event))
+		return event;
  out_fail:
 	rb_end_commit(cpu_buffer);
 	return NULL;
@@ -3154,11 +3245,6 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
 	if (rb_try_to_discard(cpu_buffer, event))
 		goto out;
 
-	/*
-	 * The commit is still visible by the reader, so we
-	 * must still update the timestamp.
-	 */
-	rb_update_write_stamp(cpu_buffer, event);
  out:
 	rb_end_commit(cpu_buffer);
 
@@ -4475,8 +4561,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->read = 0;
 	cpu_buffer->read_bytes = 0;
 
-	cpu_buffer->write_stamp = 0;
-	cpu_buffer->read_stamp = 0;
+	local64_set(&cpu_buffer->write_stamp, 0);
+	local64_set(&cpu_buffer->before_stamp, 0);
 
 	cpu_buffer->lost_events = 0;
 	cpu_buffer->last_overrun = 0;
-- 
cgit v1.2.3


From 7c4b4a5164fbedc11c23e3671bd90ba0d23a5efd Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Sun, 28 Jun 2020 22:52:26 -0400
Subject: ring-buffer: Incorporate absolute timestamp into add_timestamp logic

Instead of calling out the absolute test for each time to check if the
ring buffer wants absolute time stamps for all its recording, incorporate it
with the add_timestamp field and turn it into flags for faster processing
between wanting a absolute tag and needing to force one.

Link: http://lkml.kernel.org/r/20200629025259.154892368@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 026238c55b0c..7ee6619951ea 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -422,13 +422,15 @@ struct rb_event_info {
 /*
  * Used for the add_timestamp
  *  NONE
- *  NORMAL - may be for either time extend or absolute
+ *  EXTEND - wants a time extend
+ *  ABSOLUTE - the buffer requests all events to have absolute time stamps
  *  FORCE - force a full time stamp.
  */
 enum {
-	RB_ADD_STAMP_NONE,
-	RB_ADD_STAMP_NORMAL,
-	RB_ADD_STAMP_FORCE
+	RB_ADD_STAMP_NONE		= 0,
+	RB_ADD_STAMP_EXTEND		= BIT(1),
+	RB_ADD_STAMP_ABSOLUTE		= BIT(2),
+	RB_ADD_STAMP_FORCE		= BIT(3)
 };
 /*
  * Used for which event context the event is in.
@@ -2434,8 +2436,8 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 	 * add it to the start of the reserved space.
 	 */
 	if (unlikely(info->add_timestamp)) {
-		bool abs = info->add_timestamp == RB_ADD_STAMP_FORCE ||
-			ring_buffer_time_stamp_abs(cpu_buffer->buffer);
+		bool abs = info->add_timestamp &
+			(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
 
 		event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
 		length -= RB_LEN_TIME_EXTEND;
@@ -2884,8 +2886,8 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer,
 EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
 static noinline void
-rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
-		    struct rb_event_info *info)
+rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+		   struct rb_event_info *info)
 {
 	WARN_ONCE(info->delta > (1ULL << 59),
 		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
@@ -2897,7 +2899,6 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  "please switch to the trace global clock:\n"
 		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n"
 		  "or add trace_clock=global to the kernel command line\n");
-	info->add_timestamp = RB_ADD_STAMP_NORMAL;
 }
 
 static struct ring_buffer_event *
@@ -2908,7 +2909,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct buffer_page *tail_page;
 	unsigned long tail, write, w;
 	u64 before, after;
-	bool abs = false;
 
 	/* Don't let the compiler play games with cpu_buffer->tail_page */
 	tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
@@ -2922,20 +2922,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 	if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
 		info->delta = info->ts;
-		abs = true;
+		info->add_timestamp = RB_ADD_STAMP_ABSOLUTE;
 	} else {
 		info->delta = info->ts - after;
 	}
 
-	if (unlikely(test_time_stamp(info->delta)))
-		rb_handle_timestamp(cpu_buffer, info);
+	if (unlikely(test_time_stamp(info->delta))) {
+		rb_check_timestamp(cpu_buffer, info);
+		info->add_timestamp |= RB_ADD_STAMP_EXTEND;
+	}
 
 	/*
 	 * If interrupting an event time update, we may need an absolute timestamp.
 	 * Don't bother if this is the start of a new page (w == 0).
 	 */
 	if (unlikely(before != after && w))
-		info->add_timestamp = RB_ADD_STAMP_FORCE;
+		info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
+
 	/*
 	 * If the time delta since the last event is too big to
 	 * hold in the time field of the event, then we append a
@@ -2972,7 +2975,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
  /*D*/		local64_set(&cpu_buffer->write_stamp, info->ts);
 		barrier();
  /*E*/		save_before = local64_read(&cpu_buffer->before_stamp);
-		if (likely(info->add_timestamp != RB_ADD_STAMP_FORCE))
+		if (likely(!(info->add_timestamp &
+			     (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
 			/* This did not interrupt any time update */
 			info->delta = info->ts - after;
 		else
@@ -3015,15 +3019,15 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 			 */
 			info->delta = 0;
 		}
-		if (info->add_timestamp == RB_ADD_STAMP_FORCE)
-			info->add_timestamp = RB_ADD_STAMP_NORMAL;
+		info->add_timestamp &= ~RB_ADD_STAMP_FORCE;
 	}
 
 	/*
 	 * If this is the first commit on the page, then it has the same
 	 * timestamp as the page itself.
 	 */
-	if (unlikely(!tail && info->add_timestamp != RB_ADD_STAMP_FORCE && !abs))
+	if (unlikely(!tail && !(info->add_timestamp &
+				(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
 		info->delta = 0;
 
 	/* We reserved something on the buffer */
-- 
cgit v1.2.3


From 10464b4aa605ef93c937452f442e74cc0a4a6ceb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Sun, 28 Jun 2020 22:52:27 -0400
Subject: ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit

After a discussion with the new time algorithm to have nested events still
have proper time keeping but required using local64_t atomic operations.
Mathieu was concerned about the performance this would have on 32 bit
machines, as in most cases, atomic 64 bit operations on them can be
expensive.

As the ring buffer's timing needs do not require full features of local64_t,
a wrapper is made to implement a new rb_time_t operation that uses two longs
on 32 bit machines but still uses the local64_t operations on 64 bit
machines. There's a switch that can be made in the file to force 64 bit to
use the 32 bit version just for testing purposes.

All reads do not need to succeed if a read happened while the stamp being
read is in the process of being updated. The requirement is that all reads
must succed that were done by an interrupting event (where this event was
interrupted by another event that did the write). Or if the event itself did
the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will
always succeed (even if it gets interrupted by another event that writes to
t. The result of the read will be either the previous set, or a set
performed by an interrupting event.

If the read is done by an event that interrupted another event that was in
the process of setting the time stamp, and no other event came along to
write to that time stamp, it will fail and the rb_time_read() will return
that it failed (the value to read will be undefined).

A set will always write to the time stamp and return with a valid time
stamp, such that any read after it will be valid.

A cmpxchg may fail if it interrupted an event that was in the process of
updating the time stamp just like the reads do. Other than that, it will act
like a normal cmpxchg.

The way this works is that the rb_time_t is made of of three fields. A cnt,
that gets updated atomically everyting a modification is made. A top that
represents the most significant 30 bits of the time, and a bottom to
represent the least significant 30 bits of the time. Notice, that the time
values is only 60 bits long (where the ring buffer only uses 59 bits, which
gives us 18 years of nanoseconds!).

The top two bits of both the top and bottom is a 2 bit counter that gets set
by the value of the least two significant bits of the cnt. A read of the top
and the bottom where both the top and bottom have the same most significant
top 2 bits, are considered a match and a valid 60 bit number can be created
from it. If they do not match, then the number is considered invalid, and
this must only happen if an event interrupted another event in the midst of
updating the time stamp.

This is only used for 32 bits machines as 64 bit machines can get better
performance out of the local64_t. This has been tested heavily by forcing 64
bit to use this logic.

Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home
Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org

Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 270 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 243 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7ee6619951ea..802bb38d9c81 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -27,7 +27,6 @@
 #include <linux/oom.h>
 
 #include <asm/local.h>
-#include <asm/local64.h>
 
 static void update_pages_handler(struct work_struct *work);
 
@@ -449,6 +448,28 @@ enum {
 	RB_CTX_MAX
 };
 
+#if BITS_PER_LONG == 32
+#define RB_TIME_32
+#endif
+
+/* To test on 64 bit machines */
+//#define RB_TIME_32
+
+#ifdef RB_TIME_32
+
+struct rb_time_struct {
+	local_t		cnt;
+	local_t		top;
+	local_t		bottom;
+};
+#else
+#include <asm/local64.h>
+struct rb_time_struct {
+	local64_t	time;
+};
+#endif
+typedef struct rb_time_struct rb_time_t;
+
 /*
  * head_page == tail_page && head == tail then buffer is empty.
  */
@@ -484,8 +505,8 @@ struct ring_buffer_per_cpu {
 	size_t				shortest_full;
 	unsigned long			read;
 	unsigned long			read_bytes;
-	local64_t			write_stamp;
-	local64_t			before_stamp;
+	rb_time_t			write_stamp;
+	rb_time_t			before_stamp;
 	u64				read_stamp;
 	/* ring buffer pages to update, > 0 to add, < 0 to remove */
 	long				nr_pages_to_update;
@@ -528,6 +549,189 @@ struct ring_buffer_iter {
 	int				missed_events;
 };
 
+#ifdef RB_TIME_32
+
+/*
+ * On 32 bit machines, local64_t is very expensive. As the ring
+ * buffer doesn't need all the features of a true 64 bit atomic,
+ * on 32 bit, it uses these functions (64 still uses local64_t).
+ *
+ * For the ring buffer, 64 bit required operations for the time is
+ * the following:
+ *
+ *  - Only need 59 bits (uses 60 to make it even).
+ *  - Reads may fail if it interrupted a modification of the time stamp.
+ *      It will succeed if it did not interrupt another write even if
+ *      the read itself is interrupted by a write.
+ *      It returns whether it was successful or not.
+ *
+ *  - Writes always succeed and will overwrite other writes and writes
+ *      that were done by events interrupting the current write.
+ *
+ *  - A write followed by a read of the same time stamp will always succeed,
+ *      but may not contain the same value.
+ *
+ *  - A cmpxchg will fail if it interrupted another write or cmpxchg.
+ *      Other than that, it acts like a normal cmpxchg.
+ *
+ * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
+ *  (bottom being the least significant 30 bits of the 60 bit time stamp).
+ *
+ * The two most significant bits of each half holds a 2 bit counter (0-3).
+ * Each update will increment this counter by one.
+ * When reading the top and bottom, if the two counter bits match then the
+ *  top and bottom together make a valid 60 bit number.
+ */
+#define RB_TIME_SHIFT	30
+#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
+
+static inline int rb_time_cnt(unsigned long val)
+{
+	return (val >> RB_TIME_SHIFT) & 3;
+}
+
+static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
+{
+	u64 val;
+
+	val = top & RB_TIME_VAL_MASK;
+	val <<= RB_TIME_SHIFT;
+	val |= bottom & RB_TIME_VAL_MASK;
+
+	return val;
+}
+
+static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
+{
+	unsigned long top, bottom;
+	unsigned long c;
+
+	/*
+	 * If the read is interrupted by a write, then the cnt will
+	 * be different. Loop until both top and bottom have been read
+	 * without interruption.
+	 */
+	do {
+		c = local_read(&t->cnt);
+		top = local_read(&t->top);
+		bottom = local_read(&t->bottom);
+	} while (c != local_read(&t->cnt));
+
+	*cnt = rb_time_cnt(top);
+
+	/* If top and bottom counts don't match, this interrupted a write */
+	if (*cnt != rb_time_cnt(bottom))
+		return false;
+
+	*ret = rb_time_val(top, bottom);
+	return true;
+}
+
+static bool rb_time_read(rb_time_t *t, u64 *ret)
+{
+	unsigned long cnt;
+
+	return __rb_time_read(t, ret, &cnt);
+}
+
+static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
+{
+	return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
+}
+
+static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom)
+{
+	*top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
+	*bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
+}
+
+static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
+{
+	val = rb_time_val_cnt(val, cnt);
+	local_set(t, val);
+}
+
+static void rb_time_set(rb_time_t *t, u64 val)
+{
+	unsigned long cnt, top, bottom;
+
+	rb_time_split(val, &top, &bottom);
+
+	/* Writes always succeed with a valid number even if it gets interrupted. */
+	do {
+		cnt = local_inc_return(&t->cnt);
+		rb_time_val_set(&t->top, top, cnt);
+		rb_time_val_set(&t->bottom, bottom, cnt);
+	} while (cnt != local_read(&t->cnt));
+}
+
+static inline bool
+rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
+{
+	unsigned long ret;
+
+	ret = local_cmpxchg(l, expect, set);
+	return ret == expect;
+}
+
+static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+{
+	unsigned long cnt, top, bottom;
+	unsigned long cnt2, top2, bottom2;
+	u64 val;
+
+	/* The cmpxchg always fails if it interrupted an update */
+	 if (!__rb_time_read(t, &val, &cnt2))
+		 return false;
+
+	 if (val != expect)
+		 return false;
+
+	 cnt = local_read(&t->cnt);
+	 if ((cnt & 3) != cnt2)
+		 return false;
+
+	 cnt2 = cnt + 1;
+
+	 rb_time_split(val, &top, &bottom);
+	 top = rb_time_val_cnt(top, cnt);
+	 bottom = rb_time_val_cnt(bottom, cnt);
+
+	 rb_time_split(set, &top2, &bottom2);
+	 top2 = rb_time_val_cnt(top2, cnt2);
+	 bottom2 = rb_time_val_cnt(bottom2, cnt2);
+
+	if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
+		return false;
+	if (!rb_time_read_cmpxchg(&t->top, top, top2))
+		return false;
+	if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
+		return false;
+	return true;
+}
+
+#else /* 64 bits */
+
+/* local64_t always succeeds */
+
+static inline bool rb_time_read(rb_time_t *t, u64 *ret)
+{
+	*ret = local64_read(&t->time);
+	return true;
+}
+static void rb_time_set(rb_time_t *t, u64 val)
+{
+	local64_set(&t->time, val);
+}
+
+static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+{
+	u64 val;
+	val = local64_cmpxchg(&t->time, expect, set);
+	return val == expect;
+}
+#endif
+
 /**
  * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
  * @buffer: The ring_buffer to get the number of pages from
@@ -2545,7 +2749,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 
 	delta = rb_time_delta(event);
 
-	write_stamp = local64_read(&cpu_buffer->write_stamp);
+	if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
+		return 0;
 
 	/* Make sure the write stamp is read before testing the location */
 	barrier();
@@ -2554,11 +2759,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 		unsigned long write_mask =
 			local_read(&bpage->write) & ~RB_WRITE_MASK;
 		unsigned long event_length = rb_event_length(event);
-		u64 ret;
 
-		ret = local64_cmpxchg(&cpu_buffer->write_stamp, write_stamp, write_stamp - delta);
 		/* Something came in, can't discard */
-		if (ret != write_stamp)
+		if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
+				       write_stamp, write_stamp - delta))
 			return 0;
 
 		/*
@@ -2889,11 +3093,13 @@ static noinline void
 rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		   struct rb_event_info *info)
 {
+	u64 write_stamp;
+
 	WARN_ONCE(info->delta > (1ULL << 59),
 		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
 		  (unsigned long long)info->delta,
 		  (unsigned long long)info->ts,
-		  (unsigned long long)local64_read(&cpu_buffer->write_stamp),
+		  (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
 		  sched_clock_stable() ? "" :
 		  "If you just came from a suspend/resume,\n"
 		  "please switch to the trace global clock:\n"
@@ -2909,14 +3115,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct buffer_page *tail_page;
 	unsigned long tail, write, w;
 	u64 before, after;
+	bool a_ok;
+	bool b_ok;
 
 	/* Don't let the compiler play games with cpu_buffer->tail_page */
 	tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
 
  /*A*/	w = local_read(&tail_page->write) & RB_WRITE_MASK;
 	barrier();
-	before = local64_read(&cpu_buffer->before_stamp);
-	after = local64_read(&cpu_buffer->write_stamp);
+	b_ok = rb_time_read(&cpu_buffer->before_stamp, &before);
+	a_ok = rb_time_read(&cpu_buffer->write_stamp, &after);
 	barrier();
 	info->ts = rb_time_stamp(cpu_buffer->buffer);
 
@@ -2927,16 +3135,18 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		info->delta = info->ts - after;
 	}
 
-	if (unlikely(test_time_stamp(info->delta))) {
-		rb_check_timestamp(cpu_buffer, info);
-		info->add_timestamp |= RB_ADD_STAMP_EXTEND;
+	if (likely(a_ok && b_ok)) {
+		if (unlikely(test_time_stamp(info->delta))) {
+			rb_check_timestamp(cpu_buffer, info);
+			info->add_timestamp |= RB_ADD_STAMP_EXTEND;
+		}
 	}
 
 	/*
 	 * If interrupting an event time update, we may need an absolute timestamp.
 	 * Don't bother if this is the start of a new page (w == 0).
 	 */
-	if (unlikely(before != after && w))
+	if (unlikely(!a_ok || !b_ok || (before != after && w)))
 		info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
 
 	/*
@@ -2947,7 +3157,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (unlikely(info->add_timestamp))
 		info->length += RB_LEN_TIME_EXTEND;
 
- /*B*/	local64_set(&cpu_buffer->before_stamp, info->ts);
+ /*B*/	rb_time_set(&cpu_buffer->before_stamp, info->ts);
 
  /*C*/	write = local_add_return(info->length, &tail_page->write);
 
@@ -2960,21 +3170,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (unlikely(write > BUF_PAGE_SIZE)) {
 		if (tail != w) {
 			/* before and after may now different, fix it up*/
-			before = local64_read(&cpu_buffer->before_stamp);
-			after = local64_read(&cpu_buffer->write_stamp);
-			if (before != after)
-				(void)local64_cmpxchg(&cpu_buffer->before_stamp, before, after);
+			b_ok = rb_time_read(&cpu_buffer->before_stamp, &before);
+			a_ok = rb_time_read(&cpu_buffer->write_stamp, &after);
+			if (a_ok && b_ok && before != after)
+				(void)rb_time_cmpxchg(&cpu_buffer->before_stamp, before, after);
 		}
 		return rb_move_tail(cpu_buffer, tail, info);
 	}
 
 	if (likely(tail == w)) {
 		u64 save_before;
+		bool s_ok;
 
 		/* Nothing interrupted us between A and C */
- /*D*/		local64_set(&cpu_buffer->write_stamp, info->ts);
+ /*D*/		rb_time_set(&cpu_buffer->write_stamp, info->ts);
 		barrier();
- /*E*/		save_before = local64_read(&cpu_buffer->before_stamp);
+ /*E*/		s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
+		RB_WARN_ON(cpu_buffer, !s_ok);
 		if (likely(!(info->add_timestamp &
 			     (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
 			/* This did not interrupt any time update */
@@ -2986,27 +3198,31 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		if (unlikely(info->ts != save_before)) {
 			/* SLOW PATH - Interrupted between C and E */
 
-			after = local64_read(&cpu_buffer->write_stamp);
+			a_ok = rb_time_read(&cpu_buffer->write_stamp, &after);
+			RB_WARN_ON(cpu_buffer, !a_ok);
+
 			/* Write stamp must only go forward */
 			if (save_before > after) {
 				/*
 				 * We do not care about the result, only that
 				 * it gets updated atomically.
 				 */
-				(void)local64_cmpxchg(&cpu_buffer->write_stamp, after, save_before);
+				(void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, save_before);
 			}
 		}
 	} else {
 		u64 ts;
 		/* SLOW PATH - Interrupted between A and C */
-		after = local64_read(&cpu_buffer->write_stamp);
+		a_ok = rb_time_read(&cpu_buffer->write_stamp, &after);
+		/* Was interrupted before here, write_stamp must be valid */
+		RB_WARN_ON(cpu_buffer, !a_ok);
 		ts = rb_time_stamp(cpu_buffer->buffer);
 		barrier();
  /*E*/		if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
 		    after < ts) {
 			/* Nothing came after this event between C and E */
 			info->delta = ts - after;
-			(void)local64_cmpxchg(&cpu_buffer->write_stamp, after, info->ts);
+			(void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, info->ts);
 			info->ts = ts;
 		} else {
 			/*
@@ -4565,8 +4781,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->read = 0;
 	cpu_buffer->read_bytes = 0;
 
-	local64_set(&cpu_buffer->write_stamp, 0);
-	local64_set(&cpu_buffer->before_stamp, 0);
+	rb_time_set(&cpu_buffer->write_stamp, 0);
+	rb_time_set(&cpu_buffer->before_stamp, 0);
 
 	cpu_buffer->lost_events = 0;
 	cpu_buffer->last_overrun = 0;
-- 
cgit v1.2.3


From b23d7a5f4a07af02343cdd28fe1f7488bac3afda Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 25 Jun 2020 15:34:03 +1000
Subject: ring-buffer: speed up buffer resets by avoiding synchronize_rcu for
 each CPU

On a 144 thread system, `perf ftrace` takes about 20 seconds to start
up, due to calling synchronize_rcu() for each CPU.

  cat /proc/108560/stack
    0xc0003e7eb336f470
    __switch_to+0x2e0/0x480
    __wait_rcu_gp+0x20c/0x220
    synchronize_rcu+0x9c/0xc0
    ring_buffer_reset_cpu+0x88/0x2e0
    tracing_reset_online_cpus+0x84/0xe0
    tracing_open+0x1d4/0x1f0

On a system with 10x more threads, it starts to become an annoyance.

Batch these up so we disable all the per-cpu buffers first, then
synchronize_rcu() once, then reset each of the buffers. This brings
the time down to about 0.5s.

Link: https://lkml.kernel.org/r/20200625053403.2386972-1-npiggin@gmail.com

Tested-by: Anton Blanchard <anton@ozlabs.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  1 +
 kernel/trace/ring_buffer.c  | 85 +++++++++++++++++++++++++++++++++++++--------
 kernel/trace/trace.c        |  4 +--
 3 files changed, 73 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index c76b2f3b3ac4..136ea0997e6d 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -143,6 +143,7 @@ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter);
 unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu);
 
 void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu);
+void ring_buffer_reset_online_cpus(struct trace_buffer *buffer);
 void ring_buffer_reset(struct trace_buffer *buffer);
 
 #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 802bb38d9c81..ed1941304f69 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -270,6 +270,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 #define for_each_buffer_cpu(buffer, cpu)		\
 	for_each_cpu(cpu, buffer->cpumask)
 
+#define for_each_online_buffer_cpu(buffer, cpu)		\
+	for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
+
 #define TS_SHIFT	27
 #define TS_MASK		((1ULL << TS_SHIFT) - 1)
 #define TS_DELTA_TEST	(~TS_MASK)
@@ -4790,6 +4793,26 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	rb_head_page_activate(cpu_buffer);
 }
 
+/* Must have disabled the cpu buffer then done a synchronize_rcu */
+static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+	if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
+		goto out;
+
+	arch_spin_lock(&cpu_buffer->lock);
+
+	rb_reset_cpu(cpu_buffer);
+
+	arch_spin_unlock(&cpu_buffer->lock);
+
+ out:
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+}
+
 /**
  * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
  * @buffer: The ring buffer to reset a per cpu buffer of
@@ -4798,7 +4821,6 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
-	unsigned long flags;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return;
@@ -4809,24 +4831,42 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
 	/* Make sure all commits have finished */
 	synchronize_rcu();
 
-	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	reset_disabled_cpu_buffer(cpu_buffer);
 
-	if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
-		goto out;
+	atomic_dec(&cpu_buffer->record_disabled);
+	atomic_dec(&cpu_buffer->resize_disabled);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 
-	arch_spin_lock(&cpu_buffer->lock);
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	int cpu;
 
-	rb_reset_cpu(cpu_buffer);
+	for_each_online_buffer_cpu(buffer, cpu) {
+		cpu_buffer = buffer->buffers[cpu];
 
-	arch_spin_unlock(&cpu_buffer->lock);
+		atomic_inc(&cpu_buffer->resize_disabled);
+		atomic_inc(&cpu_buffer->record_disabled);
+	}
 
- out:
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	/* Make sure all commits have finished */
+	synchronize_rcu();
 
-	atomic_dec(&cpu_buffer->record_disabled);
-	atomic_dec(&cpu_buffer->resize_disabled);
+	for_each_online_buffer_cpu(buffer, cpu) {
+		cpu_buffer = buffer->buffers[cpu];
+
+		reset_disabled_cpu_buffer(cpu_buffer);
+
+		atomic_dec(&cpu_buffer->record_disabled);
+		atomic_dec(&cpu_buffer->resize_disabled);
+	}
 }
-EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 
 /**
  * ring_buffer_reset - reset a ring buffer
@@ -4834,10 +4874,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
  */
 void ring_buffer_reset(struct trace_buffer *buffer)
 {
+	struct ring_buffer_per_cpu *cpu_buffer;
 	int cpu;
 
-	for_each_buffer_cpu(buffer, cpu)
-		ring_buffer_reset_cpu(buffer, cpu);
+	for_each_buffer_cpu(buffer, cpu) {
+		cpu_buffer = buffer->buffers[cpu];
+
+		atomic_inc(&cpu_buffer->resize_disabled);
+		atomic_inc(&cpu_buffer->record_disabled);
+	}
+
+	/* Make sure all commits have finished */
+	synchronize_rcu();
+
+	for_each_buffer_cpu(buffer, cpu) {
+		cpu_buffer = buffer->buffers[cpu];
+
+		reset_disabled_cpu_buffer(cpu_buffer);
+
+		atomic_dec(&cpu_buffer->record_disabled);
+		atomic_dec(&cpu_buffer->resize_disabled);
+	}
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 64c5b8146cca..4aab712f9567 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2003,7 +2003,6 @@ static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
 void tracing_reset_online_cpus(struct array_buffer *buf)
 {
 	struct trace_buffer *buffer = buf->buffer;
-	int cpu;
 
 	if (!buffer)
 		return;
@@ -2015,8 +2014,7 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
 
 	buf->time_start = buffer_ftrace_now(buf, buf->cpu);
 
-	for_each_online_cpu(cpu)
-		ring_buffer_reset_cpu(buffer, cpu);
+	ring_buffer_reset_online_cpus(buffer);
 
 	ring_buffer_record_enable(buffer);
 }
-- 
cgit v1.2.3


From 75b21c6dfa2d816bcabac24a530d9cffc12092b8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 30 Jun 2020 09:04:35 -0400
Subject: ring-buffer: Mark the !tail (crossing a page) as unlikely

It is the uncommon case where an event crosses a sub buffer boundary (page)
mark that check at the end of reserving an event as unlikely.

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ed1941304f69..888bc9177937 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3260,7 +3260,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	 * If this is the first commit on the page, then update
 	 * its timestamp.
 	 */
-	if (!tail)
+	if (unlikely(!tail))
 		tail_page->page->time_stamp = info->ts;
 
 	/* account for these added bytes */
-- 
cgit v1.2.3


From fb37409a01b011a664347702f44dbf13fa7c7486 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 10 Jun 2020 09:45:20 +0300
Subject: arch: remove unicore32 port

The unicore32 port do not seem maintained for a long time now, there is no
upstream toolchain that can create unicore32 binaries and all the links to
prebuilt toolchains for unicore32 are dead. Even compilers that were
available are not supported by the kernel anymore.

Guenter Roeck says:

  I have stopped building unicore32 images since v4.19 since there is no
  available compiler that is still supported by the kernel. I am surprised
  that support for it has not been removed from the kernel.

Remove unicore32 port.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Guenter Roeck <linux@roeck-us.net>
---
 .../features/core/cBPF-JIT/arch-support.txt        |   1 -
 .../features/core/eBPF-JIT/arch-support.txt        |   1 -
 .../core/generic-idle-thread/arch-support.txt      |   1 -
 .../features/core/jump-labels/arch-support.txt     |   1 -
 .../features/core/tracehook/arch-support.txt       |   1 -
 .../features/debug/KASAN/arch-support.txt          |   1 -
 .../debug/debug-vm-pgtable/arch-support.txt        |   1 -
 .../debug/gcov-profile-all/arch-support.txt        |   1 -
 Documentation/features/debug/kgdb/arch-support.txt |   1 -
 .../debug/kprobes-on-ftrace/arch-support.txt       |   1 -
 .../features/debug/kprobes/arch-support.txt        |   1 -
 .../features/debug/kretprobes/arch-support.txt     |   1 -
 .../features/debug/optprobes/arch-support.txt      |   1 -
 .../features/debug/stackprotector/arch-support.txt |   1 -
 .../features/debug/uprobes/arch-support.txt        |   1 -
 .../debug/user-ret-profiler/arch-support.txt       |   1 -
 .../features/io/dma-contiguous/arch-support.txt    |   1 -
 .../locking/cmpxchg-local/arch-support.txt         |   1 -
 .../features/locking/lockdep/arch-support.txt      |   1 -
 .../locking/queued-rwlocks/arch-support.txt        |   1 -
 .../locking/queued-spinlocks/arch-support.txt      |   1 -
 .../features/perf/kprobes-event/arch-support.txt   |   1 -
 .../features/perf/perf-regs/arch-support.txt       |   1 -
 .../features/perf/perf-stackdump/arch-support.txt  |   1 -
 .../sched/membarrier-sync-core/arch-support.txt    |   1 -
 .../features/sched/numa-balancing/arch-support.txt |   1 -
 .../seccomp/seccomp-filter/arch-support.txt        |   1 -
 .../time/arch-tick-broadcast/arch-support.txt      |   1 -
 .../features/time/clockevents/arch-support.txt     |   1 -
 .../time/context-tracking/arch-support.txt         |   1 -
 .../features/time/irq-time-acct/arch-support.txt   |   1 -
 .../time/modern-timekeeping/arch-support.txt       |   1 -
 .../features/time/virt-cpuacct/arch-support.txt    |   1 -
 .../features/vm/ELF-ASLR/arch-support.txt          |   1 -
 .../features/vm/PG_uncached/arch-support.txt       |   1 -
 Documentation/features/vm/THP/arch-support.txt     |   1 -
 Documentation/features/vm/TLB/arch-support.txt     |   1 -
 .../features/vm/huge-vmap/arch-support.txt         |   1 -
 .../features/vm/ioremap_prot/arch-support.txt      |   1 -
 .../features/vm/pte_special/arch-support.txt       |   1 -
 MAINTAINERS                                        |   7 -
 arch/unicore32/.gitignore                          |  22 -
 arch/unicore32/Kconfig                             | 200 -----
 arch/unicore32/Kconfig.debug                       |  29 -
 arch/unicore32/Makefile                            |  59 --
 arch/unicore32/boot/Makefile                       |  39 -
 arch/unicore32/boot/compressed/Makefile            |  64 --
 arch/unicore32/boot/compressed/head.S              | 201 ------
 arch/unicore32/boot/compressed/misc.c              | 123 ----
 arch/unicore32/boot/compressed/piggy.S.in          |   6 -
 arch/unicore32/boot/compressed/vmlinux.lds.S       |  58 --
 arch/unicore32/configs/defconfig                   | 214 ------
 arch/unicore32/include/asm/Kbuild                  |   7 -
 arch/unicore32/include/asm/assembler.h             | 128 ----
 arch/unicore32/include/asm/barrier.h               |  16 -
 arch/unicore32/include/asm/bitops.h                |  46 --
 arch/unicore32/include/asm/bug.h                   |  20 -
 arch/unicore32/include/asm/cache.h                 |  24 -
 arch/unicore32/include/asm/cacheflush.h            | 186 -----
 arch/unicore32/include/asm/checksum.h              |  38 -
 arch/unicore32/include/asm/cmpxchg.h               |  58 --
 arch/unicore32/include/asm/cpu-single.h            |  42 --
 arch/unicore32/include/asm/cputype.h               |  30 -
 arch/unicore32/include/asm/delay.h                 |  49 --
 arch/unicore32/include/asm/dma.h                   |  20 -
 arch/unicore32/include/asm/elf.h                   |  90 ---
 arch/unicore32/include/asm/fpstate.h               |  23 -
 arch/unicore32/include/asm/fpu-ucf64.h             |  50 --
 arch/unicore32/include/asm/gpio.h                  | 101 ---
 arch/unicore32/include/asm/hwcap.h                 |  29 -
 arch/unicore32/include/asm/hwdef-copro.h           |  45 --
 arch/unicore32/include/asm/io.h                    |  69 --
 arch/unicore32/include/asm/irq.h                   | 102 ---
 arch/unicore32/include/asm/irqflags.h              |  50 --
 arch/unicore32/include/asm/linkage.h               |  19 -
 arch/unicore32/include/asm/memblock.h              |  43 --
 arch/unicore32/include/asm/memory.h                | 102 ---
 arch/unicore32/include/asm/mmu.h                   |  14 -
 arch/unicore32/include/asm/mmu_context.h           |  98 ---
 arch/unicore32/include/asm/page.h                  |  74 --
 arch/unicore32/include/asm/pci.h                   |  20 -
 arch/unicore32/include/asm/pgalloc.h               |  87 ---
 arch/unicore32/include/asm/pgtable-hwdef.h         |  51 --
 arch/unicore32/include/asm/pgtable.h               | 267 -------
 arch/unicore32/include/asm/processor.h             |  74 --
 arch/unicore32/include/asm/ptrace.h                |  58 --
 arch/unicore32/include/asm/stacktrace.h            |  28 -
 arch/unicore32/include/asm/string.h                |  35 -
 arch/unicore32/include/asm/suspend.h               |  26 -
 arch/unicore32/include/asm/switch_to.h             |  27 -
 arch/unicore32/include/asm/syscall.h               |  12 -
 arch/unicore32/include/asm/thread_info.h           | 133 ----
 arch/unicore32/include/asm/timex.h                 |  31 -
 arch/unicore32/include/asm/tlb.h                   |  24 -
 arch/unicore32/include/asm/tlbflush.h              | 192 -----
 arch/unicore32/include/asm/traps.h                 |  18 -
 arch/unicore32/include/asm/uaccess.h               |  38 -
 arch/unicore32/include/asm/vmalloc.h               |   4 -
 arch/unicore32/include/mach/PKUnity.h              |  95 ---
 arch/unicore32/include/mach/bitfield.h             |  21 -
 arch/unicore32/include/mach/dma.h                  |  45 --
 arch/unicore32/include/mach/hardware.h             |  30 -
 arch/unicore32/include/mach/map.h                  |  17 -
 arch/unicore32/include/mach/memory.h               |  54 --
 arch/unicore32/include/mach/ocd.h                  |  33 -
 arch/unicore32/include/mach/pm.h                   |  37 -
 arch/unicore32/include/mach/regs-ac97.h            |  33 -
 arch/unicore32/include/mach/regs-dmac.h            |  82 ---
 arch/unicore32/include/mach/regs-gpio.h            |  71 --
 arch/unicore32/include/mach/regs-i2c.h             |  64 --
 arch/unicore32/include/mach/regs-intc.h            |  29 -
 arch/unicore32/include/mach/regs-nand.h            |  80 --
 arch/unicore32/include/mach/regs-ost.h             |  91 ---
 arch/unicore32/include/mach/regs-pci.h             |  95 ---
 arch/unicore32/include/mach/regs-pm.h              | 127 ----
 arch/unicore32/include/mach/regs-ps2.h             |  21 -
 arch/unicore32/include/mach/regs-resetc.h          |  35 -
 arch/unicore32/include/mach/regs-rtc.h             |  38 -
 arch/unicore32/include/mach/regs-sdc.h             | 157 ----
 arch/unicore32/include/mach/regs-spi.h             |  99 ---
 arch/unicore32/include/mach/regs-uart.h            |   3 -
 arch/unicore32/include/mach/regs-umal.h            | 230 ------
 arch/unicore32/include/mach/regs-unigfx.h          | 201 ------
 arch/unicore32/include/mach/uncompress.h           |  31 -
 arch/unicore32/include/uapi/asm/Kbuild             |   2 -
 arch/unicore32/include/uapi/asm/byteorder.h        |  25 -
 arch/unicore32/include/uapi/asm/ptrace.h           |  91 ---
 arch/unicore32/include/uapi/asm/sigcontext.h       |  30 -
 arch/unicore32/include/uapi/asm/unistd.h           |  21 -
 arch/unicore32/kernel/Makefile                     |  31 -
 arch/unicore32/kernel/asm-offsets.c                | 108 ---
 arch/unicore32/kernel/clock.c                      | 387 ----------
 arch/unicore32/kernel/debug-macro.S                |  86 ---
 arch/unicore32/kernel/debug.S                      |  82 ---
 arch/unicore32/kernel/dma.c                        | 179 -----
 arch/unicore32/kernel/early_printk.c               |  46 --
 arch/unicore32/kernel/elf.c                        |  35 -
 arch/unicore32/kernel/entry.S                      | 802 ---------------------
 arch/unicore32/kernel/fpu-ucf64.c                  | 117 ---
 arch/unicore32/kernel/gpio.c                       | 121 ----
 arch/unicore32/kernel/head.S                       | 249 -------
 arch/unicore32/kernel/hibernate.c                  | 159 ----
 arch/unicore32/kernel/hibernate_asm.S              | 114 ---
 arch/unicore32/kernel/irq.c                        | 371 ----------
 arch/unicore32/kernel/ksyms.c                      |  57 --
 arch/unicore32/kernel/ksyms.h                      |  14 -
 arch/unicore32/kernel/module.c                     | 105 ---
 arch/unicore32/kernel/pci.c                        | 371 ----------
 arch/unicore32/kernel/pm.c                         | 121 ----
 arch/unicore32/kernel/process.c                    | 319 --------
 arch/unicore32/kernel/ptrace.c                     | 147 ----
 arch/unicore32/kernel/puv3-core.c                  | 276 -------
 arch/unicore32/kernel/puv3-nb0916.c                | 147 ----
 arch/unicore32/kernel/setup.c                      | 352 ---------
 arch/unicore32/kernel/setup.h                      |  36 -
 arch/unicore32/kernel/signal.c                     | 424 -----------
 arch/unicore32/kernel/sleep.S                      | 199 -----
 arch/unicore32/kernel/stacktrace.c                 | 127 ----
 arch/unicore32/kernel/sys.c                        |  37 -
 arch/unicore32/kernel/time.c                       | 128 ----
 arch/unicore32/kernel/traps.c                      | 322 ---------
 arch/unicore32/kernel/vmlinux.lds.S                |  59 --
 arch/unicore32/lib/Makefile                        |  28 -
 arch/unicore32/lib/backtrace.S                     | 168 -----
 arch/unicore32/lib/clear_user.S                    |  54 --
 arch/unicore32/lib/copy_from_user.S                | 101 ---
 arch/unicore32/lib/copy_page.S                     |  36 -
 arch/unicore32/lib/copy_template.S                 | 211 ------
 arch/unicore32/lib/copy_to_user.S                  |  93 ---
 arch/unicore32/lib/delay.S                         |  48 --
 arch/unicore32/lib/findbit.S                       |  97 ---
 arch/unicore32/lib/strncpy_from_user.S             |  42 --
 arch/unicore32/lib/strnlen_user.S                  |  39 -
 arch/unicore32/mm/Kconfig                          |  41 --
 arch/unicore32/mm/Makefile                         |  14 -
 arch/unicore32/mm/alignment.c                      | 524 --------------
 arch/unicore32/mm/cache-ucv2.S                     | 209 ------
 arch/unicore32/mm/extable.c                        |  21 -
 arch/unicore32/mm/fault.c                          | 481 ------------
 arch/unicore32/mm/flush.c                          |  94 ---
 arch/unicore32/mm/init.c                           | 261 -------
 arch/unicore32/mm/ioremap.c                        | 242 -------
 arch/unicore32/mm/mm.h                             |  31 -
 arch/unicore32/mm/mmu.c                            | 513 -------------
 arch/unicore32/mm/pgd.c                            | 102 ---
 arch/unicore32/mm/proc-macros.S                    | 142 ----
 arch/unicore32/mm/proc-syms.c                      |  19 -
 arch/unicore32/mm/proc-ucv2.S                      | 131 ----
 arch/unicore32/mm/tlb-ucv2.S                       |  86 ---
 kernel/reboot.c                                    |   2 +-
 190 files changed, 1 insertion(+), 15705 deletions(-)
 delete mode 100644 arch/unicore32/.gitignore
 delete mode 100644 arch/unicore32/Kconfig
 delete mode 100644 arch/unicore32/Kconfig.debug
 delete mode 100644 arch/unicore32/Makefile
 delete mode 100644 arch/unicore32/boot/Makefile
 delete mode 100644 arch/unicore32/boot/compressed/Makefile
 delete mode 100644 arch/unicore32/boot/compressed/head.S
 delete mode 100644 arch/unicore32/boot/compressed/misc.c
 delete mode 100644 arch/unicore32/boot/compressed/piggy.S.in
 delete mode 100644 arch/unicore32/boot/compressed/vmlinux.lds.S
 delete mode 100644 arch/unicore32/configs/defconfig
 delete mode 100644 arch/unicore32/include/asm/Kbuild
 delete mode 100644 arch/unicore32/include/asm/assembler.h
 delete mode 100644 arch/unicore32/include/asm/barrier.h
 delete mode 100644 arch/unicore32/include/asm/bitops.h
 delete mode 100644 arch/unicore32/include/asm/bug.h
 delete mode 100644 arch/unicore32/include/asm/cache.h
 delete mode 100644 arch/unicore32/include/asm/cacheflush.h
 delete mode 100644 arch/unicore32/include/asm/checksum.h
 delete mode 100644 arch/unicore32/include/asm/cmpxchg.h
 delete mode 100644 arch/unicore32/include/asm/cpu-single.h
 delete mode 100644 arch/unicore32/include/asm/cputype.h
 delete mode 100644 arch/unicore32/include/asm/delay.h
 delete mode 100644 arch/unicore32/include/asm/dma.h
 delete mode 100644 arch/unicore32/include/asm/elf.h
 delete mode 100644 arch/unicore32/include/asm/fpstate.h
 delete mode 100644 arch/unicore32/include/asm/fpu-ucf64.h
 delete mode 100644 arch/unicore32/include/asm/gpio.h
 delete mode 100644 arch/unicore32/include/asm/hwcap.h
 delete mode 100644 arch/unicore32/include/asm/hwdef-copro.h
 delete mode 100644 arch/unicore32/include/asm/io.h
 delete mode 100644 arch/unicore32/include/asm/irq.h
 delete mode 100644 arch/unicore32/include/asm/irqflags.h
 delete mode 100644 arch/unicore32/include/asm/linkage.h
 delete mode 100644 arch/unicore32/include/asm/memblock.h
 delete mode 100644 arch/unicore32/include/asm/memory.h
 delete mode 100644 arch/unicore32/include/asm/mmu.h
 delete mode 100644 arch/unicore32/include/asm/mmu_context.h
 delete mode 100644 arch/unicore32/include/asm/page.h
 delete mode 100644 arch/unicore32/include/asm/pci.h
 delete mode 100644 arch/unicore32/include/asm/pgalloc.h
 delete mode 100644 arch/unicore32/include/asm/pgtable-hwdef.h
 delete mode 100644 arch/unicore32/include/asm/pgtable.h
 delete mode 100644 arch/unicore32/include/asm/processor.h
 delete mode 100644 arch/unicore32/include/asm/ptrace.h
 delete mode 100644 arch/unicore32/include/asm/stacktrace.h
 delete mode 100644 arch/unicore32/include/asm/string.h
 delete mode 100644 arch/unicore32/include/asm/suspend.h
 delete mode 100644 arch/unicore32/include/asm/switch_to.h
 delete mode 100644 arch/unicore32/include/asm/syscall.h
 delete mode 100644 arch/unicore32/include/asm/thread_info.h
 delete mode 100644 arch/unicore32/include/asm/timex.h
 delete mode 100644 arch/unicore32/include/asm/tlb.h
 delete mode 100644 arch/unicore32/include/asm/tlbflush.h
 delete mode 100644 arch/unicore32/include/asm/traps.h
 delete mode 100644 arch/unicore32/include/asm/uaccess.h
 delete mode 100644 arch/unicore32/include/asm/vmalloc.h
 delete mode 100644 arch/unicore32/include/mach/PKUnity.h
 delete mode 100644 arch/unicore32/include/mach/bitfield.h
 delete mode 100644 arch/unicore32/include/mach/dma.h
 delete mode 100644 arch/unicore32/include/mach/hardware.h
 delete mode 100644 arch/unicore32/include/mach/map.h
 delete mode 100644 arch/unicore32/include/mach/memory.h
 delete mode 100644 arch/unicore32/include/mach/ocd.h
 delete mode 100644 arch/unicore32/include/mach/pm.h
 delete mode 100644 arch/unicore32/include/mach/regs-ac97.h
 delete mode 100644 arch/unicore32/include/mach/regs-dmac.h
 delete mode 100644 arch/unicore32/include/mach/regs-gpio.h
 delete mode 100644 arch/unicore32/include/mach/regs-i2c.h
 delete mode 100644 arch/unicore32/include/mach/regs-intc.h
 delete mode 100644 arch/unicore32/include/mach/regs-nand.h
 delete mode 100644 arch/unicore32/include/mach/regs-ost.h
 delete mode 100644 arch/unicore32/include/mach/regs-pci.h
 delete mode 100644 arch/unicore32/include/mach/regs-pm.h
 delete mode 100644 arch/unicore32/include/mach/regs-ps2.h
 delete mode 100644 arch/unicore32/include/mach/regs-resetc.h
 delete mode 100644 arch/unicore32/include/mach/regs-rtc.h
 delete mode 100644 arch/unicore32/include/mach/regs-sdc.h
 delete mode 100644 arch/unicore32/include/mach/regs-spi.h
 delete mode 100644 arch/unicore32/include/mach/regs-uart.h
 delete mode 100644 arch/unicore32/include/mach/regs-umal.h
 delete mode 100644 arch/unicore32/include/mach/regs-unigfx.h
 delete mode 100644 arch/unicore32/include/mach/uncompress.h
 delete mode 100644 arch/unicore32/include/uapi/asm/Kbuild
 delete mode 100644 arch/unicore32/include/uapi/asm/byteorder.h
 delete mode 100644 arch/unicore32/include/uapi/asm/ptrace.h
 delete mode 100644 arch/unicore32/include/uapi/asm/sigcontext.h
 delete mode 100644 arch/unicore32/include/uapi/asm/unistd.h
 delete mode 100644 arch/unicore32/kernel/Makefile
 delete mode 100644 arch/unicore32/kernel/asm-offsets.c
 delete mode 100644 arch/unicore32/kernel/clock.c
 delete mode 100644 arch/unicore32/kernel/debug-macro.S
 delete mode 100644 arch/unicore32/kernel/debug.S
 delete mode 100644 arch/unicore32/kernel/dma.c
 delete mode 100644 arch/unicore32/kernel/early_printk.c
 delete mode 100644 arch/unicore32/kernel/elf.c
 delete mode 100644 arch/unicore32/kernel/entry.S
 delete mode 100644 arch/unicore32/kernel/fpu-ucf64.c
 delete mode 100644 arch/unicore32/kernel/gpio.c
 delete mode 100644 arch/unicore32/kernel/head.S
 delete mode 100644 arch/unicore32/kernel/hibernate.c
 delete mode 100644 arch/unicore32/kernel/hibernate_asm.S
 delete mode 100644 arch/unicore32/kernel/irq.c
 delete mode 100644 arch/unicore32/kernel/ksyms.c
 delete mode 100644 arch/unicore32/kernel/ksyms.h
 delete mode 100644 arch/unicore32/kernel/module.c
 delete mode 100644 arch/unicore32/kernel/pci.c
 delete mode 100644 arch/unicore32/kernel/pm.c
 delete mode 100644 arch/unicore32/kernel/process.c
 delete mode 100644 arch/unicore32/kernel/ptrace.c
 delete mode 100644 arch/unicore32/kernel/puv3-core.c
 delete mode 100644 arch/unicore32/kernel/puv3-nb0916.c
 delete mode 100644 arch/unicore32/kernel/setup.c
 delete mode 100644 arch/unicore32/kernel/setup.h
 delete mode 100644 arch/unicore32/kernel/signal.c
 delete mode 100644 arch/unicore32/kernel/sleep.S
 delete mode 100644 arch/unicore32/kernel/stacktrace.c
 delete mode 100644 arch/unicore32/kernel/sys.c
 delete mode 100644 arch/unicore32/kernel/time.c
 delete mode 100644 arch/unicore32/kernel/traps.c
 delete mode 100644 arch/unicore32/kernel/vmlinux.lds.S
 delete mode 100644 arch/unicore32/lib/Makefile
 delete mode 100644 arch/unicore32/lib/backtrace.S
 delete mode 100644 arch/unicore32/lib/clear_user.S
 delete mode 100644 arch/unicore32/lib/copy_from_user.S
 delete mode 100644 arch/unicore32/lib/copy_page.S
 delete mode 100644 arch/unicore32/lib/copy_template.S
 delete mode 100644 arch/unicore32/lib/copy_to_user.S
 delete mode 100644 arch/unicore32/lib/delay.S
 delete mode 100644 arch/unicore32/lib/findbit.S
 delete mode 100644 arch/unicore32/lib/strncpy_from_user.S
 delete mode 100644 arch/unicore32/lib/strnlen_user.S
 delete mode 100644 arch/unicore32/mm/Kconfig
 delete mode 100644 arch/unicore32/mm/Makefile
 delete mode 100644 arch/unicore32/mm/alignment.c
 delete mode 100644 arch/unicore32/mm/cache-ucv2.S
 delete mode 100644 arch/unicore32/mm/extable.c
 delete mode 100644 arch/unicore32/mm/fault.c
 delete mode 100644 arch/unicore32/mm/flush.c
 delete mode 100644 arch/unicore32/mm/init.c
 delete mode 100644 arch/unicore32/mm/ioremap.c
 delete mode 100644 arch/unicore32/mm/mm.h
 delete mode 100644 arch/unicore32/mm/mmu.c
 delete mode 100644 arch/unicore32/mm/pgd.c
 delete mode 100644 arch/unicore32/mm/proc-macros.S
 delete mode 100644 arch/unicore32/mm/proc-syms.c
 delete mode 100644 arch/unicore32/mm/proc-ucv2.S
 delete mode 100644 arch/unicore32/mm/tlb-ucv2.S

(limited to 'kernel')

diff --git a/Documentation/features/core/cBPF-JIT/arch-support.txt b/Documentation/features/core/cBPF-JIT/arch-support.txt
index 8620c38d4db0..399935616813 100644
--- a/Documentation/features/core/cBPF-JIT/arch-support.txt
+++ b/Documentation/features/core/cBPF-JIT/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: | TODO |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/core/eBPF-JIT/arch-support.txt b/Documentation/features/core/eBPF-JIT/arch-support.txt
index 9ed964f65224..79409bfe0263 100644
--- a/Documentation/features/core/eBPF-JIT/arch-support.txt
+++ b/Documentation/features/core/eBPF-JIT/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/core/generic-idle-thread/arch-support.txt b/Documentation/features/core/generic-idle-thread/arch-support.txt
index 365df2c2ff0b..9ea60e416efd 100644
--- a/Documentation/features/core/generic-idle-thread/arch-support.txt
+++ b/Documentation/features/core/generic-idle-thread/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/core/jump-labels/arch-support.txt b/Documentation/features/core/jump-labels/arch-support.txt
index 632a1c7aefa2..f8ec5c13cde4 100644
--- a/Documentation/features/core/jump-labels/arch-support.txt
+++ b/Documentation/features/core/jump-labels/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/core/tracehook/arch-support.txt b/Documentation/features/core/tracehook/arch-support.txt
index 964667052eda..cd3510e2eedb 100644
--- a/Documentation/features/core/tracehook/arch-support.txt
+++ b/Documentation/features/core/tracehook/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/debug/KASAN/arch-support.txt b/Documentation/features/debug/KASAN/arch-support.txt
index 6ff38548923e..c3fe9b266e7b 100644
--- a/Documentation/features/debug/KASAN/arch-support.txt
+++ b/Documentation/features/debug/KASAN/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
index c527d05c0459..ca6bacb1e99e 100644
--- a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
+++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/gcov-profile-all/arch-support.txt b/Documentation/features/debug/gcov-profile-all/arch-support.txt
index 210256f6a4cf..7563a494ddb8 100644
--- a/Documentation/features/debug/gcov-profile-all/arch-support.txt
+++ b/Documentation/features/debug/gcov-profile-all/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/kgdb/arch-support.txt b/Documentation/features/debug/kgdb/arch-support.txt
index 38c40cfa0578..4b0a1d0d6ba4 100644
--- a/Documentation/features/debug/kgdb/arch-support.txt
+++ b/Documentation/features/debug/kgdb/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
index 97cd7aa74905..6225cfe0c5bf 100644
--- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
+++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/kprobes/arch-support.txt b/Documentation/features/debug/kprobes/arch-support.txt
index 8b316c6e03d4..371f0ac488f5 100644
--- a/Documentation/features/debug/kprobes/arch-support.txt
+++ b/Documentation/features/debug/kprobes/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/kretprobes/arch-support.txt b/Documentation/features/debug/kretprobes/arch-support.txt
index b805aada395e..38e95251deed 100644
--- a/Documentation/features/debug/kretprobes/arch-support.txt
+++ b/Documentation/features/debug/kretprobes/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/optprobes/arch-support.txt b/Documentation/features/debug/optprobes/arch-support.txt
index fb297a88f62c..7f4a20e6a12b 100644
--- a/Documentation/features/debug/optprobes/arch-support.txt
+++ b/Documentation/features/debug/optprobes/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/stackprotector/arch-support.txt b/Documentation/features/debug/stackprotector/arch-support.txt
index 12410f606edc..3db4763aa3f5 100644
--- a/Documentation/features/debug/stackprotector/arch-support.txt
+++ b/Documentation/features/debug/stackprotector/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/debug/uprobes/arch-support.txt b/Documentation/features/debug/uprobes/arch-support.txt
index be8acbb95b54..43cac6ee0c68 100644
--- a/Documentation/features/debug/uprobes/arch-support.txt
+++ b/Documentation/features/debug/uprobes/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/user-ret-profiler/arch-support.txt b/Documentation/features/debug/user-ret-profiler/arch-support.txt
index 6bfa36b0e017..d636ed0e679f 100644
--- a/Documentation/features/debug/user-ret-profiler/arch-support.txt
+++ b/Documentation/features/debug/user-ret-profiler/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/io/dma-contiguous/arch-support.txt b/Documentation/features/io/dma-contiguous/arch-support.txt
index 895c3b0f6492..dfc93d074e3d 100644
--- a/Documentation/features/io/dma-contiguous/arch-support.txt
+++ b/Documentation/features/io/dma-contiguous/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/locking/cmpxchg-local/arch-support.txt b/Documentation/features/locking/cmpxchg-local/arch-support.txt
index 242ff5a6586e..1815c7fed06d 100644
--- a/Documentation/features/locking/cmpxchg-local/arch-support.txt
+++ b/Documentation/features/locking/cmpxchg-local/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/locking/lockdep/arch-support.txt b/Documentation/features/locking/lockdep/arch-support.txt
index 98cb9d85c55d..4f844ecd0680 100644
--- a/Documentation/features/locking/lockdep/arch-support.txt
+++ b/Documentation/features/locking/lockdep/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: |  ok  |
-    |   unicore32: |  ok  |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/locking/queued-rwlocks/arch-support.txt b/Documentation/features/locking/queued-rwlocks/arch-support.txt
index ee922746a64c..5c6bcfcf8e1f 100644
--- a/Documentation/features/locking/queued-rwlocks/arch-support.txt
+++ b/Documentation/features/locking/queued-rwlocks/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/locking/queued-spinlocks/arch-support.txt b/Documentation/features/locking/queued-spinlocks/arch-support.txt
index c52116c1a049..b55e420a34ea 100644
--- a/Documentation/features/locking/queued-spinlocks/arch-support.txt
+++ b/Documentation/features/locking/queued-spinlocks/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/perf/kprobes-event/arch-support.txt b/Documentation/features/perf/kprobes-event/arch-support.txt
index 518f352fc727..04c17c2106a4 100644
--- a/Documentation/features/perf/kprobes-event/arch-support.txt
+++ b/Documentation/features/perf/kprobes-event/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/perf/perf-regs/arch-support.txt b/Documentation/features/perf/perf-regs/arch-support.txt
index c22cd6f8aa5e..e7450fbb8253 100644
--- a/Documentation/features/perf/perf-regs/arch-support.txt
+++ b/Documentation/features/perf/perf-regs/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/perf/perf-stackdump/arch-support.txt b/Documentation/features/perf/perf-stackdump/arch-support.txt
index 527fe4d0b074..98e79d128d9b 100644
--- a/Documentation/features/perf/perf-stackdump/arch-support.txt
+++ b/Documentation/features/perf/perf-stackdump/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
index 8a521a622966..68658a6f8c5b 100644
--- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
+++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
@@ -51,7 +51,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/sched/numa-balancing/arch-support.txt b/Documentation/features/sched/numa-balancing/arch-support.txt
index 350823692f28..964457ad26c1 100644
--- a/Documentation/features/sched/numa-balancing/arch-support.txt
+++ b/Documentation/features/sched/numa-balancing/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ..  |
     |       sparc: | TODO |
     |          um: |  ..  |
-    |   unicore32: |  ..  |
     |         x86: |  ok  |
     |      xtensa: |  ..  |
     -----------------------
diff --git a/Documentation/features/seccomp/seccomp-filter/arch-support.txt b/Documentation/features/seccomp/seccomp-filter/arch-support.txt
index c7b837f735b1..f54ddfc06a12 100644
--- a/Documentation/features/seccomp/seccomp-filter/arch-support.txt
+++ b/Documentation/features/seccomp/seccomp-filter/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: |  ok  |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/time/arch-tick-broadcast/arch-support.txt b/Documentation/features/time/arch-tick-broadcast/arch-support.txt
index 593536f7925b..4d11cbb3c09b 100644
--- a/Documentation/features/time/arch-tick-broadcast/arch-support.txt
+++ b/Documentation/features/time/arch-tick-broadcast/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: | TODO |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/time/clockevents/arch-support.txt b/Documentation/features/time/clockevents/arch-support.txt
index 7a27157da408..8287b6aa522e 100644
--- a/Documentation/features/time/clockevents/arch-support.txt
+++ b/Documentation/features/time/clockevents/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: |  ok  |
-    |   unicore32: |  ok  |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/time/context-tracking/arch-support.txt b/Documentation/features/time/context-tracking/arch-support.txt
index 048bfb6d3872..a71f3a945285 100644
--- a/Documentation/features/time/context-tracking/arch-support.txt
+++ b/Documentation/features/time/context-tracking/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt
index a14bbad8e948..d9082b91f10e 100644
--- a/Documentation/features/time/irq-time-acct/arch-support.txt
+++ b/Documentation/features/time/irq-time-acct/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ..  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/time/modern-timekeeping/arch-support.txt b/Documentation/features/time/modern-timekeeping/arch-support.txt
index 1d46da165b75..a84c3b9d9a94 100644
--- a/Documentation/features/time/modern-timekeeping/arch-support.txt
+++ b/Documentation/features/time/modern-timekeeping/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: |  ok  |
-    |   unicore32: |  ok  |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/time/virt-cpuacct/arch-support.txt b/Documentation/features/time/virt-cpuacct/arch-support.txt
index fb0d0cab9cab..56b372da6b01 100644
--- a/Documentation/features/time/virt-cpuacct/arch-support.txt
+++ b/Documentation/features/time/virt-cpuacct/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/vm/ELF-ASLR/arch-support.txt b/Documentation/features/vm/ELF-ASLR/arch-support.txt
index adc25878d217..eccda0732474 100644
--- a/Documentation/features/vm/ELF-ASLR/arch-support.txt
+++ b/Documentation/features/vm/ELF-ASLR/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/vm/PG_uncached/arch-support.txt b/Documentation/features/vm/PG_uncached/arch-support.txt
index f05588f9e4b4..c74e3f8040e1 100644
--- a/Documentation/features/vm/PG_uncached/arch-support.txt
+++ b/Documentation/features/vm/PG_uncached/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/vm/THP/arch-support.txt b/Documentation/features/vm/THP/arch-support.txt
index cdfe8925f881..1c0b95f2b40d 100644
--- a/Documentation/features/vm/THP/arch-support.txt
+++ b/Documentation/features/vm/THP/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ..  |
     |       sparc: |  ok  |
     |          um: |  ..  |
-    |   unicore32: |  ..  |
     |         x86: |  ok  |
     |      xtensa: |  ..  |
     -----------------------
diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt
index 2bdd3b6cee3c..30f75a79ce01 100644
--- a/Documentation/features/vm/TLB/arch-support.txt
+++ b/Documentation/features/vm/TLB/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: |  ..  |
-    |   unicore32: |  ..  |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt
index 8525f1981f19..c5ff3a427722 100644
--- a/Documentation/features/vm/huge-vmap/arch-support.txt
+++ b/Documentation/features/vm/huge-vmap/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/vm/ioremap_prot/arch-support.txt b/Documentation/features/vm/ioremap_prot/arch-support.txt
index 3a6b87de6a19..1cb7406cd858 100644
--- a/Documentation/features/vm/ioremap_prot/arch-support.txt
+++ b/Documentation/features/vm/ioremap_prot/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt
index 2e017387e228..13d0e1e17001 100644
--- a/Documentation/features/vm/pte_special/arch-support.txt
+++ b/Documentation/features/vm/pte_special/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 496fd4eafb68..1de95aa44bbb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17532,13 +17532,6 @@ L:	linux-fsdevel@vger.kernel.org
 S:	Supported
 F:	fs/unicode/
 
-UNICORE32 ARCHITECTURE
-M:	Guan Xuetao <gxt@pku.edu.cn>
-S:	Maintained
-W:	http://mprc.pku.edu.cn/~guanxuetao/linux
-T:	git git://github.com/gxt/linux.git
-F:	arch/unicore32/
-
 UNIFDEF
 M:	Tony Finch <dot@dotat.at>
 S:	Maintained
diff --git a/arch/unicore32/.gitignore b/arch/unicore32/.gitignore
deleted file mode 100644
index e82f3fb57ba0..000000000000
--- a/arch/unicore32/.gitignore
+++ /dev/null
@@ -1,22 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Generated include files
-#
-include/generated
-#
-# Generated ld script file
-#
-kernel/vmlinux.lds
-#
-# Generated images in boot
-#
-boot/Image
-boot/zImage
-boot/uImage
-#
-# Generated files in boot/compressed
-#
-boot/compressed/piggy.S
-boot/compressed/piggy.gzip
-boot/compressed/vmlinux
-boot/compressed/vmlinux.lds
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
deleted file mode 100644
index 11ba1839d198..000000000000
--- a/arch/unicore32/Kconfig
+++ /dev/null
@@ -1,200 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-config UNICORE32
-	def_bool y
-	select ARCH_32BIT_OFF_T
-	select ARCH_HAS_DEVMEM_IS_ALLOWED
-	select ARCH_HAS_KEEPINITRD
-	select ARCH_MIGHT_HAVE_PC_PARPORT
-	select ARCH_MIGHT_HAVE_PC_SERIO
-	select HAVE_KERNEL_GZIP
-	select HAVE_KERNEL_BZIP2
-	select GENERIC_ATOMIC64
-	select HAVE_KERNEL_LZO
-	select HAVE_KERNEL_LZMA
-	select HAVE_PCI
-	select VIRT_TO_BUS
-	select ARCH_HAVE_CUSTOM_GPIO_H
-	select GENERIC_FIND_FIRST_BIT
-	select GENERIC_IRQ_PROBE
-	select GENERIC_IRQ_SHOW
-	select ARCH_WANT_FRAME_POINTERS
-	select GENERIC_IOMAP
-	select MODULES_USE_ELF_REL
-	select NEED_DMA_MAP_STATE
-	select MMU_GATHER_NO_RANGE if MMU
-	help
-	  UniCore-32 is 32-bit Instruction Set Architecture,
-	  including a series of low-power-consumption RISC chip
-	  designs licensed by PKUnity Ltd.
-	  Please see web page at <http://www.pkunity.com/>.
-
-config GENERIC_CSUM
-	def_bool y
-
-config NO_IOPORT_MAP
-	bool
-
-config STACKTRACE_SUPPORT
-	def_bool y
-
-config LOCKDEP_SUPPORT
-	def_bool y
-
-config ARCH_HAS_ILOG2_U32
-	bool
-
-config ARCH_HAS_ILOG2_U64
-	bool
-
-config GENERIC_HWEIGHT
-	def_bool y
-
-config GENERIC_CALIBRATE_DELAY
-	def_bool y
-
-config ARCH_MAY_HAVE_PC_FDC
-	bool
-
-config ZONE_DMA
-	def_bool y
-
-menu "System Type"
-
-config MMU
-	def_bool y
-
-config ARCH_FPGA
-	bool
-
-config ARCH_PUV3
-	def_bool y
-	select CPU_UCV2
-	select GENERIC_CLOCKEVENTS
-	select HAVE_LEGACY_CLK
-	select GPIOLIB
-
-# CONFIGs for ARCH_PUV3
-
-if ARCH_PUV3
-
-choice
-	prompt "Board Selection"
-	default PUV3_DB0913
-
-config PUV3_FPGA_DLX200
-	select ARCH_FPGA
-	bool "FPGA board"
-
-config PUV3_DB0913
-	bool "DEBUG board (0913)"
-
-config PUV3_NB0916
-	bool "NetBook board (0916)"
-	select PWM
-	select PWM_PUV3
-
-config PUV3_SMW0919
-	bool "Security Mini-Workstation board (0919)"
-
-endchoice
-
-config PUV3_PM
-	def_bool y if !ARCH_FPGA
-
-endif
-
-source "arch/unicore32/mm/Kconfig"
-
-comment "Floating point support"
-
-config UNICORE_FPU_F64
-	def_bool y if !ARCH_FPGA
-
-endmenu
-
-menu "Kernel Features"
-
-source "kernel/Kconfig.hz"
-
-config LEDS
-	def_bool y
-	depends on GPIOLIB
-
-config ALIGNMENT_TRAP
-	def_bool y
-	help
-	  Unicore processors can not fetch/store information which is not
-	  naturally aligned on the bus, i.e., a 4 byte fetch must start at an
-	  address divisible by 4. On 32-bit Unicore processors, these non-aligned
-	  fetch/store instructions will be emulated in software if you say
-	  here, which has a severe performance impact. This is necessary for
-	  correct operation of some network protocols. With an IP-only
-	  configuration it is safe to say N, otherwise say Y.
-
-endmenu
-
-menu "Boot options"
-
-config CMDLINE
-	string "Default kernel command string"
-	default ""
-
-config CMDLINE_FORCE
-	bool "Always use the default kernel command string"
-	depends on CMDLINE != ""
-	help
-	  Always use the default kernel command string, even if the boot
-	  loader passes other arguments to the kernel.
-	  This is useful if you cannot or don't want to change the
-	  command-line options your boot loader passes to the kernel.
-
-	  If unsure, say N.
-
-endmenu
-
-menu "Power management options"
-
-source "kernel/power/Kconfig"
-
-source "drivers/cpufreq/Kconfig"
-
-config ARCH_SUSPEND_POSSIBLE
-	def_bool y if !ARCH_FPGA
-
-config ARCH_HIBERNATION_POSSIBLE
-	def_bool y if !ARCH_FPGA
-
-endmenu
-
-if ARCH_PUV3
-
-config PUV3_GPIO
-	bool
-	depends on !ARCH_FPGA
-	select GPIO_SYSFS
-	default y
-
-if PUV3_NB0916
-
-menu "PKUnity NetBook-0916 Features"
-
-config I2C_BATTERY_BQ27200
-	tristate "I2C Battery BQ27200 Support"
-	select I2C_PUV3
-	select POWER_SUPPLY
-	select BATTERY_BQ27XXX
-
-config I2C_EEPROM_AT24
-	tristate "I2C EEPROMs AT24 support"
-	select I2C_PUV3
-	select EEPROM_AT24
-
-config LCD_BACKLIGHT
-	tristate "LCD Backlight support"
-	select BACKLIGHT_PWM
-
-endmenu
-
-endif
-
-endif
diff --git a/arch/unicore32/Kconfig.debug b/arch/unicore32/Kconfig.debug
deleted file mode 100644
index ca0ff97657ef..000000000000
--- a/arch/unicore32/Kconfig.debug
+++ /dev/null
@@ -1,29 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-config EARLY_PRINTK
-	def_bool DEBUG_OCD
-	help
-	  Write kernel log output directly into the ocd or to a serial port.
-
-	  This is useful for kernel debugging when your machine crashes very
-	  early before the console code is initialized. For normal operation
-	  it is not recommended because it looks ugly and doesn't cooperate
-	  with klogd/syslogd or the X server. You should normally N here,
-	  unless you want to debug such a crash.
-
-# These options are only for real kernel hackers who want to get their hands dirty.
-config DEBUG_LL
-	bool "Kernel low-level debugging functions"
-	depends on DEBUG_KERNEL
-	help
-	  Say Y here to include definitions of printascii, printch, printhex
-	  in the kernel.  This is helpful if you are debugging code that
-	  executes before the console is initialized.
-
-config DEBUG_OCD
-	bool "Kernel low-level debugging via On-Chip-Debugger"
-	depends on DEBUG_LL
-	default y
-	help
-	  Say Y here if you want the debug print routines to direct their
-	  output to the UniCore On-Chip-Debugger channel using CP #1.
diff --git a/arch/unicore32/Makefile b/arch/unicore32/Makefile
deleted file mode 100644
index 390819947c37..000000000000
--- a/arch/unicore32/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
-#
-# arch/unicore32/Makefile
-#
-# This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies.
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 2002~2010 by Guan Xue-tao
-#
-ifneq ($(SUBARCH),$(ARCH))
-	ifeq ($(CROSS_COMPILE),)
-		CROSS_COMPILE := $(call cc-cross-prefix, unicore32-linux-)
-	endif
-endif
-
-LDFLAGS_vmlinux		:= -p --no-undefined -X
-
-OBJCOPYFLAGS		:= -O binary -R .note -R .note.gnu.build-id -R .comment -S
-
-# Never generate .eh_frame
-KBUILD_CFLAGS		+= $(call cc-option,-fno-dwarf2-cfi-asm)
-
-# Never use hard float in kernel
-KBUILD_CFLAGS		+= -msoft-float
-
-ifeq ($(CONFIG_FRAME_POINTER),y)
-KBUILD_CFLAGS		+= -mno-sched-prolog
-endif
-
-CHECKFLAGS		+= -D__unicore32__
-
-head-y			:= arch/unicore32/kernel/head.o
-
-core-y			+= arch/unicore32/kernel/
-core-y			+= arch/unicore32/mm/
-
-libs-y			+= arch/unicore32/lib/
-
-boot			:= arch/unicore32/boot
-
-# Default target when executing plain make
-KBUILD_IMAGE		:= $(boot)/zImage
-
-all:	zImage
-
-zImage Image uImage: vmlinux
-	$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
-
-archclean:
-	$(Q)$(MAKE) $(clean)=$(boot)
-
-define archhelp
-  echo  '* zImage        - Compressed kernel image (arch/$(ARCH)/boot/zImage)'
-  echo  '  Image         - Uncompressed kernel image (arch/$(ARCH)/boot/Image)'
-  echo  '  uImage        - U-Boot wrapped zImage'
-endef
diff --git a/arch/unicore32/boot/Makefile b/arch/unicore32/boot/Makefile
deleted file mode 100644
index 828855007b29..000000000000
--- a/arch/unicore32/boot/Makefile
+++ /dev/null
@@ -1,39 +0,0 @@
-#
-# arch/unicore32/boot/Makefile
-#
-# This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies.
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 2001~2010 GUAN Xue-tao
-#
-
-targets := Image zImage uImage
-
-$(obj)/Image: vmlinux FORCE
-	$(call if_changed,objcopy)
-	@echo '  Kernel: $@ is ready'
-
-$(obj)/compressed/vmlinux: $(obj)/Image FORCE
-	$(Q)$(MAKE) $(build)=$(obj)/compressed $@
-
-$(obj)/zImage: $(obj)/compressed/vmlinux FORCE
-	$(call if_changed,objcopy)
-	@echo '  Kernel: $@ is ready'
-
-UIMAGE_ARCH = unicore
-UIMAGE_LOADADDR = 0x0
-
-$(obj)/uImage: $(obj)/zImage FORCE
-	$(call if_changed,uimage)
-	@echo '  Image $@ is ready'
-
-PHONY += initrd
-initrd:
-	@test "$(INITRD)" != "" || \
-	(echo You must specify INITRD; exit -1)
-
-subdir- := compressed
diff --git a/arch/unicore32/boot/compressed/Makefile b/arch/unicore32/boot/compressed/Makefile
deleted file mode 100644
index 150fafc32fb0..000000000000
--- a/arch/unicore32/boot/compressed/Makefile
+++ /dev/null
@@ -1,64 +0,0 @@
-#
-# linux/arch/unicore32/boot/compressed/Makefile
-#
-# create a compressed vmlinuz image from the original vmlinux
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 2001~2010 GUAN Xue-tao
-#
-
-ccflags-y	:= -fpic -fno-builtin
-asflags-y	:= -Wa,-march=all
-
-OBJS		:= misc.o
-
-# font.c and font.o
-CFLAGS_font.o	:= -Dstatic=
-$(obj)/font.c: $(srctree)/lib/fonts/font_8x8.c
-	$(call cmd,shipped)
-
-# piggy.S and piggy.o
-suffix_$(CONFIG_KERNEL_GZIP)	:= gzip
-suffix_$(CONFIG_KERNEL_BZIP2)	:= bz2
-suffix_$(CONFIG_KERNEL_LZO)	:= lzo
-suffix_$(CONFIG_KERNEL_LZMA)	:= lzma
-
-$(obj)/piggy.$(suffix_y): $(obj)/../Image FORCE
-	$(call if_changed,$(suffix_y))
-
-SEDFLAGS_piggy	= s/DECOMP_SUFFIX/$(suffix_y)/
-$(obj)/piggy.S: $(obj)/piggy.S.in
-	@sed "$(SEDFLAGS_piggy)" < $< > $@
-
-$(obj)/piggy.o:  $(obj)/piggy.$(suffix_y) $(obj)/piggy.S FORCE
-
-targets		:= vmlinux vmlinux.lds font.o font.c head.o misc.o \
-			piggy.$(suffix_y) piggy.o piggy.S \
-
-# Make sure files are removed during clean
-extra-y		+= piggy.gzip piggy.bz2 piggy.lzo piggy.lzma
-
-# ?
-LDFLAGS_vmlinux += -p
-# Report unresolved symbol references
-LDFLAGS_vmlinux += --no-undefined
-# Delete all temporary local symbols
-LDFLAGS_vmlinux += -X
-# Next argument is a linker script
-LDFLAGS_vmlinux += -T
-
-# For uidivmod
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head.o $(obj)/piggy.o \
-		$(obj)/misc.o FORCE
-	$(call if_changed,ld)
-
-# We now have a PIC decompressor implementation.  Decompressors running
-# from RAM should not define ZTEXTADDR.  Decompressors running directly
-# from ROM or Flash must define ZTEXTADDR (preferably via the config)
-ZTEXTADDR	:= 0x03000000
-ZBSSADDR	:= ALIGN(4)
-
-CPPFLAGS_vmlinux.lds = -DTEXT_START="$(ZTEXTADDR)" -DBSS_START="$(ZBSSADDR)"
diff --git a/arch/unicore32/boot/compressed/head.S b/arch/unicore32/boot/compressed/head.S
deleted file mode 100644
index 5f72662cd294..000000000000
--- a/arch/unicore32/boot/compressed/head.S
+++ /dev/null
@@ -1,201 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/boot/compressed/head.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <mach/memory.h>
-
-#define csub	cmpsub
-#define cand	cmpand
-#define nop8	nop; nop; nop; nop; nop; nop; nop; nop
-
-		.section ".start", #alloc, #execinstr
-		.text
-start:
-		.type	start,#function
-
-		/* Initialize ASR, PRIV mode and INTR off */
-		mov	r0, #0xD3
-		mov.a	asr, r0
-
-		adr	r0, LC0
-		ldm	(r1, r2, r3, r5, r6, r7, r8), [r0]+
-		ldw	sp, [r0+], #28
-		sub.a	r0, r0, r1		@ calculate the delta offset
-
-		/*
-		 * if delta is zero, we are running at the address
-		 * we were linked at.
-		 */
-		beq	not_relocated
-
-		/*
-		 * We're running at a different address.  We need to fix
-		 * up various pointers:
-		 *   r5 - zImage base address (_start)
-		 *   r7 - GOT start
-		 *   r8 - GOT end
-		 */
-		add	r5, r5, r0
-		add	r7, r7, r0
-		add	r8, r8, r0
-
-		/*
-		 * we need to fix up pointers into the BSS region.
-		 *   r2 - BSS start
-		 *   r3 - BSS end
-		 *   sp - stack pointer
-		 */
-		add	r2, r2, r0
-		add	r3, r3, r0
-		add	sp, sp, r0
-
-		/*
-		 * Relocate all entries in the GOT table.
-		 * This fixes up the C references.
-		 *   r7 - GOT start
-		 *   r8 - GOT end
-		 */
-1001:		ldw	r1, [r7+], #0
-		add	r1, r1, r0
-		stw.w	r1, [r7]+, #4
-		csub.a	r7, r8
-		bub	1001b
-
-not_relocated:
-		/*
-		 * Clear BSS region.
-		 *   r2 - BSS start
-		 *   r3 - BSS end
-		 */
-		mov	r0, #0
-1002:		stw.w	r0, [r2]+, #4
-		csub.a	r2, r3
-		bub	1002b
-
-		/*
-		 * Turn on the cache.
-		 */
-                mov     r0, #0
-                movc    p0.c5, r0, #28		@ cache invalidate all
-                nop8
-                movc    p0.c6, r0, #6		@ tlb invalidate all
-                nop8
-
-                mov     r0, #0x1c		@ en icache and wb dcache
-                movc    p0.c1, r0, #0
-                nop8
-
-		/*
-		 * Set up some pointers, for starting decompressing.
-		 */
-
-		mov	r1, sp			@ malloc space above stack
-		add	r2, sp, #0x10000	@ 64k max
-
-		/*
-		 * Check to see if we will overwrite ourselves.
-		 *   r4 = final kernel address
-		 *   r5 = start of this image
-		 *   r6 = size of decompressed image
-		 *   r2 = end of malloc space (and therefore this image)
-		 * We basically want:
-		 *   r4 >= r2 -> OK
-		 *   r4 + image length <= r5 -> OK
-		 */
-		ldw	r4, =KERNEL_IMAGE_START
-		csub.a	r4, r2
-		bea	wont_overwrite
-		add	r0, r4, r6
-		csub.a	r0, r5
-		beb	wont_overwrite
-
-		/*
-		 * If overwrite, just print error message
-		 */
-		b	__error_overwrite
-
-		/*
-		 * We're not in danger of overwriting ourselves.
-		 * Do this the simple way.
-		 */
-wont_overwrite:
-		/*
-		 * decompress_kernel:
-		 *   r0: output_start
-		 *   r1: free_mem_ptr_p
-		 *   r2: free_mem_ptr_end_p
-		 */
-		mov	r0, r4
-		b.l	decompress_kernel	@ C functions
-
-		/*
-		 * Clean and flush the cache to maintain consistency.
-		 */
-		mov	r0, #0
-                movc    p0.c5, r0, #14		@ flush dcache
-		nop8
-                movc    p0.c5, r0, #20		@ icache invalidate all
-                nop8
-
-		/*
-		 * Turn off the Cache and MMU.
-		 */
-		mov	r0, #0			@ disable i/d cache and MMU
-		movc	p0.c1, r0, #0
-                nop8
-
-		mov	r0, #0			@ must be zero
-		ldw	r4, =KERNEL_IMAGE_START
-		mov	pc, r4			@ call kernel
-
-
-		.align	2
-		.type	LC0, #object
-LC0:		.word	LC0			@ r1
-		.word	__bss_start		@ r2
-		.word	_end			@ r3
-		.word	_start			@ r5
-		.word	_image_size		@ r6
-		.word	_got_start		@ r7
-		.word	_got_end		@ r8
-		.word	decompress_stack_end	@ sp
-		.size	LC0, . - LC0
-
-print_string:
-#ifdef CONFIG_DEBUG_OCD
-2001:		ldb.w	r1, [r0]+, #1
-		csub.a	r1, #0
-		bne	2002f
-		mov	pc, lr
-2002:
-		movc	r2, p1.c0, #0
-		cand.a	r2, #2
-		bne	2002b
-		movc	p1.c1, r1, #1
-		csub.a	r1, #'\n'
-		cmoveq	r1, #'\r'
-		beq	2002b
-		b	2001b
-#else
-		mov	pc, lr
-#endif
-
-__error_overwrite:
-		adr	r0, str_error
-		b.l	print_string
-2001:		nop8
-		b	2001b
-str_error:	.asciz	"\nError: Kernel address OVERWRITE\n"
-		.align
-
-		.ltorg
-
-		.align	4
-		.section ".stack", "aw", %nobits
-decompress_stack:	.space	4096
-decompress_stack_end:
diff --git a/arch/unicore32/boot/compressed/misc.c b/arch/unicore32/boot/compressed/misc.c
deleted file mode 100644
index 450d3355de20..000000000000
--- a/arch/unicore32/boot/compressed/misc.c
+++ /dev/null
@@ -1,123 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/boot/compressed/misc.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#include <asm/unaligned.h>
-#include <mach/uncompress.h>
-
-/*
- * gzip delarations
- */
-unsigned char *output_data;
-unsigned long output_ptr;
-
-unsigned int free_mem_ptr;
-unsigned int free_mem_end_ptr;
-
-#define STATIC static
-#define STATIC_RW_DATA	/* non-static please */
-
-/*
- * arch-dependent implementations
- */
-#ifndef ARCH_HAVE_DECOMP_ERROR
-#define arch_decomp_error(x)
-#endif
-
-#ifndef ARCH_HAVE_DECOMP_SETUP
-#define arch_decomp_setup()
-#endif
-
-#ifndef ARCH_HAVE_DECOMP_PUTS
-#define arch_decomp_puts(p)
-#endif
-
-void *memcpy(void *dest, const void *src, size_t n)
-{
-	int i = 0;
-	unsigned char *d = (unsigned char *)dest, *s = (unsigned char *)src;
-
-	for (i = n >> 3; i > 0; i--) {
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-	}
-
-	if (n & 1 << 2) {
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-	}
-
-	if (n & 1 << 1) {
-		*d++ = *s++;
-		*d++ = *s++;
-	}
-
-	if (n & 1)
-		*d++ = *s++;
-
-	return dest;
-}
-
-void error(char *x)
-{
-	arch_decomp_puts("\n\n");
-	arch_decomp_puts(x);
-	arch_decomp_puts("\n\n -- System halted");
-
-	arch_decomp_error(x);
-
-	for (;;)
-		; /* Halt */
-}
-
-/* Heap size should be adjusted for different decompress method */
-#ifdef CONFIG_KERNEL_GZIP
-#include "../../../../lib/decompress_inflate.c"
-#endif
-
-#ifdef CONFIG_KERNEL_BZIP2
-#include "../../../../lib/decompress_bunzip2.c"
-#endif
-
-#ifdef CONFIG_KERNEL_LZO
-#include "../../../../lib/decompress_unlzo.c"
-#endif
-
-#ifdef CONFIG_KERNEL_LZMA
-#include "../../../../lib/decompress_unlzma.c"
-#endif
-
-unsigned long decompress_kernel(unsigned long output_start,
-		unsigned long free_mem_ptr_p,
-		unsigned long free_mem_ptr_end_p)
-{
-	unsigned char *tmp;
-
-	output_data		= (unsigned char *)output_start;
-	free_mem_ptr		= free_mem_ptr_p;
-	free_mem_end_ptr	= free_mem_ptr_end_p;
-
-	arch_decomp_setup();
-
-	tmp = (unsigned char *) (((unsigned long)input_data_end) - 4);
-	output_ptr = get_unaligned_le32(tmp);
-
-	arch_decomp_puts("Uncompressing Linux...");
-	__decompress(input_data, input_data_end - input_data, NULL, NULL,
-			output_data, 0, NULL, error);
-	arch_decomp_puts(" done, booting the kernel.\n");
-	return output_ptr;
-}
diff --git a/arch/unicore32/boot/compressed/piggy.S.in b/arch/unicore32/boot/compressed/piggy.S.in
deleted file mode 100644
index b79704d58026..000000000000
--- a/arch/unicore32/boot/compressed/piggy.S.in
+++ /dev/null
@@ -1,6 +0,0 @@
-	.section .piggydata,#alloc
-	.globl	input_data
-input_data:
-	.incbin	"arch/unicore32/boot/compressed/piggy.DECOMP_SUFFIX"
-	.globl	input_data_end
-input_data_end:
diff --git a/arch/unicore32/boot/compressed/vmlinux.lds.S b/arch/unicore32/boot/compressed/vmlinux.lds.S
deleted file mode 100644
index edda4ddfa357..000000000000
--- a/arch/unicore32/boot/compressed/vmlinux.lds.S
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore/boot/compressed/vmlinux.lds.in
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-OUTPUT_ARCH(unicore32)
-ENTRY(_start)
-SECTIONS
-{
-  /DISCARD/ : {
-    /*
-     * Discard any r/w data - this produces a link error if we have any,
-     * which is required for PIC decompression.  Local data generates
-     * GOTOFF relocations, which prevents it being relocated independently
-     * of the text/got segments.
-     */
-    *(.data)
-  }
-
-  . = TEXT_START;
-  _text = .;
-
-  .text : {
-    _start = .;
-    *(.start)
-    *(.text)
-    *(.text.*)
-    *(.fixup)
-    *(.gnu.warning)
-    *(.rodata)
-    *(.rodata.*)
-    *(.piggydata)
-    . = ALIGN(4);
-  }
-
-  _etext = .;
-
-  /* Assume size of decompressed image is 4x the compressed image */
-  _image_size = (_etext - _text) * 4;
-
-  _got_start = .;
-  .got			: { *(.got) }
-  _got_end = .;
-  .got.plt		: { *(.got.plt) }
-  _edata = .;
-
-  . = BSS_START;
-  __bss_start = .;
-  .bss			: { *(.bss) }
-  _end = .;
-
-  .stack		: { *(.stack) }
-  .comment 0		: { *(.comment) }
-}
-
diff --git a/arch/unicore32/configs/defconfig b/arch/unicore32/configs/defconfig
deleted file mode 100644
index 360cc9abcdb0..000000000000
--- a/arch/unicore32/configs/defconfig
+++ /dev/null
@@ -1,214 +0,0 @@
-### General setup
-CONFIG_EXPERIMENTAL=y
-CONFIG_LOCALVERSION="-unicore32"
-CONFIG_SWAP=y
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_HOTPLUG=y
-#	Initial RAM filesystem and RAM disk (initramfs/initrd) support
-#CONFIG_BLK_DEV_INITRD=y
-#CONFIG_INITRAMFS_SOURCE="arch/unicore/ramfs/ramfs_config"
-
-### Enable loadable module support
-CONFIG_MODULES=n
-CONFIG_MODULE_UNLOAD=y
-
-### System Type
-CONFIG_ARCH_PUV3=y
-#	Board Selection
-CONFIG_PUV3_NB0916=y
-#	Processor Features
-CONFIG_CPU_DCACHE_LINE_DISABLE=y
-CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE=n
-
-### Bus support
-CONFIG_PCI=y
-CONFIG_PCI_LEGACY=n
-
-### Boot options
-#	for debug, adding: earlyprintk=ocd,keep initcall_debug
-#	others support: test_suspend=mem root=/dev/sda
-#	hibernate support: resume=/dev/sda3
-CONFIG_CMDLINE="earlyprintk=ocd,keep ignore_loglevel"
-# TODO: mem=512M video=unifb:1024x600-16@75
-# for nfs: root=/dev/nfs rw nfsroot=192.168.10.88:/home/udb/nfs/,rsize=1024,wsize=1024
-#	ip=192.168.10.83:192.168.10.88:192.168.10.1:255.255.255.0::eth0:off
-CONFIG_CMDLINE_FORCE=y
-
-### Power management options
-CONFIG_PM=y
-CONFIG_HIBERNATION=y
-CONFIG_PM_STD_PARTITION="/dev/sda3"
-CONFIG_CPU_FREQ=n
-CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
-
-### Networking support
-CONFIG_NET=y
-#	Networking options
-CONFIG_PACKET=m
-CONFIG_UNIX=m
-#	TCP/IP networking
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_PNP=y
-CONFIG_IPV6=n
-#	Wireless
-CONFIG_WIRELESS=y
-CONFIG_WIRELESS_EXT=y
-CONFIG_MAC80211=m
-
-### PKUnity SoC Features
-CONFIG_USB_WLAN_HED_AQ3=n
-CONFIG_USB_CMMB_INNOFIDEI=n
-CONFIG_I2C_BATTERY_BQ27200=n
-CONFIG_I2C_EEPROM_AT24=n
-CONFIG_LCD_BACKLIGHT=n
-
-CONFIG_PUV3_UMAL=y
-CONFIG_PUV3_MUSB=n
-CONFIG_PUV3_AC97=n
-CONFIG_PUV3_NAND=n
-CONFIG_PUV3_MMC=n
-CONFIG_PUV3_UART=n
-
-### Device Drivers
-#	Memory Technology Device (MTD) support
-CONFIG_MTD=m
-CONFIG_MTD_UBI=m
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=m
-CONFIG_MTD_BLKDEVS=m
-#	RAM/ROM/Flash chip drivers
-CONFIG_MTD_CFI=m
-CONFIG_MTD_JEDECPROBE=m
-CONFIG_MTD_CFI_AMDSTD=m
-#	Mapping drivers for chip access
-CONFIG_MTD_PHYSMAP=m
-
-#	Block devices
-CONFIG_BLK_DEV_LOOP=m
-
-#	SCSI device support
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_BLK_DEV_SR=m
-CONFIG_CHR_DEV_SG=m
-
-#	Serial ATA (prod) and Parallel ATA (experimental) drivers
-CONFIG_ATA=y
-CONFIG_SATA_VIA=y
-
-#	Network device support
-CONFIG_NETDEVICES=y
-CONFIG_NET_ETHERNET=y
-CONFIG_NETDEV_1000=y
-#	Wireless LAN
-CONFIG_WLAN_80211=n
-CONFIG_RT2X00=n
-CONFIG_RT73USB=n
-
-#	Input device support
-CONFIG_INPUT_EVDEV=m
-#	Keyboards
-CONFIG_KEYBOARD_GPIO=m
-
-#	I2C support
-CONFIG_I2C=y
-CONFIG_I2C_PUV3=y
-
-#	Hardware Monitoring support
-#CONFIG_SENSORS_LM75=m
-#	Generic Thermal sysfs driver
-#CONFIG_THERMAL=y
-#CONFIG_THERMAL_HWMON=y
-
-#	Multimedia support
-CONFIG_MEDIA_SUPPORT=n
-CONFIG_VIDEO_DEV=n
-CONFIG_USB_VIDEO_CLASS=n
-
-#	Graphics support
-CONFIG_FB=y
-CONFIG_FB_PUV3_UNIGFX=y
-#	Console display driver support
-CONFIG_VGA_CONSOLE=n
-CONFIG_FRAMEBUFFER_CONSOLE=y
-CONFIG_FONTS=y
-CONFIG_FONT_8x8=y
-CONFIG_FONT_8x16=y
-#	Bootup logo
-CONFIG_LOGO=n
-
-#	Sound card support
-CONFIG_SOUND=m
-#	Advanced Linux Sound Architecture
-CONFIG_SND=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-
-#	USB support
-CONFIG_USB_ARCH_HAS_HCD=n
-CONFIG_USB=n
-CONFIG_USB_PRINTER=n
-CONFIG_USB_STORAGE=n
-#	Inventra Highspeed Dual Role Controller
-CONFIG_USB_MUSB_HDRC=n
-
-#	LED Support
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=y
-#	LED Triggers
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_DISK=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-
-#	Real Time Clock
-CONFIG_RTC_LIB=y
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_PUV3=y
-
-### File systems
-CONFIG_EXT2_FS=m
-CONFIG_EXT3_FS=y
-CONFIG_EXT4_FS=y
-CONFIG_FUSE_FS=m
-#	CD-ROM/DVD Filesystems
-CONFIG_ISO9660_FS=m
-CONFIG_JOLIET=y
-CONFIG_UDF_FS=m
-#	DOS/FAT/NT Filesystems
-CONFIG_VFAT_FS=m
-#	Pseudo filesystems
-CONFIG_PROC_FS=y
-CONFIG_SYSFS=y
-CONFIG_TMPFS=y
-#	Miscellaneous filesystems
-CONFIG_MISC_FILESYSTEMS=y
-CONFIG_JFFS2_FS=m
-CONFIG_UBIFS_FS=m
-#	Network File Systems
-CONFIG_NETWORK_FILESYSTEMS=y
-CONFIG_NFS_FS=y
-CONFIG_NFS_V3=y
-CONFIG_ROOT_NFS=y
-#	Partition Types
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_MSDOS_PARTITION=y
-#	Native language support
-CONFIG_NLS=y
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_936=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-
-### Kernel hacking
-CONFIG_FRAME_WARN=8096
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_PROVE_LOCKING=n
-CONFIG_DEBUG_BUGVERBOSE=y
-CONFIG_FRAME_POINTER=y
-CONFIG_DEBUG_LL=y
-
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
deleted file mode 100644
index 55026e8240d8..000000000000
--- a/arch/unicore32/include/asm/Kbuild
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-generic-y += extable.h
-generic-y += kvm_para.h
-generic-y += mcs_spinlock.h
-generic-y += parport.h
-generic-y += syscalls.h
-generic-y += user.h
diff --git a/arch/unicore32/include/asm/assembler.h b/arch/unicore32/include/asm/assembler.h
deleted file mode 100644
index 3de843d92850..000000000000
--- a/arch/unicore32/include/asm/assembler.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/assembler.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  Do not include any C declarations in this file - it is included by
- *  assembler source.
- */
-#ifndef __ASSEMBLY__
-#error "Only include this from assembly code"
-#endif
-
-#include <asm/ptrace.h>
-
-/*
- * Little Endian independent macros for shifting bytes within registers.
- */
-#define pull            >>
-#define push            <<
-#define get_byte_0      << #0
-#define get_byte_1	>> #8
-#define get_byte_2	>> #16
-#define get_byte_3	>> #24
-#define put_byte_0      << #0
-#define put_byte_1	<< #8
-#define put_byte_2	<< #16
-#define put_byte_3	<< #24
-
-#define cadd		cmpadd
-#define cand		cmpand
-#define csub		cmpsub
-#define cxor		cmpxor
-
-/*
- * Enable and disable interrupts
- */
-	.macro disable_irq, temp
-	mov	\temp, asr
-	andn     \temp, \temp, #0xFF
-	or	\temp, \temp, #PSR_I_BIT | PRIV_MODE
-	mov.a	asr, \temp
-	.endm
-
-	.macro enable_irq, temp
-	mov	\temp, asr
-	andn     \temp, \temp, #0xFF
-	or	\temp, \temp, #PRIV_MODE
-	mov.a	asr, \temp
-	.endm
-
-#define USER(x...)				\
-9999:	x;					\
-	.pushsection __ex_table, "a";		\
-	.align	3;				\
-	.long	9999b, 9001f;			\
-	.popsection
-
-	.macro	notcond, cond, nexti = .+8
-	.ifc	\cond, eq
-		bne	\nexti
-	.else;	.ifc	\cond, ne
-		beq	\nexti
-	.else;	.ifc	\cond, ea
-		bub	\nexti
-	.else;	.ifc	\cond, ub
-		bea	\nexti
-	.else;	.ifc	\cond, fs
-		bns	\nexti
-	.else;	.ifc	\cond, ns
-		bfs	\nexti
-	.else;	.ifc	\cond, fv
-		bnv	\nexti
-	.else;	.ifc	\cond, nv
-		bfv	\nexti
-	.else;	.ifc	\cond, ua
-		beb	\nexti
-	.else;	.ifc	\cond, eb
-		bua	\nexti
-	.else;	.ifc	\cond, eg
-		bsl	\nexti
-	.else;	.ifc	\cond, sl
-		beg	\nexti
-	.else;	.ifc	\cond, sg
-		bel	\nexti
-	.else;	.ifc	\cond, el
-		bsg	\nexti
-	.else;	.ifnc	\cond, al
-		.error  "Unknown cond in notcond macro argument"
-	.endif;	.endif;	.endif;	.endif;	.endif;	.endif;	.endif
-	.endif;	.endif;	.endif;	.endif;	.endif;	.endif;	.endif
-	.endif
-	.endm
-
-	.macro	usracc, instr, reg, ptr, inc, cond, rept, abort
-	.rept	\rept
-	notcond	\cond, .+8
-9999 :
-	.if	\inc == 1
-	\instr\()b.u \reg, [\ptr], #\inc
-	.elseif	\inc == 4
-	\instr\()w.u \reg, [\ptr], #\inc
-	.else
-	.error	"Unsupported inc macro argument"
-	.endif
-
-	.pushsection __ex_table, "a"
-	.align	3
-	.long	9999b, \abort
-	.popsection
-	.endr
-	.endm
-
-	.macro	strusr, reg, ptr, inc, cond = al, rept = 1, abort = 9001f
-	usracc	st, \reg, \ptr, \inc, \cond, \rept, \abort
-	.endm
-
-	.macro	ldrusr, reg, ptr, inc, cond = al, rept = 1, abort = 9001f
-	usracc	ld, \reg, \ptr, \inc, \cond, \rept, \abort
-	.endm
-
-	.macro	nop8
-	.rept	8
-		nop
-	.endr
-	.endm
diff --git a/arch/unicore32/include/asm/barrier.h b/arch/unicore32/include/asm/barrier.h
deleted file mode 100644
index efb81de87507..000000000000
--- a/arch/unicore32/include/asm/barrier.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Memory barrier implementations for PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2012 GUAN Xue-tao
- */
-#ifndef __UNICORE_BARRIER_H__
-#define __UNICORE_BARRIER_H__
-
-#define isb() __asm__ __volatile__ ("" : : : "memory")
-#define dsb() __asm__ __volatile__ ("" : : : "memory")
-#define dmb() __asm__ __volatile__ ("" : : : "memory")
-
-#include <asm-generic/barrier.h>
-
-#endif /* __UNICORE_BARRIER_H__ */
diff --git a/arch/unicore32/include/asm/bitops.h b/arch/unicore32/include/asm/bitops.h
deleted file mode 100644
index deeb2163f35e..000000000000
--- a/arch/unicore32/include/asm/bitops.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/bitops.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_BITOPS_H__
-#define __UNICORE_BITOPS_H__
-
-#define _ASM_GENERIC_BITOPS_FLS_H_
-#define _ASM_GENERIC_BITOPS___FLS_H_
-#define _ASM_GENERIC_BITOPS_FFS_H_
-#define _ASM_GENERIC_BITOPS___FFS_H_
-/*
- * On UNICORE, those functions can be implemented around
- * the cntlz instruction for much better code efficiency.
- */
-
-static inline int fls(unsigned int x)
-{
-	int ret;
-
-	asm("cntlz\t%0, %1" : "=r" (ret) : "r" (x) : "cc");
-	ret = 32 - ret;
-
-	return ret;
-}
-
-#define __fls(x) (fls(x) - 1)
-#define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); })
-#define __ffs(x) (ffs(x) - 1)
-
-#include <asm-generic/bitops.h>
-
-/* following definitions: to avoid using codes in lib/find_*.c */
-#define find_next_bit		find_next_bit
-#define find_next_zero_bit	find_next_zero_bit
-#define find_first_bit		find_first_bit
-#define find_first_zero_bit	find_first_zero_bit
-
-#include <asm-generic/bitops/find.h>
-
-#endif /* __UNICORE_BITOPS_H__ */
diff --git a/arch/unicore32/include/asm/bug.h b/arch/unicore32/include/asm/bug.h
deleted file mode 100644
index 99acea84a865..000000000000
--- a/arch/unicore32/include/asm/bug.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Bug handling for PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2012 GUAN Xue-tao
- */
-#ifndef __UNICORE_BUG_H__
-#define __UNICORE_BUG_H__
-
-#include <asm-generic/bug.h>
-
-struct pt_regs;
-struct siginfo;
-
-extern void die(const char *msg, struct pt_regs *regs, int err);
-extern void uc32_notify_die(const char *str, struct pt_regs *regs,
-		int sig, int code, void __user *addr,
-		unsigned long err, unsigned long trap);
-
-#endif /* __UNICORE_BUG_H__ */
diff --git a/arch/unicore32/include/asm/cache.h b/arch/unicore32/include/asm/cache.h
deleted file mode 100644
index 44ecd1f300fe..000000000000
--- a/arch/unicore32/include/asm/cache.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/cache.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_CACHE_H__
-#define __UNICORE_CACHE_H__
-
-#define L1_CACHE_SHIFT		(5)
-#define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
-
-/*
- * Memory returned by kmalloc() may be used for DMA, so we must make
- * sure that all such allocations are cache aligned. Otherwise,
- * unrelated code may cause parts of the buffer to be read into the
- * cache before the transfer is done, causing old data to be seen by
- * the CPU.
- */
-#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
-
-#endif
diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h
deleted file mode 100644
index ff0be92ebc32..000000000000
--- a/arch/unicore32/include/asm/cacheflush.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/cacheflush.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_CACHEFLUSH_H__
-#define __UNICORE_CACHEFLUSH_H__
-
-#include <linux/mm.h>
-
-#include <asm/shmparam.h>
-
-#define CACHE_COLOUR(vaddr)	((vaddr & (SHMLBA - 1)) >> PAGE_SHIFT)
-
-/*
- * This flag is used to indicate that the page pointed to by a pte is clean
- * and does not require cleaning before returning it to the user.
- */
-#define PG_dcache_clean PG_arch_1
-
-/*
- *	MM Cache Management
- *	===================
- *
- *	The arch/unicore32/mm/cache.S files implement these methods.
- *
- *	Start addresses are inclusive and end addresses are exclusive;
- *	start addresses should be rounded down, end addresses up.
- *
- *	See Documentation/core-api/cachetlb.rst for more information.
- *	Please note that the implementation of these, and the required
- *	effects are cache-type (VIVT/VIPT/PIPT) specific.
- *
- *	flush_icache_all()
- *
- *		Unconditionally clean and invalidate the entire icache.
- *		Currently only needed for cache-v6.S and cache-v7.S, see
- *		__flush_icache_all for the generic implementation.
- *
- *	flush_kern_all()
- *
- *		Unconditionally clean and invalidate the entire cache.
- *
- *	flush_user_all()
- *
- *		Clean and invalidate all user space cache entries
- *		before a change of page tables.
- *
- *	flush_user_range(start, end, flags)
- *
- *		Clean and invalidate a range of cache entries in the
- *		specified address space before a change of page tables.
- *		- start - user start address (inclusive, page aligned)
- *		- end   - user end address   (exclusive, page aligned)
- *		- flags - vma->vm_flags field
- *
- *	coherent_kern_range(start, end)
- *
- *		Ensure coherency between the Icache and the Dcache in the
- *		region described by start, end.  If you have non-snooping
- *		Harvard caches, you need to implement this function.
- *		- start  - virtual start address
- *		- end    - virtual end address
- *
- *	coherent_user_range(start, end)
- *
- *		Ensure coherency between the Icache and the Dcache in the
- *		region described by start, end.  If you have non-snooping
- *		Harvard caches, you need to implement this function.
- *		- start  - virtual start address
- *		- end    - virtual end address
- *
- *	flush_kern_dcache_area(kaddr, size)
- *
- *		Ensure that the data held in page is written back.
- *		- kaddr  - page address
- *		- size   - region size
- *
- *	DMA Cache Coherency
- *	===================
- *
- *	dma_flush_range(start, end)
- *
- *		Clean and invalidate the specified virtual address range.
- *		- start  - virtual start address
- *		- end    - virtual end address
- */
-
-extern void __cpuc_flush_icache_all(void);
-extern void __cpuc_flush_kern_all(void);
-extern void __cpuc_flush_user_all(void);
-extern void __cpuc_flush_user_range(unsigned long, unsigned long, unsigned int);
-extern void __cpuc_coherent_kern_range(unsigned long, unsigned long);
-extern void __cpuc_coherent_user_range(unsigned long, unsigned long);
-extern void __cpuc_flush_dcache_area(void *, size_t);
-extern void __cpuc_flush_kern_dcache_area(void *addr, size_t size);
-
-/*
- * Copy user data from/to a page which is mapped into a different
- * processes address space.  Really, we want to allow our "user
- * space" model to handle this.
- */
-extern void copy_to_user_page(struct vm_area_struct *, struct page *,
-	unsigned long, void *, const void *, unsigned long);
-#define copy_from_user_page(vma, page, vaddr, dst, src, len)	\
-	do {							\
-		memcpy(dst, src, len);				\
-	} while (0)
-
-/*
- * Convert calls to our calling convention.
- */
-/* Invalidate I-cache */
-static inline void __flush_icache_all(void)
-{
-	asm("movc	p0.c5, %0, #20;\n"
-	    "nop; nop; nop; nop; nop; nop; nop; nop\n"
-	    :
-	    : "r" (0));
-}
-
-#define flush_cache_all()		__cpuc_flush_kern_all()
-
-extern void flush_cache_mm(struct mm_struct *mm);
-extern void flush_cache_range(struct vm_area_struct *vma,
-		unsigned long start, unsigned long end);
-extern void flush_cache_page(struct vm_area_struct *vma,
-		unsigned long user_addr, unsigned long pfn);
-
-#define flush_cache_dup_mm(mm) flush_cache_mm(mm)
-
-/*
- * Perform necessary cache operations to ensure that data previously
- * stored within this range of addresses can be executed by the CPU.
- */
-#define flush_icache_range(s, e)	__cpuc_coherent_kern_range(s, e)
-
-/*
- * Perform necessary cache operations to ensure that the TLB will
- * see data written in the specified area.
- */
-#define clean_dcache_area(start, size)	cpu_dcache_clean_area(start, size)
-
-/*
- * flush_dcache_page is used when the kernel has written to the page
- * cache page at virtual address page->virtual.
- *
- * If this page isn't mapped (ie, page_mapping == NULL), or it might
- * have userspace mappings, then we _must_ always clean + invalidate
- * the dcache entries associated with the kernel mapping.
- *
- * Otherwise we can defer the operation, and clean the cache when we are
- * about to change to user space.  This is the same method as used on SPARC64.
- * See update_mmu_cache for the user space part.
- */
-#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
-extern void flush_dcache_page(struct page *);
-
-#define flush_dcache_mmap_lock(mapping)		do { } while (0)
-#define flush_dcache_mmap_unlock(mapping)	do { } while (0)
-
-/*
- * We don't appear to need to do anything here.  In fact, if we did, we'd
- * duplicate cache flushing elsewhere performed by flush_dcache_page().
- */
-#define flush_icache_page(vma, page)	do { } while (0)
-
-/*
- * flush_cache_vmap() is used when creating mappings (eg, via vmap,
- * vmalloc, ioremap etc) in kernel space for pages.  On non-VIPT
- * caches, since the direct-mappings of these pages may contain cached
- * data, we need to do a full cache flush to ensure that writebacks
- * don't corrupt data placed into these pages via the new mappings.
- */
-static inline void flush_cache_vmap(unsigned long start, unsigned long end)
-{
-}
-
-static inline void flush_cache_vunmap(unsigned long start, unsigned long end)
-{
-}
-
-#endif
diff --git a/arch/unicore32/include/asm/checksum.h b/arch/unicore32/include/asm/checksum.h
deleted file mode 100644
index e774ca268c15..000000000000
--- a/arch/unicore32/include/asm/checksum.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/checksum.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * IP checksum routines
- */
-#ifndef __UNICORE_CHECKSUM_H__
-#define __UNICORE_CHECKSUM_H__
-
-/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented
- */
-
-static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
-		   __u8 proto, __wsum sum)
-{
-	__asm__(
-	"add.a	%0, %1, %2\n"
-	"addc.a	%0, %0, %3\n"
-	"addc.a	%0, %0, %4 << #8\n"
-	"addc.a	%0, %0, %5\n"
-	"addc	%0, %0, #0\n"
-	: "=&r"(sum)
-	: "r" (sum), "r" (daddr), "r" (saddr), "r" (len), "Ir" (htons(proto))
-	: "cc");
-	return sum;
-}
-#define csum_tcpudp_nofold	csum_tcpudp_nofold
-
-#include <asm-generic/checksum.h>
-
-#endif
diff --git a/arch/unicore32/include/asm/cmpxchg.h b/arch/unicore32/include/asm/cmpxchg.h
deleted file mode 100644
index 87f960a2e4f0..000000000000
--- a/arch/unicore32/include/asm/cmpxchg.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Atomics xchg/cmpxchg for PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2012 GUAN Xue-tao
- */
-#ifndef __UNICORE_CMPXCHG_H__
-#define __UNICORE_CMPXCHG_H__
-
-/*
- * Generate a link failure on undefined symbol if the pointer points to a value
- * of unsupported size.
- */
-extern void __xchg_bad_pointer(void);
-
-static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
-		int size)
-{
-	unsigned long ret;
-
-	switch (size) {
-	case 1:
-		asm volatile("swapb	%0, %1, [%2]"
-			: "=&r" (ret)
-			: "r" (x), "r" (ptr)
-			: "memory", "cc");
-		break;
-	case 4:
-		asm volatile("swapw	%0, %1, [%2]"
-			: "=&r" (ret)
-			: "r" (x), "r" (ptr)
-			: "memory", "cc");
-		break;
-	default:
-		__xchg_bad_pointer();
-	}
-
-	return ret;
-}
-
-#define xchg(ptr, x) \
-	((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))))
-
-#include <asm-generic/cmpxchg-local.h>
-
-/*
- * cmpxchg_local and cmpxchg64_local are atomic wrt current CPU. Always make
- * them available.
- */
-#define cmpxchg_local(ptr, o, n)					\
-		((__typeof__(*(ptr)))__cmpxchg_local_generic((ptr),	\
-		(unsigned long)(o), (unsigned long)(n), sizeof(*(ptr))))
-#define cmpxchg64_local(ptr, o, n)					\
-		__cmpxchg64_local_generic((ptr), (o), (n))
-
-#include <asm-generic/cmpxchg.h>
-
-#endif /* __UNICORE_CMPXCHG_H__ */
diff --git a/arch/unicore32/include/asm/cpu-single.h b/arch/unicore32/include/asm/cpu-single.h
deleted file mode 100644
index 1b419d697fd1..000000000000
--- a/arch/unicore32/include/asm/cpu-single.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/cpu-single.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_CPU_SINGLE_H__
-#define __UNICORE_CPU_SINGLE_H__
-
-#include <asm/page.h>
-#include <asm/memory.h>
-
-#ifdef __KERNEL__
-#ifndef __ASSEMBLY__
-
-#define cpu_switch_mm(pgd, mm) cpu_do_switch_mm(virt_to_phys(pgd), mm)
-
-#define cpu_get_pgd()					\
-	({						\
-		unsigned long pg;			\
-		__asm__("movc	%0, p0.c2, #0"		\
-			 : "=r" (pg) : : "cc");		\
-		pg &= ~0x0fff;				\
-		(pgd_t *)phys_to_virt(pg);		\
-	})
-
-struct mm_struct;
-
-/* declare all the functions as extern */
-extern void cpu_proc_fin(void);
-extern int cpu_do_idle(void);
-extern void cpu_dcache_clean_area(void *, int);
-extern void cpu_do_switch_mm(unsigned long pgd_phys, struct mm_struct *mm);
-extern void cpu_set_pte(pte_t *ptep, pte_t pte);
-extern void cpu_reset(unsigned long addr) __attribute__((noreturn));
-
-#endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
-
-#endif /* __UNICORE_CPU_SINGLE_H__ */
diff --git a/arch/unicore32/include/asm/cputype.h b/arch/unicore32/include/asm/cputype.h
deleted file mode 100644
index 08a47e3bdbcc..000000000000
--- a/arch/unicore32/include/asm/cputype.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/cputype.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_CPUTYPE_H__
-#define __UNICORE_CPUTYPE_H__
-
-#include <linux/stringify.h>
-
-#define CPUID_CPUID	0
-#define CPUID_CACHETYPE	1
-
-#define read_cpuid(reg)							\
-	({								\
-		unsigned int __val;					\
-		asm("movc	%0, p0.c0, #" __stringify(reg)		\
-		    : "=r" (__val)					\
-		    :							\
-		    : "cc");						\
-		__val;							\
-	})
-
-#define uc32_cpuid		read_cpuid(CPUID_CPUID)
-#define uc32_cachetype		read_cpuid(CPUID_CACHETYPE)
-
-#endif
diff --git a/arch/unicore32/include/asm/delay.h b/arch/unicore32/include/asm/delay.h
deleted file mode 100644
index 934193edfa66..000000000000
--- a/arch/unicore32/include/asm/delay.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/delay.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * Delay routines, using a pre-computed "loops_per_second" value.
- */
-#ifndef __UNICORE_DELAY_H__
-#define __UNICORE_DELAY_H__
-
-#include <asm/param.h>	/* HZ */
-
-extern void __delay(int loops);
-
-/*
- * This function intentionally does not exist; if you see references to
- * it, it means that you're calling udelay() with an out of range value.
- *
- * With currently imposed limits, this means that we support a max delay
- * of 2000us. Further limits: HZ<=1000 and bogomips<=3355
- */
-extern void __bad_udelay(void);
-
-/*
- * division by multiplication: you don't have to worry about
- * loss of precision.
- *
- * Use only for very small delays ( < 1 msec).  Should probably use a
- * lookup table, really, as the multiplications take much too long with
- * short delays.  This is a "reasonable" implementation, though (and the
- * first constant multiplications gets optimized away if the delay is
- * a constant)
- */
-extern void __udelay(unsigned long usecs);
-extern void __const_udelay(unsigned long);
-
-#define MAX_UDELAY_MS 2
-
-#define udelay(n)							\
-	(__builtin_constant_p(n) ?					\
-	  ((n) > (MAX_UDELAY_MS * 1000) ? __bad_udelay() :		\
-			__const_udelay((n) * ((2199023U*HZ)>>11))) :	\
-	  __udelay(n))
-
-#endif /* __UNICORE_DELAY_H__ */
-
diff --git a/arch/unicore32/include/asm/dma.h b/arch/unicore32/include/asm/dma.h
deleted file mode 100644
index 1326310b21e6..000000000000
--- a/arch/unicore32/include/asm/dma.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/dma.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_DMA_H__
-#define __UNICORE_DMA_H__
-
-#include <asm/memory.h>
-#include <asm-generic/dma.h>
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#endif
-
-#endif /* __UNICORE_DMA_H__ */
diff --git a/arch/unicore32/include/asm/elf.h b/arch/unicore32/include/asm/elf.h
deleted file mode 100644
index a464ed5f05d4..000000000000
--- a/arch/unicore32/include/asm/elf.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/elf.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_ELF_H__
-#define __UNICORE_ELF_H__
-
-#include <asm/hwcap.h>
-
-/*
- * ELF register definitions..
- */
-#include <asm/ptrace.h>
-#include <linux/elf-em.h>
-
-typedef unsigned long elf_greg_t;
-typedef unsigned long elf_freg_t[3];
-
-#define ELF_NGREG (sizeof(struct pt_regs) / sizeof(elf_greg_t))
-typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-
-typedef struct fp_state elf_fpregset_t;
-
-#define R_UNICORE_NONE		0
-#define R_UNICORE_PC24		1
-#define R_UNICORE_ABS32		2
-#define R_UNICORE_CALL		28
-#define R_UNICORE_JUMP24	29
-
-/*
- * These are used to set parameters in the core dumps.
- */
-#define ELF_CLASS	ELFCLASS32
-#define ELF_DATA	ELFDATA2LSB
-#define ELF_ARCH	EM_UNICORE
-
-/*
- * This yields a string that ld.so will use to load implementation
- * specific libraries for optimization.  This is more specific in
- * intent than poking at uname or /proc/cpuinfo.
- *
- */
-#define ELF_PLATFORM_SIZE 8
-#define ELF_PLATFORM	(elf_platform)
-
-extern char elf_platform[];
-
-struct elf32_hdr;
-
-/*
- * This is used to ensure we don't load something for the wrong architecture.
- */
-extern int elf_check_arch(const struct elf32_hdr *);
-#define elf_check_arch elf_check_arch
-
-struct task_struct;
-int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs);
-#define ELF_CORE_COPY_TASK_REGS dump_task_regs
-
-#define ELF_EXEC_PAGESIZE	4096
-
-/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
-   use of this is to invoke "./ld.so someprog" to test out a new version of
-   the loader.  We need to make sure that it is out of the way of the program
-   that it will "exec", and that there is sufficient room for the brk.  */
-
-#define ELF_ET_DYN_BASE	(2 * TASK_SIZE / 3)
-
-/* When the program starts, a1 contains a pointer to a function to be
-   registered with atexit, as per the SVR4 ABI.  A value of 0 means we
-   have no such handler.  */
-#define ELF_PLAT_INIT(_r, load_addr)	{(_r)->UCreg_00 = 0; }
-
-extern void elf_set_personality(const struct elf32_hdr *);
-#define SET_PERSONALITY(ex)	elf_set_personality(&(ex))
-
-struct mm_struct;
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
-extern int vectors_user_mapping(void);
-#define arch_setup_additional_pages(bprm, uses_interp) vectors_user_mapping()
-#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
-
-#endif
diff --git a/arch/unicore32/include/asm/fpstate.h b/arch/unicore32/include/asm/fpstate.h
deleted file mode 100644
index 5811293e7a7e..000000000000
--- a/arch/unicore32/include/asm/fpstate.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/fpstate.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_FPSTATE_H__
-#define __UNICORE_FPSTATE_H__
-
-#ifndef __ASSEMBLY__
-
-#define FP_REGS_NUMBER		33
-
-struct fp_state {
-	unsigned int regs[FP_REGS_NUMBER];
-} __attribute__((aligned(8)));
-
-#endif
-
-#endif
diff --git a/arch/unicore32/include/asm/fpu-ucf64.h b/arch/unicore32/include/asm/fpu-ucf64.h
deleted file mode 100644
index 7a0c8a9e05d4..000000000000
--- a/arch/unicore32/include/asm/fpu-ucf64.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/fpu-ucf64.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-#define FPSCR			s31
-
-/* FPSCR bits */
-#define FPSCR_DEFAULT_NAN	(1<<25)
-
-#define FPSCR_CMPINSTR_BIT	(1<<31)
-
-#define FPSCR_CON		(1<<29)
-#define FPSCR_TRAP		(1<<27)
-
-/* RND mode */
-#define FPSCR_ROUND_NEAREST	(0<<0)
-#define FPSCR_ROUND_PLUSINF	(2<<0)
-#define FPSCR_ROUND_MINUSINF	(3<<0)
-#define FPSCR_ROUND_TOZERO	(1<<0)
-#define FPSCR_RMODE_BIT		(0)
-#define FPSCR_RMODE_MASK	(7 << FPSCR_RMODE_BIT)
-
-/* trap enable */
-#define FPSCR_IOE		(1<<16)
-#define FPSCR_OFE		(1<<14)
-#define FPSCR_UFE		(1<<13)
-#define FPSCR_IXE		(1<<12)
-#define FPSCR_HIE		(1<<11)
-#define FPSCR_NDE		(1<<10)	/* non denomal */
-
-/* flags */
-#define FPSCR_IDC		(1<<24)
-#define FPSCR_HIC		(1<<23)
-#define FPSCR_IXC		(1<<22)
-#define FPSCR_OFC		(1<<21)
-#define FPSCR_UFC		(1<<20)
-#define FPSCR_IOC		(1<<19)
-
-/* stick bits */
-#define FPSCR_IOS		(1<<9)
-#define FPSCR_OFS		(1<<7)
-#define FPSCR_UFS		(1<<6)
-#define FPSCR_IXS		(1<<5)
-#define FPSCR_HIS		(1<<4)
-#define FPSCR_NDS		(1<<3)	/*non denomal */
diff --git a/arch/unicore32/include/asm/gpio.h b/arch/unicore32/include/asm/gpio.h
deleted file mode 100644
index dfad04ca0a65..000000000000
--- a/arch/unicore32/include/asm/gpio.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/gpio.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_GPIO_H__
-#define __UNICORE_GPIO_H__
-
-#include <linux/io.h>
-#include <asm/irq.h>
-#include <mach/hardware.h>
-#include <asm-generic/gpio.h>
-
-#define GPI_OTP_INT             0
-#define GPI_PCI_INTA            1
-#define GPI_PCI_INTB            2
-#define GPI_PCI_INTC            3
-#define GPI_PCI_INTD            4
-#define GPI_BAT_DET             5
-#define GPI_SD_CD               6
-#define GPI_SOFF_REQ            7
-#define GPI_SD_WP               8
-#define GPI_LCD_CASE_OFF        9
-#define GPO_WIFI_EN             10
-#define GPO_HDD_LED             11
-#define GPO_VGA_EN              12
-#define GPO_LCD_EN              13
-#define GPO_LED_DATA            14
-#define GPO_LED_CLK             15
-#define GPO_CAM_PWR_EN          16
-#define GPO_LCD_VCC_EN          17
-#define GPO_SOFT_OFF            18
-#define GPO_BT_EN               19
-#define GPO_FAN_ON              20
-#define GPO_SPKR                21
-#define GPO_SET_V1              23
-#define GPO_SET_V2              24
-#define GPO_CPU_HEALTH          25
-#define GPO_LAN_SEL             26
-
-#ifdef CONFIG_PUV3_NB0916
-#define GPI_BTN_TOUCH		14
-#define GPIO_IN			0x000043ff /* 1 for input */
-#define GPIO_OUT		0x0fffbc00 /* 1 for output */
-#endif	/* CONFIG_PUV3_NB0916 */
-
-#ifdef CONFIG_PUV3_SMW0919
-#define GPIO_IN			0x000003ff /* 1 for input */
-#define GPIO_OUT		0x0ffffc00 /* 1 for output */
-#endif  /* CONFIG_PUV3_SMW0919 */
-
-#ifdef CONFIG_PUV3_DB0913
-#define GPIO_IN			0x000001df /* 1 for input */
-#define GPIO_OUT		0x03fee800 /* 1 for output */
-#endif  /* CONFIG_PUV3_DB0913 */
-
-#define GPIO_DIR                (~((GPIO_IN) | 0xf0000000))
-				/* 0 input, 1 output */
-
-static inline int gpio_get_value(unsigned gpio)
-{
-	if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
-		return readl(GPIO_GPLR) & GPIO_GPIO(gpio);
-	else
-		return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned gpio, int value)
-{
-	if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
-		if (value)
-			writel(GPIO_GPIO(gpio), GPIO_GPSR);
-		else
-			writel(GPIO_GPIO(gpio), GPIO_GPCR);
-	else
-		__gpio_set_value(gpio, value);
-}
-
-#define gpio_cansleep	__gpio_cansleep
-
-static inline unsigned gpio_to_irq(unsigned gpio)
-{
-	if ((gpio < IRQ_GPIOHIGH) && (FIELD(1, 1, gpio) & readl(GPIO_GPIR)))
-		return IRQ_GPIOLOW0 + gpio;
-	else
-		return IRQ_GPIO0 + gpio;
-}
-
-static inline unsigned irq_to_gpio(unsigned irq)
-{
-	if (irq < IRQ_GPIOHIGH)
-		return irq - IRQ_GPIOLOW0;
-	else
-		return irq - IRQ_GPIO0;
-}
-
-#endif /* __UNICORE_GPIO_H__ */
diff --git a/arch/unicore32/include/asm/hwcap.h b/arch/unicore32/include/asm/hwcap.h
deleted file mode 100644
index 2e15ffbe8391..000000000000
--- a/arch/unicore32/include/asm/hwcap.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/hwcap.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_HWCAP_H__
-#define __UNICORE_HWCAP_H__
-
-/*
- * HWCAP flags
- */
-#define HWCAP_MSP		1
-#define HWCAP_UNICORE16		2
-#define HWCAP_CMOV		4
-#define HWCAP_UNICORE_F64       8
-#define HWCAP_TLS		0x80
-
-#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
-/*
- * This yields a mask that user programs can use to figure out what
- * instruction set this cpu supports.
- */
-#define ELF_HWCAP		(HWCAP_CMOV | HWCAP_UNICORE_F64)
-#endif
-
-#endif
diff --git a/arch/unicore32/include/asm/hwdef-copro.h b/arch/unicore32/include/asm/hwdef-copro.h
deleted file mode 100644
index 2db8cf864e43..000000000000
--- a/arch/unicore32/include/asm/hwdef-copro.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Co-processor register definitions for PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2012 GUAN Xue-tao
- */
-#ifndef __UNICORE_HWDEF_COPRO_H__
-#define __UNICORE_HWDEF_COPRO_H__
-
-/*
- * Control Register bits (CP#0 CR1)
- */
-#define CR_M	(1 << 0)	/* MMU enable				*/
-#define CR_A	(1 << 1)	/* Alignment abort enable		*/
-#define CR_D	(1 << 2)	/* Dcache enable			*/
-#define CR_I	(1 << 3)	/* Icache enable			*/
-#define CR_B	(1 << 4)	/* Dcache write mechanism: write back	*/
-#define CR_T	(1 << 5)	/* Burst enable				*/
-#define CR_V	(1 << 13)	/* Vectors relocated to 0xffff0000	*/
-
-#ifndef __ASSEMBLY__
-
-#define vectors_high()		(cr_alignment & CR_V)
-
-extern unsigned long cr_no_alignment;	/* defined in entry.S */
-extern unsigned long cr_alignment;	/* defined in entry.S */
-
-static inline unsigned int get_cr(void)
-{
-	unsigned int val;
-	asm("movc %0, p0.c1, #0" : "=r" (val) : : "cc");
-	return val;
-}
-
-static inline void set_cr(unsigned int val)
-{
-	asm volatile("movc p0.c1, %0, #0" : : "r" (val) : "cc");
-	isb();
-}
-
-extern void adjust_cr(unsigned long mask, unsigned long set);
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* __UNICORE_HWDEF_COPRO_H__ */
diff --git a/arch/unicore32/include/asm/io.h b/arch/unicore32/include/asm/io.h
deleted file mode 100644
index bd4e7c332f85..000000000000
--- a/arch/unicore32/include/asm/io.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/io.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_IO_H__
-#define __UNICORE_IO_H__
-
-#ifdef __KERNEL__
-
-#include <asm/byteorder.h>
-#include <asm/memory.h>
-
-#define PCI_IOBASE	PKUNITY_PCILIO_BASE
-#include <asm-generic/io.h>
-
-/*
- * __uc32_ioremap takes CPU physical address.
- */
-extern void __iomem *__uc32_ioremap(unsigned long, size_t);
-extern void __uc32_iounmap(volatile void __iomem *addr);
-
-/*
- * ioremap and friends.
- *
- * ioremap takes a PCI memory address, as specified in
- * Documentation/driver-api/io-mapping.rst.
- *
- */
-#define ioremap(cookie, size)		__uc32_ioremap(cookie, size)
-#define iounmap(cookie)			__uc32_iounmap(cookie)
-
-#define readb_relaxed readb
-#define readw_relaxed readw
-#define readl_relaxed readl
-
-#define HAVE_ARCH_PIO_SIZE
-#define PIO_OFFSET		(unsigned int)(PCI_IOBASE)
-#define PIO_MASK		(unsigned int)(IO_SPACE_LIMIT)
-#define PIO_RESERVED		(PIO_OFFSET + PIO_MASK + 1)
-
-#ifdef CONFIG_STRICT_DEVMEM
-
-#include <linux/ioport.h>
-#include <linux/mm.h>
-
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain
- * address is valid. The argument is a physical page number.
- * We mimic x86 here by disallowing access to system RAM as well as
- * device-exclusive MMIO regions. This effectively disable read()/write()
- * on /dev/mem.
- */
-static inline int devmem_is_allowed(unsigned long pfn)
-{
-	if (iomem_is_exclusive(pfn << PAGE_SHIFT))
-		return 0;
-	if (!page_is_ram(pfn))
-		return 1;
-	return 0;
-}
-
-#endif /* CONFIG_STRICT_DEVMEM */
-
-#endif	/* __KERNEL__ */
-#endif	/* __UNICORE_IO_H__ */
diff --git a/arch/unicore32/include/asm/irq.h b/arch/unicore32/include/asm/irq.h
deleted file mode 100644
index 3f7f07c0338c..000000000000
--- a/arch/unicore32/include/asm/irq.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/irq.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_IRQ_H__
-#define __UNICORE_IRQ_H__
-
-#include <asm-generic/irq.h>
-
-#define	IRQ_GPIOLOW0		0x00
-#define	IRQ_GPIOLOW1		0x01
-#define	IRQ_GPIOLOW2		0x02
-#define	IRQ_GPIOLOW3		0x03
-#define	IRQ_GPIOLOW4		0x04
-#define	IRQ_GPIOLOW5		0x05
-#define	IRQ_GPIOLOW6		0x06
-#define	IRQ_GPIOLOW7		0x07
-#define IRQ_GPIOHIGH		0x08
-#define IRQ_USB			0x09
-#define IRQ_SDC			0x0a
-#define IRQ_AC97		0x0b
-#define IRQ_SATA		0x0c
-#define IRQ_MME			0x0d
-#define IRQ_PCI_BRIDGE		0x0e
-#define	IRQ_DDR			0x0f
-#define	IRQ_SPI			0x10
-#define	IRQ_UNIGFX		0x11
-#define	IRQ_I2C			0x11
-#define	IRQ_UART1		0x12
-#define	IRQ_UART0		0x13
-#define IRQ_UMAL		0x14
-#define IRQ_NAND		0x15
-#define IRQ_PS2_KBD		0x16
-#define IRQ_PS2_AUX		0x17
-#define IRQ_DMA			0x18
-#define IRQ_DMAERR		0x19
-#define	IRQ_TIMER0		0x1a
-#define	IRQ_TIMER1		0x1b
-#define	IRQ_TIMER2		0x1c
-#define	IRQ_TIMER3		0x1d
-#define	IRQ_RTC			0x1e
-#define	IRQ_RTCAlarm		0x1f
-
-#define	IRQ_GPIO0		0x20
-#define	IRQ_GPIO1		0x21
-#define	IRQ_GPIO2		0x22
-#define	IRQ_GPIO3		0x23
-#define	IRQ_GPIO4		0x24
-#define	IRQ_GPIO5		0x25
-#define	IRQ_GPIO6		0x26
-#define	IRQ_GPIO7		0x27
-#define IRQ_GPIO8		0x28
-#define IRQ_GPIO9		0x29
-#define IRQ_GPIO10		0x2a
-#define IRQ_GPIO11		0x2b
-#define IRQ_GPIO12		0x2c
-#define IRQ_GPIO13		0x2d
-#define IRQ_GPIO14		0x2e
-#define IRQ_GPIO15		0x2f
-#define IRQ_GPIO16		0x30
-#define IRQ_GPIO17		0x31
-#define IRQ_GPIO18		0x32
-#define IRQ_GPIO19		0x33
-#define IRQ_GPIO20		0x34
-#define IRQ_GPIO21		0x35
-#define IRQ_GPIO22		0x36
-#define IRQ_GPIO23		0x37
-#define IRQ_GPIO24		0x38
-#define IRQ_GPIO25		0x39
-#define IRQ_GPIO26		0x3a
-#define IRQ_GPIO27		0x3b
-
-#ifdef CONFIG_ARCH_FPGA
-#define IRQ_PCIINTA             IRQ_GPIOLOW2
-#define IRQ_PCIINTB             IRQ_GPIOLOW1
-#define IRQ_PCIINTC             IRQ_GPIOLOW0
-#define IRQ_PCIINTD             IRQ_GPIOLOW6
-#endif
-
-#if defined(CONFIG_PUV3_DB0913) || defined(CONFIG_PUV3_NB0916)	\
-	|| defined(CONFIG_PUV3_SMW0919)
-#define IRQ_PCIINTA             IRQ_GPIOLOW1
-#define IRQ_PCIINTB             IRQ_GPIOLOW2
-#define IRQ_PCIINTC             IRQ_GPIOLOW3
-#define IRQ_PCIINTD             IRQ_GPIOLOW4
-#endif
-
-#define IRQ_SD_CD               IRQ_GPIO6 /* falling or rising trigger */
-
-#ifndef __ASSEMBLY__
-struct pt_regs;
-
-extern void asm_do_IRQ(unsigned int, struct pt_regs *);
-
-#endif
-
-#endif
-
diff --git a/arch/unicore32/include/asm/irqflags.h b/arch/unicore32/include/asm/irqflags.h
deleted file mode 100644
index f64c82e3eae6..000000000000
--- a/arch/unicore32/include/asm/irqflags.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/irqflags.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_IRQFLAGS_H__
-#define __UNICORE_IRQFLAGS_H__
-
-#ifdef __KERNEL__
-
-#include <asm/ptrace.h>
-
-#define ARCH_IRQ_DISABLED	(PRIV_MODE | PSR_I_BIT)
-#define ARCH_IRQ_ENABLED	(PRIV_MODE)
-
-/*
- * Save the current interrupt enable state.
- */
-static inline unsigned long arch_local_save_flags(void)
-{
-	unsigned long temp;
-
-	asm volatile("mov %0, asr" : "=r" (temp) : : "memory", "cc");
-
-	return temp & PSR_c;
-}
-
-/*
- * restore saved IRQ state
- */
-static inline void arch_local_irq_restore(unsigned long flags)
-{
-	unsigned long temp;
-
-	asm volatile(
-		"mov	%0, asr\n"
-		"mov.a	asr, %1\n"
-		"mov.f	asr, %0"
-		: "=&r" (temp)
-		: "r" (flags)
-		: "memory", "cc");
-}
-
-#include <asm-generic/irqflags.h>
-
-#endif
-#endif
diff --git a/arch/unicore32/include/asm/linkage.h b/arch/unicore32/include/asm/linkage.h
deleted file mode 100644
index 8e341ba7bc4a..000000000000
--- a/arch/unicore32/include/asm/linkage.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/linkage.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_LINKAGE_H__
-#define __UNICORE_LINKAGE_H__
-
-#define __ALIGN .align 0
-#define __ALIGN_STR ".align 0"
-
-#define ENDPROC(name) \
-	.type name, %function; \
-	END(name)
-
-#endif
diff --git a/arch/unicore32/include/asm/memblock.h b/arch/unicore32/include/asm/memblock.h
deleted file mode 100644
index eb56a6ddce83..000000000000
--- a/arch/unicore32/include/asm/memblock.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/memblock.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_MEMBLOCK_H__
-#define __UNICORE_MEMBLOCK_H__
-
-/*
- * Memory map description
- */
-# define NR_BANKS 8
-
-struct membank {
-	unsigned long start;
-	unsigned long size;
-	unsigned int highmem;
-};
-
-struct meminfo {
-	int nr_banks;
-	struct membank bank[NR_BANKS];
-};
-
-extern struct meminfo meminfo;
-
-#define for_each_bank(iter, mi)				\
-	for (iter = 0; iter < (mi)->nr_banks; iter++)
-
-#define bank_pfn_start(bank)	__phys_to_pfn((bank)->start)
-#define bank_pfn_end(bank)	__phys_to_pfn((bank)->start + (bank)->size)
-#define bank_pfn_size(bank)	((bank)->size >> PAGE_SHIFT)
-#define bank_phys_start(bank)	((bank)->start)
-#define bank_phys_end(bank)	((bank)->start + (bank)->size)
-#define bank_phys_size(bank)	((bank)->size)
-
-extern void uc32_memblock_init(struct meminfo *);
-
-#endif
diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h
deleted file mode 100644
index 66285178dd9b..000000000000
--- a/arch/unicore32/include/asm/memory.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/memory.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  Note: this file should not be included by non-asm/.h files
- */
-#ifndef __UNICORE_MEMORY_H__
-#define __UNICORE_MEMORY_H__
-
-#include <linux/compiler.h>
-#include <linux/const.h>
-#include <linux/sizes.h>
-#include <mach/memory.h>
-
-/*
- * PAGE_OFFSET - the virtual address of the start of the kernel image
- * TASK_SIZE - the maximum size of a user space task.
- * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area
- */
-#define PAGE_OFFSET		UL(0xC0000000)
-#define TASK_SIZE		(PAGE_OFFSET - UL(0x41000000))
-#define TASK_UNMAPPED_BASE	(PAGE_OFFSET / 3)
-
-/*
- * The module space lives between the addresses given by TASK_SIZE
- * and PAGE_OFFSET - it must be within 32MB of the kernel text.
- */
-#define MODULES_VADDR		(PAGE_OFFSET - 16*1024*1024)
-#if TASK_SIZE > MODULES_VADDR
-#error Top of user space clashes with start of module space
-#endif
-
-#define MODULES_END		(PAGE_OFFSET)
-
-/*
- * Allow 16MB-aligned ioremap pages
- */
-#define IOREMAP_MAX_ORDER	24
-
-/*
- * Physical vs virtual RAM address space conversion.  These are
- * private definitions which should NOT be used outside memory.h
- * files.  Use virt_to_phys/phys_to_virt/__pa/__va instead.
- */
-#ifndef __virt_to_phys
-#define __virt_to_phys(x)	((x) - PAGE_OFFSET + PHYS_OFFSET)
-#define __phys_to_virt(x)	((x) - PHYS_OFFSET + PAGE_OFFSET)
-#endif
-
-/*
- * Convert a page to/from a physical address
- */
-#define page_to_phys(page)	(__pfn_to_phys(page_to_pfn(page)))
-#define phys_to_page(phys)	(pfn_to_page(__phys_to_pfn(phys)))
-
-#ifndef __ASSEMBLY__
-
-#ifndef arch_adjust_zones
-#define arch_adjust_zones(max_zone_pfn) do { } while (0)
-#endif
-
-/*
- * PFNs are used to describe any physical page; this means
- * PFN 0 == physical address 0.
- *
- * This is the PFN of the first RAM page in the kernel
- * direct-mapped view.  We assume this is the first page
- * of RAM in the mem_map as well.
- */
-#define PHYS_PFN_OFFSET	(PHYS_OFFSET >> PAGE_SHIFT)
-
-/*
- * Drivers should NOT use these either.
- */
-#define __pa(x)			__virt_to_phys((unsigned long)(x))
-#define __va(x)			((void *)__phys_to_virt((unsigned long)(x)))
-#define pfn_to_kaddr(pfn)	__va((pfn) << PAGE_SHIFT)
-
-/*
- * Conversion between a struct page and a physical address.
- *
- *  page_to_pfn(page)	convert a struct page * to a PFN number
- *  pfn_to_page(pfn)	convert a _valid_ PFN number to struct page *
- *
- *  virt_to_page(k)	convert a _valid_ virtual address to struct page *
- *  virt_addr_valid(k)	indicates whether a virtual address is valid
- */
-#define ARCH_PFN_OFFSET		PHYS_PFN_OFFSET
-
-#define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define virt_addr_valid(kaddr)	((unsigned long)(kaddr) >= PAGE_OFFSET && \
-		(unsigned long)(kaddr) < (unsigned long)high_memory)
-
-#endif
-
-#include <asm-generic/memory_model.h>
-
-#endif
diff --git a/arch/unicore32/include/asm/mmu.h b/arch/unicore32/include/asm/mmu.h
deleted file mode 100644
index 8ad4e7eae17b..000000000000
--- a/arch/unicore32/include/asm/mmu.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/mmu.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_MMU_H__
-#define __UNICORE_MMU_H__
-
-typedef	unsigned long mm_context_t;
-
-#endif
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
deleted file mode 100644
index 388c0c811c68..000000000000
--- a/arch/unicore32/include/asm/mmu_context.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/mmu_context.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_MMU_CONTEXT_H__
-#define __UNICORE_MMU_CONTEXT_H__
-
-#include <linux/compiler.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmacache.h>
-#include <linux/io.h>
-
-#include <asm/cacheflush.h>
-#include <asm/cpu-single.h>
-
-#define init_new_context(tsk, mm)	0
-
-#define destroy_context(mm)		do { } while (0)
-
-/*
- * This is called when "tsk" is about to enter lazy TLB mode.
- *
- * mm:  describes the currently active mm context
- * tsk: task which is entering lazy tlb
- * cpu: cpu number which is entering lazy tlb
- *
- * tsk->mm will be NULL
- */
-static inline void
-enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
-/*
- * This is the actual mm switch as far as the scheduler
- * is concerned.  No registers are touched.  We avoid
- * calling the CPU specific function when the mm hasn't
- * actually changed.
- */
-static inline void
-switch_mm(struct mm_struct *prev, struct mm_struct *next,
-	  struct task_struct *tsk)
-{
-	unsigned int cpu = smp_processor_id();
-
-	if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next)
-		cpu_switch_mm(next->pgd, next);
-}
-
-#define deactivate_mm(tsk, mm)	do { } while (0)
-#define activate_mm(prev, next)	switch_mm(prev, next, NULL)
-
-/*
- * We are inserting a "fake" vma for the user-accessible vector page so
- * gdb and friends can get to it through ptrace and /proc/<pid>/mem.
- * But we also want to remove it before the generic code gets to see it
- * during process exit or the unmapping of it would  cause total havoc.
- * (the macro is used as remove_vma() is static to mm/mmap.c)
- */
-#define arch_exit_mmap(mm) \
-do { \
-	struct vm_area_struct *high_vma = find_vma(mm, 0xffff0000); \
-	if (high_vma) { \
-		BUG_ON(high_vma->vm_next);  /* it should be last */ \
-		if (high_vma->vm_prev) \
-			high_vma->vm_prev->vm_next = NULL; \
-		else \
-			mm->mmap = NULL; \
-		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
-		vmacache_invalidate(mm); \
-		mm->map_count--; \
-		remove_vma(high_vma); \
-	} \
-} while (0)
-
-static inline int arch_dup_mmap(struct mm_struct *oldmm,
-				struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void arch_unmap(struct mm_struct *mm,
-			unsigned long start, unsigned long end)
-{
-}
-
-static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
-		bool write, bool execute, bool foreign)
-{
-	/* by default, allow everything */
-	return true;
-}
-#endif
diff --git a/arch/unicore32/include/asm/page.h b/arch/unicore32/include/asm/page.h
deleted file mode 100644
index 96d6bdf180bd..000000000000
--- a/arch/unicore32/include/asm/page.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/page.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_PAGE_H__
-#define __UNICORE_PAGE_H__
-
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT		12
-#define PAGE_SIZE		(_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK		(~(PAGE_SIZE-1))
-
-#ifndef __ASSEMBLY__
-
-struct page;
-struct vm_area_struct;
-
-#define clear_page(page)	memset((void *)(page), 0, PAGE_SIZE)
-extern void copy_page(void *to, const void *from);
-
-#define clear_user_page(page, vaddr, pg)	clear_page(page)
-#define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
-
-#undef STRICT_MM_TYPECHECKS
-
-#ifdef STRICT_MM_TYPECHECKS
-/*
- * These are used to make use of C type-checking..
- */
-typedef struct { unsigned long pte; } pte_t;
-typedef struct { unsigned long pgd; } pgd_t;
-typedef struct { unsigned long pgprot; } pgprot_t;
-
-#define pte_val(x)      ((x).pte)
-#define pgd_val(x)	((x).pgd)
-#define pgprot_val(x)   ((x).pgprot)
-
-#define __pte(x)        ((pte_t) { (x) })
-#define __pgd(x)	((pgd_t) { (x) })
-#define __pgprot(x)     ((pgprot_t) { (x) })
-
-#else
-/*
- * .. while these make it easier on the compiler
- */
-typedef unsigned long pte_t;
-typedef unsigned long pgd_t;
-typedef unsigned long pgprot_t;
-
-#define pte_val(x)      (x)
-#define pgd_val(x)      (x)
-#define pgprot_val(x)   (x)
-
-#define __pte(x)        (x)
-#define __pgd(x)	(x)
-#define __pgprot(x)     (x)
-
-#endif /* STRICT_MM_TYPECHECKS */
-
-typedef struct page *pgtable_t;
-
-extern int pfn_valid(unsigned long);
-
-#include <asm/memory.h>
-
-#endif /* !__ASSEMBLY__ */
-
-#include <asm-generic/getorder.h>
-
-#endif
diff --git a/arch/unicore32/include/asm/pci.h b/arch/unicore32/include/asm/pci.h
deleted file mode 100644
index 3efa8ee1afce..000000000000
--- a/arch/unicore32/include/asm/pci.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/pci.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_PCI_H__
-#define __UNICORE_PCI_H__
-
-#ifdef __KERNEL__
-#include <asm-generic/pci.h>
-#include <mach/hardware.h> /* for PCIBIOS_MIN_* */
-
-#define HAVE_PCI_MMAP
-#define ARCH_GENERIC_PCI_MMAP_RESOURCE
-
-#endif /* __KERNEL__ */
-#endif
diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h
deleted file mode 100644
index ba1c9a79993b..000000000000
--- a/arch/unicore32/include/asm/pgalloc.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/pgalloc.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_PGALLOC_H__
-#define __UNICORE_PGALLOC_H__
-
-#include <asm/pgtable-hwdef.h>
-#include <asm/processor.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
-#define __HAVE_ARCH_PTE_ALLOC_ONE
-#include <asm-generic/pgalloc.h>
-
-#define _PAGE_USER_TABLE	(PMD_TYPE_TABLE | PMD_PRESENT)
-#define _PAGE_KERNEL_TABLE	(PMD_TYPE_TABLE | PMD_PRESENT)
-
-extern pgd_t *get_pgd_slow(struct mm_struct *mm);
-extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd);
-
-#define pgd_alloc(mm)			get_pgd_slow(mm)
-#define pgd_free(mm, pgd)		free_pgd_slow(mm, pgd)
-
-/*
- * Allocate one PTE table.
- */
-static inline pte_t *
-pte_alloc_one_kernel(struct mm_struct *mm)
-{
-	pte_t *pte = __pte_alloc_one_kernel(mm);
-
-	if (pte)
-		clean_dcache_area(pte, PTRS_PER_PTE * sizeof(pte_t));
-
-	return pte;
-}
-
-static inline pgtable_t
-pte_alloc_one(struct mm_struct *mm)
-{
-	struct page *pte;
-
-	pte = __pte_alloc_one(mm, GFP_PGTABLE_USER);
-	if (!pte)
-		return NULL;
-	if (!PageHighMem(pte))
-		clean_pte_table(page_address(pte));
-	return pte;
-}
-
-static inline void __pmd_populate(pmd_t *pmdp, unsigned long pmdval)
-{
-	set_pmd(pmdp, __pmd(pmdval));
-	flush_pmd_entry(pmdp);
-}
-
-/*
- * Populate the pmdp entry with a pointer to the pte.  This pmd is part
- * of the mm address space.
- */
-static inline void
-pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
-{
-	unsigned long pte_ptr = (unsigned long)ptep;
-
-	/*
-	 * The pmd must be loaded with the physical
-	 * address of the PTE table
-	 */
-	__pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE);
-}
-
-static inline void
-pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
-{
-	__pmd_populate(pmdp,
-			page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE);
-}
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
-#endif
diff --git a/arch/unicore32/include/asm/pgtable-hwdef.h b/arch/unicore32/include/asm/pgtable-hwdef.h
deleted file mode 100644
index f28b58c61db9..000000000000
--- a/arch/unicore32/include/asm/pgtable-hwdef.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/pgtable-hwdef.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_PGTABLE_HWDEF_H__
-#define __UNICORE_PGTABLE_HWDEF_H__
-
-/*
- * Hardware page table definitions.
- *
- * + Level 1 descriptor (PMD)
- *   - common
- */
-#define PMD_TYPE_MASK		(3 << 0)
-#define PMD_TYPE_TABLE		(0 << 0)
-/*#define PMD_TYPE_LARGE	(1 << 0) */
-#define PMD_TYPE_INVALID	(2 << 0)
-#define PMD_TYPE_SECT		(3 << 0)
-
-#define PMD_PRESENT		(1 << 2)
-#define PMD_YOUNG		(1 << 3)
-
-/*#define PMD_SECT_DIRTY	(1 << 4) */
-#define PMD_SECT_CACHEABLE	(1 << 5)
-#define PMD_SECT_EXEC		(1 << 6)
-#define PMD_SECT_WRITE		(1 << 7)
-#define PMD_SECT_READ		(1 << 8)
-
-/*
- * + Level 2 descriptor (PTE)
- *   - common
- */
-#define PTE_TYPE_MASK		(3 << 0)
-#define PTE_TYPE_SMALL		(0 << 0)
-#define PTE_TYPE_MIDDLE		(1 << 0)
-#define PTE_TYPE_LARGE		(2 << 0)
-#define PTE_TYPE_INVALID	(3 << 0)
-
-#define PTE_PRESENT		(1 << 2)
-#define PTE_YOUNG		(1 << 3)
-#define PTE_DIRTY		(1 << 4)
-#define PTE_CACHEABLE		(1 << 5)
-#define PTE_EXEC		(1 << 6)
-#define PTE_WRITE		(1 << 7)
-#define PTE_READ		(1 << 8)
-
-#endif
diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h
deleted file mode 100644
index 97f564c8ecba..000000000000
--- a/arch/unicore32/include/asm/pgtable.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/pgtable.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_PGTABLE_H__
-#define __UNICORE_PGTABLE_H__
-
-#include <asm-generic/pgtable-nopmd.h>
-#include <asm/cpu-single.h>
-
-#include <asm/memory.h>
-#include <asm/pgtable-hwdef.h>
-
-/*
- * Just any arbitrary offset to the start of the vmalloc VM area: the
- * current 8MB value just means that there will be a 8MB "hole" after the
- * physical memory until the kernel virtual memory starts.  That means that
- * any out-of-bounds memory accesses will hopefully be caught.
- * The vmalloc() routines leaves a hole of 4kB between each vmalloced
- * area for the same reason. ;)
- *
- * Note that platforms may override VMALLOC_START, but they must provide
- * VMALLOC_END.  VMALLOC_END defines the (exclusive) limit of this space,
- * which may not overlap IO space.
- */
-#ifndef VMALLOC_START
-#define VMALLOC_OFFSET		SZ_8M
-#define VMALLOC_START		(((unsigned long)high_memory + VMALLOC_OFFSET) \
-					& ~(VMALLOC_OFFSET-1))
-#define VMALLOC_END		(0xff000000UL)
-#endif
-
-#define PTRS_PER_PTE		1024
-#define PTRS_PER_PGD		1024
-
-/*
- * PGDIR_SHIFT determines what a third-level page table entry can map
- */
-#define PGDIR_SHIFT		22
-
-#ifndef __ASSEMBLY__
-extern void __pte_error(const char *file, int line, unsigned long val);
-extern void __pgd_error(const char *file, int line, unsigned long val);
-
-#define pte_ERROR(pte)		__pte_error(__FILE__, __LINE__, pte_val(pte))
-#define pgd_ERROR(pgd)		__pgd_error(__FILE__, __LINE__, pgd_val(pgd))
-#endif /* !__ASSEMBLY__ */
-
-#define PGDIR_SIZE		(1UL << PGDIR_SHIFT)
-#define PGDIR_MASK		(~(PGDIR_SIZE-1))
-
-/*
- * This is the lowest virtual address we can permit any user space
- * mapping to be mapped at.  This is particularly important for
- * non-high vector CPUs.
- */
-#define FIRST_USER_ADDRESS	PAGE_SIZE
-
-#define FIRST_USER_PGD_NR	1
-#define USER_PTRS_PER_PGD	((TASK_SIZE/PGDIR_SIZE) - FIRST_USER_PGD_NR)
-
-/*
- * section address mask and size definitions.
- */
-#define SECTION_SHIFT		22
-#define SECTION_SIZE		(1UL << SECTION_SHIFT)
-#define SECTION_MASK		(~(SECTION_SIZE-1))
-
-#ifndef __ASSEMBLY__
-
-/*
- * The pgprot_* and protection_map entries will be fixed up in runtime
- * to include the cachable bits based on memory policy, as well as any
- * architecture dependent bits.
- */
-#define _PTE_DEFAULT		(PTE_PRESENT | PTE_YOUNG | PTE_CACHEABLE)
-
-extern pgprot_t pgprot_user;
-extern pgprot_t pgprot_kernel;
-
-#define PAGE_NONE		pgprot_user
-#define PAGE_SHARED		__pgprot(pgprot_val(pgprot_user | PTE_READ \
-								| PTE_WRITE))
-#define PAGE_SHARED_EXEC	__pgprot(pgprot_val(pgprot_user | PTE_READ \
-								| PTE_WRITE \
-								| PTE_EXEC))
-#define PAGE_COPY		__pgprot(pgprot_val(pgprot_user | PTE_READ)
-#define PAGE_COPY_EXEC		__pgprot(pgprot_val(pgprot_user | PTE_READ \
-								| PTE_EXEC))
-#define PAGE_READONLY		__pgprot(pgprot_val(pgprot_user | PTE_READ))
-#define PAGE_READONLY_EXEC	__pgprot(pgprot_val(pgprot_user | PTE_READ \
-								| PTE_EXEC))
-#define PAGE_KERNEL		pgprot_kernel
-#define PAGE_KERNEL_EXEC	__pgprot(pgprot_val(pgprot_kernel | PTE_EXEC))
-
-#define __PAGE_NONE		__pgprot(_PTE_DEFAULT)
-#define __PAGE_SHARED		__pgprot(_PTE_DEFAULT | PTE_READ \
-							| PTE_WRITE)
-#define __PAGE_SHARED_EXEC	__pgprot(_PTE_DEFAULT | PTE_READ \
-							| PTE_WRITE \
-							| PTE_EXEC)
-#define __PAGE_COPY		__pgprot(_PTE_DEFAULT | PTE_READ)
-#define __PAGE_COPY_EXEC	__pgprot(_PTE_DEFAULT | PTE_READ \
-							| PTE_EXEC)
-#define __PAGE_READONLY		__pgprot(_PTE_DEFAULT | PTE_READ)
-#define __PAGE_READONLY_EXEC	__pgprot(_PTE_DEFAULT | PTE_READ \
-							| PTE_EXEC)
-
-#endif /* __ASSEMBLY__ */
-
-/*
- * The table below defines the page protection levels that we insert into our
- * Linux page table version.  These get translated into the best that the
- * architecture can perform.  Note that on UniCore hardware:
- *  1) We cannot do execute protection
- *  2) If we could do execute protection, then read is implied
- *  3) write implies read permissions
- */
-#define __P000  __PAGE_NONE
-#define __P001  __PAGE_READONLY
-#define __P010  __PAGE_COPY
-#define __P011  __PAGE_COPY
-#define __P100  __PAGE_READONLY_EXEC
-#define __P101  __PAGE_READONLY_EXEC
-#define __P110  __PAGE_COPY_EXEC
-#define __P111  __PAGE_COPY_EXEC
-
-#define __S000  __PAGE_NONE
-#define __S001  __PAGE_READONLY
-#define __S010  __PAGE_SHARED
-#define __S011  __PAGE_SHARED
-#define __S100  __PAGE_READONLY_EXEC
-#define __S101  __PAGE_READONLY_EXEC
-#define __S110  __PAGE_SHARED_EXEC
-#define __S111  __PAGE_SHARED_EXEC
-
-#ifndef __ASSEMBLY__
-/*
- * ZERO_PAGE is a global shared page that is always zero: used
- * for zero-mapped memory areas etc..
- */
-extern struct page *empty_zero_page;
-#define ZERO_PAGE(vaddr)		(empty_zero_page)
-
-#define pte_pfn(pte)			(pte_val(pte) >> PAGE_SHIFT)
-#define pfn_pte(pfn, prot)		(__pte(((pfn) << PAGE_SHIFT) \
-						| pgprot_val(prot)))
-
-#define pte_none(pte)			(!pte_val(pte))
-#define pte_clear(mm, addr, ptep)	set_pte(ptep, __pte(0))
-#define pte_page(pte)			(pfn_to_page(pte_pfn(pte)))
-
-#define set_pte(ptep, pte)	cpu_set_pte(ptep, pte)
-
-#define set_pte_at(mm, addr, ptep, pteval)	\
-	do {					\
-		set_pte(ptep, pteval);          \
-	} while (0)
-
-/*
- * The following only work if pte_present() is true.
- * Undefined behaviour if not..
- */
-#define pte_present(pte)	(pte_val(pte) & PTE_PRESENT)
-#define pte_write(pte)		(pte_val(pte) & PTE_WRITE)
-#define pte_dirty(pte)		(pte_val(pte) & PTE_DIRTY)
-#define pte_young(pte)		(pte_val(pte) & PTE_YOUNG)
-#define pte_exec(pte)		(pte_val(pte) & PTE_EXEC)
-
-#define PTE_BIT_FUNC(fn, op) \
-static inline pte_t pte_##fn(pte_t pte) { pte_val(pte) op; return pte; }
-
-PTE_BIT_FUNC(wrprotect, &= ~PTE_WRITE);
-PTE_BIT_FUNC(mkwrite,   |= PTE_WRITE);
-PTE_BIT_FUNC(mkclean,   &= ~PTE_DIRTY);
-PTE_BIT_FUNC(mkdirty,   |= PTE_DIRTY);
-PTE_BIT_FUNC(mkold,     &= ~PTE_YOUNG);
-PTE_BIT_FUNC(mkyoung,   |= PTE_YOUNG);
-
-/*
- * Mark the prot value as uncacheable.
- */
-#define pgprot_noncached(prot)		\
-	__pgprot(pgprot_val(prot) & ~PTE_CACHEABLE)
-#define pgprot_writecombine(prot)	\
-	__pgprot(pgprot_val(prot) & ~PTE_CACHEABLE)
-
-#define pmd_none(pmd)		(!pmd_val(pmd))
-#define pmd_present(pmd)	(pmd_val(pmd) & PMD_PRESENT)
-#define pmd_bad(pmd)		(((pmd_val(pmd) &		\
-				(PMD_PRESENT | PMD_TYPE_MASK))	\
-				!= (PMD_PRESENT | PMD_TYPE_TABLE)))
-
-#define set_pmd(pmdpd, pmdval)		\
-	do {				\
-		*(pmdpd) = pmdval;	\
-	} while (0)
-
-#define pmd_clear(pmdp)			\
-	do {				\
-		set_pmd(pmdp, __pmd(0));\
-		clean_pmd_entry(pmdp);	\
-	} while (0)
-
-#define pmd_page_vaddr(pmd) ((pte_t *)__va(pmd_val(pmd) & PAGE_MASK))
-#define pmd_page(pmd)		pfn_to_page(__phys_to_pfn(pmd_val(pmd)))
-
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-#define mk_pte(page, prot)	pfn_pte(page_to_pfn(page), prot)
-
-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-{
-	const unsigned long mask = PTE_EXEC | PTE_WRITE | PTE_READ;
-	pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask);
-	return pte;
-}
-
-extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
-
-/*
- * Encode and decode a swap entry.  Swap entries are stored in the Linux
- * page tables as follows:
- *
- *   3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
- *   1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
- *   <--------------- offset --------------> <--- type --> 0 0 0 0 0
- *
- * This gives us up to 127 swap files and 32GB per swap file.  Note that
- * the offset field is always non-zero.
- */
-#define __SWP_TYPE_SHIFT	5
-#define __SWP_TYPE_BITS		7
-#define __SWP_TYPE_MASK		((1 << __SWP_TYPE_BITS) - 1)
-#define __SWP_OFFSET_SHIFT	(__SWP_TYPE_BITS + __SWP_TYPE_SHIFT)
-
-#define __swp_type(x)		(((x).val >> __SWP_TYPE_SHIFT)		\
-				& __SWP_TYPE_MASK)
-#define __swp_offset(x)		((x).val >> __SWP_OFFSET_SHIFT)
-#define __swp_entry(type, offset) ((swp_entry_t) {			\
-				((type) << __SWP_TYPE_SHIFT) |		\
-				((offset) << __SWP_OFFSET_SHIFT) })
-
-#define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
-#define __swp_entry_to_pte(swp)	((pte_t) { (swp).val })
-
-/*
- * It is an error for the kernel to have more swap files than we can
- * encode in the PTEs.  This ensures that we know when MAX_SWAPFILES
- * is increased beyond what we presently support.
- */
-#define MAX_SWAPFILES_CHECK()	\
-	BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
-
-/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
-/* FIXME: this is not correct */
-#define kern_addr_valid(addr)	(1)
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* __UNICORE_PGTABLE_H__ */
diff --git a/arch/unicore32/include/asm/processor.h b/arch/unicore32/include/asm/processor.h
deleted file mode 100644
index 6f01620da3d1..000000000000
--- a/arch/unicore32/include/asm/processor.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/processor.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_PROCESSOR_H__
-#define __UNICORE_PROCESSOR_H__
-
-#ifdef __KERNEL__
-
-#include <asm/ptrace.h>
-#include <asm/types.h>
-
-#ifdef __KERNEL__
-#define STACK_TOP	TASK_SIZE
-#define STACK_TOP_MAX	TASK_SIZE
-#endif
-
-struct debug_entry {
-	u32			address;
-	u32			insn;
-};
-
-struct debug_info {
-	int			nsaved;
-	struct debug_entry	bp[2];
-};
-
-struct thread_struct {
-							/* fault info	  */
-	unsigned long		address;
-	unsigned long		trap_no;
-	unsigned long		error_code;
-							/* debugging	  */
-	struct debug_info	debug;
-};
-
-#define INIT_THREAD  {	}
-
-#define start_thread(regs, pc, sp)					\
-({									\
-	unsigned long *stack = (unsigned long *)sp;			\
-	memset(regs->uregs, 0, sizeof(regs->uregs));			\
-	regs->UCreg_asr = USER_MODE;					\
-	regs->UCreg_pc = pc & ~1;	/* pc */                        \
-	regs->UCreg_sp = sp;		/* sp */                        \
-	regs->UCreg_02 = stack[2];	/* r2 (envp) */                 \
-	regs->UCreg_01 = stack[1];	/* r1 (argv) */                 \
-	regs->UCreg_00 = stack[0];	/* r0 (argc) */                 \
-})
-
-/* Forward declaration, a strange C thing */
-struct task_struct;
-
-/* Free all resources held by a thread. */
-extern void release_thread(struct task_struct *);
-
-unsigned long get_wchan(struct task_struct *p);
-
-#define cpu_relax()			barrier()
-
-#define task_pt_regs(p) \
-	((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1)
-
-#define KSTK_EIP(tsk)	(task_pt_regs(tsk)->UCreg_pc)
-#define KSTK_ESP(tsk)	(task_pt_regs(tsk)->UCreg_sp)
-
-#endif
-
-#endif /* __UNICORE_PROCESSOR_H__ */
diff --git a/arch/unicore32/include/asm/ptrace.h b/arch/unicore32/include/asm/ptrace.h
deleted file mode 100644
index bb4cbc42c321..000000000000
--- a/arch/unicore32/include/asm/ptrace.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/ptrace.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_PTRACE_H__
-#define __UNICORE_PTRACE_H__
-
-#include <uapi/asm/ptrace.h>
-
-#ifndef __ASSEMBLY__
-
-#define user_mode(regs)	\
-	(processor_mode(regs) == USER_MODE)
-
-#define processor_mode(regs) \
-	((regs)->UCreg_asr & MODE_MASK)
-
-#define interrupts_enabled(regs) \
-	(!((regs)->UCreg_asr & PSR_I_BIT))
-
-#define fast_interrupts_enabled(regs) \
-	(!((regs)->UCreg_asr & PSR_R_BIT))
-
-/* Are the current registers suitable for user mode?
- * (used to maintain security in signal handlers)
- */
-static inline int valid_user_regs(struct pt_regs *regs)
-{
-	unsigned long mode = regs->UCreg_asr & MODE_MASK;
-
-	/*
-	 * Always clear the R (REAL) bits
-	 */
-	regs->UCreg_asr &= ~(PSR_R_BIT);
-
-	if ((regs->UCreg_asr & PSR_I_BIT) == 0) {
-		if (mode == USER_MODE)
-			return 1;
-	}
-
-	/*
-	 * Force ASR to something logical...
-	 */
-	regs->UCreg_asr &= PSR_f | USER_MODE;
-
-	return 0;
-}
-
-#define instruction_pointer(regs)	((regs)->UCreg_pc)
-#define user_stack_pointer(regs)	((regs)->UCreg_sp)
-#define profile_pc(regs)		instruction_pointer(regs)
-
-#endif /* __ASSEMBLY__ */
-#endif
diff --git a/arch/unicore32/include/asm/stacktrace.h b/arch/unicore32/include/asm/stacktrace.h
deleted file mode 100644
index 3e59f9d2faed..000000000000
--- a/arch/unicore32/include/asm/stacktrace.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/stacktrace.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_STACKTRACE_H__
-#define __UNICORE_STACKTRACE_H__
-
-struct stackframe {
-	unsigned long fp;
-	unsigned long sp;
-	unsigned long lr;
-	unsigned long pc;
-};
-
-#ifdef CONFIG_FRAME_POINTER
-extern int unwind_frame(struct stackframe *frame);
-#else
-#define unwind_frame(f) (-EINVAL)
-#endif
-extern void walk_stackframe(struct stackframe *frame,
-			    int (*fn)(struct stackframe *, void *), void *data);
-
-#endif	/* __UNICORE_STACKTRACE_H__ */
diff --git a/arch/unicore32/include/asm/string.h b/arch/unicore32/include/asm/string.h
deleted file mode 100644
index 1649b0e4271b..000000000000
--- a/arch/unicore32/include/asm/string.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/string.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_STRING_H__
-#define __UNICORE_STRING_H__
-
-/*
- * We don't do inline string functions, since the
- * optimised inline asm versions are not small.
- */
-
-#define __HAVE_ARCH_STRRCHR
-extern char *strrchr(const char *s, int c);
-
-#define __HAVE_ARCH_STRCHR
-extern char *strchr(const char *s, int c);
-
-#define __HAVE_ARCH_MEMCPY
-extern void *memcpy(void *, const void *, __kernel_size_t);
-
-#define __HAVE_ARCH_MEMMOVE
-extern void *memmove(void *, const void *, __kernel_size_t);
-
-#define __HAVE_ARCH_MEMCHR
-extern void *memchr(const void *, int, __kernel_size_t);
-
-#define __HAVE_ARCH_MEMSET
-extern void *memset(void *, int, __kernel_size_t);
-
-#endif
diff --git a/arch/unicore32/include/asm/suspend.h b/arch/unicore32/include/asm/suspend.h
deleted file mode 100644
index 72bd89c44d10..000000000000
--- a/arch/unicore32/include/asm/suspend.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/suspend.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_SUSPEND_H__
-#define __UNICORE_SUSPEND_H__
-
-#ifndef __ASSEMBLY__
-
-#include <asm/ptrace.h>
-
-struct swsusp_arch_regs {
-	struct cpu_context_save	cpu_context;	/* cpu context */
-#ifdef CONFIG_UNICORE_FPU_F64
-	struct fp_state		fpstate __attribute__((aligned(8)));
-#endif
-};
-#endif
-
-#endif /* __UNICORE_SUSPEND_H__ */
-
diff --git a/arch/unicore32/include/asm/switch_to.h b/arch/unicore32/include/asm/switch_to.h
deleted file mode 100644
index 12e534b3bfa5..000000000000
--- a/arch/unicore32/include/asm/switch_to.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Task switching for PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2012 GUAN Xue-tao
- */
-#ifndef __UNICORE_SWITCH_TO_H__
-#define __UNICORE_SWITCH_TO_H__
-
-struct task_struct;
-struct thread_info;
-
-/*
- * switch_to(prev, next) should switch from task `prev' to `next'
- * `prev' will never be the same as `next'.  schedule() itself
- * contains the memory barrier to tell GCC not to cache `current'.
- */
-extern struct task_struct *__switch_to(struct task_struct *,
-		struct thread_info *, struct thread_info *);
-
-#define switch_to(prev, next, last)					\
-	do {								\
-		last = __switch_to(prev, task_thread_info(prev),	\
-					task_thread_info(next));	\
-	} while (0)
-
-#endif /* __UNICORE_SWITCH_TO_H__ */
diff --git a/arch/unicore32/include/asm/syscall.h b/arch/unicore32/include/asm/syscall.h
deleted file mode 100644
index 607961797fff..000000000000
--- a/arch/unicore32/include/asm/syscall.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_UNICORE_SYSCALL_H
-#define _ASM_UNICORE_SYSCALL_H
-
-#include <uapi/linux/audit.h>
-
-static inline int syscall_get_arch(struct task_struct *task)
-{
-	return AUDIT_ARCH_UNICORE;
-}
-
-#endif	/* _ASM_UNICORE_SYSCALL_H */
diff --git a/arch/unicore32/include/asm/thread_info.h b/arch/unicore32/include/asm/thread_info.h
deleted file mode 100644
index d8a6d6b7a403..000000000000
--- a/arch/unicore32/include/asm/thread_info.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/thread_info.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_THREAD_INFO_H__
-#define __UNICORE_THREAD_INFO_H__
-
-#ifdef __KERNEL__
-
-#include <linux/compiler.h>
-#include <asm/fpstate.h>
-
-#define THREAD_SIZE_ORDER	1
-#define THREAD_SIZE		8192
-#define THREAD_START_SP		(THREAD_SIZE - 8)
-
-#ifndef __ASSEMBLY__
-
-struct task_struct;
-
-#include <asm/types.h>
-
-typedef struct {
-	unsigned long seg;
-} mm_segment_t;
-
-struct cpu_context_save {
-	__u32	r4;
-	__u32	r5;
-	__u32	r6;
-	__u32	r7;
-	__u32	r8;
-	__u32	r9;
-	__u32	r10;
-	__u32	r11;
-	__u32	r12;
-	__u32	r13;
-	__u32	r14;
-	__u32	r15;
-	__u32	r16;
-	__u32	r17;
-	__u32	r18;
-	__u32	r19;
-	__u32	r20;
-	__u32	r21;
-	__u32	r22;
-	__u32	r23;
-	__u32	r24;
-	__u32	r25;
-	__u32	r26;
-	__u32	fp;
-	__u32	sp;
-	__u32	pc;
-};
-
-/*
- * low level task data that entry.S needs immediate access to.
- * __switch_to() assumes cpu_context follows immediately after cpu_domain.
- */
-struct thread_info {
-	unsigned long		flags;		/* low level flags */
-	int			preempt_count;	/* 0 => preemptable */
-						/* <0 => bug */
-	mm_segment_t		addr_limit;	/* address limit */
-	struct task_struct	*task;		/* main task structure */
-	__u32			cpu;		/* cpu */
-	struct cpu_context_save	cpu_context;	/* cpu context */
-	__u32			syscall;	/* syscall number */
-	__u8			used_cp[16];	/* thread used copro */
-#ifdef CONFIG_UNICORE_FPU_F64
-	struct fp_state		fpstate __attribute__((aligned(8)));
-#endif
-};
-
-#define INIT_THREAD_INFO(tsk)						\
-{									\
-	.task		= &tsk,						\
-	.flags		= 0,						\
-	.preempt_count	= INIT_PREEMPT_COUNT,				\
-	.addr_limit	= KERNEL_DS,					\
-}
-
-/*
- * how to get the thread information struct from C
- */
-static inline struct thread_info *current_thread_info(void) __attribute_const__;
-
-static inline struct thread_info *current_thread_info(void)
-{
-	register unsigned long sp asm ("sp");
-	return (struct thread_info *)(sp & ~(THREAD_SIZE - 1));
-}
-
-#define thread_saved_pc(tsk)	\
-	((unsigned long)(task_thread_info(tsk)->cpu_context.pc))
-#define thread_saved_sp(tsk)	\
-	((unsigned long)(task_thread_info(tsk)->cpu_context.sp))
-#define thread_saved_fp(tsk)	\
-	((unsigned long)(task_thread_info(tsk)->cpu_context.fp))
-
-#endif
-
-/*
- * thread information flags:
- *  TIF_SYSCALL_TRACE	- syscall trace active
- *  TIF_SIGPENDING	- signal pending
- *  TIF_NEED_RESCHED	- rescheduling necessary
- *  TIF_NOTIFY_RESUME	- callback before returning to user
- */
-#define TIF_SIGPENDING		0
-#define TIF_NEED_RESCHED	1
-#define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
-#define TIF_SYSCALL_TRACE	8
-#define TIF_MEMDIE		18
-#define TIF_RESTORE_SIGMASK	20
-
-#define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
-#define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
-#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
-#define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
-
-/*
- * Change these and you break ASM code in entry-common.S
- */
-#define _TIF_WORK_MASK \
-	(_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_NOTIFY_RESUME)
-
-#endif /* __KERNEL__ */
-#endif /* __UNICORE_THREAD_INFO_H__ */
diff --git a/arch/unicore32/include/asm/timex.h b/arch/unicore32/include/asm/timex.h
deleted file mode 100644
index d714af3dbce1..000000000000
--- a/arch/unicore32/include/asm/timex.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/timex.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_TIMEX_H__
-#define __UNICORE_TIMEX_H__
-
-#ifdef	CONFIG_ARCH_FPGA
-
-/* in FPGA, APB clock is 33M, and OST clock is 32K, */
-/* so, 1M is selected for timer interrupt correctly */
-#define	CLOCK_TICK_RATE		(32*1024)
-
-#endif
-
-#if defined(CONFIG_PUV3_DB0913)		\
-	|| defined(CONFIG_PUV3_NB0916)	\
-	|| defined(CONFIG_PUV3_SMW0919)
-
-#define  CLOCK_TICK_RATE         (14318000)
-
-#endif
-
-#include <asm-generic/timex.h>
-
-#endif
diff --git a/arch/unicore32/include/asm/tlb.h b/arch/unicore32/include/asm/tlb.h
deleted file mode 100644
index 4663d8cc80ef..000000000000
--- a/arch/unicore32/include/asm/tlb.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/tlb.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_TLB_H__
-#define __UNICORE_TLB_H__
-
-/*
- * unicore32 lacks an efficient flush_tlb_range(), use flush_tlb_mm().
- */
-
-#define __pte_free_tlb(tlb, pte, addr)				\
-	do {							\
-		pgtable_pte_page_dtor(pte);			\
-		tlb_remove_page((tlb), (pte));			\
-	} while (0)
-
-#include <asm-generic/tlb.h>
-
-#endif
diff --git a/arch/unicore32/include/asm/tlbflush.h b/arch/unicore32/include/asm/tlbflush.h
deleted file mode 100644
index 1cf18ef55515..000000000000
--- a/arch/unicore32/include/asm/tlbflush.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/tlbflush.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_TLBFLUSH_H__
-#define __UNICORE_TLBFLUSH_H__
-
-#ifndef __ASSEMBLY__
-
-#include <linux/sched.h>
-
-extern void __cpu_flush_user_tlb_range(unsigned long, unsigned long,
-					struct vm_area_struct *);
-extern void __cpu_flush_kern_tlb_range(unsigned long, unsigned long);
-
-/*
- *	TLB Management
- *	==============
- *
- *	The arch/unicore/mm/tlb-*.S files implement these methods.
- *
- *	The TLB specific code is expected to perform whatever tests it
- *	needs to determine if it should invalidate the TLB for each
- *	call.  Start addresses are inclusive and end addresses are
- *	exclusive; it is safe to round these addresses down.
- *
- *	flush_tlb_all()
- *
- *		Invalidate the entire TLB.
- *
- *	flush_tlb_mm(mm)
- *
- *		Invalidate all TLB entries in a particular address
- *		space.
- *		- mm	- mm_struct describing address space
- *
- *	flush_tlb_range(mm,start,end)
- *
- *		Invalidate a range of TLB entries in the specified
- *		address space.
- *		- mm	- mm_struct describing address space
- *		- start - start address (may not be aligned)
- *		- end	- end address (exclusive, may not be aligned)
- *
- *	flush_tlb_page(vaddr,vma)
- *
- *		Invalidate the specified page in the specified address range.
- *		- vaddr - virtual address (may not be aligned)
- *		- vma	- vma_struct describing address range
- *
- *	flush_kern_tlb_page(kaddr)
- *
- *		Invalidate the TLB entry for the specified page.  The address
- *		will be in the kernels virtual memory space.  Current uses
- *		only require the D-TLB to be invalidated.
- *		- kaddr - Kernel virtual memory address
- */
-
-static inline void local_flush_tlb_all(void)
-{
-	const int zero = 0;
-
-	/* TLB invalidate all */
-	asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (zero) : "cc");
-}
-
-static inline void local_flush_tlb_mm(struct mm_struct *mm)
-{
-	const int zero = 0;
-
-	if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
-		/* TLB invalidate all */
-		asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop"
-			: : "r" (zero) : "cc");
-	}
-	put_cpu();
-}
-
-static inline void
-local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
-{
-	if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
-#ifndef CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE
-		/* iTLB invalidate page */
-		asm("movc p0.c6, %0, #5; nop; nop; nop; nop; nop; nop; nop; nop"
-			: : "r" (uaddr & PAGE_MASK) : "cc");
-		/* dTLB invalidate page */
-		asm("movc p0.c6, %0, #3; nop; nop; nop; nop; nop; nop; nop; nop"
-			: : "r" (uaddr & PAGE_MASK) : "cc");
-#else
-		/* TLB invalidate all */
-		asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop"
-			: : "r" (uaddr & PAGE_MASK) : "cc");
-#endif
-	}
-}
-
-static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
-{
-#ifndef CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE
-	/* iTLB invalidate page */
-	asm("movc p0.c6, %0, #5; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (kaddr & PAGE_MASK) : "cc");
-	/* dTLB invalidate page */
-	asm("movc p0.c6, %0, #3; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (kaddr & PAGE_MASK) : "cc");
-#else
-	/* TLB invalidate all */
-	asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (kaddr & PAGE_MASK) : "cc");
-#endif
-}
-
-/*
- *	flush_pmd_entry
- *
- *	Flush a PMD entry (word aligned, or double-word aligned) to
- *	RAM if the TLB for the CPU we are running on requires this.
- *	This is typically used when we are creating PMD entries.
- *
- *	clean_pmd_entry
- *
- *	Clean (but don't drain the write buffer) if the CPU requires
- *	these operations.  This is typically used when we are removing
- *	PMD entries.
- */
-static inline void flush_pmd_entry(pmd_t *pmd)
-{
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	/* flush dcache line, see dcacheline_flush in proc-macros.S */
-	asm("mov	r1, %0 << #20\n"
-		"ldw	r2, =_stext\n"
-		"add	r2, r2, r1 >> #20\n"
-		"ldw	r1, [r2+], #0x0000\n"
-		"ldw	r1, [r2+], #0x1000\n"
-		"ldw	r1, [r2+], #0x2000\n"
-		"ldw	r1, [r2+], #0x3000\n"
-		: : "r" (pmd) : "r1", "r2");
-#else
-	/* flush dcache all */
-	asm("movc p0.c5, %0, #14; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (pmd) : "cc");
-#endif
-}
-
-static inline void clean_pmd_entry(pmd_t *pmd)
-{
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	/* clean dcache line */
-	asm("movc p0.c5, %0, #11; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (__pa(pmd) & ~(L1_CACHE_BYTES - 1)) : "cc");
-#else
-	/* clean dcache all */
-	asm("movc p0.c5, %0, #10; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (pmd) : "cc");
-#endif
-}
-
-/*
- * Convert calls to our calling convention.
- */
-#define local_flush_tlb_range(vma, start, end)	\
-	__cpu_flush_user_tlb_range(start, end, vma)
-#define local_flush_tlb_kernel_range(s, e)	\
-	__cpu_flush_kern_tlb_range(s, e)
-
-#define flush_tlb_all		local_flush_tlb_all
-#define flush_tlb_mm		local_flush_tlb_mm
-#define flush_tlb_page		local_flush_tlb_page
-#define flush_tlb_kernel_page	local_flush_tlb_kernel_page
-#define flush_tlb_range		local_flush_tlb_range
-#define flush_tlb_kernel_range	local_flush_tlb_kernel_range
-
-/*
- * if PG_dcache_clean is not set for the page, we need to ensure that any
- * cache entries for the kernels virtual memory range are written
- * back to the page.
- */
-extern void update_mmu_cache(struct vm_area_struct *vma,
-		unsigned long addr, pte_t *ptep);
-
-extern void do_bad_area(unsigned long addr, unsigned int fsr,
-		struct pt_regs *regs);
-
-#endif
-
-#endif
diff --git a/arch/unicore32/include/asm/traps.h b/arch/unicore32/include/asm/traps.h
deleted file mode 100644
index ad1508a9a903..000000000000
--- a/arch/unicore32/include/asm/traps.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/traps.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_TRAP_H__
-#define __UNICORE_TRAP_H__
-
-extern void __init early_trap_init(void);
-extern void dump_backtrace_entry(unsigned long where,
-		unsigned long from, unsigned long frame);
-
-extern void do_DataAbort(unsigned long addr, unsigned int fsr,
-		 struct pt_regs *regs);
-#endif
diff --git a/arch/unicore32/include/asm/uaccess.h b/arch/unicore32/include/asm/uaccess.h
deleted file mode 100644
index 33c24f430511..000000000000
--- a/arch/unicore32/include/asm/uaccess.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/uaccess.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_UACCESS_H__
-#define __UNICORE_UACCESS_H__
-
-#include <asm/memory.h>
-
-#define __strncpy_from_user	__strncpy_from_user
-#define __strnlen_user		__strnlen_user
-#define __clear_user		__clear_user
-
-#define __kernel_ok		(uaccess_kernel())
-#define __user_ok(addr, size)	(((size) <= TASK_SIZE)			\
-				&& ((addr) <= TASK_SIZE - (size)))
-#define __access_ok(addr, size)	(__kernel_ok || __user_ok((addr), (size)))
-
-extern unsigned long __must_check
-raw_copy_from_user(void *to, const void __user *from, unsigned long n);
-extern unsigned long __must_check
-raw_copy_to_user(void __user *to, const void *from, unsigned long n);
-extern unsigned long __must_check
-__clear_user(void __user *addr, unsigned long n);
-extern unsigned long __must_check
-__strncpy_from_user(char *to, const char __user *from, unsigned long count);
-extern unsigned long
-__strnlen_user(const char __user *s, long n);
-#define INLINE_COPY_FROM_USER
-#define INLINE_COPY_TO_USER
-
-#include <asm-generic/uaccess.h>
-
-#endif /* __UNICORE_UACCESS_H__ */
diff --git a/arch/unicore32/include/asm/vmalloc.h b/arch/unicore32/include/asm/vmalloc.h
deleted file mode 100644
index 054435818a14..000000000000
--- a/arch/unicore32/include/asm/vmalloc.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef _ASM_UNICORE32_VMALLOC_H
-#define _ASM_UNICORE32_VMALLOC_H
-
-#endif /* _ASM_UNICORE32_VMALLOC_H */
diff --git a/arch/unicore32/include/mach/PKUnity.h b/arch/unicore32/include/mach/PKUnity.h
deleted file mode 100644
index 78f77517c1c7..000000000000
--- a/arch/unicore32/include/mach/PKUnity.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/PKUnity.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-/* Be sure that virtual mapping is defined right */
-#ifndef __MACH_PUV3_HARDWARE_H__
-#error You must include hardware.h not PKUnity.h
-#endif
-
-#include <mach/bitfield.h>
-
-/*
- * Memory Definitions
- */
-#define PKUNITY_SDRAM_BASE		0x00000000 /* 0x00000000 - 0x7FFFFFFF 2GB */
-#define PKUNITY_MMIO_BASE		0x80000000 /* 0x80000000 - 0xFFFFFFFF 2GB */
-
-/*
- * PKUNITY System Bus Addresses (PCI): 0x80000000 - 0xBFFFFFFF (1GB)
- * 0x80000000 - 0x8000000B 12B    PCI Configuration regs
- * 0x80010000 - 0x80010250 592B   PCI Bridge Base
- * 0x80030000 - 0x8003FFFF 64KB   PCI Legacy IO
- * 0x90000000 - 0x97FFFFFF 128MB  PCI AHB-PCI MEM-mapping
- * 0x98000000 - 0x9FFFFFFF 128MB  PCI PCI-AHB MEM-mapping
- */
-#define PKUNITY_PCI_BASE		io_p2v(0x80000000) /* 0x80000000 - 0xBFFFFFFF 1GB */
-#include <mach/regs-pci.h>
-
-#define PKUNITY_PCICFG_BASE		(PKUNITY_PCI_BASE + 0x0)
-#define PKUNITY_PCIBRI_BASE		(PKUNITY_PCI_BASE + 0x00010000)
-#define PKUNITY_PCILIO_BASE		(PKUNITY_PCI_BASE + 0x00030000)
-#define PKUNITY_PCIMEM_BASE		(PKUNITY_PCI_BASE + 0x10000000)
-#define PKUNITY_PCIAHB_BASE		(PKUNITY_PCI_BASE + 0x18000000)
-
-/*
- * PKUNITY System Bus Addresses (AHB): 0xC0000000 - 0xEDFFFFFF (640MB)
- */
-#define PKUNITY_AHB_BASE		io_p2v(0xC0000000)
-
-/* AHB-0 is DDR2 SDRAM */
-/* AHB-1 is PCI Space */
-#define PKUNITY_ARBITER_BASE		(PKUNITY_AHB_BASE + 0x000000) /* AHB-2 */
-#define PKUNITY_DDR2CTRL_BASE		(PKUNITY_AHB_BASE + 0x100000) /* AHB-3 */
-#define PKUNITY_DMAC_BASE		(PKUNITY_AHB_BASE + 0x200000) /* AHB-4 */
-#include <mach/regs-dmac.h>
-#define PKUNITY_UMAL_BASE		(PKUNITY_AHB_BASE + 0x300000) /* AHB-5 */
-#include <mach/regs-umal.h>
-#define PKUNITY_USB_BASE		(PKUNITY_AHB_BASE + 0x400000) /* AHB-6 */
-#define PKUNITY_SATA_BASE		(PKUNITY_AHB_BASE + 0x500000) /* AHB-7 */
-#define PKUNITY_SMC_BASE		(PKUNITY_AHB_BASE + 0x600000) /* AHB-8 */
-/* AHB-9 is for APB bridge */
-#define PKUNITY_MME_BASE		(PKUNITY_AHB_BASE + 0x700000) /* AHB-10 */
-#define PKUNITY_UNIGFX_BASE		(PKUNITY_AHB_BASE + 0x800000) /* AHB-11 */
-#include <mach/regs-unigfx.h>
-#define PKUNITY_NAND_BASE		(PKUNITY_AHB_BASE + 0x900000) /* AHB-12 */
-#include <mach/regs-nand.h>
-#define PKUNITY_H264D_BASE		(PKUNITY_AHB_BASE + 0xA00000) /* AHB-13 */
-#define PKUNITY_H264E_BASE		(PKUNITY_AHB_BASE + 0xB00000) /* AHB-14 */
-
-/*
- * PKUNITY Peripheral Bus Addresses (APB): 0xEE000000 - 0xEFFFFFFF (128MB)
- */
-#define PKUNITY_APB_BASE		io_p2v(0xEE000000)
-
-#define PKUNITY_UART0_BASE		(PKUNITY_APB_BASE + 0x000000) /* APB-0 */
-#define PKUNITY_UART1_BASE		(PKUNITY_APB_BASE + 0x100000) /* APB-1 */
-#include <mach/regs-uart.h>
-#define PKUNITY_I2C_BASE		(PKUNITY_APB_BASE + 0x200000) /* APB-2 */
-#include <mach/regs-i2c.h>
-#define PKUNITY_SPI_BASE		(PKUNITY_APB_BASE + 0x300000) /* APB-3 */
-#include <mach/regs-spi.h>
-#define PKUNITY_AC97_BASE		(PKUNITY_APB_BASE + 0x400000) /* APB-4 */
-#include <mach/regs-ac97.h>
-#define PKUNITY_GPIO_BASE		(PKUNITY_APB_BASE + 0x500000) /* APB-5 */
-#include <mach/regs-gpio.h>
-#define PKUNITY_INTC_BASE		(PKUNITY_APB_BASE + 0x600000) /* APB-6 */
-#include <mach/regs-intc.h>
-#define PKUNITY_RTC_BASE		(PKUNITY_APB_BASE + 0x700000) /* APB-7 */
-#include <mach/regs-rtc.h>
-#define PKUNITY_OST_BASE		(PKUNITY_APB_BASE + 0x800000) /* APB-8 */
-#include <mach/regs-ost.h>
-#define PKUNITY_RESETC_BASE		(PKUNITY_APB_BASE + 0x900000) /* APB-9 */
-#include <mach/regs-resetc.h>
-#define PKUNITY_PM_BASE			(PKUNITY_APB_BASE + 0xA00000) /* APB-10 */
-#include <mach/regs-pm.h>
-#define PKUNITY_PS2_BASE		(PKUNITY_APB_BASE + 0xB00000) /* APB-11 */
-#include <mach/regs-ps2.h>
-#define PKUNITY_SDC_BASE		(PKUNITY_APB_BASE + 0xC00000) /* APB-12 */
-#include <mach/regs-sdc.h>
-
diff --git a/arch/unicore32/include/mach/bitfield.h b/arch/unicore32/include/mach/bitfield.h
deleted file mode 100644
index 766b7f01f1cd..000000000000
--- a/arch/unicore32/include/mach/bitfield.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/bitfield.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __MACH_PUV3_BITFIELD_H__
-#define __MACH_PUV3_BITFIELD_H__
-
-#ifndef __ASSEMBLY__
-#define UData(Data)	((unsigned long) (Data))
-#else
-#define UData(Data)	(Data)
-#endif
-
-#define FIELD(val, vmask, vshift)	(((val) & ((UData(1) << (vmask)) - 1)) << (vshift))
-#define FMASK(vmask, vshift)		(((UData(1) << (vmask)) - 1) << (vshift))
-
-#endif /* __MACH_PUV3_BITFIELD_H__ */
diff --git a/arch/unicore32/include/mach/dma.h b/arch/unicore32/include/mach/dma.h
deleted file mode 100644
index 271001cd13c4..000000000000
--- a/arch/unicore32/include/mach/dma.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/dma.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __MACH_PUV3_DMA_H__
-#define __MACH_PUV3_DMA_H__
-
-/*
- * The PKUnity has six internal DMA channels.
- */
-#define MAX_DMA_CHANNELS	6
-
-typedef enum {
-	DMA_PRIO_HIGH = 0,
-	DMA_PRIO_MEDIUM = 1,
-	DMA_PRIO_LOW = 2
-} puv3_dma_prio;
-
-/*
- * DMA registration
- */
-
-extern int puv3_request_dma(char *name,
-			 puv3_dma_prio prio,
-			 void (*irq_handler)(int, void *),
-			 void (*err_handler)(int, void *),
-			 void *data);
-
-extern void puv3_free_dma(int dma_ch);
-
-static inline void puv3_stop_dma(int ch)
-{
-	writel(readl(DMAC_CONFIG(ch)) & ~DMAC_CONFIG_EN, DMAC_CONFIG(ch));
-}
-
-static inline void puv3_resume_dma(int ch)
-{
-	writel(readl(DMAC_CONFIG(ch)) | DMAC_CONFIG_EN, DMAC_CONFIG(ch));
-}
-
-#endif /* __MACH_PUV3_DMA_H__ */
diff --git a/arch/unicore32/include/mach/hardware.h b/arch/unicore32/include/mach/hardware.h
deleted file mode 100644
index 2d7571cbd1d0..000000000000
--- a/arch/unicore32/include/mach/hardware.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/hardware.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * This file contains the hardware definitions for PKUnity architecture
- */
-
-#ifndef __MACH_PUV3_HARDWARE_H__
-#define __MACH_PUV3_HARDWARE_H__
-
-#include <mach/PKUnity.h>
-
-#ifndef __ASSEMBLY__
-#define io_p2v(x)	(void __iomem *)((x) - PKUNITY_MMIO_BASE)
-#define io_v2p(x)	(phys_addr_t)((x) + PKUNITY_MMIO_BASE)
-#else
-#define io_p2v(x)	((x) - PKUNITY_MMIO_BASE)
-#define io_v2p(x)	((x) + PKUNITY_MMIO_BASE)
-#endif
-
-#define PCIBIOS_MIN_IO			0x4000 /* should lower than 64KB */
-#define PCIBIOS_MIN_MEM			io_v2p(PKUNITY_PCIMEM_BASE)
-
-#define pcibios_assign_all_busses()	1
-
-#endif  /* __MACH_PUV3_HARDWARE_H__ */
diff --git a/arch/unicore32/include/mach/map.h b/arch/unicore32/include/mach/map.h
deleted file mode 100644
index 7a83eeeb1287..000000000000
--- a/arch/unicore32/include/mach/map.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/map.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  Page table mapping constructs and function prototypes
- */
-#define MT_DEVICE		0
-#define MT_DEVICE_CACHED	2
-#define MT_KUSER		7
-#define MT_HIGH_VECTORS		8
-#define MT_MEMORY		9
-#define MT_ROM			10
-
diff --git a/arch/unicore32/include/mach/memory.h b/arch/unicore32/include/mach/memory.h
deleted file mode 100644
index b4e6035cb9a3..000000000000
--- a/arch/unicore32/include/mach/memory.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/memory.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __MACH_PUV3_MEMORY_H__
-#define __MACH_PUV3_MEMORY_H__
-
-#include <mach/hardware.h>
-
-/* Physical DRAM offset. */
-#define PHYS_OFFSET	UL(0x00000000)
-/* The base address of exception vectors. */
-#define VECTORS_BASE	UL(0xffff0000)
-/* The base address of kuser area. */
-#define KUSER_BASE	UL(0x80000000)
-
-#ifdef __ASSEMBLY__
-/* The byte offset of the kernel image in RAM from the start of RAM. */
-#define KERNEL_IMAGE_START	0x00408000
-#endif
-
-#if !defined(__ASSEMBLY__) && defined(CONFIG_PCI)
-
-void puv3_pci_adjust_zones(unsigned long *max_zone_pfn);
-
-#define arch_adjust_zones(max_zone_pfn) \
-	puv3_pci_adjust_zones(max_zone_pfn)
-
-#endif
-
-/*
- * PCI controller in PKUnity-3 masks highest 5-bit for upstream channel,
- * so we must limit the DMA allocation within 128M physical memory for
- * supporting PCI devices.
- */
-#define PCI_DMA_THRESHOLD	(PHYS_OFFSET + SZ_128M - 1)
-
-#define is_pcibus_device(dev)	(dev &&			\
-				(strncmp(dev->bus->name, "pci", 3) == 0))
-
-#define __virt_to_pcibus(x)     (__virt_to_phys((x) + PKUNITY_PCIAHB_BASE))
-#define __pcibus_to_virt(x)     (__phys_to_virt(x) - PKUNITY_PCIAHB_BASE)
-
-/* kuser area */
-#define KUSER_VECPAGE_BASE	(KUSER_BASE + UL(0x3fff0000))
-/* kuser_vecpage (0xbfff0000) is ro, and vectors page (0xffff0000) is rw */
-#define kuser_vecpage_to_vectors(x)	((x) - (KUSER_VECPAGE_BASE)	\
-					+ (VECTORS_BASE))
-
-#endif
diff --git a/arch/unicore32/include/mach/ocd.h b/arch/unicore32/include/mach/ocd.h
deleted file mode 100644
index 2a814929e389..000000000000
--- a/arch/unicore32/include/mach/ocd.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/ocd.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __MACH_PUV3_OCD_H__
-#define __MACH_PUV3_OCD_H__
-
-#if defined(CONFIG_DEBUG_OCD)
-static inline void ocd_putc(unsigned int c)
-{
-	int status, i = 0x2000000;
-
-	do {
-		if (--i < 0)
-			return;
-
-		asm volatile ("movc %0, p1.c0, #0" : "=r" (status));
-	} while (status & 2);
-
-	asm("movc p1.c1, %0, #1" : : "r" (c));
-}
-
-#define putc(ch)	ocd_putc(ch)
-#else
-#define putc(ch)
-#endif
-
-#endif
diff --git a/arch/unicore32/include/mach/pm.h b/arch/unicore32/include/mach/pm.h
deleted file mode 100644
index cb40b8490a57..000000000000
--- a/arch/unicore32/include/mach/pm.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore/include/mach/pm.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __PUV3_PM_H__
-#define __PUV3_PM_H__
-
-#include <linux/suspend.h>
-
-struct puv3_cpu_pm_fns {
-	int	save_count;
-	void	(*save)(unsigned long *);
-	void	(*restore)(unsigned long *);
-	int	(*valid)(suspend_state_t state);
-	void	(*enter)(suspend_state_t state);
-	int	(*prepare)(void);
-	void	(*finish)(void);
-};
-
-extern struct puv3_cpu_pm_fns *puv3_cpu_pm_fns;
-
-/* sleep.S */
-extern void puv3_cpu_suspend(unsigned int);
-
-extern void puv3_cpu_resume(void);
-
-extern int puv3_pm_enter(suspend_state_t state);
-
-/* Defined in hibernate_asm.S */
-extern int restore_image(pgd_t *resume_pg_dir, struct pbe *restore_pblist);
-
-extern struct pbe *restore_pblist;
-#endif
diff --git a/arch/unicore32/include/mach/regs-ac97.h b/arch/unicore32/include/mach/regs-ac97.h
deleted file mode 100644
index 85c601898d02..000000000000
--- a/arch/unicore32/include/mach/regs-ac97.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity AC97 Registers
- */
-
-#define PKUNITY_AC97_CONR		(PKUNITY_AC97_BASE + 0x0000)
-#define PKUNITY_AC97_OCR		(PKUNITY_AC97_BASE + 0x0004)
-#define PKUNITY_AC97_ICR		(PKUNITY_AC97_BASE + 0x0008)
-#define PKUNITY_AC97_CRAC		(PKUNITY_AC97_BASE + 0x000C)
-#define PKUNITY_AC97_INTR		(PKUNITY_AC97_BASE + 0x0010)
-#define PKUNITY_AC97_INTRSTAT		(PKUNITY_AC97_BASE + 0x0014)
-#define PKUNITY_AC97_INTRCLEAR		(PKUNITY_AC97_BASE + 0x0018)
-#define PKUNITY_AC97_ENABLE		(PKUNITY_AC97_BASE + 0x001C)
-#define PKUNITY_AC97_OUT_FIFO		(PKUNITY_AC97_BASE + 0x0020)
-#define PKUNITY_AC97_IN_FIFO		(PKUNITY_AC97_BASE + 0x0030)
-
-#define AC97_CODEC_REG(v)               FIELD((v), 7, 16)
-#define AC97_CODEC_VAL(v)               FIELD((v), 16, 0)
-#define AC97_CODEC_WRITECOMPLETE        FIELD(1, 1, 2)
-
-/*
- * VAR PLAY SAMPLE RATE
- */
-#define AC97_CMD_VPSAMPLE		(FIELD(3, 2, 16) | FIELD(3, 2, 0))
-
-/*
- * FIX CAPTURE SAMPLE RATE
- */
-#define AC97_CMD_FCSAMPLE		FIELD(7, 3, 0)
-
-#define AC97_CMD_RESET			FIELD(1, 1, 0)
-#define AC97_CMD_ENABLE			FIELD(1, 1, 0)
-#define AC97_CMD_DISABLE		FIELD(0, 1, 0)
diff --git a/arch/unicore32/include/mach/regs-dmac.h b/arch/unicore32/include/mach/regs-dmac.h
deleted file mode 100644
index bbdc52d06a98..000000000000
--- a/arch/unicore32/include/mach/regs-dmac.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Direct Memory Access Controller (DMAC)
- */
-
-/*
- * Interrupt Status Reg DMAC_ISR.
- */
-#define DMAC_ISR		(PKUNITY_DMAC_BASE + 0x0020)
-/*
- * Interrupt Transfer Complete Status Reg DMAC_ITCSR.
- */
-#define DMAC_ITCSR		(PKUNITY_DMAC_BASE + 0x0050)
-/*
- * Interrupt Transfer Complete Clear Reg DMAC_ITCCR.
- */
-#define DMAC_ITCCR		(PKUNITY_DMAC_BASE + 0x0060)
-/*
- * Interrupt Error Status Reg DMAC_IESR.
- */
-#define DMAC_IESR		(PKUNITY_DMAC_BASE + 0x0080)
-/*
- * Interrupt Error Clear Reg DMAC_IECR.
- */
-#define DMAC_IECR		(PKUNITY_DMAC_BASE + 0x0090)
-/*
- * Enable Channels Reg DMAC_ENCH.
- */
-#define DMAC_ENCH		(PKUNITY_DMAC_BASE + 0x00B0)
-
-/*
- * DMA control reg. Space [byte]
- */
-#define DMASp                   0x00000100
-
-/*
- * Source Addr DMAC_SRCADDR(ch).
- */
-#define DMAC_SRCADDR(ch)	(PKUNITY_DMAC_BASE + (ch)*DMASp + 0x00)
-/*
- * Destination Addr DMAC_DESTADDR(ch).
- */
-#define DMAC_DESTADDR(ch)	(PKUNITY_DMAC_BASE + (ch)*DMASp + 0x04)
-/*
- * Control Reg DMAC_CONTROL(ch).
- */
-#define DMAC_CONTROL(ch)	(PKUNITY_DMAC_BASE + (ch)*DMASp + 0x0C)
-/*
- * Configuration Reg DMAC_CONFIG(ch).
- */
-#define DMAC_CONFIG(ch)		(PKUNITY_DMAC_BASE + (ch)*DMASp + 0x10)
-
-#define DMAC_IR_MASK            FMASK(6, 0)
-/*
- * select channel (ch)
- */
-#define DMAC_CHANNEL(ch)	FIELD(1, 1, (ch))
-
-#define DMAC_CONTROL_SIZE_BYTE(v)       (FIELD((v), 12, 14) | \
-					FIELD(0, 3, 9) | FIELD(0, 3, 6))
-#define DMAC_CONTROL_SIZE_HWORD(v)      (FIELD((v) >> 1, 12, 14) | \
-					FIELD(1, 3, 9) | FIELD(1, 3, 6))
-#define DMAC_CONTROL_SIZE_WORD(v)       (FIELD((v) >> 2, 12, 14) | \
-					FIELD(2, 3, 9) | FIELD(2, 3, 6))
-#define DMAC_CONTROL_DI                 FIELD(1, 1, 13)
-#define DMAC_CONTROL_SI                 FIELD(1, 1, 12)
-#define DMAC_CONTROL_BURST_1BYTE        (FIELD(0, 3, 3) | FIELD(0, 3, 0))
-#define DMAC_CONTROL_BURST_4BYTE        (FIELD(3, 3, 3) | FIELD(3, 3, 0))
-#define DMAC_CONTROL_BURST_8BYTE        (FIELD(5, 3, 3) | FIELD(5, 3, 0))
-#define DMAC_CONTROL_BURST_16BYTE       (FIELD(7, 3, 3) | FIELD(7, 3, 0))
-
-#define	DMAC_CONFIG_UART0_WR    (FIELD(2, 4, 11) | FIELD(1, 2, 1))
-#define	DMAC_CONFIG_UART0_RD    (FIELD(2, 4, 7)  | FIELD(2, 2, 1))
-#define	DMAC_CONFIG_UART1_WR    (FIELD(3, 4, 11) | FIELD(1, 2, 1))
-#define	DMAC_CONFIG_UART1RD     (FIELD(3, 4, 7)  | FIELD(2, 2, 1))
-#define	DMAC_CONFIG_AC97WR      (FIELD(4, 4, 11) | FIELD(1, 2, 1))
-#define	DMAC_CONFIG_AC97RD      (FIELD(4, 4, 7)  | FIELD(2, 2, 1))
-#define	DMAC_CONFIG_MMCWR       (FIELD(7, 4, 11) | FIELD(1, 2, 1))
-#define	DMAC_CONFIG_MMCRD       (FIELD(7, 4, 7)  | FIELD(2, 2, 1))
-#define DMAC_CONFIG_MASKITC     FIELD(1, 1, 4)
-#define DMAC_CONFIG_MASKIE      FIELD(1, 1, 3)
-#define DMAC_CONFIG_EN          FIELD(1, 1, 0)
diff --git a/arch/unicore32/include/mach/regs-gpio.h b/arch/unicore32/include/mach/regs-gpio.h
deleted file mode 100644
index 5fc701ee33e3..000000000000
--- a/arch/unicore32/include/mach/regs-gpio.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity General-Purpose Input/Output (GPIO) Registers
- */
-
-/*
- * Voltage Status Reg GPIO_GPLR.
- */
-#define GPIO_GPLR	(PKUNITY_GPIO_BASE + 0x0000)
-/*
- * Pin Direction Reg GPIO_GPDR.
- */
-#define GPIO_GPDR	(PKUNITY_GPIO_BASE + 0x0004)
-/*
- * Output Pin Set Reg GPIO_GPSR.
- */
-#define GPIO_GPSR	(PKUNITY_GPIO_BASE + 0x0008)
-/*
- * Output Pin Clear Reg GPIO_GPCR.
- */
-#define GPIO_GPCR	(PKUNITY_GPIO_BASE + 0x000C)
-/*
- * Raise Edge Detect Reg GPIO_GRER.
- */
-#define GPIO_GRER	(PKUNITY_GPIO_BASE + 0x0010)
-/*
- * Fall Edge Detect Reg GPIO_GFER.
- */
-#define GPIO_GFER	(PKUNITY_GPIO_BASE + 0x0014)
-/*
- * Edge Status Reg GPIO_GEDR.
- */
-#define GPIO_GEDR	(PKUNITY_GPIO_BASE + 0x0018)
-/*
- * Special Voltage Detect Reg GPIO_GPIR.
- */
-#define GPIO_GPIR	(PKUNITY_GPIO_BASE + 0x0020)
-
-#define GPIO_MIN	(0)
-#define GPIO_MAX	(27)
-
-#define GPIO_GPIO(Nb)	(0x00000001 << (Nb))	/* GPIO [0..27] */
-#define GPIO_GPIO0	GPIO_GPIO(0)	/* GPIO  [0] */
-#define GPIO_GPIO1	GPIO_GPIO(1)	/* GPIO  [1] */
-#define GPIO_GPIO2	GPIO_GPIO(2)	/* GPIO  [2] */
-#define GPIO_GPIO3	GPIO_GPIO(3)	/* GPIO  [3] */
-#define GPIO_GPIO4	GPIO_GPIO(4)	/* GPIO  [4] */
-#define GPIO_GPIO5	GPIO_GPIO(5)	/* GPIO  [5] */
-#define GPIO_GPIO6	GPIO_GPIO(6)	/* GPIO  [6] */
-#define GPIO_GPIO7	GPIO_GPIO(7)	/* GPIO  [7] */
-#define GPIO_GPIO8	GPIO_GPIO(8)	/* GPIO  [8] */
-#define GPIO_GPIO9	GPIO_GPIO(9)	/* GPIO  [9] */
-#define GPIO_GPIO10	GPIO_GPIO(10)	/* GPIO [10] */
-#define GPIO_GPIO11	GPIO_GPIO(11)	/* GPIO [11] */
-#define GPIO_GPIO12	GPIO_GPIO(12)	/* GPIO [12] */
-#define GPIO_GPIO13	GPIO_GPIO(13)	/* GPIO [13] */
-#define GPIO_GPIO14	GPIO_GPIO(14)	/* GPIO [14] */
-#define GPIO_GPIO15	GPIO_GPIO(15)	/* GPIO [15] */
-#define GPIO_GPIO16	GPIO_GPIO(16)	/* GPIO [16] */
-#define GPIO_GPIO17	GPIO_GPIO(17)	/* GPIO [17] */
-#define GPIO_GPIO18	GPIO_GPIO(18)	/* GPIO [18] */
-#define GPIO_GPIO19	GPIO_GPIO(19)	/* GPIO [19] */
-#define GPIO_GPIO20	GPIO_GPIO(20)	/* GPIO [20] */
-#define GPIO_GPIO21	GPIO_GPIO(21)	/* GPIO [21] */
-#define GPIO_GPIO22	GPIO_GPIO(22)	/* GPIO [22] */
-#define GPIO_GPIO23	GPIO_GPIO(23)	/* GPIO [23] */
-#define GPIO_GPIO24	GPIO_GPIO(24)	/* GPIO [24] */
-#define GPIO_GPIO25	GPIO_GPIO(25)	/* GPIO [25] */
-#define GPIO_GPIO26	GPIO_GPIO(26)	/* GPIO [26] */
-#define GPIO_GPIO27	GPIO_GPIO(27)	/* GPIO [27] */
-
diff --git a/arch/unicore32/include/mach/regs-i2c.h b/arch/unicore32/include/mach/regs-i2c.h
deleted file mode 100644
index b41aa7c92430..000000000000
--- a/arch/unicore32/include/mach/regs-i2c.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Inter-integrated Circuit (I2C) Registers
- */
-
-/*
- * Control Reg I2C_CON.
- */
-#define I2C_CON		(PKUNITY_I2C_BASE + 0x0000)
-/*
- * Target Address Reg I2C_TAR.
- */
-#define I2C_TAR		(PKUNITY_I2C_BASE + 0x0004)
-/*
- * Data buffer and command Reg I2C_DATACMD.
- */
-#define I2C_DATACMD	(PKUNITY_I2C_BASE + 0x0010)
-/*
- * Enable Reg I2C_ENABLE.
- */
-#define I2C_ENABLE	(PKUNITY_I2C_BASE + 0x006C)
-/*
- * Status Reg I2C_STATUS.
- */
-#define I2C_STATUS	(PKUNITY_I2C_BASE + 0x0070)
-/*
- * Tx FIFO Length Reg I2C_TXFLR.
- */
-#define I2C_TXFLR	(PKUNITY_I2C_BASE + 0x0074)
-/*
- * Rx FIFO Length Reg I2C_RXFLR.
- */
-#define I2C_RXFLR	(PKUNITY_I2C_BASE + 0x0078)
-/*
- * Enable Status Reg I2C_ENSTATUS.
- */
-#define I2C_ENSTATUS	(PKUNITY_I2C_BASE + 0x009C)
-
-#define I2C_CON_MASTER          FIELD(1, 1, 0)
-#define I2C_CON_SPEED_STD       FIELD(1, 2, 1)
-#define I2C_CON_SPEED_FAST      FIELD(2, 2, 1)
-#define I2C_CON_RESTART         FIELD(1, 1, 5)
-#define I2C_CON_SLAVEDISABLE    FIELD(1, 1, 6)
-
-#define I2C_DATACMD_READ        FIELD(1, 1, 8)
-#define I2C_DATACMD_WRITE       FIELD(0, 1, 8)
-#define I2C_DATACMD_DAT_MASK    FMASK(8, 0)
-#define I2C_DATACMD_DAT(v)      FIELD((v), 8, 0)
-
-#define I2C_ENABLE_ENABLE       FIELD(1, 1, 0)
-#define I2C_ENABLE_DISABLE      FIELD(0, 1, 0)
-
-#define I2C_STATUS_RFF          FIELD(1, 1, 4)
-#define I2C_STATUS_RFNE         FIELD(1, 1, 3)
-#define I2C_STATUS_TFE          FIELD(1, 1, 2)
-#define I2C_STATUS_TFNF         FIELD(1, 1, 1)
-#define I2C_STATUS_ACTIVITY     FIELD(1, 1, 0)
-
-#define I2C_ENSTATUS_ENABLE	FIELD(1, 1, 0)
-
-#define I2C_TAR_THERMAL	0x4f
-#define I2C_TAR_SPD	0x50
-#define I2C_TAR_PWIC    0x55
-#define I2C_TAR_EEPROM	0x57
diff --git a/arch/unicore32/include/mach/regs-intc.h b/arch/unicore32/include/mach/regs-intc.h
deleted file mode 100644
index 4eb1b5b571bb..000000000000
--- a/arch/unicore32/include/mach/regs-intc.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUNITY Interrupt Controller (INTC) Registers
- */
-/*
- * INTC Level Reg INTC_ICLR.
- */
-#define INTC_ICLR	(PKUNITY_INTC_BASE + 0x0000)
-/*
- * INTC Mask Reg INTC_ICMR.
- */
-#define INTC_ICMR	(PKUNITY_INTC_BASE + 0x0004)
-/*
- * INTC Pending Reg INTC_ICPR.
- */
-#define INTC_ICPR	(PKUNITY_INTC_BASE + 0x0008)
-/*
- * INTC IRQ Pending Reg INTC_ICIP.
- */
-#define INTC_ICIP	(PKUNITY_INTC_BASE + 0x000C)
-/*
- * INTC REAL Pending Reg INTC_ICFP.
- */
-#define INTC_ICFP	(PKUNITY_INTC_BASE + 0x0010)
-/*
- * INTC Control Reg INTC_ICCR.
- */
-#define INTC_ICCR	(PKUNITY_INTC_BASE + 0x0014)
-
diff --git a/arch/unicore32/include/mach/regs-nand.h b/arch/unicore32/include/mach/regs-nand.h
deleted file mode 100644
index 7f29939251ef..000000000000
--- a/arch/unicore32/include/mach/regs-nand.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity NAND Controller Registers
- */
-/*
- * ID Reg. 0 NAND_IDR0
- */
-#define NAND_IDR0	(PKUNITY_NAND_BASE + 0x0000)
-/*
- * ID Reg. 1 NAND_IDR1
- */
-#define NAND_IDR1	(PKUNITY_NAND_BASE + 0x0004)
-/*
- * ID Reg. 2 NAND_IDR2
- */
-#define NAND_IDR2	(PKUNITY_NAND_BASE + 0x0008)
-/*
- * ID Reg. 3 NAND_IDR3
- */
-#define NAND_IDR3	(PKUNITY_NAND_BASE + 0x000C)
-/*
- * Page Address Reg 0 NAND_PAR0
- */
-#define NAND_PAR0	(PKUNITY_NAND_BASE + 0x0010)
-/*
- * Page Address Reg 1 NAND_PAR1
- */
-#define NAND_PAR1	(PKUNITY_NAND_BASE + 0x0014)
-/*
- * Page Address Reg 2 NAND_PAR2
- */
-#define NAND_PAR2	(PKUNITY_NAND_BASE + 0x0018)
-/*
- * ECC Enable Reg NAND_ECCEN
- */
-#define NAND_ECCEN	(PKUNITY_NAND_BASE + 0x001C)
-/*
- * Buffer Reg NAND_BUF
- */
-#define NAND_BUF	(PKUNITY_NAND_BASE + 0x0020)
-/*
- * ECC Status Reg NAND_ECCSR
- */
-#define NAND_ECCSR	(PKUNITY_NAND_BASE + 0x0024)
-/*
- * Command Reg NAND_CMD
- */
-#define NAND_CMD	(PKUNITY_NAND_BASE + 0x0028)
-/*
- * DMA Configure Reg NAND_DMACR
- */
-#define NAND_DMACR	(PKUNITY_NAND_BASE + 0x002C)
-/*
- * Interrupt Reg NAND_IR
- */
-#define NAND_IR		(PKUNITY_NAND_BASE + 0x0030)
-/*
- * Interrupt Mask Reg NAND_IMR
- */
-#define NAND_IMR	(PKUNITY_NAND_BASE + 0x0034)
-/*
- * Chip Enable Reg NAND_CHIPEN
- */
-#define NAND_CHIPEN	(PKUNITY_NAND_BASE + 0x0038)
-/*
- * Address Reg NAND_ADDR
- */
-#define NAND_ADDR	(PKUNITY_NAND_BASE + 0x003C)
-
-/*
- * Command bits NAND_CMD_CMD_MASK
- */
-#define NAND_CMD_CMD_MASK		FMASK(4, 4)
-#define NAND_CMD_CMD_READPAGE		FIELD(0x0, 4, 4)
-#define NAND_CMD_CMD_ERASEBLOCK		FIELD(0x6, 4, 4)
-#define NAND_CMD_CMD_READSTATUS		FIELD(0x7, 4, 4)
-#define NAND_CMD_CMD_WRITEPAGE		FIELD(0x8, 4, 4)
-#define NAND_CMD_CMD_READID		FIELD(0x9, 4, 4)
-#define NAND_CMD_CMD_RESET		FIELD(0xf, 4, 4)
-
diff --git a/arch/unicore32/include/mach/regs-ost.h b/arch/unicore32/include/mach/regs-ost.h
deleted file mode 100644
index 6c63e7b7569e..000000000000
--- a/arch/unicore32/include/mach/regs-ost.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Operating System Timer (OST) Registers
- */
-/*
- * Match Reg 0 OST_OSMR0
- */
-#define OST_OSMR0	(PKUNITY_OST_BASE + 0x0000)
-/*
- * Match Reg 1 OST_OSMR1
- */
-#define OST_OSMR1	(PKUNITY_OST_BASE + 0x0004)
-/*
- * Match Reg 2 OST_OSMR2
- */
-#define OST_OSMR2	(PKUNITY_OST_BASE + 0x0008)
-/*
- * Match Reg 3 OST_OSMR3
- */
-#define OST_OSMR3	(PKUNITY_OST_BASE + 0x000C)
-/*
- * Counter Reg OST_OSCR
- */
-#define OST_OSCR	(PKUNITY_OST_BASE + 0x0010)
-/*
- * Status Reg OST_OSSR
- */
-#define OST_OSSR	(PKUNITY_OST_BASE + 0x0014)
-/*
- * Watchdog Enable Reg OST_OWER
- */
-#define OST_OWER	(PKUNITY_OST_BASE + 0x0018)
-/*
- * Interrupt Enable Reg OST_OIER
- */
-#define OST_OIER	(PKUNITY_OST_BASE + 0x001C)
-
-/*
- * PWM Registers: IO base address: PKUNITY_OST_BASE + 0x80
- *      PWCR: Pulse Width Control Reg
- *      DCCR: Duty Cycle Control Reg
- *      PCR: Period Control Reg
- */
-#define OST_PWM_PWCR	(0x00)
-#define OST_PWM_DCCR	(0x04)
-#define OST_PWM_PCR 	(0x08)
-
-/*
- * Match detected 0 OST_OSSR_M0
- */
-#define OST_OSSR_M0		FIELD(1, 1, 0)
-/*
- * Match detected 1 OST_OSSR_M1
- */
-#define OST_OSSR_M1		FIELD(1, 1, 1)
-/*
- * Match detected 2 OST_OSSR_M2
- */
-#define OST_OSSR_M2		FIELD(1, 1, 2)
-/*
- * Match detected 3 OST_OSSR_M3
- */
-#define OST_OSSR_M3		FIELD(1, 1, 3)
-
-/*
- * Interrupt enable 0 OST_OIER_E0
- */
-#define OST_OIER_E0		FIELD(1, 1, 0)
-/*
- * Interrupt enable 1 OST_OIER_E1
- */
-#define OST_OIER_E1		FIELD(1, 1, 1)
-/*
- * Interrupt enable 2 OST_OIER_E2
- */
-#define OST_OIER_E2		FIELD(1, 1, 2)
-/*
- * Interrupt enable 3 OST_OIER_E3
- */
-#define OST_OIER_E3		FIELD(1, 1, 3)
-
-/*
- * Watchdog Match Enable OST_OWER_WME
- */
-#define OST_OWER_WME		FIELD(1, 1, 0)
-
-/*
- * PWM Full Duty Cycle OST_PWMDCCR_FDCYCLE
- */
-#define OST_PWMDCCR_FDCYCLE	FIELD(1, 1, 10)
-
diff --git a/arch/unicore32/include/mach/regs-pci.h b/arch/unicore32/include/mach/regs-pci.h
deleted file mode 100644
index 25bb307b87c3..000000000000
--- a/arch/unicore32/include/mach/regs-pci.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity AHB-PCI Bridge Registers
- */
-
-/*
- * AHB/PCI fixed physical address for pci addess configuration
- */
-/*
- * PCICFG Bridge Base Reg.
- */
-#define PCICFG_BRIBASE          (PKUNITY_PCICFG_BASE + 0x0000)
-/*
- * PCICFG Address Reg.
- */
-#define PCICFG_ADDR             (PKUNITY_PCICFG_BASE + 0x0004)
-/*
- * PCICFG Address Reg.
- */
-#define PCICFG_DATA             (PKUNITY_PCICFG_BASE + 0x0008)
-
-/*
- * PCI Bridge configuration space
- */
-#define PCIBRI_ID		(PKUNITY_PCIBRI_BASE + 0x0000)
-#define PCIBRI_CMD		(PKUNITY_PCIBRI_BASE + 0x0004)
-#define PCIBRI_CLASS		(PKUNITY_PCIBRI_BASE + 0x0008)
-#define PCIBRI_LTR		(PKUNITY_PCIBRI_BASE + 0x000C)
-#define PCIBRI_BAR0		(PKUNITY_PCIBRI_BASE + 0x0010)
-#define PCIBRI_BAR1		(PKUNITY_PCIBRI_BASE + 0x0014)
-#define PCIBRI_BAR2		(PKUNITY_PCIBRI_BASE + 0x0018)
-#define PCIBRI_BAR3		(PKUNITY_PCIBRI_BASE + 0x001C)
-#define PCIBRI_BAR4		(PKUNITY_PCIBRI_BASE + 0x0020)
-#define PCIBRI_BAR5		(PKUNITY_PCIBRI_BASE + 0x0024)
-
-#define PCIBRI_PCICTL0		(PKUNITY_PCIBRI_BASE + 0x0100)
-#define PCIBRI_PCIBAR0		(PKUNITY_PCIBRI_BASE + 0x0104)
-#define PCIBRI_PCIAMR0		(PKUNITY_PCIBRI_BASE + 0x0108)
-#define PCIBRI_PCITAR0		(PKUNITY_PCIBRI_BASE + 0x010C)
-#define PCIBRI_PCICTL1		(PKUNITY_PCIBRI_BASE + 0x0110)
-#define PCIBRI_PCIBAR1		(PKUNITY_PCIBRI_BASE + 0x0114)
-#define PCIBRI_PCIAMR1		(PKUNITY_PCIBRI_BASE + 0x0118)
-#define PCIBRI_PCITAR1		(PKUNITY_PCIBRI_BASE + 0x011C)
-#define PCIBRI_PCICTL2		(PKUNITY_PCIBRI_BASE + 0x0120)
-#define PCIBRI_PCIBAR2		(PKUNITY_PCIBRI_BASE + 0x0124)
-#define PCIBRI_PCIAMR2		(PKUNITY_PCIBRI_BASE + 0x0128)
-#define PCIBRI_PCITAR2		(PKUNITY_PCIBRI_BASE + 0x012C)
-#define PCIBRI_PCICTL3		(PKUNITY_PCIBRI_BASE + 0x0130)
-#define PCIBRI_PCIBAR3		(PKUNITY_PCIBRI_BASE + 0x0134)
-#define PCIBRI_PCIAMR3		(PKUNITY_PCIBRI_BASE + 0x0138)
-#define PCIBRI_PCITAR3		(PKUNITY_PCIBRI_BASE + 0x013C)
-#define PCIBRI_PCICTL4		(PKUNITY_PCIBRI_BASE + 0x0140)
-#define PCIBRI_PCIBAR4		(PKUNITY_PCIBRI_BASE + 0x0144)
-#define PCIBRI_PCIAMR4		(PKUNITY_PCIBRI_BASE + 0x0148)
-#define PCIBRI_PCITAR4		(PKUNITY_PCIBRI_BASE + 0x014C)
-#define PCIBRI_PCICTL5		(PKUNITY_PCIBRI_BASE + 0x0150)
-#define PCIBRI_PCIBAR5		(PKUNITY_PCIBRI_BASE + 0x0154)
-#define PCIBRI_PCIAMR5		(PKUNITY_PCIBRI_BASE + 0x0158)
-#define PCIBRI_PCITAR5		(PKUNITY_PCIBRI_BASE + 0x015C)
-
-#define PCIBRI_AHBCTL0		(PKUNITY_PCIBRI_BASE + 0x0180)
-#define PCIBRI_AHBBAR0		(PKUNITY_PCIBRI_BASE + 0x0184)
-#define PCIBRI_AHBAMR0		(PKUNITY_PCIBRI_BASE + 0x0188)
-#define PCIBRI_AHBTAR0		(PKUNITY_PCIBRI_BASE + 0x018C)
-#define PCIBRI_AHBCTL1		(PKUNITY_PCIBRI_BASE + 0x0190)
-#define PCIBRI_AHBBAR1		(PKUNITY_PCIBRI_BASE + 0x0194)
-#define PCIBRI_AHBAMR1		(PKUNITY_PCIBRI_BASE + 0x0198)
-#define PCIBRI_AHBTAR1		(PKUNITY_PCIBRI_BASE + 0x019C)
-#define PCIBRI_AHBCTL2		(PKUNITY_PCIBRI_BASE + 0x01A0)
-#define PCIBRI_AHBBAR2		(PKUNITY_PCIBRI_BASE + 0x01A4)
-#define PCIBRI_AHBAMR2		(PKUNITY_PCIBRI_BASE + 0x01A8)
-#define PCIBRI_AHBTAR2		(PKUNITY_PCIBRI_BASE + 0x01AC)
-#define PCIBRI_AHBCTL3		(PKUNITY_PCIBRI_BASE + 0x01B0)
-#define PCIBRI_AHBBAR3		(PKUNITY_PCIBRI_BASE + 0x01B4)
-#define PCIBRI_AHBAMR3		(PKUNITY_PCIBRI_BASE + 0x01B8)
-#define PCIBRI_AHBTAR3		(PKUNITY_PCIBRI_BASE + 0x01BC)
-#define PCIBRI_AHBCTL4		(PKUNITY_PCIBRI_BASE + 0x01C0)
-#define PCIBRI_AHBBAR4		(PKUNITY_PCIBRI_BASE + 0x01C4)
-#define PCIBRI_AHBAMR4		(PKUNITY_PCIBRI_BASE + 0x01C8)
-#define PCIBRI_AHBTAR4		(PKUNITY_PCIBRI_BASE + 0x01CC)
-#define PCIBRI_AHBCTL5		(PKUNITY_PCIBRI_BASE + 0x01D0)
-#define PCIBRI_AHBBAR5		(PKUNITY_PCIBRI_BASE + 0x01D4)
-#define PCIBRI_AHBAMR5		(PKUNITY_PCIBRI_BASE + 0x01D8)
-#define PCIBRI_AHBTAR5		(PKUNITY_PCIBRI_BASE + 0x01DC)
-
-#define PCIBRI_CTLx_AT          FIELD(1, 1, 2)
-#define PCIBRI_CTLx_PREF        FIELD(1, 1, 1)
-#define PCIBRI_CTLx_MRL         FIELD(1, 1, 0)
-
-#define PCIBRI_BARx_ADDR        FIELD(0xFFFFFFFC, 30, 2)
-#define PCIBRI_BARx_IO          FIELD(1, 1, 0)
-#define PCIBRI_BARx_MEM         FIELD(0, 1, 0)
-
-#define PCIBRI_CMD_IO           FIELD(1, 1, 0)
-#define PCIBRI_CMD_MEM          FIELD(1, 1, 1)
diff --git a/arch/unicore32/include/mach/regs-pm.h b/arch/unicore32/include/mach/regs-pm.h
deleted file mode 100644
index 777b1ace39b9..000000000000
--- a/arch/unicore32/include/mach/regs-pm.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUNITY Power Manager (PM) Registers
- */
-/*
- * PM Control Reg PM_PMCR
- */
-#define PM_PMCR                 (PKUNITY_PM_BASE + 0x0000)
-/*
- * PM General Conf. Reg PM_PGCR
- */
-#define PM_PGCR                 (PKUNITY_PM_BASE + 0x0004)
-/*
- * PM PLL Conf. Reg PM_PPCR
- */
-#define PM_PPCR                 (PKUNITY_PM_BASE + 0x0008)
-/*
- * PM Wakeup Enable Reg PM_PWER
- */
-#define PM_PWER                 (PKUNITY_PM_BASE + 0x000C)
-/*
- * PM GPIO Sleep Status Reg PM_PGSR
- */
-#define PM_PGSR                 (PKUNITY_PM_BASE + 0x0010)
-/*
- * PM Clock Gate Reg PM_PCGR
- */
-#define PM_PCGR                 (PKUNITY_PM_BASE + 0x0014)
-/*
- * PM SYS PLL Conf. Reg PM_PLLSYSCFG
- */
-#define PM_PLLSYSCFG            (PKUNITY_PM_BASE + 0x0018)
-/*
- * PM DDR PLL Conf. Reg PM_PLLDDRCFG
- */
-#define PM_PLLDDRCFG            (PKUNITY_PM_BASE + 0x001C)
-/*
- * PM VGA PLL Conf. Reg PM_PLLVGACFG
- */
-#define PM_PLLVGACFG            (PKUNITY_PM_BASE + 0x0020)
-/*
- * PM Div Conf. Reg PM_DIVCFG
- */
-#define PM_DIVCFG               (PKUNITY_PM_BASE + 0x0024)
-/*
- * PM SYS PLL Status Reg PM_PLLSYSSTATUS
- */
-#define PM_PLLSYSSTATUS         (PKUNITY_PM_BASE + 0x0028)
-/*
- * PM DDR PLL Status Reg PM_PLLDDRSTATUS
- */
-#define PM_PLLDDRSTATUS         (PKUNITY_PM_BASE + 0x002C)
-/*
- * PM VGA PLL Status Reg PM_PLLVGASTATUS
- */
-#define PM_PLLVGASTATUS         (PKUNITY_PM_BASE + 0x0030)
-/*
- * PM Div Status Reg PM_DIVSTATUS
- */
-#define PM_DIVSTATUS            (PKUNITY_PM_BASE + 0x0034)
-/*
- * PM Software Reset Reg PM_SWRESET
- */
-#define PM_SWRESET              (PKUNITY_PM_BASE + 0x0038)
-/*
- * PM DDR2 PAD Start Reg PM_DDR2START
- */
-#define PM_DDR2START            (PKUNITY_PM_BASE + 0x003C)
-/*
- * PM DDR2 PAD Status Reg PM_DDR2CAL0
- */
-#define PM_DDR2CAL0             (PKUNITY_PM_BASE + 0x0040)
-/*
- * PM PLL DFC Done Reg PM_PLLDFCDONE
- */
-#define PM_PLLDFCDONE           (PKUNITY_PM_BASE + 0x0044)
-
-#define PM_PMCR_SFB             FIELD(1, 1, 0)
-#define PM_PMCR_IFB             FIELD(1, 1, 1)
-#define PM_PMCR_CFBSYS          FIELD(1, 1, 2)
-#define PM_PMCR_CFBDDR          FIELD(1, 1, 3)
-#define PM_PMCR_CFBVGA          FIELD(1, 1, 4)
-#define PM_PMCR_CFBDIVBCLK      FIELD(1, 1, 5)
-
-/*
- * GPIO 8~27 wake-up enable PM_PWER_GPIOHIGH
- */
-#define PM_PWER_GPIOHIGH        FIELD(1, 1, 8)
-/*
- * RTC alarm wake-up enable PM_PWER_RTC
- */
-#define PM_PWER_RTC             FIELD(1, 1, 31)
-
-#define PM_PCGR_BCLK64DDR	FIELD(1, 1, 0)
-#define PM_PCGR_BCLK64VGA	FIELD(1, 1, 1)
-#define PM_PCGR_BCLKDDR		FIELD(1, 1, 2)
-#define PM_PCGR_BCLKPCI		FIELD(1, 1, 4)
-#define PM_PCGR_BCLKDMAC	FIELD(1, 1, 5)
-#define PM_PCGR_BCLKUMAL	FIELD(1, 1, 6)
-#define PM_PCGR_BCLKUSB		FIELD(1, 1, 7)
-#define PM_PCGR_BCLKMME		FIELD(1, 1, 10)
-#define PM_PCGR_BCLKNAND	FIELD(1, 1, 11)
-#define PM_PCGR_BCLKH264E	FIELD(1, 1, 12)
-#define PM_PCGR_BCLKVGA		FIELD(1, 1, 13)
-#define PM_PCGR_BCLKH264D	FIELD(1, 1, 14)
-#define PM_PCGR_VECLK		FIELD(1, 1, 15)
-#define PM_PCGR_HECLK		FIELD(1, 1, 16)
-#define PM_PCGR_HDCLK		FIELD(1, 1, 17)
-#define PM_PCGR_NANDCLK		FIELD(1, 1, 18)
-#define PM_PCGR_GECLK		FIELD(1, 1, 19)
-#define PM_PCGR_VGACLK          FIELD(1, 1, 20)
-#define PM_PCGR_PCICLK		FIELD(1, 1, 21)
-#define PM_PCGR_SATACLK		FIELD(1, 1, 25)
-
-/*
- * [23:20]PM_DIVCFG_VGACLK(v)
- */
-#define PM_DIVCFG_VGACLK_MASK   FMASK(4, 20)
-#define PM_DIVCFG_VGACLK(v)	FIELD((v), 4, 20)
-
-#define PM_SWRESET_USB          FIELD(1, 1, 6)
-#define PM_SWRESET_VGADIV       FIELD(1, 1, 26)
-#define PM_SWRESET_GEDIV        FIELD(1, 1, 27)
-
-#define PM_PLLDFCDONE_SYSDFC    FIELD(1, 1, 0)
-#define PM_PLLDFCDONE_DDRDFC    FIELD(1, 1, 1)
-#define PM_PLLDFCDONE_VGADFC    FIELD(1, 1, 2)
diff --git a/arch/unicore32/include/mach/regs-ps2.h b/arch/unicore32/include/mach/regs-ps2.h
deleted file mode 100644
index d539d7482462..000000000000
--- a/arch/unicore32/include/mach/regs-ps2.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity PS2 Controller Registers
- */
-/*
- * the same as I8042_DATA_REG PS2_DATA
- */
-#define PS2_DATA	(PKUNITY_PS2_BASE + 0x0060)
-/*
- * the same as I8042_COMMAND_REG PS2_COMMAND
- */
-#define PS2_COMMAND	(PKUNITY_PS2_BASE + 0x0064)
-/*
- * the same as I8042_STATUS_REG PS2_STATUS
- */
-#define PS2_STATUS	(PKUNITY_PS2_BASE + 0x0064)
-/*
- * counter reg PS2_CNT
- */
-#define PS2_CNT		(PKUNITY_PS2_BASE + 0x0068)
-
diff --git a/arch/unicore32/include/mach/regs-resetc.h b/arch/unicore32/include/mach/regs-resetc.h
deleted file mode 100644
index 5f2b9d77a9ec..000000000000
--- a/arch/unicore32/include/mach/regs-resetc.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Reset Controller (RC) Registers
- */
-/*
- * Software Reset Register
- */
-#define RESETC_SWRR	(PKUNITY_RESETC_BASE + 0x0000)
-/*
- * Reset Status Register
- */
-#define RESETC_RSSR	(PKUNITY_RESETC_BASE + 0x0004)
-
-/*
- * Software Reset Bit
- */
-#define RESETC_SWRR_SRB		FIELD(1, 1, 0)
-
-/*
- * Hardware Reset
- */
-#define RESETC_RSSR_HWR		FIELD(1, 1, 0)
-/*
- * Software Reset
- */
-#define RESETC_RSSR_SWR		FIELD(1, 1, 1)
-/*
- * Watchdog Reset
- */
-#define RESETC_RSSR_WDR		FIELD(1, 1, 2)
-/*
- * Sleep Mode Reset
- */
-#define RESETC_RSSR_SMR		FIELD(1, 1, 3)
-
diff --git a/arch/unicore32/include/mach/regs-rtc.h b/arch/unicore32/include/mach/regs-rtc.h
deleted file mode 100644
index f2f7f47eb65e..000000000000
--- a/arch/unicore32/include/mach/regs-rtc.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Real-Time Clock (RTC) control registers
- */
-/*
- * RTC Alarm Reg RTC_RTAR
- */
-#define RTC_RTAR	(PKUNITY_RTC_BASE + 0x0000)
-/*
- * RTC Count Reg RTC_RCNR
- */
-#define RTC_RCNR	(PKUNITY_RTC_BASE + 0x0004)
-/*
- * RTC Trim Reg RTC_RTTR
- */
-#define RTC_RTTR	(PKUNITY_RTC_BASE + 0x0008)
-/*
- * RTC Status Reg RTC_RTSR
- */
-#define RTC_RTSR	(PKUNITY_RTC_BASE + 0x0010)
-
-/*
- * ALarm detected RTC_RTSR_AL
- */
-#define RTC_RTSR_AL		FIELD(1, 1, 0)
-/*
- * 1 Hz clock detected RTC_RTSR_HZ
- */
-#define RTC_RTSR_HZ		FIELD(1, 1, 1)
-/*
- * ALarm interrupt Enable RTC_RTSR_ALE
- */
-#define RTC_RTSR_ALE		FIELD(1, 1, 2)
-/*
- * 1 Hz clock interrupt Enable RTC_RTSR_HZE
- */
-#define RTC_RTSR_HZE		FIELD(1, 1, 3)
-
diff --git a/arch/unicore32/include/mach/regs-sdc.h b/arch/unicore32/include/mach/regs-sdc.h
deleted file mode 100644
index 658bfaf4cb3c..000000000000
--- a/arch/unicore32/include/mach/regs-sdc.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Multi-Media Card and Security Digital Card (MMC/SD) Registers
- */
-/*
- * Clock Control Reg SDC_CCR
- */
-#define SDC_CCR		(PKUNITY_SDC_BASE + 0x0000)
-/*
- * Software Reset Reg SDC_SRR
- */
-#define SDC_SRR		(PKUNITY_SDC_BASE + 0x0004)
-/*
- * Argument Reg SDC_ARGUMENT
- */
-#define SDC_ARGUMENT	(PKUNITY_SDC_BASE + 0x0008)
-/*
- * Command Reg SDC_COMMAND
- */
-#define SDC_COMMAND	(PKUNITY_SDC_BASE + 0x000C)
-/*
- * Block Size Reg SDC_BLOCKSIZE
- */
-#define SDC_BLOCKSIZE	(PKUNITY_SDC_BASE + 0x0010)
-/*
- * Block Cound Reg SDC_BLOCKCOUNT
- */
-#define SDC_BLOCKCOUNT	(PKUNITY_SDC_BASE + 0x0014)
-/*
- * Transfer Mode Reg SDC_TMR
- */
-#define SDC_TMR		(PKUNITY_SDC_BASE + 0x0018)
-/*
- * Response Reg. 0 SDC_RES0
- */
-#define SDC_RES0	(PKUNITY_SDC_BASE + 0x001C)
-/*
- * Response Reg. 1 SDC_RES1
- */
-#define SDC_RES1	(PKUNITY_SDC_BASE + 0x0020)
-/*
- * Response Reg. 2 SDC_RES2
- */
-#define SDC_RES2	(PKUNITY_SDC_BASE + 0x0024)
-/*
- * Response Reg. 3 SDC_RES3
- */
-#define SDC_RES3	(PKUNITY_SDC_BASE + 0x0028)
-/*
- * Read Timeout Control Reg SDC_RTCR
- */
-#define SDC_RTCR	(PKUNITY_SDC_BASE + 0x002C)
-/*
- * Interrupt Status Reg SDC_ISR
- */
-#define SDC_ISR		(PKUNITY_SDC_BASE + 0x0030)
-/*
- * Interrupt Status Mask Reg SDC_ISMR
- */
-#define SDC_ISMR	(PKUNITY_SDC_BASE + 0x0034)
-/*
- * RX FIFO SDC_RXFIFO
- */
-#define SDC_RXFIFO	(PKUNITY_SDC_BASE + 0x0038)
-/*
- * TX FIFO SDC_TXFIFO
- */
-#define SDC_TXFIFO	(PKUNITY_SDC_BASE + 0x003C)
-
-/*
- * SD Clock Enable SDC_CCR_CLKEN
- */
-#define SDC_CCR_CLKEN			FIELD(1, 1, 2)
-/*
- * [15:8] SDC_CCR_PDIV(v)
- */
-#define SDC_CCR_PDIV(v)			FIELD((v), 8, 8)
-
-/*
- * Software reset enable SDC_SRR_ENABLE
- */
-#define SDC_SRR_ENABLE			FIELD(0, 1, 0)
-/*
- * Software reset disable SDC_SRR_DISABLE
- */
-#define SDC_SRR_DISABLE			FIELD(1, 1, 0)
-
-/*
- * Response type SDC_COMMAND_RESTYPE_MASK
- */
-#define SDC_COMMAND_RESTYPE_MASK	FMASK(2, 0)
-/*
- * No response SDC_COMMAND_RESTYPE_NONE
- */
-#define SDC_COMMAND_RESTYPE_NONE	FIELD(0, 2, 0)
-/*
- * 136-bit long response SDC_COMMAND_RESTYPE_LONG
- */
-#define SDC_COMMAND_RESTYPE_LONG	FIELD(1, 2, 0)
-/*
- * 48-bit short response SDC_COMMAND_RESTYPE_SHORT
- */
-#define SDC_COMMAND_RESTYPE_SHORT	FIELD(2, 2, 0)
-/*
- * 48-bit short and test if busy response SDC_COMMAND_RESTYPE_SHORTBUSY
- */
-#define SDC_COMMAND_RESTYPE_SHORTBUSY	FIELD(3, 2, 0)
-/*
- * data ready SDC_COMMAND_DATAREADY
- */
-#define SDC_COMMAND_DATAREADY		FIELD(1, 1, 2)
-#define SDC_COMMAND_CMDEN		FIELD(1, 1, 3)
-/*
- * [10:5] SDC_COMMAND_CMDINDEX(v)
- */
-#define SDC_COMMAND_CMDINDEX(v)		FIELD((v), 6, 5)
-
-/*
- * [10:0] SDC_BLOCKSIZE_BSMASK(v)
- */
-#define SDC_BLOCKSIZE_BSMASK(v)		FIELD((v), 11, 0)
-/*
- * [11:0] SDC_BLOCKCOUNT_BCMASK(v)
- */
-#define SDC_BLOCKCOUNT_BCMASK(v)	FIELD((v), 12, 0)
-
-/*
- * Data Width 1bit SDC_TMR_WTH_1BIT
- */
-#define SDC_TMR_WTH_1BIT		FIELD(0, 1, 0)
-/*
- * Data Width 4bit SDC_TMR_WTH_4BIT
- */
-#define SDC_TMR_WTH_4BIT		FIELD(1, 1, 0)
-/*
- * Read SDC_TMR_DIR_READ
- */
-#define SDC_TMR_DIR_READ		FIELD(0, 1, 1)
-/*
- * Write SDC_TMR_DIR_WRITE
- */
-#define SDC_TMR_DIR_WRITE		FIELD(1, 1, 1)
-
-#define SDC_IR_MASK			FMASK(13, 0)
-#define SDC_IR_RESTIMEOUT		FIELD(1, 1, 0)
-#define SDC_IR_WRITECRC			FIELD(1, 1, 1)
-#define SDC_IR_READCRC			FIELD(1, 1, 2)
-#define SDC_IR_TXFIFOREAD		FIELD(1, 1, 3)
-#define SDC_IR_RXFIFOWRITE		FIELD(1, 1, 4)
-#define SDC_IR_READTIMEOUT		FIELD(1, 1, 5)
-#define SDC_IR_DATACOMPLETE		FIELD(1, 1, 6)
-#define SDC_IR_CMDCOMPLETE		FIELD(1, 1, 7)
-#define SDC_IR_RXFIFOFULL		FIELD(1, 1, 8)
-#define SDC_IR_RXFIFOEMPTY		FIELD(1, 1, 9)
-#define SDC_IR_TXFIFOFULL		FIELD(1, 1, 10)
-#define SDC_IR_TXFIFOEMPTY		FIELD(1, 1, 11)
-#define SDC_IR_ENDCMDWITHRES		FIELD(1, 1, 12)
diff --git a/arch/unicore32/include/mach/regs-spi.h b/arch/unicore32/include/mach/regs-spi.h
deleted file mode 100644
index 3460647a9c2a..000000000000
--- a/arch/unicore32/include/mach/regs-spi.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Serial Peripheral Interface (SPI) Registers
- */
-/*
- * Control reg. 0 SPI_CR0
- */
-#define SPI_CR0		(PKUNITY_SPI_BASE + 0x0000)
-/*
- * Control reg. 1 SPI_CR1
- */
-#define SPI_CR1		(PKUNITY_SPI_BASE + 0x0004)
-/*
- * Enable reg SPI_SSIENR
- */
-#define SPI_SSIENR	(PKUNITY_SPI_BASE + 0x0008)
-/*
- * Status reg SPI_SR
- */
-#define SPI_SR		(PKUNITY_SPI_BASE + 0x0028)
-/*
- * Interrupt Mask reg SPI_IMR
- */
-#define SPI_IMR		(PKUNITY_SPI_BASE + 0x002C)
-/*
- * Interrupt Status reg SPI_ISR
- */
-#define SPI_ISR		(PKUNITY_SPI_BASE + 0x0030)
-
-/*
- * Enable SPI Controller SPI_SSIENR_EN
- */
-#define SPI_SSIENR_EN		FIELD(1, 1, 0)
-
-/*
- * SPI Busy SPI_SR_BUSY
- */
-#define SPI_SR_BUSY		FIELD(1, 1, 0)
-/*
- * Transmit FIFO Not Full SPI_SR_TFNF
- */
-#define SPI_SR_TFNF		FIELD(1, 1, 1)
-/*
- * Transmit FIFO Empty SPI_SR_TFE
- */
-#define SPI_SR_TFE		FIELD(1, 1, 2)
-/*
- * Receive FIFO Not Empty SPI_SR_RFNE
- */
-#define SPI_SR_RFNE		FIELD(1, 1, 3)
-/*
- * Receive FIFO Full SPI_SR_RFF
- */
-#define SPI_SR_RFF		FIELD(1, 1, 4)
-
-/*
- * Trans. FIFO Empty Interrupt Status SPI_ISR_TXEIS
- */
-#define SPI_ISR_TXEIS		FIELD(1, 1, 0)
-/*
- * Trans. FIFO Overflow Interrupt Status SPI_ISR_TXOIS
- */
-#define SPI_ISR_TXOIS		FIELD(1, 1, 1)
-/*
- * Receiv. FIFO Underflow Interrupt Status SPI_ISR_RXUIS
- */
-#define SPI_ISR_RXUIS		FIELD(1, 1, 2)
-/*
- * Receiv. FIFO Overflow Interrupt Status SPI_ISR_RXOIS
- */
-#define SPI_ISR_RXOIS		FIELD(1, 1, 3)
-/*
- * Receiv. FIFO Full Interrupt Status SPI_ISR_RXFIS
- */
-#define SPI_ISR_RXFIS		FIELD(1, 1, 4)
-#define SPI_ISR_MSTIS		FIELD(1, 1, 5)
-
-/*
- * Trans. FIFO Empty Interrupt Mask SPI_IMR_TXEIM
- */
-#define SPI_IMR_TXEIM		FIELD(1, 1, 0)
-/*
- * Trans. FIFO Overflow Interrupt Mask SPI_IMR_TXOIM
- */
-#define SPI_IMR_TXOIM		FIELD(1, 1, 1)
-/*
- * Receiv. FIFO Underflow Interrupt Mask SPI_IMR_RXUIM
- */
-#define SPI_IMR_RXUIM		FIELD(1, 1, 2)
-/*
- * Receiv. FIFO Overflow Interrupt Mask SPI_IMR_RXOIM
- */
-#define SPI_IMR_RXOIM		FIELD(1, 1, 3)
-/*
- * Receiv. FIFO Full Interrupt Mask SPI_IMR_RXFIM
- */
-#define SPI_IMR_RXFIM		FIELD(1, 1, 4)
-#define SPI_IMR_MSTIM		FIELD(1, 1, 5)
-
diff --git a/arch/unicore32/include/mach/regs-uart.h b/arch/unicore32/include/mach/regs-uart.h
deleted file mode 100644
index 9fa6b1938b77..000000000000
--- a/arch/unicore32/include/mach/regs-uart.h
+++ /dev/null
@@ -1,3 +0,0 @@
-/*
- * PKUnity Universal Asynchronous Receiver/Transmitter (UART) Registers
- */
diff --git a/arch/unicore32/include/mach/regs-umal.h b/arch/unicore32/include/mach/regs-umal.h
deleted file mode 100644
index 7023089c61c6..000000000000
--- a/arch/unicore32/include/mach/regs-umal.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Ultra Media Access Layer (UMAL) Ethernet MAC Registers
- */
-
-/* MAC module of UMAL */
-/* UMAL's MAC module includes G/MII interface, several additional PHY
- * interfaces, and MAC control sub-layer, which provides support for control
- * frames (e.g. PAUSE frames).
- */
-/*
- * TX/RX reset and control UMAL_CFG1
- */
-#define UMAL_CFG1		(PKUNITY_UMAL_BASE + 0x0000)
-/*
- * MAC interface mode control UMAL_CFG2
- */
-#define UMAL_CFG2		(PKUNITY_UMAL_BASE + 0x0004)
-/*
- * Inter Packet/Frame Gap UMAL_IPGIFG
- */
-#define UMAL_IPGIFG		(PKUNITY_UMAL_BASE + 0x0008)
-/*
- * Collision retry or backoff UMAL_HALFDUPLEX
- */
-#define UMAL_HALFDUPLEX		(PKUNITY_UMAL_BASE + 0x000c)
-/*
- * Maximum Frame Length UMAL_MAXFRAME
- */
-#define UMAL_MAXFRAME		(PKUNITY_UMAL_BASE + 0x0010)
-/*
- * Test Regsiter UMAL_TESTREG
- */
-#define UMAL_TESTREG		(PKUNITY_UMAL_BASE + 0x001c)
-/*
- * MII Management Configure UMAL_MIICFG
- */
-#define UMAL_MIICFG		(PKUNITY_UMAL_BASE + 0x0020)
-/*
- * MII Management Command UMAL_MIICMD
- */
-#define UMAL_MIICMD		(PKUNITY_UMAL_BASE + 0x0024)
-/*
- * MII Management Address UMAL_MIIADDR
- */
-#define UMAL_MIIADDR		(PKUNITY_UMAL_BASE + 0x0028)
-/*
- * MII Management Control UMAL_MIICTRL
- */
-#define UMAL_MIICTRL		(PKUNITY_UMAL_BASE + 0x002c)
-/*
- * MII Management Status UMAL_MIISTATUS
- */
-#define UMAL_MIISTATUS		(PKUNITY_UMAL_BASE + 0x0030)
-/*
- * MII Management Indicator UMAL_MIIIDCT
- */
-#define UMAL_MIIIDCT		(PKUNITY_UMAL_BASE + 0x0034)
-/*
- * Interface Control UMAL_IFCTRL
- */
-#define UMAL_IFCTRL		(PKUNITY_UMAL_BASE + 0x0038)
-/*
- * Interface Status UMAL_IFSTATUS
- */
-#define UMAL_IFSTATUS		(PKUNITY_UMAL_BASE + 0x003c)
-/*
- * MAC address (high 4 bytes) UMAL_STADDR1
- */
-#define UMAL_STADDR1		(PKUNITY_UMAL_BASE + 0x0040)
-/*
- * MAC address (low 2 bytes) UMAL_STADDR2
- */
-#define UMAL_STADDR2		(PKUNITY_UMAL_BASE + 0x0044)
-
-/* FIFO MODULE OF UMAL */
-/* UMAL's FIFO module provides data queuing for increased system level
- * throughput
- */
-#define UMAL_FIFOCFG0		(PKUNITY_UMAL_BASE + 0x0048)
-#define UMAL_FIFOCFG1		(PKUNITY_UMAL_BASE + 0x004c)
-#define UMAL_FIFOCFG2		(PKUNITY_UMAL_BASE + 0x0050)
-#define UMAL_FIFOCFG3		(PKUNITY_UMAL_BASE + 0x0054)
-#define UMAL_FIFOCFG4		(PKUNITY_UMAL_BASE + 0x0058)
-#define UMAL_FIFOCFG5		(PKUNITY_UMAL_BASE + 0x005c)
-#define UMAL_FIFORAM0		(PKUNITY_UMAL_BASE + 0x0060)
-#define UMAL_FIFORAM1		(PKUNITY_UMAL_BASE + 0x0064)
-#define UMAL_FIFORAM2		(PKUNITY_UMAL_BASE + 0x0068)
-#define UMAL_FIFORAM3		(PKUNITY_UMAL_BASE + 0x006c)
-#define UMAL_FIFORAM4		(PKUNITY_UMAL_BASE + 0x0070)
-#define UMAL_FIFORAM5		(PKUNITY_UMAL_BASE + 0x0074)
-#define UMAL_FIFORAM6		(PKUNITY_UMAL_BASE + 0x0078)
-#define UMAL_FIFORAM7		(PKUNITY_UMAL_BASE + 0x007c)
-
-/* MAHBE MODULE OF UMAL */
-/* UMAL's MAHBE module interfaces to the host system through 32-bit AHB Master
- * and Slave ports.Registers within the M-AHBE provide Control and Status
- * information concerning these transfers.
- */
-/*
- * Transmit Control UMAL_DMATxCtrl
- */
-#define UMAL_DMATxCtrl		(PKUNITY_UMAL_BASE + 0x0180)
-/*
- * Pointer to TX Descripter UMAL_DMATxDescriptor
- */
-#define UMAL_DMATxDescriptor	(PKUNITY_UMAL_BASE + 0x0184)
-/*
- * Status of Tx Packet Transfers UMAL_DMATxStatus
- */
-#define UMAL_DMATxStatus	(PKUNITY_UMAL_BASE + 0x0188)
-/*
- * Receive Control UMAL_DMARxCtrl
- */
-#define UMAL_DMARxCtrl		(PKUNITY_UMAL_BASE + 0x018c)
-/*
- * Pointer to Rx Descriptor UMAL_DMARxDescriptor
- */
-#define UMAL_DMARxDescriptor	(PKUNITY_UMAL_BASE + 0x0190)
-/*
- * Status of Rx Packet Transfers UMAL_DMARxStatus
- */
-#define UMAL_DMARxStatus	(PKUNITY_UMAL_BASE + 0x0194)
-/*
- * Interrupt Mask UMAL_DMAIntrMask
- */
-#define UMAL_DMAIntrMask	(PKUNITY_UMAL_BASE + 0x0198)
-/*
- * Interrupts, read only UMAL_DMAInterrupt
- */
-#define UMAL_DMAInterrupt	(PKUNITY_UMAL_BASE + 0x019c)
-
-/*
- * Commands for UMAL_CFG1 register
- */
-#define UMAL_CFG1_TXENABLE	FIELD(1, 1, 0)
-#define UMAL_CFG1_RXENABLE	FIELD(1, 1, 2)
-#define UMAL_CFG1_TXFLOWCTL	FIELD(1, 1, 4)
-#define UMAL_CFG1_RXFLOWCTL	FIELD(1, 1, 5)
-#define UMAL_CFG1_CONFLPBK	FIELD(1, 1, 8)
-#define UMAL_CFG1_RESET		FIELD(1, 1, 31)
-#define UMAL_CFG1_CONFFLCTL	(MAC_TX_FLOW_CTL | MAC_RX_FLOW_CTL)
-
-/*
- * Commands for UMAL_CFG2 register
- */
-#define UMAL_CFG2_FULLDUPLEX	FIELD(1, 1, 0)
-#define UMAL_CFG2_CRCENABLE	FIELD(1, 1, 1)
-#define UMAL_CFG2_PADCRC	FIELD(1, 1, 2)
-#define UMAL_CFG2_LENGTHCHECK	FIELD(1, 1, 4)
-#define UMAL_CFG2_MODEMASK	FMASK(2, 8)
-#define UMAL_CFG2_NIBBLEMODE	FIELD(1, 2, 8)
-#define UMAL_CFG2_BYTEMODE	FIELD(2, 2, 8)
-#define UMAL_CFG2_PREAMBLENMASK	FMASK(4, 12)
-#define UMAL_CFG2_DEFPREAMBLEN	FIELD(7, 4, 12)
-#define UMAL_CFG2_FD100		(UMAL_CFG2_DEFPREAMBLEN | UMAL_CFG2_NIBBLEMODE \
-				| UMAL_CFG2_LENGTHCHECK | UMAL_CFG2_PADCRC \
-				| UMAL_CFG2_CRCENABLE | UMAL_CFG2_FULLDUPLEX)
-#define UMAL_CFG2_FD1000	(UMAL_CFG2_DEFPREAMBLEN | UMAL_CFG2_BYTEMODE \
-				| UMAL_CFG2_LENGTHCHECK | UMAL_CFG2_PADCRC \
-				| UMAL_CFG2_CRCENABLE | UMAL_CFG2_FULLDUPLEX)
-#define UMAL_CFG2_HD100		(UMAL_CFG2_DEFPREAMBLEN | UMAL_CFG2_NIBBLEMODE \
-				| UMAL_CFG2_LENGTHCHECK | UMAL_CFG2_PADCRC \
-				| UMAL_CFG2_CRCENABLE)
-
-/*
- * Command for UMAL_IFCTRL register
- */
-#define UMAL_IFCTRL_RESET	FIELD(1, 1, 31)
-
-/*
- * Command for UMAL_MIICFG register
- */
-#define UMAL_MIICFG_RESET	FIELD(1, 1, 31)
-
-/*
- * Command for UMAL_MIICMD register
- */
-#define UMAL_MIICMD_READ	FIELD(1, 1, 0)
-
-/*
- * Command for UMAL_MIIIDCT register
- */
-#define UMAL_MIIIDCT_BUSY	FIELD(1, 1, 0)
-#define UMAL_MIIIDCT_NOTVALID	FIELD(1, 1, 2)
-
-/*
- * Commands for DMATxCtrl regesters
- */
-#define UMAL_DMA_Enable		FIELD(1, 1, 0)
-
-/*
- * Commands for DMARxCtrl regesters
- */
-#define UMAL_DMAIntrMask_ENABLEHALFWORD	FIELD(1, 1, 16)
-
-/*
- * Command for DMARxStatus
- */
-#define CLR_RX_BUS_ERR		FIELD(1, 1, 3)
-#define CLR_RX_OVERFLOW		FIELD(1, 1, 2)
-#define CLR_RX_PKT		FIELD(1, 1, 0)
-
-/*
- * Command for DMATxStatus
- */
-#define CLR_TX_BUS_ERR		FIELD(1, 1, 3)
-#define CLR_TX_UNDERRUN		FIELD(1, 1, 1)
-#define CLR_TX_PKT		FIELD(1, 1, 0)
-
-/*
- * Commands for DMAIntrMask and DMAInterrupt register
- */
-#define INT_RX_MASK		FIELD(0xd, 4, 4)
-#define INT_TX_MASK		FIELD(0xb, 4, 0)
-
-#define INT_RX_BUS_ERR		FIELD(1, 1, 7)
-#define INT_RX_OVERFLOW		FIELD(1, 1, 6)
-#define INT_RX_PKT		FIELD(1, 1, 4)
-#define INT_TX_BUS_ERR		FIELD(1, 1, 3)
-#define INT_TX_UNDERRUN		FIELD(1, 1, 1)
-#define INT_TX_PKT		FIELD(1, 1, 0)
-
-/*
- * MARCOS of UMAL's descriptors
- */
-#define UMAL_DESC_PACKETSIZE_EMPTY	FIELD(1, 1, 31)
-#define UMAL_DESC_PACKETSIZE_NONEMPTY	FIELD(0, 1, 31)
-#define UMAL_DESC_PACKETSIZE_SIZEMASK	FMASK(12, 0)
-
diff --git a/arch/unicore32/include/mach/regs-unigfx.h b/arch/unicore32/include/mach/regs-unigfx.h
deleted file mode 100644
index 553d1157c6b2..000000000000
--- a/arch/unicore32/include/mach/regs-unigfx.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity UNIGFX Registers
- */
-
-#define UDE_BASE      (PKUNITY_UNIGFX_BASE + 0x1400)
-#define UGE_BASE      (PKUNITY_UNIGFX_BASE + 0x0000)
-
-/*
- * command reg for UNIGFX DE
- */
-/*
- * control reg UDE_CFG
- */
-#define UDE_CFG       (UDE_BASE + 0x0000)
-/*
- * framebuffer start address reg UDE_FSA
- */
-#define UDE_FSA       (UDE_BASE + 0x0004)
-/*
- * line size reg UDE_LS
- */
-#define UDE_LS        (UDE_BASE + 0x0008)
-/*
- * pitch size reg UDE_PS
- */
-#define UDE_PS        (UDE_BASE + 0x000C)
-/*
- * horizontal active time reg UDE_HAT
- */
-#define UDE_HAT       (UDE_BASE + 0x0010)
-/*
- * horizontal blank time reg UDE_HBT
- */
-#define UDE_HBT       (UDE_BASE + 0x0014)
-/*
- * horizontal sync time reg UDE_HST
- */
-#define UDE_HST       (UDE_BASE + 0x0018)
-/*
- * vertival active time reg UDE_VAT
- */
-#define UDE_VAT       (UDE_BASE + 0x001C)
-/*
- * vertival blank time reg UDE_VBT
- */
-#define UDE_VBT       (UDE_BASE + 0x0020)
-/*
- * vertival sync time reg UDE_VST
- */
-#define UDE_VST       (UDE_BASE + 0x0024)
-/*
- * cursor position UDE_CXY
- */
-#define UDE_CXY       (UDE_BASE + 0x0028)
-/*
- * cursor front color UDE_CC0
- */
-#define UDE_CC0       (UDE_BASE + 0x002C)
-/*
- * cursor background color UDE_CC1
- */
-#define UDE_CC1       (UDE_BASE + 0x0030)
-/*
- * video position UDE_VXY
- */
-#define UDE_VXY       (UDE_BASE + 0x0034)
-/*
- * video start address reg UDE_VSA
- */
-#define UDE_VSA       (UDE_BASE + 0x0040)
-/*
- * video size reg UDE_VS
- */
-#define UDE_VS        (UDE_BASE + 0x004C)
-
-/*
- * command reg for UNIGFX GE
- */
-/*
- * src xy reg UGE_SRCXY
- */
-#define UGE_SRCXY     (UGE_BASE + 0x0000)
-/*
- * dst xy reg UGE_DSTXY
- */
-#define UGE_DSTXY     (UGE_BASE + 0x0004)
-/*
- * pitch reg UGE_PITCH
- */
-#define UGE_PITCH     (UGE_BASE + 0x0008)
-/*
- * src start reg UGE_SRCSTART
- */
-#define UGE_SRCSTART  (UGE_BASE + 0x000C)
-/*
- * dst start reg UGE_DSTSTART
- */
-#define UGE_DSTSTART  (UGE_BASE + 0x0010)
-/*
- * width height reg UGE_WIDHEIGHT
- */
-#define UGE_WIDHEIGHT (UGE_BASE + 0x0014)
-/*
- * rop alpah reg UGE_ROPALPHA
- */
-#define UGE_ROPALPHA  (UGE_BASE + 0x0018)
-/*
- * front color UGE_FCOLOR
- */
-#define UGE_FCOLOR    (UGE_BASE + 0x001C)
-/*
- * background color UGE_BCOLOR
- */
-#define UGE_BCOLOR    (UGE_BASE + 0x0020)
-/*
- * src color key for high value UGE_SCH
- */
-#define UGE_SCH       (UGE_BASE + 0x0024)
-/*
- * dst color key for high value UGE_DCH
- */
-#define UGE_DCH       (UGE_BASE + 0x0028)
-/*
- * src color key for low value UGE_SCL
- */
-#define UGE_SCL       (UGE_BASE + 0x002C)
-/*
- * dst color key for low value UGE_DCL
- */
-#define UGE_DCL       (UGE_BASE + 0x0030)
-/*
- * clip 0 reg UGE_CLIP0
- */
-#define UGE_CLIP0     (UGE_BASE + 0x0034)
-/*
- * clip 1 reg UGE_CLIP1
- */
-#define UGE_CLIP1     (UGE_BASE + 0x0038)
-/*
- * command reg UGE_COMMAND
- */
-#define UGE_COMMAND   (UGE_BASE + 0x003C)
-/*
- * pattern 0 UGE_P0
- */
-#define UGE_P0        (UGE_BASE + 0x0040)
-#define UGE_P1        (UGE_BASE + 0x0044)
-#define UGE_P2        (UGE_BASE + 0x0048)
-#define UGE_P3        (UGE_BASE + 0x004C)
-#define UGE_P4        (UGE_BASE + 0x0050)
-#define UGE_P5        (UGE_BASE + 0x0054)
-#define UGE_P6        (UGE_BASE + 0x0058)
-#define UGE_P7        (UGE_BASE + 0x005C)
-#define UGE_P8        (UGE_BASE + 0x0060)
-#define UGE_P9        (UGE_BASE + 0x0064)
-#define UGE_P10       (UGE_BASE + 0x0068)
-#define UGE_P11       (UGE_BASE + 0x006C)
-#define UGE_P12       (UGE_BASE + 0x0070)
-#define UGE_P13       (UGE_BASE + 0x0074)
-#define UGE_P14       (UGE_BASE + 0x0078)
-#define UGE_P15       (UGE_BASE + 0x007C)
-#define UGE_P16       (UGE_BASE + 0x0080)
-#define UGE_P17       (UGE_BASE + 0x0084)
-#define UGE_P18       (UGE_BASE + 0x0088)
-#define UGE_P19       (UGE_BASE + 0x008C)
-#define UGE_P20       (UGE_BASE + 0x0090)
-#define UGE_P21       (UGE_BASE + 0x0094)
-#define UGE_P22       (UGE_BASE + 0x0098)
-#define UGE_P23       (UGE_BASE + 0x009C)
-#define UGE_P24       (UGE_BASE + 0x00A0)
-#define UGE_P25       (UGE_BASE + 0x00A4)
-#define UGE_P26       (UGE_BASE + 0x00A8)
-#define UGE_P27       (UGE_BASE + 0x00AC)
-#define UGE_P28       (UGE_BASE + 0x00B0)
-#define UGE_P29       (UGE_BASE + 0x00B4)
-#define UGE_P30       (UGE_BASE + 0x00B8)
-#define UGE_P31       (UGE_BASE + 0x00BC)
-
-#define UDE_CFG_DST_MASK	FMASK(2, 8)
-#define UDE_CFG_DST8            FIELD(0x0, 2, 8)
-#define UDE_CFG_DST16           FIELD(0x1, 2, 8)
-#define UDE_CFG_DST24           FIELD(0x2, 2, 8)
-#define UDE_CFG_DST32           FIELD(0x3, 2, 8)
-
-/*
- * GDEN enable UDE_CFG_GDEN_ENABLE
- */
-#define UDE_CFG_GDEN_ENABLE     FIELD(1, 1, 3)
-/*
- * VDEN enable UDE_CFG_VDEN_ENABLE
- */
-#define UDE_CFG_VDEN_ENABLE     FIELD(1, 1, 4)
-/*
- * CDEN enable UDE_CFG_CDEN_ENABLE
- */
-#define UDE_CFG_CDEN_ENABLE     FIELD(1, 1, 5)
-/*
- * TIMEUP enable UDE_CFG_TIMEUP_ENABLE
- */
-#define UDE_CFG_TIMEUP_ENABLE   FIELD(1, 1, 6)
diff --git a/arch/unicore32/include/mach/uncompress.h b/arch/unicore32/include/mach/uncompress.h
deleted file mode 100644
index 0c1a56a1913f..000000000000
--- a/arch/unicore32/include/mach/uncompress.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/uncompress.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __MACH_PUV3_UNCOMPRESS_H__
-#define __MACH_PUV3_UNCOMPRESS_H__
-
-#include <mach/hardware.h>
-#include <mach/ocd.h>
-
-extern char input_data[];
-extern char input_data_end[];
-
-static void arch_decomp_puts(const char *ptr)
-{
-	char c;
-
-	while ((c = *ptr++) != '\0') {
-		if (c == '\n')
-			putc('\r');
-		putc(c);
-	}
-}
-#define ARCH_HAVE_DECOMP_PUTS
-
-#endif /* __MACH_PUV3_UNCOMPRESS_H__ */
diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild
deleted file mode 100644
index e78470141932..000000000000
--- a/arch/unicore32/include/uapi/asm/Kbuild
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-generic-y += ucontext.h
diff --git a/arch/unicore32/include/uapi/asm/byteorder.h b/arch/unicore32/include/uapi/asm/byteorder.h
deleted file mode 100644
index 864fe4814cf4..000000000000
--- a/arch/unicore32/include/uapi/asm/byteorder.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * linux/arch/unicore32/include/asm/byteorder.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * UniCore ONLY support Little Endian mode, the data bus is connected such
- * that byte accesses appear as:
- *  0 = d0...d7, 1 = d8...d15, 2 = d16...d23, 3 = d24...d31
- * and word accesses (data or instruction) appear as:
- *  d0...d31
- */
-#ifndef __UNICORE_BYTEORDER_H__
-#define __UNICORE_BYTEORDER_H__
-
-#include <linux/byteorder/little_endian.h>
-
-#endif
-
diff --git a/arch/unicore32/include/uapi/asm/ptrace.h b/arch/unicore32/include/uapi/asm/ptrace.h
deleted file mode 100644
index 2820de83e37d..000000000000
--- a/arch/unicore32/include/uapi/asm/ptrace.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * linux/arch/unicore32/include/asm/ptrace.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef _UAPI__UNICORE_PTRACE_H__
-#define _UAPI__UNICORE_PTRACE_H__
-
-#define PTRACE_GET_THREAD_AREA	22
-
-/*
- * PSR bits
- */
-#define USER_MODE	0x00000010
-#define REAL_MODE	0x00000011
-#define INTR_MODE	0x00000012
-#define PRIV_MODE	0x00000013
-#define ABRT_MODE	0x00000017
-#define EXTN_MODE	0x0000001b
-#define SUSR_MODE	0x0000001f
-#define MODE_MASK	0x0000001f
-#define PSR_R_BIT	0x00000040
-#define PSR_I_BIT	0x00000080
-#define PSR_V_BIT	0x10000000
-#define PSR_C_BIT	0x20000000
-#define PSR_Z_BIT	0x40000000
-#define PSR_S_BIT	0x80000000
-
-/*
- * Groups of PSR bits
- */
-#define PSR_f		0xff000000	/* Flags		*/
-#define PSR_c		0x000000ff	/* Control		*/
-
-#ifndef __ASSEMBLY__
-
-/*
- * This struct defines the way the registers are stored on the
- * stack during a system call.  Note that sizeof(struct pt_regs)
- * has to be a multiple of 8.
- */
-struct pt_regs {
-	unsigned long uregs[34];
-};
-
-#define UCreg_asr		uregs[32]
-#define UCreg_pc		uregs[31]
-#define UCreg_lr		uregs[30]
-#define UCreg_sp		uregs[29]
-#define UCreg_ip		uregs[28]
-#define UCreg_fp		uregs[27]
-#define UCreg_26		uregs[26]
-#define UCreg_25		uregs[25]
-#define UCreg_24		uregs[24]
-#define UCreg_23		uregs[23]
-#define UCreg_22		uregs[22]
-#define UCreg_21		uregs[21]
-#define UCreg_20		uregs[20]
-#define UCreg_19		uregs[19]
-#define UCreg_18		uregs[18]
-#define UCreg_17		uregs[17]
-#define UCreg_16		uregs[16]
-#define UCreg_15		uregs[15]
-#define UCreg_14		uregs[14]
-#define UCreg_13		uregs[13]
-#define UCreg_12		uregs[12]
-#define UCreg_11		uregs[11]
-#define UCreg_10		uregs[10]
-#define UCreg_09		uregs[9]
-#define UCreg_08		uregs[8]
-#define UCreg_07		uregs[7]
-#define UCreg_06		uregs[6]
-#define UCreg_05		uregs[5]
-#define UCreg_04		uregs[4]
-#define UCreg_03		uregs[3]
-#define UCreg_02		uregs[2]
-#define UCreg_01		uregs[1]
-#define UCreg_00		uregs[0]
-#define UCreg_ORIG_00		uregs[33]
-
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _UAPI__UNICORE_PTRACE_H__ */
diff --git a/arch/unicore32/include/uapi/asm/sigcontext.h b/arch/unicore32/include/uapi/asm/sigcontext.h
deleted file mode 100644
index 79e56f28e4b5..000000000000
--- a/arch/unicore32/include/uapi/asm/sigcontext.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * linux/arch/unicore32/include/asm/sigcontext.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __UNICORE_SIGCONTEXT_H__
-#define __UNICORE_SIGCONTEXT_H__
-
-#include <asm/ptrace.h>
-/*
- * Signal context structure - contains all info to do with the state
- * before the signal handler was invoked.  Note: only add new entries
- * to the end of the structure.
- */
-struct sigcontext {
-	unsigned long trap_no;
-	unsigned long error_code;
-	unsigned long oldmask;
-	unsigned long fault_address;
-	struct pt_regs regs;
-};
-
-#endif
diff --git a/arch/unicore32/include/uapi/asm/unistd.h b/arch/unicore32/include/uapi/asm/unistd.h
deleted file mode 100644
index 54a7378a70b1..000000000000
--- a/arch/unicore32/include/uapi/asm/unistd.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * linux/arch/unicore32/include/asm/unistd.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define __ARCH_WANT_RENAMEAT
-#define __ARCH_WANT_SET_GET_RLIMIT
-#define __ARCH_WANT_STAT64
-#define __ARCH_WANT_TIME32_SYSCALLS
-
-/* Use the standard ABI for syscalls. */
-#include <asm-generic/unistd.h>
-#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/unicore32/kernel/Makefile b/arch/unicore32/kernel/Makefile
deleted file mode 100644
index 2f79aa56735b..000000000000
--- a/arch/unicore32/kernel/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the linux kernel.
-#
-
-# Object file lists.
-obj-y				:= dma.o elf.o entry.o process.o ptrace.o
-obj-y				+= setup.o signal.o sys.o stacktrace.o traps.o
-
-obj-$(CONFIG_MODULES)		+= ksyms.o module.o
-obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-
-obj-$(CONFIG_UNICORE_FPU_F64)	+= fpu-ucf64.o
-
-# obj-y for architecture PKUnity v3
-obj-$(CONFIG_ARCH_PUV3)		+= clock.o irq.o time.o
-
-obj-$(CONFIG_PUV3_GPIO)		+= gpio.o
-obj-$(CONFIG_PUV3_PM)		+= pm.o sleep.o
-obj-$(CONFIG_HIBERNATION)	+= hibernate.o hibernate_asm.o
-
-obj-$(CONFIG_PCI)		+= pci.o
-
-# obj-y for specific machines
-obj-$(CONFIG_ARCH_PUV3)		+= puv3-core.o
-obj-$(CONFIG_PUV3_NB0916)	+= puv3-nb0916.o
-
-head-y				:= head.o
-obj-$(CONFIG_DEBUG_LL)		+= debug.o
-
-extra-y				:= $(head-y) vmlinux.lds
diff --git a/arch/unicore32/kernel/asm-offsets.c b/arch/unicore32/kernel/asm-offsets.c
deleted file mode 100644
index f7d672267549..000000000000
--- a/arch/unicore32/kernel/asm-offsets.c
+++ /dev/null
@@ -1,108 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/asm-offsets.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed to extract
- * and format the required data.
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/dma-mapping.h>
-#include <linux/kbuild.h>
-#include <linux/suspend.h>
-#include <linux/thread_info.h>
-#include <asm/memory.h>
-#include <asm/suspend.h>
-
-/*
- * GCC 3.0, 3.1: general bad code generation.
- * GCC 3.2.0: incorrect function argument offset calculation.
- * GCC 3.2.x: miscompiles NEW_AUX_ENT in fs/binfmt_elf.c
- *	(http://gcc.gnu.org/PR8896) and incorrect structure
- *		initialisation in fs/jffs2/erase.c
- */
-#if (__GNUC__ < 4)
-#error Your compiler should upgrade to uc4
-#error	Known good compilers: 4.2.2
-#endif
-
-int main(void)
-{
-	DEFINE(TSK_ACTIVE_MM,	offsetof(struct task_struct, active_mm));
-	BLANK();
-	DEFINE(TI_FLAGS,	offsetof(struct thread_info, flags));
-	DEFINE(TI_PREEMPT,	offsetof(struct thread_info, preempt_count));
-	DEFINE(TI_ADDR_LIMIT,	offsetof(struct thread_info, addr_limit));
-	DEFINE(TI_TASK,		offsetof(struct thread_info, task));
-	DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
-	DEFINE(TI_CPU_SAVE,	offsetof(struct thread_info, cpu_context));
-	DEFINE(TI_USED_CP,	offsetof(struct thread_info, used_cp));
-#ifdef CONFIG_UNICORE_FPU_F64
-	DEFINE(TI_FPSTATE,	offsetof(struct thread_info, fpstate));
-#endif
-	BLANK();
-	DEFINE(S_R0,		offsetof(struct pt_regs, UCreg_00));
-	DEFINE(S_R1,		offsetof(struct pt_regs, UCreg_01));
-	DEFINE(S_R2,		offsetof(struct pt_regs, UCreg_02));
-	DEFINE(S_R3,		offsetof(struct pt_regs, UCreg_03));
-	DEFINE(S_R4,		offsetof(struct pt_regs, UCreg_04));
-	DEFINE(S_R5,		offsetof(struct pt_regs, UCreg_05));
-	DEFINE(S_R6,		offsetof(struct pt_regs, UCreg_06));
-	DEFINE(S_R7,		offsetof(struct pt_regs, UCreg_07));
-	DEFINE(S_R8,		offsetof(struct pt_regs, UCreg_08));
-	DEFINE(S_R9,		offsetof(struct pt_regs, UCreg_09));
-	DEFINE(S_R10,		offsetof(struct pt_regs, UCreg_10));
-	DEFINE(S_R11,		offsetof(struct pt_regs, UCreg_11));
-	DEFINE(S_R12,		offsetof(struct pt_regs, UCreg_12));
-	DEFINE(S_R13,		offsetof(struct pt_regs, UCreg_13));
-	DEFINE(S_R14,		offsetof(struct pt_regs, UCreg_14));
-	DEFINE(S_R15,		offsetof(struct pt_regs, UCreg_15));
-	DEFINE(S_R16,		offsetof(struct pt_regs, UCreg_16));
-	DEFINE(S_R17,		offsetof(struct pt_regs, UCreg_17));
-	DEFINE(S_R18,		offsetof(struct pt_regs, UCreg_18));
-	DEFINE(S_R19,		offsetof(struct pt_regs, UCreg_19));
-	DEFINE(S_R20,		offsetof(struct pt_regs, UCreg_20));
-	DEFINE(S_R21,		offsetof(struct pt_regs, UCreg_21));
-	DEFINE(S_R22,		offsetof(struct pt_regs, UCreg_22));
-	DEFINE(S_R23,		offsetof(struct pt_regs, UCreg_23));
-	DEFINE(S_R24,		offsetof(struct pt_regs, UCreg_24));
-	DEFINE(S_R25,		offsetof(struct pt_regs, UCreg_25));
-	DEFINE(S_R26,		offsetof(struct pt_regs, UCreg_26));
-	DEFINE(S_FP,		offsetof(struct pt_regs, UCreg_fp));
-	DEFINE(S_IP,		offsetof(struct pt_regs, UCreg_ip));
-	DEFINE(S_SP,		offsetof(struct pt_regs, UCreg_sp));
-	DEFINE(S_LR,		offsetof(struct pt_regs, UCreg_lr));
-	DEFINE(S_PC,		offsetof(struct pt_regs, UCreg_pc));
-	DEFINE(S_PSR,		offsetof(struct pt_regs, UCreg_asr));
-	DEFINE(S_OLD_R0,	offsetof(struct pt_regs, UCreg_ORIG_00));
-	DEFINE(S_FRAME_SIZE,	sizeof(struct pt_regs));
-	BLANK();
-	DEFINE(VMA_VM_MM,	offsetof(struct vm_area_struct, vm_mm));
-	DEFINE(VMA_VM_FLAGS,	offsetof(struct vm_area_struct, vm_flags));
-	BLANK();
-	DEFINE(VM_EXEC,		VM_EXEC);
-	BLANK();
-	DEFINE(PAGE_SZ,		PAGE_SIZE);
-	BLANK();
-	DEFINE(SYS_ERROR0,	0x9f0000);
-	BLANK();
-	DEFINE(PBE_ADDRESS,		offsetof(struct pbe, address));
-	DEFINE(PBE_ORIN_ADDRESS,	offsetof(struct pbe, orig_address));
-	DEFINE(PBE_NEXT,		offsetof(struct pbe, next));
-	DEFINE(SWSUSP_CPU,		offsetof(struct swsusp_arch_regs, \
-							cpu_context));
-#ifdef	CONFIG_UNICORE_FPU_F64
-	DEFINE(SWSUSP_FPSTATE,		offsetof(struct swsusp_arch_regs, \
-							fpstate));
-#endif
-	BLANK();
-	DEFINE(DMA_BIDIRECTIONAL,	DMA_BIDIRECTIONAL);
-	DEFINE(DMA_TO_DEVICE,		DMA_TO_DEVICE);
-	DEFINE(DMA_FROM_DEVICE,		DMA_FROM_DEVICE);
-	return 0;
-}
diff --git a/arch/unicore32/kernel/clock.c b/arch/unicore32/kernel/clock.c
deleted file mode 100644
index 41df6be0a3b2..000000000000
--- a/arch/unicore32/kernel/clock.c
+++ /dev/null
@@ -1,387 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/clock.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/device.h>
-#include <linux/list.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/string.h>
-#include <linux/clk.h>
-#include <linux/mutex.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-
-#include <mach/hardware.h>
-
-/*
- * Very simple clock implementation
- */
-struct clk {
-	struct list_head	node;
-	unsigned long		rate;
-	const char		*name;
-};
-
-static struct clk clk_ost_clk = {
-	.name		= "OST_CLK",
-	.rate		= CLOCK_TICK_RATE,
-};
-
-static struct clk clk_mclk_clk = {
-	.name		= "MAIN_CLK",
-};
-
-static struct clk clk_bclk32_clk = {
-	.name		= "BUS32_CLK",
-};
-
-static struct clk clk_ddr_clk = {
-	.name		= "DDR_CLK",
-};
-
-static struct clk clk_vga_clk = {
-	.name		= "VGA_CLK",
-};
-
-static LIST_HEAD(clocks);
-static DEFINE_MUTEX(clocks_mutex);
-
-struct clk *clk_get(struct device *dev, const char *id)
-{
-	struct clk *p, *clk = ERR_PTR(-ENOENT);
-
-	mutex_lock(&clocks_mutex);
-	list_for_each_entry(p, &clocks, node) {
-		if (strcmp(id, p->name) == 0) {
-			clk = p;
-			break;
-		}
-	}
-	mutex_unlock(&clocks_mutex);
-
-	return clk;
-}
-EXPORT_SYMBOL(clk_get);
-
-void clk_put(struct clk *clk)
-{
-}
-EXPORT_SYMBOL(clk_put);
-
-int clk_enable(struct clk *clk)
-{
-	return 0;
-}
-EXPORT_SYMBOL(clk_enable);
-
-void clk_disable(struct clk *clk)
-{
-}
-EXPORT_SYMBOL(clk_disable);
-
-unsigned long clk_get_rate(struct clk *clk)
-{
-	return clk->rate;
-}
-EXPORT_SYMBOL(clk_get_rate);
-
-struct {
-	unsigned long rate;
-	unsigned long cfg;
-	unsigned long div;
-} vga_clk_table[] = {
-	{.rate =  25175000, .cfg = 0x00002001, .div = 0x9},
-	{.rate =  31500000, .cfg = 0x00002001, .div = 0x7},
-	{.rate =  40000000, .cfg = 0x00003801, .div = 0x9},
-	{.rate =  49500000, .cfg = 0x00003801, .div = 0x7},
-	{.rate =  65000000, .cfg = 0x00002c01, .div = 0x4},
-	{.rate =  78750000, .cfg = 0x00002400, .div = 0x7},
-	{.rate = 108000000, .cfg = 0x00002c01, .div = 0x2},
-	{.rate = 106500000, .cfg = 0x00003c01, .div = 0x3},
-	{.rate =  50650000, .cfg = 0x00106400, .div = 0x9},
-	{.rate =  61500000, .cfg = 0x00106400, .div = 0xa},
-	{.rate =  85500000, .cfg = 0x00002800, .div = 0x6},
-};
-
-struct {
-	unsigned long mrate;
-	unsigned long prate;
-} mclk_clk_table[] = {
-	{.mrate = 500000000, .prate = 0x00109801},
-	{.mrate = 525000000, .prate = 0x00104C00},
-	{.mrate = 550000000, .prate = 0x00105000},
-	{.mrate = 575000000, .prate = 0x00105400},
-	{.mrate = 600000000, .prate = 0x00105800},
-	{.mrate = 625000000, .prate = 0x00105C00},
-	{.mrate = 650000000, .prate = 0x00106000},
-	{.mrate = 675000000, .prate = 0x00106400},
-	{.mrate = 700000000, .prate = 0x00106800},
-	{.mrate = 725000000, .prate = 0x00106C00},
-	{.mrate = 750000000, .prate = 0x00107000},
-	{.mrate = 775000000, .prate = 0x00107400},
-	{.mrate = 800000000, .prate = 0x00107800},
-};
-
-int clk_set_rate(struct clk *clk, unsigned long rate)
-{
-	if (clk == &clk_vga_clk) {
-		unsigned long pll_vgacfg, pll_vgadiv;
-		int ret, i;
-
-		/* lookup vga_clk_table */
-		ret = -EINVAL;
-		for (i = 0; i < ARRAY_SIZE(vga_clk_table); i++) {
-			if (rate == vga_clk_table[i].rate) {
-				pll_vgacfg = vga_clk_table[i].cfg;
-				pll_vgadiv = vga_clk_table[i].div;
-				ret = 0;
-				break;
-			}
-		}
-
-		if (ret)
-			return ret;
-
-		if (readl(PM_PLLVGACFG) == pll_vgacfg)
-			return 0;
-
-		/* set pll vga cfg reg. */
-		writel(pll_vgacfg, PM_PLLVGACFG);
-
-		writel(PM_PMCR_CFBVGA, PM_PMCR);
-		while ((readl(PM_PLLDFCDONE) & PM_PLLDFCDONE_VGADFC)
-				!= PM_PLLDFCDONE_VGADFC)
-			udelay(100); /* about 1ms */
-
-		/* set div cfg reg. */
-		writel(readl(PM_PCGR) | PM_PCGR_VGACLK, PM_PCGR);
-
-		writel((readl(PM_DIVCFG) & ~PM_DIVCFG_VGACLK_MASK)
-				| PM_DIVCFG_VGACLK(pll_vgadiv), PM_DIVCFG);
-
-		writel(readl(PM_SWRESET) | PM_SWRESET_VGADIV, PM_SWRESET);
-		while ((readl(PM_SWRESET) & PM_SWRESET_VGADIV)
-				== PM_SWRESET_VGADIV)
-			udelay(100); /* 65536 bclk32, about 320us */
-
-		writel(readl(PM_PCGR) & ~PM_PCGR_VGACLK, PM_PCGR);
-	}
-#ifdef CONFIG_CPU_FREQ
-	if (clk == &clk_mclk_clk) {
-		u32 pll_rate, divstatus = readl(PM_DIVSTATUS);
-		int ret, i;
-
-		/* lookup mclk_clk_table */
-		ret = -EINVAL;
-		for (i = 0; i < ARRAY_SIZE(mclk_clk_table); i++) {
-			if (rate == mclk_clk_table[i].mrate) {
-				pll_rate = mclk_clk_table[i].prate;
-				clk_mclk_clk.rate = mclk_clk_table[i].mrate;
-				ret = 0;
-				break;
-			}
-		}
-
-		if (ret)
-			return ret;
-
-		if (clk_mclk_clk.rate)
-			clk_bclk32_clk.rate = clk_mclk_clk.rate
-				/ (((divstatus & 0x0000f000) >> 12) + 1);
-
-		/* set pll sys cfg reg. */
-		writel(pll_rate, PM_PLLSYSCFG);
-
-		writel(PM_PMCR_CFBSYS, PM_PMCR);
-		while ((readl(PM_PLLDFCDONE) & PM_PLLDFCDONE_SYSDFC)
-				!= PM_PLLDFCDONE_SYSDFC)
-			udelay(100);
-			/* about 1ms */
-	}
-#endif
-	return 0;
-}
-EXPORT_SYMBOL(clk_set_rate);
-
-int clk_register(struct clk *clk)
-{
-	mutex_lock(&clocks_mutex);
-	list_add(&clk->node, &clocks);
-	mutex_unlock(&clocks_mutex);
-	printk(KERN_DEFAULT "PKUnity PM: %s %lu.%02luM\n", clk->name,
-		(clk->rate)/1000000, (clk->rate)/10000 % 100);
-	return 0;
-}
-EXPORT_SYMBOL(clk_register);
-
-void clk_unregister(struct clk *clk)
-{
-	mutex_lock(&clocks_mutex);
-	list_del(&clk->node);
-	mutex_unlock(&clocks_mutex);
-}
-EXPORT_SYMBOL(clk_unregister);
-
-struct {
-	unsigned long prate;
-	unsigned long rate;
-} pllrate_table[] = {
-	{.prate = 0x00002001, .rate = 250000000},
-	{.prate = 0x00104801, .rate = 250000000},
-	{.prate = 0x00104C01, .rate = 262500000},
-	{.prate = 0x00002401, .rate = 275000000},
-	{.prate = 0x00105001, .rate = 275000000},
-	{.prate = 0x00105401, .rate = 287500000},
-	{.prate = 0x00002801, .rate = 300000000},
-	{.prate = 0x00105801, .rate = 300000000},
-	{.prate = 0x00105C01, .rate = 312500000},
-	{.prate = 0x00002C01, .rate = 325000000},
-	{.prate = 0x00106001, .rate = 325000000},
-	{.prate = 0x00106401, .rate = 337500000},
-	{.prate = 0x00003001, .rate = 350000000},
-	{.prate = 0x00106801, .rate = 350000000},
-	{.prate = 0x00106C01, .rate = 362500000},
-	{.prate = 0x00003401, .rate = 375000000},
-	{.prate = 0x00107001, .rate = 375000000},
-	{.prate = 0x00107401, .rate = 387500000},
-	{.prate = 0x00003801, .rate = 400000000},
-	{.prate = 0x00107801, .rate = 400000000},
-	{.prate = 0x00107C01, .rate = 412500000},
-	{.prate = 0x00003C01, .rate = 425000000},
-	{.prate = 0x00108001, .rate = 425000000},
-	{.prate = 0x00108401, .rate = 437500000},
-	{.prate = 0x00004001, .rate = 450000000},
-	{.prate = 0x00108801, .rate = 450000000},
-	{.prate = 0x00108C01, .rate = 462500000},
-	{.prate = 0x00004401, .rate = 475000000},
-	{.prate = 0x00109001, .rate = 475000000},
-	{.prate = 0x00109401, .rate = 487500000},
-	{.prate = 0x00004801, .rate = 500000000},
-	{.prate = 0x00109801, .rate = 500000000},
-	{.prate = 0x00104C00, .rate = 525000000},
-	{.prate = 0x00002400, .rate = 550000000},
-	{.prate = 0x00105000, .rate = 550000000},
-	{.prate = 0x00105400, .rate = 575000000},
-	{.prate = 0x00002800, .rate = 600000000},
-	{.prate = 0x00105800, .rate = 600000000},
-	{.prate = 0x00105C00, .rate = 625000000},
-	{.prate = 0x00002C00, .rate = 650000000},
-	{.prate = 0x00106000, .rate = 650000000},
-	{.prate = 0x00106400, .rate = 675000000},
-	{.prate = 0x00003000, .rate = 700000000},
-	{.prate = 0x00106800, .rate = 700000000},
-	{.prate = 0x00106C00, .rate = 725000000},
-	{.prate = 0x00003400, .rate = 750000000},
-	{.prate = 0x00107000, .rate = 750000000},
-	{.prate = 0x00107400, .rate = 775000000},
-	{.prate = 0x00003800, .rate = 800000000},
-	{.prate = 0x00107800, .rate = 800000000},
-	{.prate = 0x00107C00, .rate = 825000000},
-	{.prate = 0x00003C00, .rate = 850000000},
-	{.prate = 0x00108000, .rate = 850000000},
-	{.prate = 0x00108400, .rate = 875000000},
-	{.prate = 0x00004000, .rate = 900000000},
-	{.prate = 0x00108800, .rate = 900000000},
-	{.prate = 0x00108C00, .rate = 925000000},
-	{.prate = 0x00004400, .rate = 950000000},
-	{.prate = 0x00109000, .rate = 950000000},
-	{.prate = 0x00109400, .rate = 975000000},
-	{.prate = 0x00004800, .rate = 1000000000},
-	{.prate = 0x00109800, .rate = 1000000000},
-};
-
-struct {
-	unsigned long prate;
-	unsigned long drate;
-} pddr_table[] = {
-	{.prate = 0x00100800, .drate = 44236800},
-	{.prate = 0x00100C00, .drate = 66355200},
-	{.prate = 0x00101000, .drate = 88473600},
-	{.prate = 0x00101400, .drate = 110592000},
-	{.prate = 0x00101800, .drate = 132710400},
-	{.prate = 0x00101C01, .drate = 154828800},
-	{.prate = 0x00102001, .drate = 176947200},
-	{.prate = 0x00102401, .drate = 199065600},
-	{.prate = 0x00102801, .drate = 221184000},
-	{.prate = 0x00102C01, .drate = 243302400},
-	{.prate = 0x00103001, .drate = 265420800},
-	{.prate = 0x00103401, .drate = 287539200},
-	{.prate = 0x00103801, .drate = 309657600},
-	{.prate = 0x00103C01, .drate = 331776000},
-	{.prate = 0x00104001, .drate = 353894400},
-};
-
-static int __init clk_init(void)
-{
-#ifdef CONFIG_PUV3_PM
-	u32 pllrate, divstatus = readl(PM_DIVSTATUS);
-	u32 pcgr_val = readl(PM_PCGR);
-	int i;
-
-	pcgr_val |= PM_PCGR_BCLKMME | PM_PCGR_BCLKH264E | PM_PCGR_BCLKH264D
-			| PM_PCGR_HECLK | PM_PCGR_HDCLK;
-	writel(pcgr_val, PM_PCGR);
-
-	pllrate = readl(PM_PLLSYSSTATUS);
-
-	/* lookup pmclk_table */
-	clk_mclk_clk.rate = 0;
-	for (i = 0; i < ARRAY_SIZE(pllrate_table); i++) {
-		if (pllrate == pllrate_table[i].prate) {
-			clk_mclk_clk.rate = pllrate_table[i].rate;
-			break;
-		}
-	}
-
-	if (clk_mclk_clk.rate)
-		clk_bclk32_clk.rate = clk_mclk_clk.rate /
-			(((divstatus & 0x0000f000) >> 12) + 1);
-
-	pllrate = readl(PM_PLLDDRSTATUS);
-
-	/* lookup pddr_table */
-	clk_ddr_clk.rate = 0;
-	for (i = 0; i < ARRAY_SIZE(pddr_table); i++) {
-		if (pllrate == pddr_table[i].prate) {
-			clk_ddr_clk.rate = pddr_table[i].drate;
-			break;
-		}
-	}
-
-	pllrate = readl(PM_PLLVGASTATUS);
-
-	/* lookup pvga_table */
-	clk_vga_clk.rate = 0;
-	for (i = 0; i < ARRAY_SIZE(pllrate_table); i++) {
-		if (pllrate == pllrate_table[i].prate) {
-			clk_vga_clk.rate = pllrate_table[i].rate;
-			break;
-		}
-	}
-
-	if (clk_vga_clk.rate)
-		clk_vga_clk.rate = clk_vga_clk.rate /
-			(((divstatus & 0x00f00000) >> 20) + 1);
-
-	clk_register(&clk_vga_clk);
-#endif
-#ifdef CONFIG_ARCH_FPGA
-	clk_ddr_clk.rate = 33000000;
-	clk_mclk_clk.rate = 33000000;
-	clk_bclk32_clk.rate = 33000000;
-#endif
-	clk_register(&clk_ddr_clk);
-	clk_register(&clk_mclk_clk);
-	clk_register(&clk_bclk32_clk);
-	clk_register(&clk_ost_clk);
-	return 0;
-}
-core_initcall(clk_init);
diff --git a/arch/unicore32/kernel/debug-macro.S b/arch/unicore32/kernel/debug-macro.S
deleted file mode 100644
index 7e2da0de4f71..000000000000
--- a/arch/unicore32/kernel/debug-macro.S
+++ /dev/null
@@ -1,86 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/debug-macro.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * Debugging macro include header
- */
-#include <generated/asm-offsets.h>
-#include <mach/hardware.h>
-
-		.macro	put_word_ocd, rd, rx=r16
-1001:		movc		\rx, p1.c0, #0
-		cand.a	\rx, #2
-		bne	1001b
-		movc		p1.c1, \rd, #1
-		.endm
-
-#ifdef CONFIG_DEBUG_OCD
-		/* debug using UniCore On-Chip-Debugger */
-		.macro	addruart, rx
-		.endm
-
-		.macro	senduart, rd, rx
-		put_word_ocd	\rd, \rx
-		.endm
-
-		.macro	busyuart, rd, rx
-		.endm
-
-		.macro	waituart, rd, rx
-		.endm
-#else
-#define UART_CLK_DEFAULT        3686400 * 20
-	/* Uartclk = MCLK/ 2, The MCLK on my board is 3686400 * 40  */
-#define BAUD_RATE_DEFAULT	115200
-	/* The baud rate of the serial port */
-
-#define UART_DIVISOR_DEFAULT	(UART_CLK_DEFAULT \
-				/ (16 * BAUD_RATE_DEFAULT) - 1)
-
-		.macro	addruart,rx
-		mrc	p0, #0, \rx, c1, c0
-		tst	\rx, #1			@ MMU enabled?
-		moveq	\rx, #0xee000000	@ physical base address
-		movne	\rx, #0x6e000000	@ virtual address
-
-		@ We probe for the active serial port here
-		@ However, now we assume UART0 is active:	epip4d
-		@ We assume r1 and r2 can be clobbered.
-
-		movl 	r2, #UART_DIVISOR_DEFAULT
-		mov 	r1, #0x80
-		str	r1, [\rx, #UART_LCR_OFFSET]
-		and	r1, r2, #0xff00
-		mov	r1, r1, lsr #8
-		str	r1, [\rx, #UART_DLH_OFFSET]
-		and	r1, r2, #0xff
-		str	r1, [\rx, #UART_DLL_OFFSET]
-		mov 	r1, #0x7
-		str	r1, [\rx, #UART_FCR_OFFSET]
-		mov 	r1, #0x3
-		str	r1, [\rx, #UART_LCR_OFFSET]
-		mov 	r1, #0x0
-		str	r1, [\rx, #UART_IER_OFFSET]
-		.endm
-
-		.macro	senduart,rd,rx
-		str	\rd, [\rx, #UART_THR_OFFSET]
-		.endm
-
-		.macro	waituart,rd,rx
-1001:		ldr	\rd, [\rx, #UART_LSR_OFFSET]
-		tst	\rd, #UART_LSR_THRE
-		beq	1001b
-		.endm
-
-		.macro	busyuart,rd,rx
-1001:		ldr	\rd, [\rx, #UART_LSR_OFFSET]
-		tst	\rd, #UART_LSR_TEMT
-		bne	1001b
-		.endm
-#endif
-
diff --git a/arch/unicore32/kernel/debug.S b/arch/unicore32/kernel/debug.S
deleted file mode 100644
index 13bc8c8550e4..000000000000
--- a/arch/unicore32/kernel/debug.S
+++ /dev/null
@@ -1,82 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/debug.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  32-bit debugging code
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-		.text
-
-/*
- * Some debugging routines (useful if you've got MM problems and
- * printk isn't working).  For DEBUGGING ONLY!!!  Do not leave
- * references to these in a production kernel!
- */
-#include "debug-macro.S"
-
-/*
- * Useful debugging routines
- */
-ENTRY(printhex8)
-		mov	r1, #8
-		b	printhex
-ENDPROC(printhex8)
-
-ENTRY(printhex4)
-		mov	r1, #4
-		b	printhex
-ENDPROC(printhex4)
-
-ENTRY(printhex2)
-		mov	r1, #2
-printhex:	adr	r2, hexbuf
-		add	r3, r2, r1
-		mov	r1, #0
-		stb	r1, [r3]
-1:		and	r1, r0, #15
-		mov	r0, r0 >> #4
-		csub.a	r1, #10
-		beg	2f
-		add	r1, r1, #'0' - 'a' + 10
-2:		add	r1, r1, #'a' - 10
-		stb.w	r1, [r3+], #-1
-		cxor.a	r3, r2
-		bne	1b
-		mov	r0, r2
-		b	printascii
-ENDPROC(printhex2)
-
-		.ltorg
-
-ENTRY(printascii)
-		addruart r3
-		b	2f
-1:		waituart r2, r3
-		senduart r1, r3
-		busyuart r2, r3
-		cxor.a	r1, #'\n'
-		cmoveq	r1, #'\r'
-		beq	1b
-2:		cxor.a	r0, #0
-		beq	3f
-		ldb.w	r1, [r0]+, #1
-		cxor.a	r1, #0
-		bne	1b
-3:		mov	pc, lr
-ENDPROC(printascii)
-
-ENTRY(printch)
-		addruart r3
-		mov	r1, r0
-		mov	r0, #0
-		b	1b
-ENDPROC(printch)
-
-hexbuf:		.space 16
-
diff --git a/arch/unicore32/kernel/dma.c b/arch/unicore32/kernel/dma.c
deleted file mode 100644
index 7a0e2d4d6077..000000000000
--- a/arch/unicore32/kernel/dma.c
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/dma.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/errno.h>
-#include <linux/io.h>
-
-#include <asm/irq.h>
-#include <mach/hardware.h>
-#include <mach/dma.h>
-
-struct dma_channel {
-	char *name;
-	puv3_dma_prio prio;
-	void (*irq_handler)(int, void *);
-	void (*err_handler)(int, void *);
-	void *data;
-};
-
-static struct dma_channel dma_channels[MAX_DMA_CHANNELS];
-
-int puv3_request_dma(char *name, puv3_dma_prio prio,
-			 void (*irq_handler)(int, void *),
-			 void (*err_handler)(int, void *),
-			 void *data)
-{
-	unsigned long flags;
-	int i, found = 0;
-
-	/* basic sanity checks */
-	if (!name)
-		return -EINVAL;
-
-	local_irq_save(flags);
-
-	do {
-		/* try grabbing a DMA channel with the requested priority */
-		for (i = 0; i < MAX_DMA_CHANNELS; i++) {
-			if ((dma_channels[i].prio == prio) &&
-			    !dma_channels[i].name) {
-				found = 1;
-				break;
-			}
-		}
-		/* if requested prio group is full, try a hier priority */
-	} while (!found && prio--);
-
-	if (found) {
-		dma_channels[i].name = name;
-		dma_channels[i].irq_handler = irq_handler;
-		dma_channels[i].err_handler = err_handler;
-		dma_channels[i].data = data;
-	} else {
-		printk(KERN_WARNING "No more available DMA channels for %s\n",
-				name);
-		i = -ENODEV;
-	}
-
-	local_irq_restore(flags);
-	return i;
-}
-EXPORT_SYMBOL(puv3_request_dma);
-
-void puv3_free_dma(int dma_ch)
-{
-	unsigned long flags;
-
-	if (!dma_channels[dma_ch].name) {
-		printk(KERN_CRIT
-			"%s: trying to free channel %d which is already freed\n",
-			__func__, dma_ch);
-		return;
-	}
-
-	local_irq_save(flags);
-	dma_channels[dma_ch].name = NULL;
-	dma_channels[dma_ch].err_handler = NULL;
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL(puv3_free_dma);
-
-static irqreturn_t dma_irq_handler(int irq, void *dev_id)
-{
-	int i, dint;
-
-	dint = readl(DMAC_ITCSR);
-	for (i = 0; i < MAX_DMA_CHANNELS; i++) {
-		if (dint & DMAC_CHANNEL(i)) {
-			struct dma_channel *channel = &dma_channels[i];
-
-			/* Clear TC interrupt of channel i */
-			writel(DMAC_CHANNEL(i), DMAC_ITCCR);
-			writel(0, DMAC_ITCCR);
-
-			if (channel->name && channel->irq_handler) {
-				channel->irq_handler(i, channel->data);
-			} else {
-				/*
-				 * IRQ for an unregistered DMA channel:
-				 * let's clear the interrupts and disable it.
-				 */
-				printk(KERN_WARNING "spurious IRQ for"
-						" DMA channel %d\n", i);
-			}
-		}
-	}
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t dma_err_handler(int irq, void *dev_id)
-{
-	int i, dint;
-
-	dint = readl(DMAC_IESR);
-	for (i = 0; i < MAX_DMA_CHANNELS; i++) {
-		if (dint & DMAC_CHANNEL(i)) {
-			struct dma_channel *channel = &dma_channels[i];
-
-			/* Clear Err interrupt of channel i */
-			writel(DMAC_CHANNEL(i), DMAC_IECR);
-			writel(0, DMAC_IECR);
-
-			if (channel->name && channel->err_handler) {
-				channel->err_handler(i, channel->data);
-			} else {
-				/*
-				 * IRQ for an unregistered DMA channel:
-				 * let's clear the interrupts and disable it.
-				 */
-				printk(KERN_WARNING "spurious IRQ for"
-						" DMA channel %d\n", i);
-			}
-		}
-	}
-	return IRQ_HANDLED;
-}
-
-int __init puv3_init_dma(void)
-{
-	int i, ret;
-
-	/* dma channel priorities on v8 processors:
-	 * ch 0 - 1  <--> (0) DMA_PRIO_HIGH
-	 * ch 2 - 3  <--> (1) DMA_PRIO_MEDIUM
-	 * ch 4 - 5  <--> (2) DMA_PRIO_LOW
-	 */
-	for (i = 0; i < MAX_DMA_CHANNELS; i++) {
-		puv3_stop_dma(i);
-		dma_channels[i].name = NULL;
-		dma_channels[i].prio = min((i & 0x7) >> 1, DMA_PRIO_LOW);
-	}
-
-	ret = request_irq(IRQ_DMA, dma_irq_handler, 0, "DMA", NULL);
-	if (ret) {
-		printk(KERN_CRIT "Can't register IRQ for DMA\n");
-		return ret;
-	}
-
-	ret = request_irq(IRQ_DMAERR, dma_err_handler, 0, "DMAERR", NULL);
-	if (ret) {
-		printk(KERN_CRIT "Can't register IRQ for DMAERR\n");
-		free_irq(IRQ_DMA, "DMA");
-		return ret;
-	}
-
-	return 0;
-}
-
-postcore_initcall(puv3_init_dma);
diff --git a/arch/unicore32/kernel/early_printk.c b/arch/unicore32/kernel/early_printk.c
deleted file mode 100644
index c00b6712b8f7..000000000000
--- a/arch/unicore32/kernel/early_printk.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/early_printk.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/console.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <mach/ocd.h>
-
-/* On-Chip-Debugger functions */
-
-static void early_ocd_write(struct console *con, const char *s, unsigned n)
-{
-	while (*s && n-- > 0) {
-		if (*s == '\n')
-			ocd_putc((int)'\r');
-		ocd_putc((int)*s);
-		s++;
-	}
-}
-
-static struct console early_ocd_console = {
-	.name =		"earlyocd",
-	.write =	early_ocd_write,
-	.flags =	CON_PRINTBUFFER,
-	.index =	-1,
-};
-
-static int __init setup_early_printk(char *buf)
-{
-	if (!buf || early_console)
-		return 0;
-
-	early_console = &early_ocd_console;
-	if (strstr(buf, "keep"))
-		early_console->flags &= ~CON_BOOT;
-	else
-		early_console->flags |= CON_BOOT;
-	register_console(early_console);
-	return 0;
-}
-early_param("earlyprintk", setup_early_printk);
diff --git a/arch/unicore32/kernel/elf.c b/arch/unicore32/kernel/elf.c
deleted file mode 100644
index 22adc65a03e9..000000000000
--- a/arch/unicore32/kernel/elf.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/elf.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/personality.h>
-#include <linux/binfmts.h>
-#include <linux/elf.h>
-
-int elf_check_arch(const struct elf32_hdr *x)
-{
-	/* Make sure it's an UniCore executable */
-	if (x->e_machine != EM_UNICORE)
-		return 0;
-
-	/* Make sure the entry address is reasonable */
-	if (x->e_entry & 3)
-		return 0;
-
-	return 1;
-}
-EXPORT_SYMBOL(elf_check_arch);
-
-void elf_set_personality(const struct elf32_hdr *x)
-{
-	unsigned int personality = PER_LINUX;
-
-	set_personality(personality);
-}
-EXPORT_SYMBOL(elf_set_personality);
diff --git a/arch/unicore32/kernel/entry.S b/arch/unicore32/kernel/entry.S
deleted file mode 100644
index b35dc83069cb..000000000000
--- a/arch/unicore32/kernel/entry.S
+++ /dev/null
@@ -1,802 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/entry.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  Low-level vector interface routines
- */
-#include <linux/init.h>
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/errno.h>
-#include <asm/thread_info.h>
-#include <asm/memory.h>
-#include <asm/unistd.h>
-#include <generated/asm-offsets.h>
-#include "debug-macro.S"
-
-@
-@ Most of the stack format comes from struct pt_regs, but with
-@ the addition of 8 bytes for storing syscall args 5 and 6.
-@
-#define S_OFF		8
-
-/*
- * The SWI code relies on the fact that R0 is at the bottom of the stack
- * (due to slow/fast restore user regs).
- */
-#if S_R0 != 0
-#error "Please fix"
-#endif
-
-	.macro	zero_fp
-#ifdef CONFIG_FRAME_POINTER
-	mov	fp, #0
-#endif
-	.endm
-
-	.macro	alignment_trap, rtemp
-#ifdef CONFIG_ALIGNMENT_TRAP
-	ldw	\rtemp, .LCcralign
-	ldw	\rtemp, [\rtemp]
-	movc	p0.c1, \rtemp, #0
-#endif
-	.endm
-
-	.macro	load_user_sp_lr, rd, rtemp, offset = 0
-	mov	\rtemp, asr
-	xor	\rtemp, \rtemp, #(PRIV_MODE ^ SUSR_MODE)
-	mov.a	asr, \rtemp			@ switch to the SUSR mode
-
-	ldw	sp, [\rd+], #\offset		@ load sp_user
-	ldw	lr, [\rd+], #\offset + 4	@ load lr_user
-
-	xor	\rtemp, \rtemp, #(PRIV_MODE ^ SUSR_MODE)
-	mov.a	asr, \rtemp			@ switch back to the PRIV mode
-	.endm
-
-	.macro	priv_exit, rpsr
-	mov.a	bsr, \rpsr
-	ldm.w	(r0 - r15), [sp]+
-	ldm.b	(r16 - pc), [sp]+		@ load r0 - pc, asr
-	.endm
-
-	.macro	restore_user_regs, fast = 0, offset = 0
-	ldw	r1, [sp+], #\offset + S_PSR	@ get calling asr
-	ldw	lr, [sp+], #\offset + S_PC	@ get pc
-	mov.a	bsr, r1				@ save in bsr_priv
-	.if	\fast
-	add	sp, sp, #\offset + S_R1		@ r0 is syscall return value
-	ldm.w	(r1 - r15), [sp]+		@ get calling r1 - r15
-	ldur	(r16 - lr), [sp]+		@ get calling r16 - lr
-	.else
-	ldm.w	(r0 - r15), [sp]+		@ get calling r0 - r15
-	ldur	(r16 - lr), [sp]+		@ get calling r16 - lr
-	.endif
-	nop
-	add	sp, sp, #S_FRAME_SIZE - S_R16
-	mov.a	pc, lr				@ return
-						@ and move bsr_priv into asr
-	.endm
-
-	.macro	get_thread_info, rd
-	mov	\rd, sp >> #13
-	mov	\rd, \rd << #13
-	.endm
-
-	.macro	get_irqnr_and_base, irqnr, irqstat, base, tmp
-	ldw	\base, =(PKUNITY_INTC_BASE)
-	ldw	\irqstat, [\base+], #0xC	@ INTC_ICIP
-	ldw	\tmp,	  [\base+], #0x4	@ INTC_ICMR
-	and.a	\irqstat, \irqstat, \tmp
-	beq	1001f
-	cntlz	\irqnr, \irqstat
-	rsub	\irqnr, \irqnr, #31
-1001:	/* EQ will be set if no irqs pending */
-	.endm
-
-#ifdef CONFIG_DEBUG_LL
-	.macro	printreg, reg, temp
-		adr	\temp, 901f
-		stm	(r0-r3), [\temp]+
-		stw	lr, [\temp+], #0x10
-		mov	r0, \reg
-		b.l	printhex8
-		mov	r0, #':'
-		b.l	printch
-		mov	r0, pc
-		b.l	printhex8
-		adr	r0, 902f
-		b.l	printascii
-		adr	\temp, 901f
-		ldm	(r0-r3), [\temp]+
-		ldw	lr, [\temp+], #0x10
-		b	903f
-901:	.word	0, 0, 0, 0, 0	@ r0-r3, lr
-902:	.asciz	": epip4d\n"
-	.align
-903:
-	.endm
-#endif
-
-/*
- * These are the registers used in the syscall handler, and allow us to
- * have in theory up to 7 arguments to a function - r0 to r6.
- *
- * Note that tbl == why is intentional.
- *
- * We must set at least "tsk" and "why" when calling ret_with_reschedule.
- */
-scno	.req	r21		@ syscall number
-tbl	.req	r22		@ syscall table pointer
-why	.req	r22		@ Linux syscall (!= 0)
-tsk	.req	r23		@ current thread_info
-
-/*
- * Interrupt handling.  Preserves r17, r18, r19
- */
-	.macro	intr_handler
-1:	get_irqnr_and_base r0, r6, r5, lr
-	beq	2f
-	mov	r1, sp
-	@
-	@ routine called with r0 = irq number, r1 = struct pt_regs *
-	@
-	adr	lr, 1b
-	b	asm_do_IRQ
-2:
-	.endm
-
-/*
- * PRIV mode handlers
- */
-	.macro	priv_entry
-	sub	sp, sp, #(S_FRAME_SIZE - 4)
-	stm	(r1 - r15), [sp]+
-	add	r5, sp, #S_R15
-	stm	(r16 - r28), [r5]+
-
-	ldm	(r1 - r3), [r0]+
-	add	r5, sp, #S_SP - 4	@ here for interlock avoidance
-	mov	r4, #-1			@  ""  ""      ""       ""
-	add	r0, sp, #(S_FRAME_SIZE - 4)
-	stw.w	r1, [sp+], #-4		@ save the "real" r0 copied
-					@ from the exception stack
-
-	mov	r1, lr
-
-	@
-	@ We are now ready to fill in the remaining blanks on the stack:
-	@
-	@  r0 - sp_priv
-	@  r1 - lr_priv
-	@  r2 - lr_<exception>, already fixed up for correct return/restart
-	@  r3 - bsr_<exception>
-	@  r4 - orig_r0 (see pt_regs definition in ptrace.h)
-	@
-	stm	(r0 - r4), [r5]+
-	.endm
-
-/*
- * User mode handlers
- *
- */
-	.macro	user_entry
-	sub	sp, sp, #S_FRAME_SIZE
-	stm	(r1 - r15), [sp+]
-	add	r4, sp, #S_R16
-	stm	(r16 - r28), [r4]+
-
-	ldm	(r1 - r3), [r0]+
-	add	r0, sp, #S_PC		@ here for interlock avoidance
-	mov	r4, #-1			@  ""  ""     ""        ""
-
-	stw	r1, [sp]		@ save the "real" r0 copied
-					@ from the exception stack
-
-	@
-	@ We are now ready to fill in the remaining blanks on the stack:
-	@
-	@  r2 - lr_<exception>, already fixed up for correct return/restart
-	@  r3 - bsr_<exception>
-	@  r4 - orig_r0 (see pt_regs definition in ptrace.h)
-	@
-	@ Also, separately save sp_user and lr_user
-	@
-	stm	(r2 - r4), [r0]+
-	stur	(sp, lr), [r0-]
-
-	@
-	@ Enable the alignment trap while in kernel mode
-	@
-	alignment_trap r0
-
-	@
-	@ Clear FP to mark the first stack frame
-	@
-	zero_fp
-	.endm
-
-	.text
-
-@
-@ __invalid - generic code for failed exception
-@			(re-entrant version of handlers)
-@
-__invalid:
-	sub	sp, sp, #S_FRAME_SIZE
-	stm	(r1 - r15), [sp+]
-	add	r1, sp, #S_R16
-	stm	(r16 - r28, sp, lr), [r1]+
-
-	zero_fp
-
-	ldm	(r4 - r6), [r0]+
-	add	r0, sp, #S_PC		@ here for interlock avoidance
-	mov	r7, #-1			@  ""   ""    ""        ""
-	stw	r4, [sp]		@ save preserved r0
-	stm	(r5 - r7), [r0]+	@ lr_<exception>,
-					@ asr_<exception>, "old_r0"
-
-	mov	r0, sp
-	mov	r1, asr
-	b	bad_mode
-ENDPROC(__invalid)
-
-	.align	5
-__dabt_priv:
-	priv_entry
-
-	@
-	@ get ready to re-enable interrupts if appropriate
-	@
-	mov	r17, asr
-	cand.a	r3, #PSR_I_BIT
-	bne	1f
-	andn	r17, r17, #PSR_I_BIT
-1:
-
-	@
-	@ Call the processor-specific abort handler:
-	@
-	@  r2 - aborted context pc
-	@  r3 - aborted context asr
-	@
-	@ The abort handler must return the aborted address in r0, and
-	@ the fault status register in r1.
-	@
-	movc	r1, p0.c3, #0		@ get FSR
-	movc	r0, p0.c4, #0		@ get FAR
-
-	@
-	@ set desired INTR state, then call main handler
-	@
-	mov.a	asr, r17
-	mov	r2, sp
-	b.l	do_DataAbort
-
-	@
-	@ INTRs off again before pulling preserved data off the stack
-	@
-	disable_irq r0
-
-	@
-	@ restore BSR and restart the instruction
-	@
-	ldw	r2, [sp+], #S_PSR
-	priv_exit r2				@ return from exception
-ENDPROC(__dabt_priv)
-
-	.align	5
-__intr_priv:
-	priv_entry
-
-	intr_handler
-
-	mov	r0, #0				@ epip4d
-	movc	p0.c5, r0, #14
-	nop; nop; nop; nop; nop; nop; nop; nop
-
-	ldw	r4, [sp+], #S_PSR		@ irqs are already disabled
-
-	priv_exit r4				@ return from exception
-ENDPROC(__intr_priv)
-
-	.ltorg
-
-	.align	5
-__extn_priv:
-	priv_entry
-
-	mov	r0, sp				@ struct pt_regs *regs
-	mov	r1, asr
-	b	bad_mode			@ not supported
-ENDPROC(__extn_priv)
-
-	.align	5
-__pabt_priv:
-	priv_entry
-
-	@
-	@ re-enable interrupts if appropriate
-	@
-	mov	r17, asr
-	cand.a	r3, #PSR_I_BIT
-	bne	1f
-	andn	r17, r17, #PSR_I_BIT
-1:
-
-	@
-	@ set args, then call main handler
-	@
-	@  r0 - address of faulting instruction
-	@  r1 - pointer to registers on stack
-	@
-	mov	r0, r2			@ pass address of aborted instruction
-	mov	r1, #5
-	mov.a	asr, r17
-	mov	r2, sp			@ regs
-	b.l	do_PrefetchAbort	@ call abort handler
-
-	@
-	@ INTRs off again before pulling preserved data off the stack
-	@
-	disable_irq r0
-
-	@
-	@ restore BSR and restart the instruction
-	@
-	ldw	r2, [sp+], #S_PSR
-	priv_exit r2			@ return from exception
-ENDPROC(__pabt_priv)
-
-	.align	5
-.LCcralign:
-	.word	cr_alignment
-
-	.align	5
-__dabt_user:
-	user_entry
-
-#ifdef CONFIG_UNICORE_FPU_F64
-	cff	ip, s31
-	cand.a	ip, #0x08000000		@ FPU execption traps?
-	beq	209f
-
-	ldw	ip, [sp+], #S_PC
-	add	ip, ip, #4
-	stw	ip, [sp+], #S_PC
-	@
-	@ fall through to the emulation code, which returns using r19 if
-	@ it has emulated the instruction, or the more conventional lr
-	@ if we are to treat this as a real extended instruction
-	@
-	@  r0 - instruction
-	@
-1:	ldw.u	r0, [r2]
-	adr	r19, ret_from_exception
-	adr	lr, 209f
-	@
-	@ fallthrough to call do_uc_f64
-	@
-/*
- * Check whether the instruction is a co-processor instruction.
- * If yes, we need to call the relevant co-processor handler.
- *
- * Note that we don't do a full check here for the co-processor
- * instructions; all instructions with bit 27 set are well
- * defined.  The only instructions that should fault are the
- * co-processor instructions.
- *
- * Emulators may wish to make use of the following registers:
- *  r0  = instruction opcode.
- *  r2  = PC
- *  r19 = normal "successful" return address
- *  r20 = this threads thread_info structure.
- *  lr  = unrecognised instruction return address
- */
-	get_thread_info r20			@ get current thread
-	and	r8, r0, #0x00003c00		@ mask out CP number
-	mov	r7, #1
-	stb	r7, [r20+], #TI_USED_CP + 2	@ set appropriate used_cp[]
-
-	@ F64 hardware support entry point.
-	@  r0  = faulted instruction
-	@  r19 = return address
-	@  r20 = fp_state
-	enable_irq r4
-	add	r20, r20, #TI_FPSTATE	@ r20 = workspace
-	cff	r1, s31			@ get fpu FPSCR
-	andn    r2, r1, #0x08000000
-	ctf     r2, s31			@ clear 27 bit
-	mov	r2, sp			@ nothing stacked - regdump is at TOS
-	mov	lr, r19			@ setup for a return to the user code
-
-	@ Now call the C code to package up the bounce to the support code
-	@   r0 holds the trigger instruction
-	@   r1 holds the FPSCR value
-	@   r2 pointer to register dump
-	b	ucf64_exchandler
-209:
-#endif
-	@
-	@ Call the processor-specific abort handler:
-	@
-	@  r2 - aborted context pc
-	@  r3 - aborted context asr
-	@
-	@ The abort handler must return the aborted address in r0, and
-	@ the fault status register in r1.
-	@
-	movc	r1, p0.c3, #0		@ get FSR
-	movc	r0, p0.c4, #0		@ get FAR
-
-	@
-	@ INTRs on, then call the main handler
-	@
-	enable_irq r2
-	mov	r2, sp
-	adr	lr, ret_from_exception
-	b	do_DataAbort
-ENDPROC(__dabt_user)
-
-	.align	5
-__intr_user:
-	user_entry
-
-	get_thread_info tsk
-
-	intr_handler
-
-	mov	why, #0
-	b	ret_to_user
-ENDPROC(__intr_user)
-
-	.ltorg
-
-	.align	5
-__extn_user:
-	user_entry
-
-	mov	r0, sp
-	mov	r1, asr
-	b	bad_mode
-ENDPROC(__extn_user)
-
-	.align	5
-__pabt_user:
-	user_entry
-
-	mov	r0, r2			@ pass address of aborted instruction.
-	mov	r1, #5
-	enable_irq r1			@ Enable interrupts
-	mov	r2, sp			@ regs
-	b.l	do_PrefetchAbort	@ call abort handler
-	/* fall through */
-/*
- * This is the return code to user mode for abort handlers
- */
-ENTRY(ret_from_exception)
-	get_thread_info tsk
-	mov	why, #0
-	b	ret_to_user
-ENDPROC(__pabt_user)
-ENDPROC(ret_from_exception)
-
-/*
- * Register switch for UniCore V2 processors
- * r0 = previous task_struct, r1 = previous thread_info, r2 = next thread_info
- * previous and next are guaranteed not to be the same.
- */
-ENTRY(__switch_to)
-	add	ip, r1, #TI_CPU_SAVE
-	stm.w	(r4 - r15), [ip]+
-	stm.w	(r16 - r27, sp, lr), [ip]+
-
-#ifdef	CONFIG_UNICORE_FPU_F64
-	add	ip, r1, #TI_FPSTATE
-	sfm.w	(f0  - f7 ), [ip]+
-	sfm.w	(f8  - f15), [ip]+
-	sfm.w	(f16 - f23), [ip]+
-	sfm.w	(f24 - f31), [ip]+
-	cff	r4, s31
-	stw	r4, [ip]
-
-	add	ip, r2, #TI_FPSTATE
-	lfm.w	(f0  - f7 ), [ip]+
-	lfm.w	(f8  - f15), [ip]+
-	lfm.w	(f16 - f23), [ip]+
-	lfm.w	(f24 - f31), [ip]+
-	ldw	r4, [ip]
-	ctf	r4, s31
-#endif
-	add	ip, r2, #TI_CPU_SAVE
-	ldm.w	(r4 - r15), [ip]+
-	ldm	(r16 - r27, sp, pc), [ip]+	@ Load all regs saved previously
-ENDPROC(__switch_to)
-
-	.align	5
-/*
- * This is the fast syscall return path.  We do as little as
- * possible here, and this includes saving r0 back into the PRIV
- * stack.
- */
-ret_fast_syscall:
-	disable_irq r1				@ disable interrupts
-	ldw	r1, [tsk+], #TI_FLAGS
-	cand.a	r1, #_TIF_WORK_MASK
-	bne	fast_work_pending
-
-	@ fast_restore_user_regs
-	restore_user_regs fast = 1, offset = S_OFF
-
-/*
- * Ok, we need to do extra processing, enter the slow path.
- */
-fast_work_pending:
-	stw.w	r0, [sp+], #S_R0+S_OFF		@ returned r0
-work_pending:
-	cand.a	r1, #_TIF_NEED_RESCHED
-	bne	work_resched
-	mov	r0, sp				@ 'regs'
-	mov	r2, why				@ 'syscall'
-	cand.a	r1, #_TIF_SIGPENDING		@ delivering a signal?
-	cmovne	why, #0				@ prevent further restarts
-	b.l	do_notify_resume
-	b	ret_slow_syscall		@ Check work again
-
-work_resched:
-	b.l	schedule
-/*
- * "slow" syscall return path.  "why" tells us if this was a real syscall.
- */
-ENTRY(ret_to_user)
-ret_slow_syscall:
-	disable_irq r1				@ disable interrupts
-	get_thread_info tsk			@ epip4d, one path error?!
-	ldw	r1, [tsk+], #TI_FLAGS
-	cand.a	r1, #_TIF_WORK_MASK
-	bne	work_pending
-no_work_pending:
-	@ slow_restore_user_regs
-	restore_user_regs fast = 0, offset = 0
-ENDPROC(ret_to_user)
-
-/*
- * This is how we return from a fork.
- */
-ENTRY(ret_from_fork)
-	b.l	schedule_tail
-	b	ret_slow_syscall
-ENDPROC(ret_from_fork)
-
-ENTRY(ret_from_kernel_thread)
-	b.l	schedule_tail
-	mov	r0, r5
-	adr	lr, ret_slow_syscall
-	mov	pc, r4
-ENDPROC(ret_from_kernel_thread)
-
-/*=============================================================================
- * SWI handler
- *-----------------------------------------------------------------------------
- */
-	.align	5
-ENTRY(vector_swi)
-	sub	sp, sp, #S_FRAME_SIZE
-	stm	(r0 - r15), [sp]+		@ Calling r0 - r15
-	add	r8, sp, #S_R16
-	stm	(r16 - r28), [r8]+		@ Calling r16 - r28
-	add	r8, sp, #S_PC
-	stur	(sp, lr), [r8-]			@ Calling sp, lr
-	mov	r8, bsr				@ called from non-REAL mode
-	stw	lr, [sp+], #S_PC		@ Save calling PC
-	stw	r8, [sp+], #S_PSR		@ Save ASR
-	stw	r0, [sp+], #S_OLD_R0		@ Save OLD_R0
-	zero_fp
-
-	/*
-	 * Get the system call number.
-	 */
-	sub	ip, lr, #4
-	ldw.u	scno, [ip]			@ get SWI instruction
-
-#ifdef CONFIG_ALIGNMENT_TRAP
-	ldw	ip, __cr_alignment
-	ldw	ip, [ip]
-	movc	p0.c1, ip, #0                   @ update control register
-#endif
-	enable_irq ip
-
-	get_thread_info tsk
-	ldw	tbl, =sys_call_table		@ load syscall table pointer
-
-	andn	scno, scno, #0xff000000		@ mask off SWI op-code
-	andn	scno, scno, #0x00ff0000		@ mask off SWI op-code
-
-	stm.w	(r4, r5), [sp-]			@ push fifth and sixth args
-	ldw	ip, [tsk+], #TI_FLAGS		@ check for syscall tracing
-	cand.a	ip, #_TIF_SYSCALL_TRACE		@ are we tracing syscalls?
-	bne	__sys_trace
-
-	csub.a	scno, #__NR_syscalls		@ check upper syscall limit
-	adr	lr, ret_fast_syscall		@ return address
-	bea	1f
-	ldw	pc, [tbl+], scno << #2		@ call sys_* routine
-1:
-	add	r1, sp, #S_OFF
-2:	mov	why, #0				@ no longer a real syscall
-	b	sys_ni_syscall			@ not private func
-
-	/*
-	 * This is the really slow path.  We're going to be doing
-	 * context switches, and waiting for our parent to respond.
-	 */
-__sys_trace:
-	mov	r2, scno
-	add	r1, sp, #S_OFF
-	mov	r0, #0				@ trace entry [IP = 0]
-	b.l	syscall_trace
-
-	adr	lr, __sys_trace_return		@ return address
-	mov	scno, r0			@ syscall number (possibly new)
-	add	r1, sp, #S_R0 + S_OFF		@ pointer to regs
-	csub.a	scno, #__NR_syscalls		@ check upper syscall limit
-	bea	2b
-	ldm	(r0 - r3), [r1]+		@ have to reload r0 - r3
-	ldw	pc, [tbl+], scno << #2		@ call sys_* routine
-
-__sys_trace_return:
-	stw.w	r0, [sp+], #S_R0 + S_OFF	@ save returned r0
-	mov	r2, scno
-	mov	r1, sp
-	mov	r0, #1				@ trace exit [IP = 1]
-	b.l	syscall_trace
-	b	ret_slow_syscall
-
-	.align	5
-#ifdef CONFIG_ALIGNMENT_TRAP
-	.type	__cr_alignment, #object
-__cr_alignment:
-	.word	cr_alignment
-#endif
-	.ltorg
-
-ENTRY(sys_rt_sigreturn)
-		add	r0, sp, #S_OFF
-		mov	why, #0		@ prevent syscall restart handling
-		b	__sys_rt_sigreturn
-ENDPROC(sys_rt_sigreturn)
-
-	__INIT
-
-/*
- * Vector stubs.
- *
- * This code is copied to 0xffff0200 so we can use branches in the
- * vectors, rather than ldr's.  Note that this code must not
- * exceed 0x300 bytes.
- *
- * Common stub entry macro:
- *   Enter in INTR mode, bsr = PRIV/USER ASR, lr = PRIV/USER PC
- *
- * SP points to a minimal amount of processor-private memory, the address
- * of which is copied into r0 for the mode specific abort handler.
- */
-	.macro	vector_stub, name, mode
-	.align	5
-
-vector_\name:
-	@
-	@ Save r0, lr_<exception> (parent PC) and bsr_<exception>
-	@ (parent ASR)
-	@
-	stw	r0, [sp]
-	stw	lr, [sp+], #4		@ save r0, lr
-	mov	lr, bsr
-	stw	lr, [sp+], #8		@ save bsr
-
-	@
-	@ Prepare for PRIV mode.  INTRs remain disabled.
-	@
-	mov	r0, asr
-	xor	r0, r0, #(\mode ^ PRIV_MODE)
-	mov.a	bsr, r0
-
-	@
-	@ the branch table must immediately follow this code
-	@
-	and	lr, lr, #0x03
-	add	lr, lr, #1
-	mov	r0, sp
-	ldw	lr, [pc+], lr << #2
-	mov.a	pc, lr			@ branch to handler in PRIV mode
-ENDPROC(vector_\name)
-	.align	2
-	@ handler addresses follow this label
-	.endm
-
-	.globl	__stubs_start
-__stubs_start:
-/*
- * Interrupt dispatcher
- */
-	vector_stub	intr, INTR_MODE
-
-	.long	__intr_user			@  0  (USER)
-	.long	__invalid			@  1
-	.long	__invalid			@  2
-	.long	__intr_priv			@  3  (PRIV)
-
-/*
- * Data abort dispatcher
- * Enter in ABT mode, bsr = USER ASR, lr = USER PC
- */
-	vector_stub	dabt, ABRT_MODE
-
-	.long	__dabt_user			@  0  (USER)
-	.long	__invalid			@  1
-	.long	__invalid			@  2  (INTR)
-	.long	__dabt_priv			@  3  (PRIV)
-
-/*
- * Prefetch abort dispatcher
- * Enter in ABT mode, bsr = USER ASR, lr = USER PC
- */
-	vector_stub	pabt, ABRT_MODE
-
-	.long	__pabt_user			@  0 (USER)
-	.long	__invalid			@  1
-	.long	__invalid			@  2 (INTR)
-	.long	__pabt_priv			@  3 (PRIV)
-
-/*
- * Undef instr entry dispatcher
- * Enter in EXTN mode, bsr = PRIV/USER ASR, lr = PRIV/USER PC
- */
-	vector_stub	extn, EXTN_MODE
-
-	.long	__extn_user			@  0 (USER)
-	.long	__invalid			@  1
-	.long	__invalid			@  2 (INTR)
-	.long	__extn_priv			@  3 (PRIV)
-
-/*
- * We group all the following data together to optimise
- * for CPUs with separate I & D caches.
- */
-	.align	5
-
-.LCvswi:
-	.word	vector_swi
-
-	.globl	__stubs_end
-__stubs_end:
-
-	.equ	stubs_offset, __vectors_start + 0x200 - __stubs_start
-
-	.globl	__vectors_start
-__vectors_start:
-	jepriv	SYS_ERROR0
-	b	vector_extn + stubs_offset
-	ldw	pc, .LCvswi + stubs_offset
-	b	vector_pabt + stubs_offset
-	b	vector_dabt + stubs_offset
-	jepriv	SYS_ERROR0
-	b	vector_intr + stubs_offset
-	jepriv	SYS_ERROR0
-
-	.globl	__vectors_end
-__vectors_end:
-
-	.data
-
-	.globl	cr_alignment
-	.globl	cr_no_alignment
-cr_alignment:
-	.space	4
-cr_no_alignment:
-	.space	4
diff --git a/arch/unicore32/kernel/fpu-ucf64.c b/arch/unicore32/kernel/fpu-ucf64.c
deleted file mode 100644
index 85f0af29d29b..000000000000
--- a/arch/unicore32/kernel/fpu-ucf64.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/fpu-ucf64.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/sched/signal.h>
-#include <linux/init.h>
-
-#include <asm/fpu-ucf64.h>
-
-/*
- * A special flag to tell the normalisation code not to normalise.
- */
-#define F64_NAN_FLAG	0x100
-
-/*
- * A bit pattern used to indicate the initial (unset) value of the
- * exception mask, in case nothing handles an instruction.  This
- * doesn't include the NAN flag, which get masked out before
- * we check for an error.
- */
-#define F64_EXCEPTION_ERROR	((u32)-1 & ~F64_NAN_FLAG)
-
-/*
- * Since we aren't building with -mfpu=f64, we need to code
- * these instructions using their MRC/MCR equivalents.
- */
-#define f64reg(_f64_) #_f64_
-
-#define cff(_f64_) ({			\
-	u32 __v;			\
-	asm("cff %0, " f64reg(_f64_) "@ fmrx	%0, " #_f64_	\
-	    : "=r" (__v) : : "cc");	\
-	__v;				\
-	})
-
-#define ctf(_f64_, _var_)		\
-	asm("ctf %0, " f64reg(_f64_) "@ fmxr	" #_f64_ ", %0"	\
-	   : : "r" (_var_) : "cc")
-
-/*
- * Raise a SIGFPE for the current process.
- * sicode describes the signal being raised.
- */
-void ucf64_raise_sigfpe(struct pt_regs *regs)
-{
-	/*
-	 * This is the same as NWFPE, because it's not clear what
-	 * this is used for
-	 */
-	current->thread.error_code = 0;
-	current->thread.trap_no = 6;
-
-	send_sig_fault(SIGFPE, FPE_FLTUNK,
-		       (void __user *)(instruction_pointer(regs) - 4),
-		       current);
-}
-
-/*
- * Handle exceptions of UniCore-F64.
- */
-void ucf64_exchandler(u32 inst, u32 fpexc, struct pt_regs *regs)
-{
-	u32 tmp = fpexc;
-	u32 exc = F64_EXCEPTION_ERROR & fpexc;
-
-	pr_debug("UniCore-F64: instruction %08x fpscr %08x\n",
-			inst, fpexc);
-
-	if (exc & FPSCR_CMPINSTR_BIT) {
-		if (exc & FPSCR_CON)
-			tmp |= FPSCR_CON;
-		else
-			tmp &= ~(FPSCR_CON);
-		exc &= ~(FPSCR_CMPINSTR_BIT | FPSCR_CON);
-	} else {
-		pr_debug("UniCore-F64 Error: unhandled exceptions\n");
-		pr_debug("UniCore-F64 FPSCR 0x%08x INST 0x%08x\n",
-				cff(FPSCR), inst);
-
-		ucf64_raise_sigfpe(regs);
-		return;
-	}
-
-	/*
-	 * Update the FPSCR with the additional exception flags.
-	 * Comparison instructions always return at least one of
-	 * these flags set.
-	 */
-	tmp &= ~(FPSCR_TRAP | FPSCR_IOS | FPSCR_OFS | FPSCR_UFS |
-			FPSCR_IXS | FPSCR_HIS | FPSCR_IOC | FPSCR_OFC |
-			FPSCR_UFC | FPSCR_IXC | FPSCR_HIC);
-
-	tmp |= exc;
-	ctf(FPSCR, tmp);
-}
-
-/*
- * F64 support code initialisation.
- */
-static int __init ucf64_init(void)
-{
-	ctf(FPSCR, 0x0);     /* FPSCR_UFE | FPSCR_NDE perhaps better */
-
-	printk(KERN_INFO "Enable UniCore-F64 support.\n");
-
-	return 0;
-}
-
-late_initcall(ucf64_init);
diff --git a/arch/unicore32/kernel/gpio.c b/arch/unicore32/kernel/gpio.c
deleted file mode 100644
index 36d395b54b7c..000000000000
--- a/arch/unicore32/kernel/gpio.c
+++ /dev/null
@@ -1,121 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/gpio.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-/* in FPGA, no GPIO support */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/gpio/driver.h>
-/* FIXME: needed for gpio_set_value() - convert to use descriptors or hogs */
-#include <linux/gpio.h>
-#include <mach/hardware.h>
-
-#ifdef CONFIG_LEDS
-#include <linux/leds.h>
-#include <linux/platform_device.h>
-
-static const struct gpio_led puv3_gpio_leds[] = {
-	{ .name = "cpuhealth", .gpio = GPO_CPU_HEALTH, .active_low = 0,
-		.default_trigger = "heartbeat",	},
-	{ .name = "hdd_led", .gpio = GPO_HDD_LED, .active_low = 1,
-		.default_trigger = "disk-activity", },
-};
-
-static const struct gpio_led_platform_data puv3_gpio_led_data = {
-	.num_leds =	ARRAY_SIZE(puv3_gpio_leds),
-	.leds =		(void *) puv3_gpio_leds,
-};
-
-static struct platform_device puv3_gpio_gpio_leds = {
-	.name =		"leds-gpio",
-	.id =		-1,
-	.dev = {
-		.platform_data = (void *) &puv3_gpio_led_data,
-	}
-};
-
-static int __init puv3_gpio_leds_init(void)
-{
-	platform_device_register(&puv3_gpio_gpio_leds);
-	return 0;
-}
-
-device_initcall(puv3_gpio_leds_init);
-#endif
-
-static int puv3_gpio_get(struct gpio_chip *chip, unsigned offset)
-{
-	return !!(readl(GPIO_GPLR) & GPIO_GPIO(offset));
-}
-
-static void puv3_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
-{
-	if (value)
-		writel(GPIO_GPIO(offset), GPIO_GPSR);
-	else
-		writel(GPIO_GPIO(offset), GPIO_GPCR);
-}
-
-static int puv3_direction_input(struct gpio_chip *chip, unsigned offset)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	writel(readl(GPIO_GPDR) & ~GPIO_GPIO(offset), GPIO_GPDR);
-	local_irq_restore(flags);
-	return 0;
-}
-
-static int puv3_direction_output(struct gpio_chip *chip, unsigned offset,
-		int value)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	puv3_gpio_set(chip, offset, value);
-	writel(readl(GPIO_GPDR) | GPIO_GPIO(offset), GPIO_GPDR);
-	local_irq_restore(flags);
-	return 0;
-}
-
-static struct gpio_chip puv3_gpio_chip = {
-	.label			= "gpio",
-	.direction_input	= puv3_direction_input,
-	.direction_output	= puv3_direction_output,
-	.set			= puv3_gpio_set,
-	.get			= puv3_gpio_get,
-	.base			= 0,
-	.ngpio			= GPIO_MAX + 1,
-};
-
-void __init puv3_init_gpio(void)
-{
-	writel(GPIO_DIR, GPIO_GPDR);
-#if	defined(CONFIG_PUV3_NB0916) || defined(CONFIG_PUV3_SMW0919)	\
-	|| defined(CONFIG_PUV3_DB0913)
-	gpio_set_value(GPO_WIFI_EN, 1);
-	gpio_set_value(GPO_HDD_LED, 1);
-	gpio_set_value(GPO_VGA_EN, 1);
-	gpio_set_value(GPO_LCD_EN, 1);
-	gpio_set_value(GPO_CAM_PWR_EN, 0);
-	gpio_set_value(GPO_LCD_VCC_EN, 1);
-	gpio_set_value(GPO_SOFT_OFF, 1);
-	gpio_set_value(GPO_BT_EN, 1);
-	gpio_set_value(GPO_FAN_ON, 0);
-	gpio_set_value(GPO_SPKR, 0);
-	gpio_set_value(GPO_CPU_HEALTH, 1);
-	gpio_set_value(GPO_LAN_SEL, 1);
-/*
- * DO NOT modify the GPO_SET_V1 and GPO_SET_V2 in kernel
- *	gpio_set_value(GPO_SET_V1, 1);
- *	gpio_set_value(GPO_SET_V2, 1);
- */
-#endif
-	gpiochip_add_data(&puv3_gpio_chip, NULL);
-}
diff --git a/arch/unicore32/kernel/head.S b/arch/unicore32/kernel/head.S
deleted file mode 100644
index 9bbb8668f9f7..000000000000
--- a/arch/unicore32/kernel/head.S
+++ /dev/null
@@ -1,249 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/head.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <linux/init.h>
-
-#include <asm/assembler.h>
-#include <asm/ptrace.h>
-#include <generated/asm-offsets.h>
-#include <asm/memory.h>
-#include <asm/thread_info.h>
-#include <asm/hwdef-copro.h>
-#include <asm/pgtable-hwdef.h>
-
-#if (PHYS_OFFSET & 0x003fffff)
-#error "PHYS_OFFSET must be at an even 4MiB boundary!"
-#endif
-
-#define KERNEL_RAM_VADDR	(PAGE_OFFSET + KERNEL_IMAGE_START)
-#define KERNEL_RAM_PADDR	(PHYS_OFFSET + KERNEL_IMAGE_START)
-
-#define KERNEL_PGD_PADDR	(KERNEL_RAM_PADDR - 0x1000)
-#define KERNEL_PGD_VADDR	(KERNEL_RAM_VADDR - 0x1000)
-
-#define KERNEL_START		KERNEL_RAM_VADDR
-#define KERNEL_END		_end
-
-/*
- * swapper_pg_dir is the virtual address of the initial page table.
- * We place the page tables 4K below KERNEL_RAM_VADDR.  Therefore, we must
- * make sure that KERNEL_RAM_VADDR is correctly set.  Currently, we expect
- * the least significant 16 bits to be 0x8000, but we could probably
- * relax this restriction to KERNEL_RAM_VADDR >= PAGE_OFFSET + 0x1000.
- */
-#if (KERNEL_RAM_VADDR & 0xffff) != 0x8000
-#error KERNEL_RAM_VADDR must start at 0xXXXX8000
-#endif
-
-	.globl	swapper_pg_dir
-	.equ	swapper_pg_dir, KERNEL_RAM_VADDR - 0x1000
-
-/*
- * Kernel startup entry point.
- * ---------------------------
- *
- * This is normally called from the decompressor code.  The requirements
- * are: MMU = off, D-cache = off, I-cache = dont care
- *
- * This code is mostly position independent, so if you link the kernel at
- * 0xc0008000, you call this at __pa(0xc0008000).
- */
-	__HEAD
-ENTRY(stext)
-	@ set asr
-	mov	r0, #PRIV_MODE			@ ensure priv mode
-	or	r0, #PSR_R_BIT | PSR_I_BIT	@ disable irqs
-	mov.a	asr, r0
-
-	@ process identify
-	movc	r0, p0.c0, #0			@ cpuid
-	movl	r1, 0xff00ffff			@ mask
-	movl	r2, 0x4d000863			@ value
-	and	r0, r1, r0
-	cxor.a	r0, r2
-	bne	__error_p			@ invalid processor id
-
-	/*
-	 * Clear the 4K level 1 swapper page table
-	 */
-	movl	r0, #KERNEL_PGD_PADDR		@ page table address
-	mov	r1, #0
-	add	r2, r0, #0x1000
-101:	stw.w	r1, [r0]+, #4
-	stw.w	r1, [r0]+, #4
-	stw.w	r1, [r0]+, #4
-	stw.w	r1, [r0]+, #4
-	cxor.a	r0, r2
-	bne	101b
-
-	movl	r4, #KERNEL_PGD_PADDR		@ page table address
-	mov	r7, #PMD_TYPE_SECT | PMD_PRESENT	@ page size: section
-	or	r7, r7, #PMD_SECT_CACHEABLE		@ cacheable
-	or	r7, r7, #PMD_SECT_READ | PMD_SECT_WRITE | PMD_SECT_EXEC
-
-	/*
-	 * Create identity mapping for first 4MB of kernel to
-	 * cater for the MMU enable.  This identity mapping
-	 * will be removed by paging_init().  We use our current program
-	 * counter to determine corresponding section base address.
-	 */
-	mov	r6, pc
-	mov	r6, r6 >> #22			@ start of kernel section
-	or	r1, r7, r6 << #22		@ flags + kernel base
-	stw	r1, [r4+], r6 << #2		@ identity mapping
-
-	/*
-	 * Now setup the pagetables for our kernel direct
-	 * mapped region.
-	 */
-	add	r0, r4,  #(KERNEL_START & 0xff000000) >> 20
-	stw.w	r1, [r0+], #(KERNEL_START & 0x00c00000) >> 20
-	movl	r6, #(KERNEL_END - 1)
-	add	r0, r0, #4
-	add	r6, r4, r6 >> #20
-102:	csub.a	r0, r6
-	add	r1, r1, #1 << 22
-	bua	103f
-	stw.w	r1, [r0]+, #4
-	b	102b
-103:
-	/*
-	 * Then map first 4MB of ram in case it contains our boot params.
-	 */
-	add	r0, r4, #PAGE_OFFSET >> 20
-	or	r6, r7, #(PHYS_OFFSET & 0xffc00000)
-	stw	r6, [r0]
-
-	ldw	r15, __switch_data		@ address to jump to after
-
-	/*
-	 * Initialise TLB, Caches, and MMU state ready to switch the MMU
-	 * on.
-	 */
-	mov	r0, #0
-	movc	p0.c5, r0, #28			@ cache invalidate all
-	nop8
-	movc	p0.c6, r0, #6			@ TLB invalidate all
-	nop8
-
-	/*
-	 * ..V. .... ..TB IDAM
-	 * ..1. .... ..01 1111
-	 */
-	movl	r0, #0x201f			@ control register setting
-
-	/*
-	 * Setup common bits before finally enabling the MMU.  Essentially
-	 * this is just loading the page table pointer and domain access
-	 * registers.
-	 */
-	#ifndef CONFIG_ALIGNMENT_TRAP
-		andn	r0, r0, #CR_A
-	#endif
-	#ifdef CONFIG_CPU_DCACHE_DISABLE
-		andn	r0, r0, #CR_D
-	#endif
-	#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH
-		andn	r0, r0, #CR_B
-	#endif
-	#ifdef CONFIG_CPU_ICACHE_DISABLE
-		andn	r0, r0, #CR_I
-	#endif
-
-	movc	p0.c2, r4, #0			@ set pgd
-	b	__turn_mmu_on
-ENDPROC(stext)
-
-/*
- * Enable the MMU.  This completely changes the structure of the visible
- * memory space.  You will not be able to trace execution through this.
- *
- *  r0  = cp#0 control register
- *  r15 = *virtual* address to jump to upon completion
- */
-	.align	5
-__turn_mmu_on:
-	mov	r0, r0
-	movc	p0.c1, r0, #0			@ write control reg
-	nop					@ fetch inst by phys addr
-	mov	pc, r15
-	nop8					@ fetch inst by phys addr
-ENDPROC(__turn_mmu_on)
-
-/*
- * Setup the initial page tables.  We only setup the barest
- * amount which are required to get the kernel running, which
- * generally means mapping in the kernel code.
- *
- * r9  = cpuid
- * r10 = procinfo
- *
- * Returns:
- *  r0, r3, r6, r7 corrupted
- *  r4 = physical page table address
- */
-	.ltorg
-
-	.align	2
-	.type	__switch_data, %object
-__switch_data:
-	.long	__mmap_switched
-	.long	__bss_start			@ r6
-	.long	_end				@ r7
-	.long	cr_alignment			@ r8
-	.long	init_thread_union + THREAD_START_SP @ sp
-
-/*
- * The following fragment of code is executed with the MMU on in MMU mode,
- * and uses absolute addresses; this is not position independent.
- *
- *  r0  = cp#0 control register
- */
-__mmap_switched:
-	adr	r3, __switch_data + 4
-
-	ldm.w	(r6, r7, r8), [r3]+
-	ldw	sp, [r3]
-
-	mov	fp, #0				@ Clear BSS (and zero fp)
-203:	csub.a	r6, r7
-	bea	204f
-	stw.w	fp, [r6]+,#4
-	b	203b
-204:
-	andn	r1, r0, #CR_A			@ Clear 'A' bit
-	stm	(r0, r1), [r8]+			@ Save control register values
-	b	start_kernel
-ENDPROC(__mmap_switched)
-
-/*
- * Exception handling.  Something went wrong and we can't proceed.  We
- * ought to tell the user, but since we don't have any guarantee that
- * we're even running on the right architecture, we do virtually nothing.
- *
- * If CONFIG_DEBUG_LL is set we try to print out something about the error
- * and hope for the best (useful if bootloader fails to pass a proper
- * machine ID for example).
- */
-__error_p:
-#ifdef CONFIG_DEBUG_LL
-	adr	r0, str_p1
-	b.l	printascii
-	mov	r0, r9
-	b.l	printhex8
-	adr	r0, str_p2
-	b.l	printascii
-901:	nop8
-	b	901b
-str_p1:	.asciz	"\nError: unrecognized processor variant (0x"
-str_p2:	.asciz	").\n"
-	.align
-#endif
-ENDPROC(__error_p)
-
diff --git a/arch/unicore32/kernel/hibernate.c b/arch/unicore32/kernel/hibernate.c
deleted file mode 100644
index 4cdf3c846a2d..000000000000
--- a/arch/unicore32/kernel/hibernate.c
+++ /dev/null
@@ -1,159 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  linux/arch/unicore32/kernel/hibernate.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/gfp.h>
-#include <linux/suspend.h>
-#include <linux/memblock.h>
-#include <linux/pgtable.h>
-
-#include <asm/page.h>
-#include <asm/pgalloc.h>
-#include <asm/sections.h>
-#include <asm/suspend.h>
-
-#include "mach/pm.h"
-
-/* Pointer to the temporary resume page tables */
-pgd_t *resume_pg_dir;
-
-struct swsusp_arch_regs swsusp_arch_regs_cpu0;
-
-/*
- * Create a middle page table on a resume-safe page and put a pointer to it in
- * the given global directory entry.  This only returns the gd entry
- * in non-PAE compilation mode, since the middle layer is folded.
- */
-static pmd_t *resume_one_md_table_init(pgd_t *pgd)
-{
-	pud_t *pud;
-	p4d_t *p4d;
-	pmd_t *pmd_table;
-
-	p4d = p4d_offset(pgd, 0);
-	pud = pud_offset(p4d, 0);
-	pmd_table = pmd_offset(pud, 0);
-
-	return pmd_table;
-}
-
-/*
- * Create a page table on a resume-safe page and place a pointer to it in
- * a middle page directory entry.
- */
-static pte_t *resume_one_page_table_init(pmd_t *pmd)
-{
-	if (pmd_none(*pmd)) {
-		pte_t *page_table = (pte_t *)get_safe_page(GFP_ATOMIC);
-		if (!page_table)
-			return NULL;
-
-		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_KERNEL_TABLE));
-
-		BUG_ON(page_table != pte_offset_kernel(pmd, 0));
-
-		return page_table;
-	}
-
-	return pte_offset_kernel(pmd, 0);
-}
-
-/*
- * This maps the physical memory to kernel virtual address space, a total
- * of max_low_pfn pages, by creating page tables starting from address
- * PAGE_OFFSET.  The page tables are allocated out of resume-safe pages.
- */
-static int resume_physical_mapping_init(pgd_t *pgd_base)
-{
-	unsigned long pfn;
-	pgd_t *pgd;
-	pmd_t *pmd;
-	pte_t *pte;
-	int pgd_idx, pmd_idx;
-
-	pgd_idx = pgd_index(PAGE_OFFSET);
-	pgd = pgd_base + pgd_idx;
-	pfn = 0;
-
-	for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
-		pmd = resume_one_md_table_init(pgd);
-		if (!pmd)
-			return -ENOMEM;
-
-		if (pfn >= max_low_pfn)
-			continue;
-
-		for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD; pmd++, pmd_idx++) {
-			pte_t *max_pte;
-
-			if (pfn >= max_low_pfn)
-				break;
-
-			/* Map with normal page tables.
-			 * NOTE: We can mark everything as executable here
-			 */
-			pte = resume_one_page_table_init(pmd);
-			if (!pte)
-				return -ENOMEM;
-
-			max_pte = pte + PTRS_PER_PTE;
-			for (; pte < max_pte; pte++, pfn++) {
-				if (pfn >= max_low_pfn)
-					break;
-
-				set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
-			}
-		}
-	}
-
-	return 0;
-}
-
-static inline void resume_init_first_level_page_table(pgd_t *pg_dir)
-{
-}
-
-int swsusp_arch_resume(void)
-{
-	int error;
-
-	resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
-	if (!resume_pg_dir)
-		return -ENOMEM;
-
-	resume_init_first_level_page_table(resume_pg_dir);
-	error = resume_physical_mapping_init(resume_pg_dir);
-	if (error)
-		return error;
-
-	/* We have got enough memory and from now on we cannot recover */
-	restore_image(resume_pg_dir, restore_pblist);
-	return 0;
-}
-
-/*
- *	pfn_is_nosave - check if given pfn is in the 'nosave' section
- */
-
-int pfn_is_nosave(unsigned long pfn)
-{
-	unsigned long begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
-	unsigned long end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
-
-	return (pfn >= begin_pfn) && (pfn < end_pfn);
-}
-
-void save_processor_state(void)
-{
-}
-
-void restore_processor_state(void)
-{
-	local_flush_tlb_all();
-}
diff --git a/arch/unicore32/kernel/hibernate_asm.S b/arch/unicore32/kernel/hibernate_asm.S
deleted file mode 100644
index a589bc189e24..000000000000
--- a/arch/unicore32/kernel/hibernate_asm.S
+++ /dev/null
@@ -1,114 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/hibernate_asm.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/sys.h>
-#include <linux/errno.h>
-#include <linux/linkage.h>
-#include <linux/pgtable.h>
-#include <generated/asm-offsets.h>
-#include <asm/page.h>
-#include <asm/assembler.h>
-
-@ restore_image(pgd_t *resume_pg_dir, struct pbe *restore_pblist)
-@ r0: resume_pg_dir
-@ r1: restore_pblist
-@ copy restore_pblist pages
-@ restore registers from swsusp_arch_regs_cpu0
-@
-ENTRY(restore_image)
-	sub	r0, r0, #PAGE_OFFSET
-	mov	r5, #0
-	movc	p0.c6, r5, #6	@invalidate ITLB & DTLB
-	movc	p0.c2, r0, #0
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-
-	.p2align 4,,7
-101:
-	csub.a	r1, #0
-	beq	109f
-
-	ldw	r6, [r1+], #PBE_ADDRESS
-	ldw	r7, [r1+], #PBE_ORIN_ADDRESS
-
-	movl	ip, #128
-102:	ldm.w	(r8 - r15), [r6]+
-	stm.w	(r8 - r15), [r7]+
-	sub.a	ip, ip, #1
-	bne	102b
-
-	ldw	r1, [r1+], #PBE_NEXT
-	b	101b
-
-	.p2align 4,,7
-109:
-	/* go back to the original page tables */
-	ldw	r0, =swapper_pg_dir
-	sub	r0, r0, #PAGE_OFFSET
-	mov	r5, #0
-	movc	p0.c6, r5, #6
-	movc	p0.c2, r0, #0
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-
-#ifdef	CONFIG_UNICORE_FPU_F64
-	ldw	ip, 1f
-	add	ip, ip, #SWSUSP_FPSTATE
-	lfm.w	(f0  - f7 ), [ip]+
-	lfm.w	(f8  - f15), [ip]+
-	lfm.w	(f16 - f23), [ip]+
-	lfm.w	(f24 - f31), [ip]+
-	ldw	r4, [ip]
-	ctf	r4, s31
-#endif
-	mov	r0, #0x0
-	ldw	ip, 1f
-	add	ip, ip, #SWSUSP_CPU
-	ldm.w	(r4 - r15), [ip]+
-	ldm	(r16 - r27, sp, pc), [ip]+	@ Load all regs saved previously
-
-	.align	2
-1:	.long	swsusp_arch_regs_cpu0
-
-
-@ swsusp_arch_suspend()
-@ - prepare pc for resume, return from function without swsusp_save on resume
-@ - save registers in swsusp_arch_regs_cpu0
-@ - call swsusp_save write suspend image
-
-ENTRY(swsusp_arch_suspend)
-	ldw	ip, 1f
-	add	ip, ip, #SWSUSP_CPU
-	stm.w	(r4 - r15), [ip]+
-	stm.w	(r16 - r27, sp, lr), [ip]+
-
-#ifdef	CONFIG_UNICORE_FPU_F64
-	ldw	ip, 1f
-	add	ip, ip, #SWSUSP_FPSTATE
-	sfm.w	(f0  - f7 ), [ip]+
-	sfm.w	(f8  - f15), [ip]+
-	sfm.w	(f16 - f23), [ip]+
-	sfm.w	(f24 - f31), [ip]+
-	cff	r4, s31
-	stw	r4, [ip]
-#endif
-	b	swsusp_save			@ no return
-
-1:	.long	swsusp_arch_regs_cpu0
diff --git a/arch/unicore32/kernel/irq.c b/arch/unicore32/kernel/irq.c
deleted file mode 100644
index c014ae3c3e48..000000000000
--- a/arch/unicore32/kernel/irq.c
+++ /dev/null
@@ -1,371 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/irq.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/kernel_stat.h>
-#include <linux/module.h>
-#include <linux/signal.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/random.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/seq_file.h>
-#include <linux/errno.h>
-#include <linux/list.h>
-#include <linux/kallsyms.h>
-#include <linux/proc_fs.h>
-#include <linux/syscore_ops.h>
-
-#include <mach/hardware.h>
-
-#include "setup.h"
-
-/*
- * PKUnity GPIO edge detection for IRQs:
- * IRQs are generated on Falling-Edge, Rising-Edge, or both.
- * Use this instead of directly setting GRER/GFER.
- */
-static int GPIO_IRQ_rising_edge;
-static int GPIO_IRQ_falling_edge;
-static int GPIO_IRQ_mask = 0;
-
-#define GPIO_MASK(irq)		(1 << (irq - IRQ_GPIO0))
-
-static int puv3_gpio_type(struct irq_data *d, unsigned int type)
-{
-	unsigned int mask;
-
-	if (d->irq < IRQ_GPIOHIGH)
-		mask = 1 << d->irq;
-	else
-		mask = GPIO_MASK(d->irq);
-
-	if (type == IRQ_TYPE_PROBE) {
-		if ((GPIO_IRQ_rising_edge | GPIO_IRQ_falling_edge) & mask)
-			return 0;
-		type = IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING;
-	}
-
-	if (type & IRQ_TYPE_EDGE_RISING)
-		GPIO_IRQ_rising_edge |= mask;
-	else
-		GPIO_IRQ_rising_edge &= ~mask;
-	if (type & IRQ_TYPE_EDGE_FALLING)
-		GPIO_IRQ_falling_edge |= mask;
-	else
-		GPIO_IRQ_falling_edge &= ~mask;
-
-	writel(GPIO_IRQ_rising_edge & GPIO_IRQ_mask, GPIO_GRER);
-	writel(GPIO_IRQ_falling_edge & GPIO_IRQ_mask, GPIO_GFER);
-
-	return 0;
-}
-
-/*
- * GPIO IRQs must be acknowledged.  This is for IRQs from 0 to 7.
- */
-static void puv3_low_gpio_ack(struct irq_data *d)
-{
-	writel((1 << d->irq), GPIO_GEDR);
-}
-
-static void puv3_low_gpio_mask(struct irq_data *d)
-{
-	writel(readl(INTC_ICMR) & ~(1 << d->irq), INTC_ICMR);
-}
-
-static void puv3_low_gpio_unmask(struct irq_data *d)
-{
-	writel(readl(INTC_ICMR) | (1 << d->irq), INTC_ICMR);
-}
-
-static int puv3_low_gpio_wake(struct irq_data *d, unsigned int on)
-{
-	if (on)
-		writel(readl(PM_PWER) | (1 << d->irq), PM_PWER);
-	else
-		writel(readl(PM_PWER) & ~(1 << d->irq), PM_PWER);
-	return 0;
-}
-
-static struct irq_chip puv3_low_gpio_chip = {
-	.name		= "GPIO-low",
-	.irq_ack	= puv3_low_gpio_ack,
-	.irq_mask	= puv3_low_gpio_mask,
-	.irq_unmask	= puv3_low_gpio_unmask,
-	.irq_set_type	= puv3_gpio_type,
-	.irq_set_wake	= puv3_low_gpio_wake,
-};
-
-/*
- * IRQ8 (GPIO0 through 27) handler.  We enter here with the
- * irq_controller_lock held, and IRQs disabled.  Decode the IRQ
- * and call the handler.
- */
-static void puv3_gpio_handler(struct irq_desc *desc)
-{
-	unsigned int mask, irq;
-
-	mask = readl(GPIO_GEDR);
-	do {
-		/*
-		 * clear down all currently active IRQ sources.
-		 * We will be processing them all.
-		 */
-		writel(mask, GPIO_GEDR);
-
-		irq = IRQ_GPIO0;
-		do {
-			if (mask & 1)
-				generic_handle_irq(irq);
-			mask >>= 1;
-			irq++;
-		} while (mask);
-		mask = readl(GPIO_GEDR);
-	} while (mask);
-}
-
-/*
- * GPIO0-27 edge IRQs need to be handled specially.
- * In addition, the IRQs are all collected up into one bit in the
- * interrupt controller registers.
- */
-static void puv3_high_gpio_ack(struct irq_data *d)
-{
-	unsigned int mask = GPIO_MASK(d->irq);
-
-	writel(mask, GPIO_GEDR);
-}
-
-static void puv3_high_gpio_mask(struct irq_data *d)
-{
-	unsigned int mask = GPIO_MASK(d->irq);
-
-	GPIO_IRQ_mask &= ~mask;
-
-	writel(readl(GPIO_GRER) & ~mask, GPIO_GRER);
-	writel(readl(GPIO_GFER) & ~mask, GPIO_GFER);
-}
-
-static void puv3_high_gpio_unmask(struct irq_data *d)
-{
-	unsigned int mask = GPIO_MASK(d->irq);
-
-	GPIO_IRQ_mask |= mask;
-
-	writel(GPIO_IRQ_rising_edge & GPIO_IRQ_mask, GPIO_GRER);
-	writel(GPIO_IRQ_falling_edge & GPIO_IRQ_mask, GPIO_GFER);
-}
-
-static int puv3_high_gpio_wake(struct irq_data *d, unsigned int on)
-{
-	if (on)
-		writel(readl(PM_PWER) | PM_PWER_GPIOHIGH, PM_PWER);
-	else
-		writel(readl(PM_PWER) & ~PM_PWER_GPIOHIGH, PM_PWER);
-	return 0;
-}
-
-static struct irq_chip puv3_high_gpio_chip = {
-	.name		= "GPIO-high",
-	.irq_ack	= puv3_high_gpio_ack,
-	.irq_mask	= puv3_high_gpio_mask,
-	.irq_unmask	= puv3_high_gpio_unmask,
-	.irq_set_type	= puv3_gpio_type,
-	.irq_set_wake	= puv3_high_gpio_wake,
-};
-
-/*
- * We don't need to ACK IRQs on the PKUnity unless they're GPIOs
- * this is for internal IRQs i.e. from 8 to 31.
- */
-static void puv3_mask_irq(struct irq_data *d)
-{
-	writel(readl(INTC_ICMR) & ~(1 << d->irq), INTC_ICMR);
-}
-
-static void puv3_unmask_irq(struct irq_data *d)
-{
-	writel(readl(INTC_ICMR) | (1 << d->irq), INTC_ICMR);
-}
-
-/*
- * Apart form GPIOs, only the RTC alarm can be a wakeup event.
- */
-static int puv3_set_wake(struct irq_data *d, unsigned int on)
-{
-	if (d->irq == IRQ_RTCAlarm) {
-		if (on)
-			writel(readl(PM_PWER) | PM_PWER_RTC, PM_PWER);
-		else
-			writel(readl(PM_PWER) & ~PM_PWER_RTC, PM_PWER);
-		return 0;
-	}
-	return -EINVAL;
-}
-
-static struct irq_chip puv3_normal_chip = {
-	.name		= "PKUnity-v3",
-	.irq_ack	= puv3_mask_irq,
-	.irq_mask	= puv3_mask_irq,
-	.irq_unmask	= puv3_unmask_irq,
-	.irq_set_wake	= puv3_set_wake,
-};
-
-static struct resource irq_resource = {
-	.name	= "irqs",
-	.start	= io_v2p(PKUNITY_INTC_BASE),
-	.end	= io_v2p(PKUNITY_INTC_BASE) + 0xFFFFF,
-};
-
-static struct puv3_irq_state {
-	unsigned int	saved;
-	unsigned int	icmr;
-	unsigned int	iclr;
-	unsigned int	iccr;
-} puv3_irq_state;
-
-static int puv3_irq_suspend(void)
-{
-	struct puv3_irq_state *st = &puv3_irq_state;
-
-	st->saved = 1;
-	st->icmr = readl(INTC_ICMR);
-	st->iclr = readl(INTC_ICLR);
-	st->iccr = readl(INTC_ICCR);
-
-	/*
-	 * Disable all GPIO-based interrupts.
-	 */
-	writel(readl(INTC_ICMR) & ~(0x1ff), INTC_ICMR);
-
-	/*
-	 * Set the appropriate edges for wakeup.
-	 */
-	writel(readl(PM_PWER) & GPIO_IRQ_rising_edge, GPIO_GRER);
-	writel(readl(PM_PWER) & GPIO_IRQ_falling_edge, GPIO_GFER);
-
-	/*
-	 * Clear any pending GPIO interrupts.
-	 */
-	writel(readl(GPIO_GEDR), GPIO_GEDR);
-
-	return 0;
-}
-
-static void puv3_irq_resume(void)
-{
-	struct puv3_irq_state *st = &puv3_irq_state;
-
-	if (st->saved) {
-		writel(st->iccr, INTC_ICCR);
-		writel(st->iclr, INTC_ICLR);
-
-		writel(GPIO_IRQ_rising_edge & GPIO_IRQ_mask, GPIO_GRER);
-		writel(GPIO_IRQ_falling_edge & GPIO_IRQ_mask, GPIO_GFER);
-
-		writel(st->icmr, INTC_ICMR);
-	}
-}
-
-static struct syscore_ops puv3_irq_syscore_ops = {
-	.suspend	= puv3_irq_suspend,
-	.resume		= puv3_irq_resume,
-};
-
-static int __init puv3_irq_init_syscore(void)
-{
-	register_syscore_ops(&puv3_irq_syscore_ops);
-	return 0;
-}
-
-device_initcall(puv3_irq_init_syscore);
-
-void __init init_IRQ(void)
-{
-	unsigned int irq;
-
-	request_resource(&iomem_resource, &irq_resource);
-
-	/* disable all IRQs */
-	writel(0, INTC_ICMR);
-
-	/* all IRQs are IRQ, not REAL */
-	writel(0, INTC_ICLR);
-
-	/* clear all GPIO edge detects */
-	writel(FMASK(8, 0) & ~FIELD(1, 1, GPI_SOFF_REQ), GPIO_GPIR);
-	writel(0, GPIO_GFER);
-	writel(0, GPIO_GRER);
-	writel(0x0FFFFFFF, GPIO_GEDR);
-
-	writel(1, INTC_ICCR);
-
-	for (irq = 0; irq < IRQ_GPIOHIGH; irq++) {
-		irq_set_chip(irq, &puv3_low_gpio_chip);
-		irq_set_handler(irq, handle_edge_irq);
-		irq_modify_status(irq,
-			IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN,
-			0);
-	}
-
-	for (irq = IRQ_GPIOHIGH + 1; irq < IRQ_GPIO0; irq++) {
-		irq_set_chip(irq, &puv3_normal_chip);
-		irq_set_handler(irq, handle_level_irq);
-		irq_modify_status(irq,
-			IRQ_NOREQUEST | IRQ_NOAUTOEN,
-			IRQ_NOPROBE);
-	}
-
-	for (irq = IRQ_GPIO0; irq <= IRQ_GPIO27; irq++) {
-		irq_set_chip(irq, &puv3_high_gpio_chip);
-		irq_set_handler(irq, handle_edge_irq);
-		irq_modify_status(irq,
-			IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN,
-			0);
-	}
-
-	/*
-	 * Install handler for GPIO 0-27 edge detect interrupts
-	 */
-	irq_set_chip(IRQ_GPIOHIGH, &puv3_normal_chip);
-	irq_set_chained_handler(IRQ_GPIOHIGH, puv3_gpio_handler);
-
-#ifdef CONFIG_PUV3_GPIO
-	puv3_init_gpio();
-#endif
-}
-
-/*
- * do_IRQ handles all hardware IRQ's.  Decoded IRQs should not
- * come via this function.  Instead, they should provide their
- * own 'handler'
- */
-asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
-{
-	struct pt_regs *old_regs = set_irq_regs(regs);
-
-	irq_enter();
-
-	/*
-	 * Some hardware gives randomly wrong interrupts.  Rather
-	 * than crashing, do something sensible.
-	 */
-	if (unlikely(irq >= nr_irqs)) {
-		if (printk_ratelimit())
-			printk(KERN_WARNING "Bad IRQ%u\n", irq);
-		ack_bad_irq(irq);
-	} else {
-		generic_handle_irq(irq);
-	}
-
-	irq_exit();
-	set_irq_regs(old_regs);
-}
-
diff --git a/arch/unicore32/kernel/ksyms.c b/arch/unicore32/kernel/ksyms.c
deleted file mode 100644
index 731445008932..000000000000
--- a/arch/unicore32/kernel/ksyms.c
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/ksyms.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/in6.h>
-#include <linux/syscalls.h>
-#include <linux/uaccess.h>
-#include <linux/io.h>
-
-#include <asm/checksum.h>
-
-#include "ksyms.h"
-
-EXPORT_SYMBOL(find_first_bit);
-EXPORT_SYMBOL(find_first_zero_bit);
-EXPORT_SYMBOL(find_next_zero_bit);
-EXPORT_SYMBOL(find_next_bit);
-
-	/* platform dependent support */
-EXPORT_SYMBOL(__udelay);
-EXPORT_SYMBOL(__const_udelay);
-
-	/* string / mem functions */
-EXPORT_SYMBOL(strchr);
-EXPORT_SYMBOL(strrchr);
-EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(memmove);
-EXPORT_SYMBOL(memchr);
-
-	/* user mem (segment) */
-EXPORT_SYMBOL(__strnlen_user);
-EXPORT_SYMBOL(__strncpy_from_user);
-
-EXPORT_SYMBOL(copy_page);
-
-EXPORT_SYMBOL(raw_copy_from_user);
-EXPORT_SYMBOL(raw_copy_to_user);
-EXPORT_SYMBOL(__clear_user);
-
-EXPORT_SYMBOL(__ashldi3);
-EXPORT_SYMBOL(__ashrdi3);
-EXPORT_SYMBOL(__divsi3);
-EXPORT_SYMBOL(__lshrdi3);
-EXPORT_SYMBOL(__modsi3);
-EXPORT_SYMBOL(__ucmpdi2);
-EXPORT_SYMBOL(__udivsi3);
-EXPORT_SYMBOL(__umodsi3);
-
diff --git a/arch/unicore32/kernel/ksyms.h b/arch/unicore32/kernel/ksyms.h
deleted file mode 100644
index 5d2d5ba324ac..000000000000
--- a/arch/unicore32/kernel/ksyms.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * libgcc functions - functions that are used internally by the
- * compiler...  (prototypes are not correct though, but that
- * doesn't really matter since they're not versioned).
- */
-extern void __ashldi3(void);
-extern void __ashrdi3(void);
-extern void __divsi3(void);
-extern void __lshrdi3(void);
-extern void __modsi3(void);
-extern void __ucmpdi2(void);
-extern void __udivsi3(void);
-extern void __umodsi3(void);
diff --git a/arch/unicore32/kernel/module.c b/arch/unicore32/kernel/module.c
deleted file mode 100644
index 67c89ef2d6ee..000000000000
--- a/arch/unicore32/kernel/module.c
+++ /dev/null
@@ -1,105 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/module.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/moduleloader.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/elf.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/string.h>
-#include <linux/gfp.h>
-
-#include <asm/sections.h>
-
-void *module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-}
-
-int
-apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
-	       unsigned int relindex, struct module *module)
-{
-	Elf32_Shdr *symsec = sechdrs + symindex;
-	Elf32_Shdr *relsec = sechdrs + relindex;
-	Elf32_Shdr *dstsec = sechdrs + relsec->sh_info;
-	Elf32_Rel *rel = (void *)relsec->sh_addr;
-	unsigned int i;
-
-	for (i = 0; i < relsec->sh_size / sizeof(Elf32_Rel); i++, rel++) {
-		unsigned long loc;
-		Elf32_Sym *sym;
-		s32 offset;
-
-		offset = ELF32_R_SYM(rel->r_info);
-		if (offset < 0 || offset >
-				(symsec->sh_size / sizeof(Elf32_Sym))) {
-			printk(KERN_ERR "%s: bad relocation, "
-					"section %d reloc %d\n",
-					module->name, relindex, i);
-			return -ENOEXEC;
-		}
-
-		sym = ((Elf32_Sym *)symsec->sh_addr) + offset;
-
-		if (rel->r_offset < 0 || rel->r_offset >
-				dstsec->sh_size - sizeof(u32)) {
-			printk(KERN_ERR "%s: out of bounds relocation, "
-				"section %d reloc %d offset %d size %d\n",
-				module->name, relindex, i, rel->r_offset,
-				dstsec->sh_size);
-			return -ENOEXEC;
-		}
-
-		loc = dstsec->sh_addr + rel->r_offset;
-
-		switch (ELF32_R_TYPE(rel->r_info)) {
-		case R_UNICORE_NONE:
-			/* ignore */
-			break;
-
-		case R_UNICORE_ABS32:
-			*(u32 *)loc += sym->st_value;
-			break;
-
-		case R_UNICORE_PC24:
-		case R_UNICORE_CALL:
-		case R_UNICORE_JUMP24:
-			offset = (*(u32 *)loc & 0x00ffffff) << 2;
-			if (offset & 0x02000000)
-				offset -= 0x04000000;
-
-			offset += sym->st_value - loc;
-			if (offset & 3 ||
-			    offset <= (s32)0xfe000000 ||
-			    offset >= (s32)0x02000000) {
-				printk(KERN_ERR
-				       "%s: relocation out of range, section "
-				       "%d reloc %d sym '%s'\n", module->name,
-				       relindex, i, strtab + sym->st_name);
-				return -ENOEXEC;
-			}
-
-			offset >>= 2;
-
-			*(u32 *)loc &= 0xff000000;
-			*(u32 *)loc |= offset & 0x00ffffff;
-			break;
-
-		default:
-			printk(KERN_ERR "%s: unknown relocation: %u\n",
-			       module->name, ELF32_R_TYPE(rel->r_info));
-			return -ENOEXEC;
-		}
-	}
-	return 0;
-}
diff --git a/arch/unicore32/kernel/pci.c b/arch/unicore32/kernel/pci.c
deleted file mode 100644
index 0d098aa05b47..000000000000
--- a/arch/unicore32/kernel/pci.c
+++ /dev/null
@@ -1,371 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/pci.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  PCI bios-type initialisation for PCI machines
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/io.h>
-
-static int debug_pci;
-
-#define CONFIG_CMD(bus, devfn, where)	\
-	(0x80000000 | (bus->number << 16) | (devfn << 8) | (where & ~3))
-
-static int
-puv3_read_config(struct pci_bus *bus, unsigned int devfn, int where,
-			int size, u32 *value)
-{
-	writel(CONFIG_CMD(bus, devfn, where), PCICFG_ADDR);
-	switch (size) {
-	case 1:
-		*value = (readl(PCICFG_DATA) >> ((where & 3) * 8)) & 0xFF;
-		break;
-	case 2:
-		*value = (readl(PCICFG_DATA) >> ((where & 2) * 8)) & 0xFFFF;
-		break;
-	case 4:
-		*value = readl(PCICFG_DATA);
-		break;
-	}
-	return PCIBIOS_SUCCESSFUL;
-}
-
-static int
-puv3_write_config(struct pci_bus *bus, unsigned int devfn, int where,
-			int size, u32 value)
-{
-	writel(CONFIG_CMD(bus, devfn, where), PCICFG_ADDR);
-	switch (size) {
-	case 1:
-		writel((readl(PCICFG_DATA) & ~FMASK(8, (where&3)*8))
-			| FIELD(value, 8, (where&3)*8), PCICFG_DATA);
-		break;
-	case 2:
-		writel((readl(PCICFG_DATA) & ~FMASK(16, (where&2)*8))
-			| FIELD(value, 16, (where&2)*8), PCICFG_DATA);
-		break;
-	case 4:
-		writel(value, PCICFG_DATA);
-		break;
-	}
-	return PCIBIOS_SUCCESSFUL;
-}
-
-struct pci_ops pci_puv3_ops = {
-	.read  = puv3_read_config,
-	.write = puv3_write_config,
-};
-
-void pci_puv3_preinit(void)
-{
-	printk(KERN_DEBUG "PCI: PKUnity PCI Controller Initializing ...\n");
-	/* config PCI bridge base */
-	writel(io_v2p(PKUNITY_PCIBRI_BASE), PCICFG_BRIBASE);
-
-	writel(0, PCIBRI_AHBCTL0);
-	writel(io_v2p(PKUNITY_PCIBRI_BASE) | PCIBRI_BARx_MEM, PCIBRI_AHBBAR0);
-	writel(0xFFFF0000, PCIBRI_AHBAMR0);
-	writel(0, PCIBRI_AHBTAR0);
-
-	writel(PCIBRI_CTLx_AT, PCIBRI_AHBCTL1);
-	writel(io_v2p(PKUNITY_PCILIO_BASE) | PCIBRI_BARx_IO, PCIBRI_AHBBAR1);
-	writel(0xFFFF0000, PCIBRI_AHBAMR1);
-	writel(0x00000000, PCIBRI_AHBTAR1);
-
-	writel(PCIBRI_CTLx_PREF, PCIBRI_AHBCTL2);
-	writel(io_v2p(PKUNITY_PCIMEM_BASE) | PCIBRI_BARx_MEM, PCIBRI_AHBBAR2);
-	writel(0xF8000000, PCIBRI_AHBAMR2);
-	writel(0, PCIBRI_AHBTAR2);
-
-	writel(io_v2p(PKUNITY_PCIAHB_BASE) | PCIBRI_BARx_MEM, PCIBRI_BAR1);
-
-	writel(PCIBRI_CTLx_AT | PCIBRI_CTLx_PREF, PCIBRI_PCICTL0);
-	writel(io_v2p(PKUNITY_PCIAHB_BASE) | PCIBRI_BARx_MEM, PCIBRI_PCIBAR0);
-	writel(0xF8000000, PCIBRI_PCIAMR0);
-	writel(PKUNITY_SDRAM_BASE, PCIBRI_PCITAR0);
-
-	writel(readl(PCIBRI_CMD) | PCIBRI_CMD_IO | PCIBRI_CMD_MEM, PCIBRI_CMD);
-}
-
-static int pci_puv3_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
-{
-	if (dev->bus->number == 0) {
-#ifdef CONFIG_ARCH_FPGA /* 4 pci slots */
-		if      (dev->devfn == 0x00)
-			return IRQ_PCIINTA;
-		else if (dev->devfn == 0x08)
-			return IRQ_PCIINTB;
-		else if (dev->devfn == 0x10)
-			return IRQ_PCIINTC;
-		else if (dev->devfn == 0x18)
-			return IRQ_PCIINTD;
-#endif
-#ifdef CONFIG_PUV3_DB0913 /* 3 pci slots */
-		if      (dev->devfn == 0x30)
-			return IRQ_PCIINTB;
-		else if (dev->devfn == 0x60)
-			return IRQ_PCIINTC;
-		else if (dev->devfn == 0x58)
-			return IRQ_PCIINTD;
-#endif
-#if	defined(CONFIG_PUV3_NB0916) || defined(CONFIG_PUV3_SMW0919)
-		/* only support 2 pci devices */
-		if      (dev->devfn == 0x00)
-			return IRQ_PCIINTC; /* sata */
-#endif
-	}
-	return -1;
-}
-
-/*
- * Only first 128MB of memory can be accessed via PCI.
- * We use GFP_DMA to allocate safe buffers to do map/unmap.
- * This is really ugly and we need a better way of specifying
- * DMA-capable regions of memory.
- */
-void __init puv3_pci_adjust_zones(unsigned long max_zone_pfn)
-{
-	unsigned int sz = SZ_128M >> PAGE_SHIFT;
-
-	max_zone_pfn[ZONE_DMA] = sz;
-}
-
-/*
- * If the bus contains any of these devices, then we must not turn on
- * parity checking of any kind.
- */
-static inline int pdev_bad_for_parity(struct pci_dev *dev)
-{
-	return 0;
-}
-
-/*
- * pcibios_fixup_bus - Called after each bus is probed,
- * but before its children are examined.
- */
-void pcibios_fixup_bus(struct pci_bus *bus)
-{
-	struct pci_dev *dev;
-	u16 features = PCI_COMMAND_SERR
-		| PCI_COMMAND_PARITY
-		| PCI_COMMAND_FAST_BACK;
-
-	bus->resource[0] = &ioport_resource;
-	bus->resource[1] = &iomem_resource;
-
-	/*
-	 * Walk the devices on this bus, working out what we can
-	 * and can't support.
-	 */
-	list_for_each_entry(dev, &bus->devices, bus_list) {
-		u16 status;
-
-		pci_read_config_word(dev, PCI_STATUS, &status);
-
-		/*
-		 * If any device on this bus does not support fast back
-		 * to back transfers, then the bus as a whole is not able
-		 * to support them.  Having fast back to back transfers
-		 * on saves us one PCI cycle per transaction.
-		 */
-		if (!(status & PCI_STATUS_FAST_BACK))
-			features &= ~PCI_COMMAND_FAST_BACK;
-
-		if (pdev_bad_for_parity(dev))
-			features &= ~(PCI_COMMAND_SERR
-					| PCI_COMMAND_PARITY);
-
-		switch (dev->class >> 8) {
-		case PCI_CLASS_BRIDGE_PCI:
-			pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &status);
-			status |= PCI_BRIDGE_CTL_PARITY
-				| PCI_BRIDGE_CTL_MASTER_ABORT;
-			status &= ~(PCI_BRIDGE_CTL_BUS_RESET
-				| PCI_BRIDGE_CTL_FAST_BACK);
-			pci_write_config_word(dev, PCI_BRIDGE_CONTROL, status);
-			break;
-
-		case PCI_CLASS_BRIDGE_CARDBUS:
-			pci_read_config_word(dev, PCI_CB_BRIDGE_CONTROL,
-					&status);
-			status |= PCI_CB_BRIDGE_CTL_PARITY
-				| PCI_CB_BRIDGE_CTL_MASTER_ABORT;
-			pci_write_config_word(dev, PCI_CB_BRIDGE_CONTROL,
-					status);
-			break;
-		}
-	}
-
-	/*
-	 * Now walk the devices again, this time setting them up.
-	 */
-	list_for_each_entry(dev, &bus->devices, bus_list) {
-		u16 cmd;
-
-		pci_read_config_word(dev, PCI_COMMAND, &cmd);
-		cmd |= features;
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
-
-		pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE,
-				      L1_CACHE_BYTES >> 2);
-	}
-
-	/*
-	 * Propagate the flags to the PCI bridge.
-	 */
-	if (bus->self && bus->self->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-		if (features & PCI_COMMAND_FAST_BACK)
-			bus->bridge_ctl |= PCI_BRIDGE_CTL_FAST_BACK;
-		if (features & PCI_COMMAND_PARITY)
-			bus->bridge_ctl |= PCI_BRIDGE_CTL_PARITY;
-	}
-
-	/*
-	 * Report what we did for this bus
-	 */
-	printk(KERN_INFO "PCI: bus%d: Fast back to back transfers %sabled\n",
-		bus->number, (features & PCI_COMMAND_FAST_BACK) ? "en" : "dis");
-}
-EXPORT_SYMBOL(pcibios_fixup_bus);
-
-static struct resource busn_resource = {
-	.name	= "PCI busn",
-	.start	= 0,
-	.end	= 255,
-	.flags	= IORESOURCE_BUS,
-};
-
-static int __init pci_common_init(void)
-{
-	struct pci_bus *puv3_bus;
-	struct pci_host_bridge *bridge;
-	int ret;
-
-	bridge = pci_alloc_host_bridge(0);
-	if (!bridge)
-		return -ENOMEM;
-
-	pci_puv3_preinit();
-
-	pci_add_resource(&bridge->windows, &ioport_resource);
-	pci_add_resource(&bridge->windows, &iomem_resource);
-	pci_add_resource(&bridge->windows, &busn_resource);
-	bridge->sysdata = NULL;
-	bridge->busnr = 0;
-	bridge->ops = &pci_puv3_ops;
-	bridge->swizzle_irq = pci_common_swizzle;
-	bridge->map_irq = pci_puv3_map_irq;
-
-	/* Scan our single hose.  */
-	ret = pci_scan_root_bus_bridge(bridge);
-	if (ret) {
-		pci_free_host_bridge(bridge);
-		return;
-	}
-
-	puv3_bus = bridge->bus;
-
-	if (!puv3_bus)
-		panic("PCI: unable to scan bus!");
-
-	pci_bus_size_bridges(puv3_bus);
-	pci_bus_assign_resources(puv3_bus);
-	pci_bus_add_devices(puv3_bus);
-	return 0;
-}
-subsys_initcall(pci_common_init);
-
-char * __init pcibios_setup(char *str)
-{
-	if (!strcmp(str, "debug")) {
-		debug_pci = 1;
-		return NULL;
-	}
-	return str;
-}
-
-void pcibios_set_master(struct pci_dev *dev)
-{
-	/* No special bus mastering setup handling */
-}
-
-/*
- * From arch/i386/kernel/pci-i386.c:
- *
- * We need to avoid collisions with `mirrored' VGA ports
- * and other strange ISA hardware, so we always want the
- * addresses to be allocated in the 0x000-0x0ff region
- * modulo 0x400.
- *
- * Why? Because some silly external IO cards only decode
- * the low 10 bits of the IO address. The 0x00-0xff region
- * is reserved for motherboard devices that decode all 16
- * bits, so it's ok to allocate at, say, 0x2800-0x28ff,
- * but we want to try to avoid allocating at 0x2900-0x2bff
- * which might be mirrored at 0x0100-0x03ff..
- */
-resource_size_t pcibios_align_resource(void *data, const struct resource *res,
-				resource_size_t size, resource_size_t align)
-{
-	resource_size_t start = res->start;
-
-	if (res->flags & IORESOURCE_IO && start & 0x300)
-		start = (start + 0x3ff) & ~0x3ff;
-
-	start = (start + align - 1) & ~(align - 1);
-
-	return start;
-}
-
-/**
- * pcibios_enable_device - Enable I/O and memory.
- * @dev: PCI device to be enabled
- */
-int pcibios_enable_device(struct pci_dev *dev, int mask)
-{
-	u16 cmd, old_cmd;
-	int idx;
-	struct resource *r;
-
-	pci_read_config_word(dev, PCI_COMMAND, &cmd);
-	old_cmd = cmd;
-	for (idx = 0; idx < 6; idx++) {
-		/* Only set up the requested stuff */
-		if (!(mask & (1 << idx)))
-			continue;
-
-		r = dev->resource + idx;
-		if (!r->start && r->end) {
-			printk(KERN_ERR "PCI: Device %s not available because"
-			       " of resource collisions\n", pci_name(dev));
-			return -EINVAL;
-		}
-		if (r->flags & IORESOURCE_IO)
-			cmd |= PCI_COMMAND_IO;
-		if (r->flags & IORESOURCE_MEM)
-			cmd |= PCI_COMMAND_MEMORY;
-	}
-
-	/*
-	 * Bridges (eg, cardbus bridges) need to be fully enabled
-	 */
-	if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)
-		cmd |= PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
-
-	if (cmd != old_cmd) {
-		printk("PCI: enabling device %s (%04x -> %04x)\n",
-		       pci_name(dev), old_cmd, cmd);
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
-	}
-	return 0;
-}
diff --git a/arch/unicore32/kernel/pm.c b/arch/unicore32/kernel/pm.c
deleted file mode 100644
index 94b7f9df6c1a..000000000000
--- a/arch/unicore32/kernel/pm.c
+++ /dev/null
@@ -1,121 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/pm.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/suspend.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/io.h>
-
-#include <mach/hardware.h>
-#include <mach/pm.h>
-
-#include "setup.h"
-
-struct puv3_cpu_pm_fns *puv3_cpu_pm_fns;
-static unsigned long *sleep_save;
-
-int puv3_pm_enter(suspend_state_t state)
-{
-	unsigned long sleep_save_checksum = 0, checksum = 0;
-	int i;
-
-	/* skip registers saving for standby */
-	if (state != PM_SUSPEND_STANDBY) {
-		puv3_cpu_pm_fns->save(sleep_save);
-		/* before sleeping, calculate and save a checksum */
-		for (i = 0; i < puv3_cpu_pm_fns->save_count - 1; i++)
-			sleep_save_checksum += sleep_save[i];
-	}
-
-	/* *** go zzz *** */
-	puv3_cpu_pm_fns->enter(state);
-	cpu_init();
-#ifdef CONFIG_INPUT_KEYBOARD
-	puv3_ps2_init();
-#endif
-#ifdef CONFIG_PCI
-	pci_puv3_preinit();
-#endif
-	if (state != PM_SUSPEND_STANDBY) {
-		/* after sleeping, validate the checksum */
-		for (i = 0; i < puv3_cpu_pm_fns->save_count - 1; i++)
-			checksum += sleep_save[i];
-
-		/* if invalid, display message and wait for a hardware reset */
-		if (checksum != sleep_save_checksum) {
-			while (1)
-				puv3_cpu_pm_fns->enter(state);
-		}
-		puv3_cpu_pm_fns->restore(sleep_save);
-	}
-
-	pr_debug("*** made it back from resume\n");
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(puv3_pm_enter);
-
-unsigned long sleep_phys_sp(void *sp)
-{
-	return virt_to_phys(sp);
-}
-
-static int puv3_pm_valid(suspend_state_t state)
-{
-	if (puv3_cpu_pm_fns)
-		return puv3_cpu_pm_fns->valid(state);
-
-	return -EINVAL;
-}
-
-static int puv3_pm_prepare(void)
-{
-	int ret = 0;
-
-	if (puv3_cpu_pm_fns && puv3_cpu_pm_fns->prepare)
-		ret = puv3_cpu_pm_fns->prepare();
-
-	return ret;
-}
-
-static void puv3_pm_finish(void)
-{
-	if (puv3_cpu_pm_fns && puv3_cpu_pm_fns->finish)
-		puv3_cpu_pm_fns->finish();
-}
-
-static struct platform_suspend_ops puv3_pm_ops = {
-	.valid		= puv3_pm_valid,
-	.enter		= puv3_pm_enter,
-	.prepare	= puv3_pm_prepare,
-	.finish		= puv3_pm_finish,
-};
-
-static int __init puv3_pm_init(void)
-{
-	if (!puv3_cpu_pm_fns) {
-		printk(KERN_ERR "no valid puv3_cpu_pm_fns defined\n");
-		return -EINVAL;
-	}
-
-	sleep_save = kmalloc_array(puv3_cpu_pm_fns->save_count,
-				   sizeof(unsigned long),
-				   GFP_KERNEL);
-	if (!sleep_save) {
-		printk(KERN_ERR "failed to alloc memory for pm save\n");
-		return -ENOMEM;
-	}
-
-	suspend_set_ops(&puv3_pm_ops);
-	return 0;
-}
-
-device_initcall(puv3_pm_init);
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
deleted file mode 100644
index b4fd3a604a18..000000000000
--- a/arch/unicore32/kernel/process.c
+++ /dev/null
@@ -1,319 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/process.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <stdarg.h>
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/task.h>
-#include <linux/sched/task_stack.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/delay.h>
-#include <linux/reboot.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/cpu.h>
-#include <linux/elfcore.h>
-#include <linux/pm.h>
-#include <linux/tick.h>
-#include <linux/utsname.h>
-#include <linux/uaccess.h>
-#include <linux/random.h>
-#include <linux/gpio.h>
-#include <linux/stacktrace.h>
-
-#include <asm/cacheflush.h>
-#include <asm/processor.h>
-#include <asm/stacktrace.h>
-
-#include "setup.h"
-
-static const char * const processor_modes[] = {
-	"UK00", "UK01", "UK02", "UK03", "UK04", "UK05", "UK06", "UK07",
-	"UK08", "UK09", "UK0A", "UK0B", "UK0C", "UK0D", "UK0E", "UK0F",
-	"USER", "REAL", "INTR", "PRIV", "UK14", "UK15", "UK16", "ABRT",
-	"UK18", "UK19", "UK1A", "EXTN", "UK1C", "UK1D", "UK1E", "SUSR"
-};
-
-void arch_cpu_idle(void)
-{
-	cpu_do_idle();
-	local_irq_enable();
-}
-
-void machine_halt(void)
-{
-	gpio_set_value(GPO_SOFT_OFF, 0);
-}
-
-/*
- * Function pointers to optional machine specific functions
- */
-void (*pm_power_off)(void) = NULL;
-EXPORT_SYMBOL(pm_power_off);
-
-void machine_power_off(void)
-{
-	if (pm_power_off)
-		pm_power_off();
-	machine_halt();
-}
-
-void machine_restart(char *cmd)
-{
-	/* Disable interrupts first */
-	local_irq_disable();
-
-	/*
-	 * Tell the mm system that we are going to reboot -
-	 * we may need it to insert some 1:1 mappings so that
-	 * soft boot works.
-	 */
-	setup_mm_for_reboot();
-
-	/* Clean and invalidate caches */
-	flush_cache_all();
-
-	/* Turn off caching */
-	cpu_proc_fin();
-
-	/* Push out any further dirty data, and ensure cache is empty */
-	flush_cache_all();
-
-	/*
-	 * Now handle reboot code.
-	 */
-	if (reboot_mode == REBOOT_SOFT) {
-		/* Jump into ROM at address 0xffff0000 */
-		cpu_reset(VECTORS_BASE);
-	} else {
-		writel(0x00002001, PM_PLLSYSCFG); /* cpu clk = 250M */
-		writel(0x00100800, PM_PLLDDRCFG); /* ddr clk =  44M */
-		writel(0x00002001, PM_PLLVGACFG); /* vga clk = 250M */
-
-		/* Use on-chip reset capability */
-		/* following instructions must be in one icache line */
-		__asm__ __volatile__(
-			"	.align 5\n\t"
-			"	stw	%1, [%0]\n\t"
-			"201:	ldw	r0, [%0]\n\t"
-			"	cmpsub.a	r0, #0\n\t"
-			"	bne	201b\n\t"
-			"	stw	%3, [%2]\n\t"
-			"	nop; nop; nop\n\t"
-			/* prefetch 3 instructions at most */
-			:
-			: "r" (PM_PMCR),
-			  "r" (PM_PMCR_CFBSYS | PM_PMCR_CFBDDR
-				| PM_PMCR_CFBVGA),
-			  "r" (RESETC_SWRR),
-			  "r" (RESETC_SWRR_SRB)
-			: "r0", "memory");
-	}
-
-	/*
-	 * Whoops - the architecture was unable to reboot.
-	 * Tell the user!
-	 */
-	mdelay(1000);
-	printk(KERN_EMERG "Reboot failed -- System halted\n");
-	do { } while (1);
-}
-
-void __show_regs(struct pt_regs *regs)
-{
-	unsigned long flags;
-	char buf[64];
-
-	show_regs_print_info(KERN_DEFAULT);
-	printk("PC is at %pS\n", (void *)instruction_pointer(regs));
-	printk("LR is at %pS\n", (void *)regs->UCreg_lr);
-	printk(KERN_DEFAULT "pc : [<%08lx>]    lr : [<%08lx>]    psr: %08lx\n"
-	       "sp : %08lx  ip : %08lx  fp : %08lx\n",
-		regs->UCreg_pc, regs->UCreg_lr, regs->UCreg_asr,
-		regs->UCreg_sp, regs->UCreg_ip, regs->UCreg_fp);
-	printk(KERN_DEFAULT "r26: %08lx  r25: %08lx  r24: %08lx\n",
-		regs->UCreg_26, regs->UCreg_25,
-		regs->UCreg_24);
-	printk(KERN_DEFAULT "r23: %08lx  r22: %08lx  r21: %08lx  r20: %08lx\n",
-		regs->UCreg_23, regs->UCreg_22,
-		regs->UCreg_21, regs->UCreg_20);
-	printk(KERN_DEFAULT "r19: %08lx  r18: %08lx  r17: %08lx  r16: %08lx\n",
-		regs->UCreg_19, regs->UCreg_18,
-		regs->UCreg_17, regs->UCreg_16);
-	printk(KERN_DEFAULT "r15: %08lx  r14: %08lx  r13: %08lx  r12: %08lx\n",
-		regs->UCreg_15, regs->UCreg_14,
-		regs->UCreg_13, regs->UCreg_12);
-	printk(KERN_DEFAULT "r11: %08lx  r10: %08lx  r9 : %08lx  r8 : %08lx\n",
-		regs->UCreg_11, regs->UCreg_10,
-		regs->UCreg_09, regs->UCreg_08);
-	printk(KERN_DEFAULT "r7 : %08lx  r6 : %08lx  r5 : %08lx  r4 : %08lx\n",
-		regs->UCreg_07, regs->UCreg_06,
-		regs->UCreg_05, regs->UCreg_04);
-	printk(KERN_DEFAULT "r3 : %08lx  r2 : %08lx  r1 : %08lx  r0 : %08lx\n",
-		regs->UCreg_03, regs->UCreg_02,
-		regs->UCreg_01, regs->UCreg_00);
-
-	flags = regs->UCreg_asr;
-	buf[0] = flags & PSR_S_BIT ? 'S' : 's';
-	buf[1] = flags & PSR_Z_BIT ? 'Z' : 'z';
-	buf[2] = flags & PSR_C_BIT ? 'C' : 'c';
-	buf[3] = flags & PSR_V_BIT ? 'V' : 'v';
-	buf[4] = '\0';
-
-	printk(KERN_DEFAULT "Flags: %s  INTR o%s  REAL o%s  Mode %s  Segment %s\n",
-		buf, interrupts_enabled(regs) ? "n" : "ff",
-		fast_interrupts_enabled(regs) ? "n" : "ff",
-		processor_modes[processor_mode(regs)],
-		uaccess_kernel() ? "kernel" : "user");
-	{
-		unsigned int ctrl;
-
-		buf[0] = '\0';
-		{
-			unsigned int transbase;
-			asm("movc %0, p0.c2, #0\n"
-			    : "=r" (transbase));
-			snprintf(buf, sizeof(buf), "  Table: %08x", transbase);
-		}
-		asm("movc %0, p0.c1, #0\n" : "=r" (ctrl));
-
-		printk(KERN_DEFAULT "Control: %08x%s\n", ctrl, buf);
-	}
-}
-
-void show_regs(struct pt_regs *regs)
-{
-	printk(KERN_DEFAULT "\n");
-	printk(KERN_DEFAULT "Pid: %d, comm: %20s\n",
-			task_pid_nr(current), current->comm);
-	__show_regs(regs);
-	__backtrace();
-}
-
-void flush_thread(void)
-{
-	struct thread_info *thread = current_thread_info();
-	struct task_struct *tsk = current;
-
-	memset(thread->used_cp, 0, sizeof(thread->used_cp));
-	memset(&tsk->thread.debug, 0, sizeof(struct debug_info));
-#ifdef CONFIG_UNICORE_FPU_F64
-	memset(&thread->fpstate, 0, sizeof(struct fp_state));
-#endif
-}
-
-void release_thread(struct task_struct *dead_task)
-{
-}
-
-asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
-asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
-
-int
-copy_thread(unsigned long clone_flags, unsigned long stack_start,
-	    unsigned long stk_sz, struct task_struct *p)
-{
-	struct thread_info *thread = task_thread_info(p);
-	struct pt_regs *childregs = task_pt_regs(p);
-
-	memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save));
-	thread->cpu_context.sp = (unsigned long)childregs;
-	if (unlikely(p->flags & PF_KTHREAD)) {
-		thread->cpu_context.pc = (unsigned long)ret_from_kernel_thread;
-		thread->cpu_context.r4 = stack_start;
-		thread->cpu_context.r5 = stk_sz;
-		memset(childregs, 0, sizeof(struct pt_regs));
-	} else {
-		thread->cpu_context.pc = (unsigned long)ret_from_fork;
-		*childregs = *current_pt_regs();
-		childregs->UCreg_00 = 0;
-		if (stack_start)
-			childregs->UCreg_sp = stack_start;
-
-		if (clone_flags & CLONE_SETTLS)
-			childregs->UCreg_16 = childregs->UCreg_03;
-	}
-	return 0;
-}
-
-/*
- * Fill in the task's elfregs structure for a core dump.
- */
-int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs)
-{
-	elf_core_copy_regs(elfregs, task_pt_regs(t));
-	return 1;
-}
-
-/*
- * fill in the fpe structure for a core dump...
- */
-int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fp)
-{
-	struct thread_info *thread = current_thread_info();
-	int used_math = thread->used_cp[1] | thread->used_cp[2];
-
-#ifdef CONFIG_UNICORE_FPU_F64
-	if (used_math)
-		memcpy(fp, &thread->fpstate, sizeof(*fp));
-#endif
-	return used_math != 0;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-unsigned long get_wchan(struct task_struct *p)
-{
-	struct stackframe frame;
-	int count = 0;
-	if (!p || p == current || p->state == TASK_RUNNING)
-		return 0;
-
-	frame.fp = thread_saved_fp(p);
-	frame.sp = thread_saved_sp(p);
-	frame.lr = 0;			/* recovered from the stack */
-	frame.pc = thread_saved_pc(p);
-	do {
-		int ret = unwind_frame(&frame);
-		if (ret < 0)
-			return 0;
-		if (!in_sched_functions(frame.pc))
-			return frame.pc;
-	} while ((count++) < 16);
-	return 0;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
-	return randomize_page(mm->brk, 0x02000000);
-}
-
-/*
- * The vectors page is always readable from user space for the
- * atomic helpers and the signal restart code.  Let's declare a mapping
- * for it so it is visible through ptrace and /proc/<pid>/mem.
- */
-
-int vectors_user_mapping(void)
-{
-	struct mm_struct *mm = current->mm;
-	return install_special_mapping(mm, 0xffff0000, PAGE_SIZE,
-				       VM_READ | VM_EXEC |
-				       VM_MAYREAD | VM_MAYEXEC |
-				       VM_DONTEXPAND | VM_DONTDUMP,
-				       NULL);
-}
-
-const char *arch_vma_name(struct vm_area_struct *vma)
-{
-	return (vma->vm_start == 0xffff0000) ? "[vectors]" : NULL;
-}
diff --git a/arch/unicore32/kernel/ptrace.c b/arch/unicore32/kernel/ptrace.c
deleted file mode 100644
index 0f216567b90a..000000000000
--- a/arch/unicore32/kernel/ptrace.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/ptrace.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * By Ross Biro 1/23/92
- */
-#include <linux/kernel.h>
-#include <linux/ptrace.h>
-#include <linux/signal.h>
-#include <linux/uaccess.h>
-#include <linux/sched/task_stack.h>
-
-/*
- * this routine will get a word off of the processes privileged stack.
- * the offset is how far from the base addr as stored in the THREAD.
- * this routine assumes that all the privileged stacks are in our
- * data space.
- */
-static inline long get_user_reg(struct task_struct *task, int offset)
-{
-	return task_pt_regs(task)->uregs[offset];
-}
-
-/*
- * this routine will put a word on the processes privileged stack.
- * the offset is how far from the base addr as stored in the THREAD.
- * this routine assumes that all the privileged stacks are in our
- * data space.
- */
-static inline int
-put_user_reg(struct task_struct *task, int offset, long data)
-{
-	struct pt_regs newregs, *regs = task_pt_regs(task);
-	int ret = -EINVAL;
-
-	newregs = *regs;
-	newregs.uregs[offset] = data;
-
-	if (valid_user_regs(&newregs)) {
-		regs->uregs[offset] = data;
-		ret = 0;
-	}
-
-	return ret;
-}
-
-/*
- * Called by kernel/ptrace.c when detaching..
- */
-void ptrace_disable(struct task_struct *child)
-{
-}
-
-/*
- * We actually access the pt_regs stored on the kernel stack.
- */
-static int ptrace_read_user(struct task_struct *tsk, unsigned long off,
-			    unsigned long __user *ret)
-{
-	unsigned long tmp;
-
-	tmp = 0;
-	if (off < sizeof(struct pt_regs))
-		tmp = get_user_reg(tsk, off >> 2);
-
-	return put_user(tmp, ret);
-}
-
-/*
- * We actually access the pt_regs stored on the kernel stack.
- */
-static int ptrace_write_user(struct task_struct *tsk, unsigned long off,
-			     unsigned long val)
-{
-	if (off >= sizeof(struct pt_regs))
-		return 0;
-
-	return put_user_reg(tsk, off >> 2, val);
-}
-
-long arch_ptrace(struct task_struct *child, long request,
-		 unsigned long addr, unsigned long data)
-{
-	int ret;
-	unsigned long __user *datap = (unsigned long __user *) data;
-
-	switch (request) {
-	case PTRACE_PEEKUSR:
-		ret = ptrace_read_user(child, addr, datap);
-		break;
-
-	case PTRACE_POKEUSR:
-		ret = ptrace_write_user(child, addr, data);
-		break;
-
-	case PTRACE_GET_THREAD_AREA:
-		ret = put_user(task_pt_regs(child)->UCreg_16,
-			       datap);
-		break;
-
-	default:
-		ret = ptrace_request(child, request, addr, data);
-		break;
-	}
-
-	return ret;
-}
-
-asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno)
-{
-	unsigned long ip;
-
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return scno;
-	if (!(current->ptrace & PT_PTRACED))
-		return scno;
-
-	/*
-	 * Save IP.  IP is used to denote syscall entry/exit:
-	 *  IP = 0 -> entry, = 1 -> exit
-	 */
-	ip = regs->UCreg_ip;
-	regs->UCreg_ip = why;
-
-	current_thread_info()->syscall = scno;
-
-	/* the 0x80 provides a way for the tracing parent to distinguish
-	   between a syscall stop and SIGTRAP delivery */
-	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
-				 ? 0x80 : 0));
-	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
-	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
-	regs->UCreg_ip = ip;
-
-	return current_thread_info()->syscall;
-}
diff --git a/arch/unicore32/kernel/puv3-core.c b/arch/unicore32/kernel/puv3-core.c
deleted file mode 100644
index 78f12e627365..000000000000
--- a/arch/unicore32/kernel/puv3-core.c
+++ /dev/null
@@ -1,276 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  linux/arch/unicore32/kernel/puv3-core.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/init.h>
-#include <linux/device.h>
-#include <linux/amba/bus.h>
-#include <linux/platform_device.h>
-#include <linux/io.h>
-#include <linux/cnt32_to_63.h>
-#include <linux/usb/musb.h>
-
-#include <asm/irq.h>
-#include <mach/hardware.h>
-#include <mach/pm.h>
-
-/*
- * This is the PKUnity sched_clock implementation.  This has
- * a resolution of 271ns, and a maximum value of 32025597s (370 days).
- *
- * The return value is guaranteed to be monotonic in that range as
- * long as there is always less than 582 seconds between successive
- * calls to this function.
- *
- *  ( * 1E9 / CLOCK_TICK_RATE ) -> about 2235/32
- */
-unsigned long long sched_clock(void)
-{
-	unsigned long long v = cnt32_to_63(readl(OST_OSCR));
-
-	/* original conservative method, but overflow frequently
-	 * v *= NSEC_PER_SEC >> 12;
-	 * do_div(v, CLOCK_TICK_RATE >> 12);
-	 */
-	v = ((v & 0x7fffffffffffffffULL) * 2235) >> 5;
-
-	return v;
-}
-
-static struct resource puv3_usb_resources[] = {
-	/* order is significant! */
-	{
-		.start		= io_v2p(PKUNITY_USB_BASE),
-		.end		= io_v2p(PKUNITY_USB_BASE) + 0x3ff,
-		.flags		= IORESOURCE_MEM,
-	}, {
-		.start		= IRQ_USB,
-		.flags		= IORESOURCE_IRQ,
-	}, {
-		.start		= IRQ_USB,
-		.flags		= IORESOURCE_IRQ,
-	},
-};
-
-static struct musb_hdrc_config	puv3_usb_config[] = {
-	{
-		.num_eps = 16,
-		.multipoint = 1,
-#ifdef CONFIG_USB_INVENTRA_DMA
-		.dma = 1,
-		.dma_channels = 8,
-#endif
-	},
-};
-
-static struct musb_hdrc_platform_data puv3_usb_plat = {
-	.mode		= MUSB_HOST,
-	.min_power	= 100,
-	.clock		= 0,
-	.config		= puv3_usb_config,
-};
-
-static struct resource puv3_mmc_resources[] = {
-	[0] = {
-		.start	= io_v2p(PKUNITY_SDC_BASE),
-		.end	= io_v2p(PKUNITY_SDC_BASE) + 0xfff,
-		.flags	= IORESOURCE_MEM,
-	},
-	[1] = {
-		.start	= IRQ_SDC,
-		.end	= IRQ_SDC,
-		.flags	= IORESOURCE_IRQ,
-	},
-};
-
-static struct resource puv3_unigfx_resources[] = {
-	[0] = {
-		.start	= io_v2p(PKUNITY_UNIGFX_BASE),
-		.end	= io_v2p(PKUNITY_UNIGFX_BASE) + 0xfff,
-		.flags	= IORESOURCE_MEM,
-	},
-};
-
-static struct resource puv3_rtc_resources[] = {
-	[0] = {
-		.start = io_v2p(PKUNITY_RTC_BASE),
-		.end   = io_v2p(PKUNITY_RTC_BASE) + 0xff,
-		.flags = IORESOURCE_MEM,
-	},
-	[1] = {
-		.start = IRQ_RTCAlarm,
-		.end   = IRQ_RTCAlarm,
-		.flags = IORESOURCE_IRQ,
-	},
-	[2] = {
-		.start = IRQ_RTC,
-		.end   = IRQ_RTC,
-		.flags = IORESOURCE_IRQ
-	}
-};
-
-static struct resource puv3_pwm_resources[] = {
-	[0] = {
-		.start	= io_v2p(PKUNITY_OST_BASE) + 0x80,
-		.end	= io_v2p(PKUNITY_OST_BASE) + 0xff,
-		.flags	= IORESOURCE_MEM,
-	},
-};
-
-static struct resource puv3_uart0_resources[] = {
-	[0] = {
-		.start = io_v2p(PKUNITY_UART0_BASE),
-		.end   = io_v2p(PKUNITY_UART0_BASE) + 0xff,
-		.flags = IORESOURCE_MEM,
-	},
-	[1] = {
-		.start = IRQ_UART0,
-		.end   = IRQ_UART0,
-		.flags = IORESOURCE_IRQ
-	}
-};
-
-static struct resource puv3_uart1_resources[] = {
-	[0] = {
-		.start = io_v2p(PKUNITY_UART1_BASE),
-		.end   = io_v2p(PKUNITY_UART1_BASE) + 0xff,
-		.flags = IORESOURCE_MEM,
-	},
-	[1] = {
-		.start = IRQ_UART1,
-		.end   = IRQ_UART1,
-		.flags = IORESOURCE_IRQ
-	}
-};
-
-static struct resource puv3_umal_resources[] = {
-	[0] = {
-		.start = io_v2p(PKUNITY_UMAL_BASE),
-		.end   = io_v2p(PKUNITY_UMAL_BASE) + 0x1fff,
-		.flags = IORESOURCE_MEM,
-	},
-	[1] = {
-		.start = IRQ_UMAL,
-		.end   = IRQ_UMAL,
-		.flags = IORESOURCE_IRQ
-	}
-};
-
-#ifdef CONFIG_PUV3_PM
-
-#define SAVE(x)		sleep_save[SLEEP_SAVE_##x] = x
-#define RESTORE(x)	x = sleep_save[SLEEP_SAVE_##x]
-
-/*
- * List of global PXA peripheral registers to preserve.
- * More ones like CP and general purpose register values are preserved
- * with the stack pointer in sleep.S.
- */
-enum {
-	SLEEP_SAVE_PM_PLLDDRCFG,
-	SLEEP_SAVE_COUNT
-};
-
-
-static void puv3_cpu_pm_save(unsigned long *sleep_save)
-{
-/*	SAVE(PM_PLLDDRCFG); */
-}
-
-static void puv3_cpu_pm_restore(unsigned long *sleep_save)
-{
-/*	RESTORE(PM_PLLDDRCFG); */
-}
-
-static int puv3_cpu_pm_prepare(void)
-{
-	/* set resume return address */
-	writel(virt_to_phys(puv3_cpu_resume), PM_DIVCFG);
-	return 0;
-}
-
-static void puv3_cpu_pm_enter(suspend_state_t state)
-{
-	/* Clear reset status */
-	writel(RESETC_RSSR_HWR | RESETC_RSSR_WDR
-			| RESETC_RSSR_SMR | RESETC_RSSR_SWR, RESETC_RSSR);
-
-	switch (state) {
-/*	case PM_SUSPEND_ON:
-		puv3_cpu_idle();
-		break; */
-	case PM_SUSPEND_MEM:
-		puv3_cpu_pm_prepare();
-		puv3_cpu_suspend(PM_PMCR_SFB);
-		break;
-	}
-}
-
-static int puv3_cpu_pm_valid(suspend_state_t state)
-{
-	return state == PM_SUSPEND_MEM;
-}
-
-static void puv3_cpu_pm_finish(void)
-{
-	/* ensure not to come back here if it wasn't intended */
-	/* PSPR = 0; */
-}
-
-static struct puv3_cpu_pm_fns puv3_cpu_pm_fnss = {
-	.save_count	= SLEEP_SAVE_COUNT,
-	.valid		= puv3_cpu_pm_valid,
-	.save		= puv3_cpu_pm_save,
-	.restore	= puv3_cpu_pm_restore,
-	.enter		= puv3_cpu_pm_enter,
-	.prepare	= puv3_cpu_pm_prepare,
-	.finish		= puv3_cpu_pm_finish,
-};
-
-static void __init puv3_init_pm(void)
-{
-	puv3_cpu_pm_fns = &puv3_cpu_pm_fnss;
-}
-#else
-static inline void puv3_init_pm(void) {}
-#endif
-
-void puv3_ps2_init(void)
-{
-	struct clk *bclk32;
-
-	bclk32 = clk_get(NULL, "BUS32_CLK");
-	writel(clk_get_rate(bclk32) / 200000, PS2_CNT); /* should > 5us */
-}
-
-void __init puv3_core_init(void)
-{
-	puv3_init_pm();
-	puv3_ps2_init();
-
-	platform_device_register_simple("PKUnity-v3-RTC", -1,
-			puv3_rtc_resources, ARRAY_SIZE(puv3_rtc_resources));
-	platform_device_register_simple("PKUnity-v3-UMAL", -1,
-			puv3_umal_resources, ARRAY_SIZE(puv3_umal_resources));
-	platform_device_register_simple("PKUnity-v3-MMC", -1,
-			puv3_mmc_resources, ARRAY_SIZE(puv3_mmc_resources));
-	platform_device_register_simple("PKUnity-v3-UNIGFX", -1,
-			puv3_unigfx_resources, ARRAY_SIZE(puv3_unigfx_resources));
-	platform_device_register_simple("PKUnity-v3-PWM", -1,
-			puv3_pwm_resources, ARRAY_SIZE(puv3_pwm_resources));
-	platform_device_register_simple("PKUnity-v3-UART", 0,
-			puv3_uart0_resources, ARRAY_SIZE(puv3_uart0_resources));
-	platform_device_register_simple("PKUnity-v3-UART", 1,
-			puv3_uart1_resources, ARRAY_SIZE(puv3_uart1_resources));
-	platform_device_register_simple("PKUnity-v3-AC97", -1, NULL, 0);
-	platform_device_register_resndata(NULL, "musb_hdrc", -1,
-			puv3_usb_resources, ARRAY_SIZE(puv3_usb_resources),
-			&puv3_usb_plat, sizeof(puv3_usb_plat));
-}
-
diff --git a/arch/unicore32/kernel/puv3-nb0916.c b/arch/unicore32/kernel/puv3-nb0916.c
deleted file mode 100644
index e251f5028396..000000000000
--- a/arch/unicore32/kernel/puv3-nb0916.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/puv3-nb0916.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/init.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/mtd/physmap.h>
-#include <linux/io.h>
-#include <linux/reboot.h>
-#include <linux/interrupt.h>
-#include <linux/i2c.h>
-#include <linux/pwm.h>
-#include <linux/pwm_backlight.h>
-#include <linux/gpio.h>
-#include <linux/gpio_keys.h>
-#include <linux/input.h>
-
-#include <mach/hardware.h>
-
-static struct physmap_flash_data physmap_flash_data = {
-	.width		= 1,
-};
-
-static struct resource physmap_flash_resource = {
-	.start		= 0xFFF80000,
-	.end		= 0xFFFFFFFF,
-	.flags		= IORESOURCE_MEM,
-};
-
-static struct resource puv3_i2c_resources[] = {
-	[0] = {
-		.start = io_v2p(PKUNITY_I2C_BASE),
-		.end   = io_v2p(PKUNITY_I2C_BASE) + 0xff,
-		.flags = IORESOURCE_MEM,
-	},
-	[1] = {
-		.start = IRQ_I2C,
-		.end   = IRQ_I2C,
-		.flags = IORESOURCE_IRQ,
-	}
-};
-
-static struct pwm_lookup nb0916_pwm_lookup[] = {
-	PWM_LOOKUP("PKUnity-v3-PWM", 0, "pwm-backlight", NULL, 70 * 1024,
-		   PWM_POLARITY_NORMAL),
-};
-
-static struct platform_pwm_backlight_data nb0916_backlight_data = {
-	.max_brightness	= 100,
-	.dft_brightness	= 100,
-};
-
-static struct gpio_keys_button nb0916_gpio_keys[] = {
-	{
-		.type	= EV_KEY,
-		.code	= KEY_POWER,
-		.gpio	= GPI_SOFF_REQ,
-		.desc	= "Power Button",
-		.wakeup = 1,
-		.active_low = 1,
-	},
-	{
-		.type	= EV_KEY,
-		.code	= BTN_TOUCH,
-		.gpio	= GPI_BTN_TOUCH,
-		.desc	= "Touchpad Button",
-		.wakeup = 1,
-		.active_low = 1,
-	},
-};
-
-static struct gpio_keys_platform_data nb0916_gpio_button_data = {
-	.buttons	= nb0916_gpio_keys,
-	.nbuttons	= ARRAY_SIZE(nb0916_gpio_keys),
-};
-
-static irqreturn_t nb0916_lcdcaseoff_handler(int irq, void *dev_id)
-{
-	if (gpio_get_value(GPI_LCD_CASE_OFF))
-		gpio_set_value(GPO_LCD_EN, 1);
-	else
-		gpio_set_value(GPO_LCD_EN, 0);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t nb0916_overheat_handler(int irq, void *dev_id)
-{
-	machine_halt();
-	/* SYSTEM HALT, NO RETURN */
-	return IRQ_HANDLED;
-}
-
-static struct i2c_board_info __initdata puv3_i2c_devices[] = {
-	{	I2C_BOARD_INFO("lm75",		I2C_TAR_THERMAL),	},
-	{	I2C_BOARD_INFO("bq27200",	I2C_TAR_PWIC),		},
-	{	I2C_BOARD_INFO("24c02",		I2C_TAR_EEPROM),	},
-};
-
-int __init mach_nb0916_init(void)
-{
-	i2c_register_board_info(0, puv3_i2c_devices,
-			ARRAY_SIZE(puv3_i2c_devices));
-
-	platform_device_register_simple("PKUnity-v3-I2C", -1,
-			puv3_i2c_resources, ARRAY_SIZE(puv3_i2c_resources));
-
-	pwm_add_table(nb0916_pwm_lookup, ARRAY_SIZE(nb0916_pwm_lookup));
-
-	platform_device_register_data(NULL, "pwm-backlight", -1,
-			&nb0916_backlight_data, sizeof(nb0916_backlight_data));
-
-	platform_device_register_data(NULL, "gpio-keys", -1,
-			&nb0916_gpio_button_data, sizeof(nb0916_gpio_button_data));
-
-	platform_device_register_resndata(NULL, "physmap-flash", -1,
-			&physmap_flash_resource, 1,
-			&physmap_flash_data, sizeof(physmap_flash_data));
-
-	if (request_irq(gpio_to_irq(GPI_LCD_CASE_OFF),
-		&nb0916_lcdcaseoff_handler,
-		IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
-		"NB0916 lcd case off", NULL) < 0) {
-
-		printk(KERN_DEBUG "LCD-Case-OFF IRQ %d not available\n",
-			gpio_to_irq(GPI_LCD_CASE_OFF));
-	}
-
-	if (request_irq(gpio_to_irq(GPI_OTP_INT), &nb0916_overheat_handler,
-		IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
-		"NB0916 overheating protection", NULL) < 0) {
-
-		printk(KERN_DEBUG "Overheating Protection IRQ %d not available\n",
-			gpio_to_irq(GPI_OTP_INT));
-	}
-
-	return 0;
-}
-
-subsys_initcall_sync(mach_nb0916_init);
diff --git a/arch/unicore32/kernel/setup.c b/arch/unicore32/kernel/setup.c
deleted file mode 100644
index 0c4242a5ee1d..000000000000
--- a/arch/unicore32/kernel/setup.c
+++ /dev/null
@@ -1,352 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/setup.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/stddef.h>
-#include <linux/ioport.h>
-#include <linux/delay.h>
-#include <linux/utsname.h>
-#include <linux/initrd.h>
-#include <linux/console.h>
-#include <linux/memblock.h>
-#include <linux/seq_file.h>
-#include <linux/screen_info.h>
-#include <linux/init.h>
-#include <linux/root_dev.h>
-#include <linux/cpu.h>
-#include <linux/interrupt.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-#include <linux/proc_fs.h>
-#include <linux/elf.h>
-#include <linux/io.h>
-
-#include <asm/cputype.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/traps.h>
-#include <asm/memblock.h>
-
-#include "setup.h"
-
-#ifndef MEM_SIZE
-#define MEM_SIZE	(16*1024*1024)
-#endif
-
-struct stack {
-	u32 irq[3];
-	u32 abt[3];
-	u32 und[3];
-} ____cacheline_aligned;
-
-static struct stack stacks[NR_CPUS];
-
-#ifdef CONFIG_VGA_CONSOLE
-struct screen_info screen_info;
-#endif
-
-char elf_platform[ELF_PLATFORM_SIZE];
-EXPORT_SYMBOL(elf_platform);
-
-static char __initdata cmd_line[COMMAND_LINE_SIZE];
-
-static char default_command_line[COMMAND_LINE_SIZE] __initdata = CONFIG_CMDLINE;
-
-/*
- * Standard memory resources
- */
-static struct resource mem_res[] = {
-	{
-		.name = "Kernel code",
-		.start = 0,
-		.end = 0,
-		.flags = IORESOURCE_SYSTEM_RAM
-	},
-	{
-		.name = "Kernel data",
-		.start = 0,
-		.end = 0,
-		.flags = IORESOURCE_SYSTEM_RAM
-	}
-};
-
-#define kernel_code mem_res[0]
-#define kernel_data mem_res[1]
-
-/*
- * These functions re-use the assembly code in head.S, which
- * already provide the required functionality.
- */
-static void __init setup_processor(void)
-{
-	printk(KERN_DEFAULT "CPU: UniCore-II [%08x] revision %d, cr=%08lx\n",
-	       uc32_cpuid, (int)(uc32_cpuid >> 16) & 15, cr_alignment);
-
-	sprintf(init_utsname()->machine, "puv3");
-	sprintf(elf_platform, "ucv2");
-}
-
-/*
- * cpu_init - initialise one CPU.
- *
- * cpu_init sets up the per-CPU stacks.
- */
-void cpu_init(void)
-{
-	unsigned int cpu = smp_processor_id();
-	struct stack *stk = &stacks[cpu];
-
-	/*
-	 * setup stacks for re-entrant exception handlers
-	 */
-	__asm__ (
-	"mov.a	asr, %1\n\t"
-	"add	sp, %0, %2\n\t"
-	"mov.a	asr, %3\n\t"
-	"add	sp, %0, %4\n\t"
-	"mov.a	asr, %5\n\t"
-	"add	sp, %0, %6\n\t"
-	"mov.a	asr, %7"
-	    :
-	    : "r" (stk),
-	      "r" (PSR_R_BIT | PSR_I_BIT | INTR_MODE),
-	      "I" (offsetof(struct stack, irq[0])),
-	      "r" (PSR_R_BIT | PSR_I_BIT | ABRT_MODE),
-	      "I" (offsetof(struct stack, abt[0])),
-	      "r" (PSR_R_BIT | PSR_I_BIT | EXTN_MODE),
-	      "I" (offsetof(struct stack, und[0])),
-	      "r" (PSR_R_BIT | PSR_I_BIT | PRIV_MODE)
-	: "r30", "cc");
-}
-
-static int __init uc32_add_memory(unsigned long start, unsigned long size)
-{
-	struct membank *bank = &meminfo.bank[meminfo.nr_banks];
-
-	if (meminfo.nr_banks >= NR_BANKS) {
-		printk(KERN_CRIT "NR_BANKS too low, "
-			"ignoring memory at %#lx\n", start);
-		return -EINVAL;
-	}
-
-	/*
-	 * Ensure that start/size are aligned to a page boundary.
-	 * Size is appropriately rounded down, start is rounded up.
-	 */
-	size -= start & ~PAGE_MASK;
-
-	bank->start = PAGE_ALIGN(start);
-	bank->size  = size & PAGE_MASK;
-
-	/*
-	 * Check whether this memory region has non-zero size or
-	 * invalid node number.
-	 */
-	if (bank->size == 0)
-		return -EINVAL;
-
-	meminfo.nr_banks++;
-	return 0;
-}
-
-/*
- * Pick out the memory size.  We look for mem=size@start,
- * where start and size are "size[KkMm]"
- */
-static int __init early_mem(char *p)
-{
-	static int usermem __initdata = 1;
-	unsigned long size, start;
-	char *endp;
-
-	/*
-	 * If the user specifies memory size, we
-	 * blow away any automatically generated
-	 * size.
-	 */
-	if (usermem) {
-		usermem = 0;
-		meminfo.nr_banks = 0;
-	}
-
-	start = PHYS_OFFSET;
-	size  = memparse(p, &endp);
-	if (*endp == '@')
-		start = memparse(endp + 1, NULL);
-
-	uc32_add_memory(start, size);
-
-	return 0;
-}
-early_param("mem", early_mem);
-
-static void __init
-request_standard_resources(struct meminfo *mi)
-{
-	struct resource *res;
-	int i;
-
-	kernel_code.start   = virt_to_phys(_stext);
-	kernel_code.end     = virt_to_phys(_etext - 1);
-	kernel_data.start   = virt_to_phys(_sdata);
-	kernel_data.end     = virt_to_phys(_end - 1);
-
-	for (i = 0; i < mi->nr_banks; i++) {
-		if (mi->bank[i].size == 0)
-			continue;
-
-		res = memblock_alloc_low(sizeof(*res), SMP_CACHE_BYTES);
-		if (!res)
-			panic("%s: Failed to allocate %zu bytes align=%x\n",
-			      __func__, sizeof(*res), SMP_CACHE_BYTES);
-
-		res->name  = "System RAM";
-		res->start = mi->bank[i].start;
-		res->end   = mi->bank[i].start + mi->bank[i].size - 1;
-		res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-		request_resource(&iomem_resource, res);
-
-		if (kernel_code.start >= res->start &&
-		    kernel_code.end <= res->end)
-			request_resource(res, &kernel_code);
-		if (kernel_data.start >= res->start &&
-		    kernel_data.end <= res->end)
-			request_resource(res, &kernel_data);
-	}
-}
-
-static void (*init_machine)(void) __initdata;
-
-static int __init customize_machine(void)
-{
-	/* customizes platform devices, or adds new ones */
-	if (init_machine)
-		init_machine();
-	return 0;
-}
-arch_initcall(customize_machine);
-
-void __init setup_arch(char **cmdline_p)
-{
-	char *from = default_command_line;
-
-	setup_processor();
-
-	init_mm.start_code = (unsigned long) _stext;
-	init_mm.end_code   = (unsigned long) _etext;
-	init_mm.end_data   = (unsigned long) _edata;
-	init_mm.brk	   = (unsigned long) _end;
-
-	/* parse_early_param needs a boot_command_line */
-	strlcpy(boot_command_line, from, COMMAND_LINE_SIZE);
-
-	/* populate cmd_line too for later use, preserving boot_command_line */
-	strlcpy(cmd_line, boot_command_line, COMMAND_LINE_SIZE);
-	*cmdline_p = cmd_line;
-
-	parse_early_param();
-
-	uc32_memblock_init(&meminfo);
-
-	paging_init();
-	request_standard_resources(&meminfo);
-
-	cpu_init();
-
-	/*
-	 * Set up various architecture-specific pointers
-	 */
-	init_machine = puv3_core_init;
-
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
-	conswitchp = &vga_con;
-#endif
-#endif
-	early_trap_init();
-}
-
-static struct cpu cpuinfo_unicore;
-
-static int __init topology_init(void)
-{
-	int i;
-
-	for_each_possible_cpu(i)
-		register_cpu(&cpuinfo_unicore, i);
-
-	return 0;
-}
-subsys_initcall(topology_init);
-
-#ifdef CONFIG_HAVE_PROC_CPU
-static int __init proc_cpu_init(void)
-{
-	struct proc_dir_entry *res;
-
-	res = proc_mkdir("cpu", NULL);
-	if (!res)
-		return -ENOMEM;
-	return 0;
-}
-fs_initcall(proc_cpu_init);
-#endif
-
-static int c_show(struct seq_file *m, void *v)
-{
-	seq_printf(m, "Processor\t: UniCore-II rev %d (%s)\n",
-		   (int)(uc32_cpuid >> 16) & 15, elf_platform);
-
-	seq_printf(m, "BogoMIPS\t: %lu.%02lu\n",
-		   loops_per_jiffy / (500000/HZ),
-		   (loops_per_jiffy / (5000/HZ)) % 100);
-
-	/* dump out the processor features */
-	seq_puts(m, "Features\t: CMOV UC-F64");
-
-	seq_printf(m, "\nCPU implementer\t: 0x%02x\n", uc32_cpuid >> 24);
-	seq_printf(m, "CPU architecture: 2\n");
-	seq_printf(m, "CPU revision\t: %d\n", (uc32_cpuid >> 16) & 15);
-
-	seq_printf(m, "Cache type\t: write-back\n"
-			"Cache clean\t: cp0 c5 ops\n"
-			"Cache lockdown\t: not support\n"
-			"Cache format\t: Harvard\n");
-
-	seq_puts(m, "\n");
-
-	seq_printf(m, "Hardware\t: PKUnity v3\n");
-
-	return 0;
-}
-
-static void *c_start(struct seq_file *m, loff_t *pos)
-{
-	return *pos < 1 ? (void *)1 : NULL;
-}
-
-static void *c_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	++*pos;
-	return NULL;
-}
-
-static void c_stop(struct seq_file *m, void *v)
-{
-}
-
-const struct seq_operations cpuinfo_op = {
-	.start	= c_start,
-	.next	= c_next,
-	.stop	= c_stop,
-	.show	= c_show
-};
diff --git a/arch/unicore32/kernel/setup.h b/arch/unicore32/kernel/setup.h
deleted file mode 100644
index 967352323185..000000000000
--- a/arch/unicore32/kernel/setup.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/setup.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_KERNEL_SETUP_H__
-#define __UNICORE_KERNEL_SETUP_H__
-
-#include <asm/hwdef-copro.h>
-
-extern void paging_init(void);
-extern void puv3_core_init(void);
-extern void cpu_init(void);
-
-extern void puv3_ps2_init(void);
-extern void pci_puv3_preinit(void);
-extern void __init puv3_init_gpio(void);
-
-extern void setup_mm_for_reboot(void);
-
-extern char __stubs_start[], __stubs_end[];
-extern char __vectors_start[], __vectors_end[];
-
-extern void kernel_thread_helper(void);
-
-extern void __init early_signal_init(void);
-
-extern asmlinkage void __backtrace(void);
-extern asmlinkage void c_backtrace(unsigned long fp, const char *loglvl);
-
-extern void __show_regs(struct pt_regs *);
-
-#endif
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
deleted file mode 100644
index 3946182a835d..000000000000
--- a/arch/unicore32/kernel/signal.c
+++ /dev/null
@@ -1,424 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/signal.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/personality.h>
-#include <linux/uaccess.h>
-#include <linux/tracehook.h>
-#include <linux/elf.h>
-#include <linux/unistd.h>
-
-#include <asm/cacheflush.h>
-#include <asm/ucontext.h>
-
-/*
- * For UniCore syscalls, we encode the syscall number into the instruction.
- */
-#define SWI_SYS_SIGRETURN	(0xff000000) /* error number for new abi */
-#define SWI_SYS_RT_SIGRETURN	(0xff000000 | (__NR_rt_sigreturn))
-#define SWI_SYS_RESTART		(0xff000000 | (__NR_restart_syscall))
-
-#define KERN_SIGRETURN_CODE	(KUSER_VECPAGE_BASE + 0x00000500)
-#define KERN_RESTART_CODE	(KERN_SIGRETURN_CODE + sizeof(sigreturn_codes))
-
-const unsigned long sigreturn_codes[3] = {
-	SWI_SYS_SIGRETURN, SWI_SYS_RT_SIGRETURN,
-};
-
-const unsigned long syscall_restart_code[2] = {
-	SWI_SYS_RESTART,	/* swi	__NR_restart_syscall */
-	0x69efc004,		/* ldr	pc, [sp], #4 */
-};
-
-/*
- * Do a signal return; undo the signal stack.  These are aligned to 64-bit.
- */
-struct sigframe {
-	struct ucontext uc;
-	unsigned long retcode[2];
-};
-
-struct rt_sigframe {
-	struct siginfo info;
-	struct sigframe sig;
-};
-
-static int restore_sigframe(struct pt_regs *regs, struct sigframe __user *sf)
-{
-	sigset_t set;
-	int err;
-
-	err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set));
-	if (err == 0)
-		set_current_blocked(&set);
-
-	err |= __get_user(regs->UCreg_00, &sf->uc.uc_mcontext.regs.UCreg_00);
-	err |= __get_user(regs->UCreg_01, &sf->uc.uc_mcontext.regs.UCreg_01);
-	err |= __get_user(regs->UCreg_02, &sf->uc.uc_mcontext.regs.UCreg_02);
-	err |= __get_user(regs->UCreg_03, &sf->uc.uc_mcontext.regs.UCreg_03);
-	err |= __get_user(regs->UCreg_04, &sf->uc.uc_mcontext.regs.UCreg_04);
-	err |= __get_user(regs->UCreg_05, &sf->uc.uc_mcontext.regs.UCreg_05);
-	err |= __get_user(regs->UCreg_06, &sf->uc.uc_mcontext.regs.UCreg_06);
-	err |= __get_user(regs->UCreg_07, &sf->uc.uc_mcontext.regs.UCreg_07);
-	err |= __get_user(regs->UCreg_08, &sf->uc.uc_mcontext.regs.UCreg_08);
-	err |= __get_user(regs->UCreg_09, &sf->uc.uc_mcontext.regs.UCreg_09);
-	err |= __get_user(regs->UCreg_10, &sf->uc.uc_mcontext.regs.UCreg_10);
-	err |= __get_user(regs->UCreg_11, &sf->uc.uc_mcontext.regs.UCreg_11);
-	err |= __get_user(regs->UCreg_12, &sf->uc.uc_mcontext.regs.UCreg_12);
-	err |= __get_user(regs->UCreg_13, &sf->uc.uc_mcontext.regs.UCreg_13);
-	err |= __get_user(regs->UCreg_14, &sf->uc.uc_mcontext.regs.UCreg_14);
-	err |= __get_user(regs->UCreg_15, &sf->uc.uc_mcontext.regs.UCreg_15);
-	err |= __get_user(regs->UCreg_16, &sf->uc.uc_mcontext.regs.UCreg_16);
-	err |= __get_user(regs->UCreg_17, &sf->uc.uc_mcontext.regs.UCreg_17);
-	err |= __get_user(regs->UCreg_18, &sf->uc.uc_mcontext.regs.UCreg_18);
-	err |= __get_user(regs->UCreg_19, &sf->uc.uc_mcontext.regs.UCreg_19);
-	err |= __get_user(regs->UCreg_20, &sf->uc.uc_mcontext.regs.UCreg_20);
-	err |= __get_user(regs->UCreg_21, &sf->uc.uc_mcontext.regs.UCreg_21);
-	err |= __get_user(regs->UCreg_22, &sf->uc.uc_mcontext.regs.UCreg_22);
-	err |= __get_user(regs->UCreg_23, &sf->uc.uc_mcontext.regs.UCreg_23);
-	err |= __get_user(regs->UCreg_24, &sf->uc.uc_mcontext.regs.UCreg_24);
-	err |= __get_user(regs->UCreg_25, &sf->uc.uc_mcontext.regs.UCreg_25);
-	err |= __get_user(regs->UCreg_26, &sf->uc.uc_mcontext.regs.UCreg_26);
-	err |= __get_user(regs->UCreg_fp, &sf->uc.uc_mcontext.regs.UCreg_fp);
-	err |= __get_user(regs->UCreg_ip, &sf->uc.uc_mcontext.regs.UCreg_ip);
-	err |= __get_user(regs->UCreg_sp, &sf->uc.uc_mcontext.regs.UCreg_sp);
-	err |= __get_user(regs->UCreg_lr, &sf->uc.uc_mcontext.regs.UCreg_lr);
-	err |= __get_user(regs->UCreg_pc, &sf->uc.uc_mcontext.regs.UCreg_pc);
-	err |= __get_user(regs->UCreg_asr, &sf->uc.uc_mcontext.regs.UCreg_asr);
-
-	err |= !valid_user_regs(regs);
-
-	return err;
-}
-
-asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs)
-{
-	struct rt_sigframe __user *frame;
-
-	/* Always make any pending restarted system calls return -EINTR */
-	current->restart_block.fn = do_no_restart_syscall;
-
-	/*
-	 * Since we stacked the signal on a 64-bit boundary,
-	 * then 'sp' should be word aligned here.  If it's
-	 * not, then the user is trying to mess with us.
-	 */
-	if (regs->UCreg_sp & 7)
-		goto badframe;
-
-	frame = (struct rt_sigframe __user *)regs->UCreg_sp;
-
-	if (!access_ok(frame, sizeof(*frame)))
-		goto badframe;
-
-	if (restore_sigframe(regs, &frame->sig))
-		goto badframe;
-
-	if (restore_altstack(&frame->sig.uc.uc_stack))
-		goto badframe;
-
-	return regs->UCreg_00;
-
-badframe:
-	force_sig(SIGSEGV);
-	return 0;
-}
-
-static int setup_sigframe(struct sigframe __user *sf, struct pt_regs *regs,
-		sigset_t *set)
-{
-	int err = 0;
-
-	err |= __put_user(regs->UCreg_00, &sf->uc.uc_mcontext.regs.UCreg_00);
-	err |= __put_user(regs->UCreg_01, &sf->uc.uc_mcontext.regs.UCreg_01);
-	err |= __put_user(regs->UCreg_02, &sf->uc.uc_mcontext.regs.UCreg_02);
-	err |= __put_user(regs->UCreg_03, &sf->uc.uc_mcontext.regs.UCreg_03);
-	err |= __put_user(regs->UCreg_04, &sf->uc.uc_mcontext.regs.UCreg_04);
-	err |= __put_user(regs->UCreg_05, &sf->uc.uc_mcontext.regs.UCreg_05);
-	err |= __put_user(regs->UCreg_06, &sf->uc.uc_mcontext.regs.UCreg_06);
-	err |= __put_user(regs->UCreg_07, &sf->uc.uc_mcontext.regs.UCreg_07);
-	err |= __put_user(regs->UCreg_08, &sf->uc.uc_mcontext.regs.UCreg_08);
-	err |= __put_user(regs->UCreg_09, &sf->uc.uc_mcontext.regs.UCreg_09);
-	err |= __put_user(regs->UCreg_10, &sf->uc.uc_mcontext.regs.UCreg_10);
-	err |= __put_user(regs->UCreg_11, &sf->uc.uc_mcontext.regs.UCreg_11);
-	err |= __put_user(regs->UCreg_12, &sf->uc.uc_mcontext.regs.UCreg_12);
-	err |= __put_user(regs->UCreg_13, &sf->uc.uc_mcontext.regs.UCreg_13);
-	err |= __put_user(regs->UCreg_14, &sf->uc.uc_mcontext.regs.UCreg_14);
-	err |= __put_user(regs->UCreg_15, &sf->uc.uc_mcontext.regs.UCreg_15);
-	err |= __put_user(regs->UCreg_16, &sf->uc.uc_mcontext.regs.UCreg_16);
-	err |= __put_user(regs->UCreg_17, &sf->uc.uc_mcontext.regs.UCreg_17);
-	err |= __put_user(regs->UCreg_18, &sf->uc.uc_mcontext.regs.UCreg_18);
-	err |= __put_user(regs->UCreg_19, &sf->uc.uc_mcontext.regs.UCreg_19);
-	err |= __put_user(regs->UCreg_20, &sf->uc.uc_mcontext.regs.UCreg_20);
-	err |= __put_user(regs->UCreg_21, &sf->uc.uc_mcontext.regs.UCreg_21);
-	err |= __put_user(regs->UCreg_22, &sf->uc.uc_mcontext.regs.UCreg_22);
-	err |= __put_user(regs->UCreg_23, &sf->uc.uc_mcontext.regs.UCreg_23);
-	err |= __put_user(regs->UCreg_24, &sf->uc.uc_mcontext.regs.UCreg_24);
-	err |= __put_user(regs->UCreg_25, &sf->uc.uc_mcontext.regs.UCreg_25);
-	err |= __put_user(regs->UCreg_26, &sf->uc.uc_mcontext.regs.UCreg_26);
-	err |= __put_user(regs->UCreg_fp, &sf->uc.uc_mcontext.regs.UCreg_fp);
-	err |= __put_user(regs->UCreg_ip, &sf->uc.uc_mcontext.regs.UCreg_ip);
-	err |= __put_user(regs->UCreg_sp, &sf->uc.uc_mcontext.regs.UCreg_sp);
-	err |= __put_user(regs->UCreg_lr, &sf->uc.uc_mcontext.regs.UCreg_lr);
-	err |= __put_user(regs->UCreg_pc, &sf->uc.uc_mcontext.regs.UCreg_pc);
-	err |= __put_user(regs->UCreg_asr, &sf->uc.uc_mcontext.regs.UCreg_asr);
-
-	err |= __put_user(current->thread.trap_no,
-			&sf->uc.uc_mcontext.trap_no);
-	err |= __put_user(current->thread.error_code,
-			&sf->uc.uc_mcontext.error_code);
-	err |= __put_user(current->thread.address,
-			&sf->uc.uc_mcontext.fault_address);
-	err |= __put_user(set->sig[0], &sf->uc.uc_mcontext.oldmask);
-
-	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
-
-	return err;
-}
-
-static inline void __user *get_sigframe(struct k_sigaction *ka,
-		struct pt_regs *regs, int framesize)
-{
-	unsigned long sp = regs->UCreg_sp;
-	void __user *frame;
-
-	/*
-	 * This is the X/Open sanctioned signal stack switching.
-	 */
-	if ((ka->sa.sa_flags & SA_ONSTACK) && !sas_ss_flags(sp))
-		sp = current->sas_ss_sp + current->sas_ss_size;
-
-	/*
-	 * ATPCS B01 mandates 8-byte alignment
-	 */
-	frame = (void __user *)((sp - framesize) & ~7);
-
-	/*
-	 * Check that we can actually write to the signal frame.
-	 */
-	if (!access_ok(frame, framesize))
-		frame = NULL;
-
-	return frame;
-}
-
-static int setup_return(struct pt_regs *regs, struct k_sigaction *ka,
-	     unsigned long __user *rc, void __user *frame, int usig)
-{
-	unsigned long handler = (unsigned long)ka->sa.sa_handler;
-	unsigned long retcode;
-	unsigned long asr = regs->UCreg_asr & ~PSR_f;
-
-	unsigned int idx = 0;
-
-	if (ka->sa.sa_flags & SA_SIGINFO)
-		idx += 1;
-
-	if (__put_user(sigreturn_codes[idx],   rc) ||
-	    __put_user(sigreturn_codes[idx+1], rc+1))
-		return 1;
-
-	retcode = KERN_SIGRETURN_CODE + (idx << 2);
-
-	regs->UCreg_00 = usig;
-	regs->UCreg_sp = (unsigned long)frame;
-	regs->UCreg_lr = retcode;
-	regs->UCreg_pc = handler;
-	regs->UCreg_asr = asr;
-
-	return 0;
-}
-
-static int setup_frame(struct ksignal *ksig, sigset_t *set,
-		       struct pt_regs *regs)
-{
-	struct sigframe __user *frame = get_sigframe(&ksig->ka, regs, sizeof(*frame));
-	int err = 0;
-
-	if (!frame)
-		return 1;
-
-	/*
-	 * Set uc.uc_flags to a value which sc.trap_no would never have.
-	 */
-	err |= __put_user(0x5ac3c35a, &frame->uc.uc_flags);
-
-	err |= setup_sigframe(frame, regs, set);
-	if (err == 0)
-		err |= setup_return(regs, &ksig->ka, frame->retcode, frame,
-				    ksig->sig);
-
-	return err;
-}
-
-static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
-			  struct pt_regs *regs)
-{
-	struct rt_sigframe __user *frame =
-			get_sigframe(&ksig->ka, regs, sizeof(*frame));
-	int err = 0;
-
-	if (!frame)
-		return 1;
-
-	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
-
-	err |= __put_user(0, &frame->sig.uc.uc_flags);
-	err |= __put_user(NULL, &frame->sig.uc.uc_link);
-	err |= __save_altstack(&frame->sig.uc.uc_stack, regs->UCreg_sp);
-	err |= setup_sigframe(&frame->sig, regs, set);
-	if (err == 0)
-		err |= setup_return(regs, &ksig->ka, frame->sig.retcode, frame,
-				    ksig->sig);
-
-	if (err == 0) {
-		/*
-		 * For realtime signals we must also set the second and third
-		 * arguments for the signal handler.
-		 */
-		regs->UCreg_01 = (unsigned long)&frame->info;
-		regs->UCreg_02 = (unsigned long)&frame->sig.uc;
-	}
-
-	return err;
-}
-
-static inline void setup_syscall_restart(struct pt_regs *regs)
-{
-	regs->UCreg_00 = regs->UCreg_ORIG_00;
-	regs->UCreg_pc -= 4;
-}
-
-/*
- * OK, we're invoking a handler
- */
-static void handle_signal(struct ksignal *ksig, struct pt_regs *regs,
-			  int syscall)
-{
-	struct thread_info *thread = current_thread_info();
-	sigset_t *oldset = sigmask_to_save();
-	int usig = ksig->sig;
-	int ret;
-
-	/*
-	 * If we were from a system call, check for system call restarting...
-	 */
-	if (syscall) {
-		switch (regs->UCreg_00) {
-		case -ERESTART_RESTARTBLOCK:
-		case -ERESTARTNOHAND:
-			regs->UCreg_00 = -EINTR;
-			break;
-		case -ERESTARTSYS:
-			if (!(ksig->ka.sa.sa_flags & SA_RESTART)) {
-				regs->UCreg_00 = -EINTR;
-				break;
-			}
-			/* fallthrough */
-		case -ERESTARTNOINTR:
-			setup_syscall_restart(regs);
-		}
-	}
-
-	/*
-	 * Set up the stack frame
-	 */
-	if (ksig->ka.sa.sa_flags & SA_SIGINFO)
-		ret = setup_rt_frame(ksig, oldset, regs);
-	else
-		ret = setup_frame(ksig, oldset, regs);
-
-	/*
-	 * Check that the resulting registers are actually sane.
-	 */
-	ret |= !valid_user_regs(regs);
-
-	signal_setup_done(ret, ksig, 0);
-}
-
-/*
- * Note that 'init' is a special process: it doesn't get signals it doesn't
- * want to handle. Thus you cannot kill init even with a SIGKILL even by
- * mistake.
- *
- * Note that we go through the signals twice: once to check the signals that
- * the kernel can handle, and then we build all the user-level signal handling
- * stack-frames in one go after that.
- */
-static void do_signal(struct pt_regs *regs, int syscall)
-{
-	struct ksignal ksig;
-
-	/*
-	 * We want the common case to go fast, which
-	 * is why we may in certain cases get here from
-	 * kernel mode. Just return without doing anything
-	 * if so.
-	 */
-	if (!user_mode(regs))
-		return;
-
-	if (get_signal(&ksig)) {
-		handle_signal(&ksig, regs, syscall);
-		return;
-	}
-
-	/*
-	 * No signal to deliver to the process - restart the syscall.
-	 */
-	if (syscall) {
-		if (regs->UCreg_00 == -ERESTART_RESTARTBLOCK) {
-				u32 __user *usp;
-
-				regs->UCreg_sp -= 4;
-				usp = (u32 __user *)regs->UCreg_sp;
-
-				if (put_user(regs->UCreg_pc, usp) == 0) {
-					regs->UCreg_pc = KERN_RESTART_CODE;
-				} else {
-					regs->UCreg_sp += 4;
-					force_sigsegv(0);
-				}
-		}
-		if (regs->UCreg_00 == -ERESTARTNOHAND ||
-		    regs->UCreg_00 == -ERESTARTSYS ||
-		    regs->UCreg_00 == -ERESTARTNOINTR) {
-			setup_syscall_restart(regs);
-		}
-	}
-	/* If there's no signal to deliver, we just put the saved
-	 * sigmask back.
-	 */
-	restore_saved_sigmask();
-}
-
-asmlinkage void do_notify_resume(struct pt_regs *regs,
-		unsigned int thread_flags, int syscall)
-{
-	if (thread_flags & _TIF_SIGPENDING)
-		do_signal(regs, syscall);
-
-	if (thread_flags & _TIF_NOTIFY_RESUME) {
-		clear_thread_flag(TIF_NOTIFY_RESUME);
-		tracehook_notify_resume(regs);
-	}
-}
-
-/*
- * Copy signal return handlers into the vector page, and
- * set sigreturn to be a pointer to these.
- */
-void __init early_signal_init(void)
-{
-	memcpy((void *)kuser_vecpage_to_vectors(KERN_SIGRETURN_CODE),
-			sigreturn_codes, sizeof(sigreturn_codes));
-	memcpy((void *)kuser_vecpage_to_vectors(KERN_RESTART_CODE),
-			syscall_restart_code, sizeof(syscall_restart_code));
-	/* Need not to flush icache, since early_trap_init will do it last. */
-}
diff --git a/arch/unicore32/kernel/sleep.S b/arch/unicore32/kernel/sleep.S
deleted file mode 100644
index 23151abe53c6..000000000000
--- a/arch/unicore32/kernel/sleep.S
+++ /dev/null
@@ -1,199 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/sleep.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <mach/hardware.h>
-
-		.text
-
-pkunity_cpu_save_cp:
-
-	@ get coprocessor registers
-
-	movc	r3, p0.c7, #0			@ PID
-	movc	r4, p0.c2, #0			@ translation table base addr
-	movc	r5, p0.c1, #0			@ control reg
-
-
-	@ store them plus current virtual stack ptr on stack
-	mov	r6, sp
-	stm.w	(r3 - r6), [sp-]
-
-	mov	pc, lr
-
-pkunity_cpu_save_sp:
-	@ preserve phys address of stack
-	mov	r0, sp
-	stw.w	lr, [sp+], #-4
-	b.l	sleep_phys_sp
-	ldw	r1, =sleep_save_sp
-	stw	r0, [r1]
-	ldw.w	pc, [sp]+, #4
-
-/*
- * puv3_cpu_suspend()
- *
- * Forces CPU into sleep state.
- *
- * r0 = value for PWRMODE M field for desired sleep state
- */
-
-ENTRY(puv3_cpu_suspend)
-	stm.w	(r16 - r27, lr), [sp-]		@ save registers on stack
-	stm.w	(r4 - r15), [sp-]		@ save registers on stack
-
-#ifdef	CONFIG_UNICORE_FPU_F64
-	sfm.w	(f0  - f7 ), [sp-]
-	sfm.w	(f8  - f15), [sp-]
-	sfm.w	(f16 - f23), [sp-]
-	sfm.w	(f24 - f31), [sp-]
-	cff	r4, s31
-	stm.w	(r4), [sp-]
-#endif
-	b.l	pkunity_cpu_save_cp
-
-	b.l	pkunity_cpu_save_sp
-
-	@ clean data cache
-	mov	r1, #0
-	movc	p0.c5, r1, #14
-	nop
-	nop
-	nop
-	nop
-
-
-
-	@ DDR2 BaseAddr
-	ldw	r0, =(PKUNITY_DDR2CTRL_BASE)
-
-	@ PM BaseAddr
-	ldw	r1, =(PKUNITY_PM_BASE)
-
-	@ set PLL_SYS_CFG reg, 275
-	movl	r6, #0x00002401
-	stw	r6, [r1+], #0x18
-	@ set PLL_DDR_CFG reg, 66MHz
-	movl	r6, #0x00100c00
-	stw	r6, [r1+], #0x1c
-
-	@ set wake up source
-	movl	r8, #0x800001ff		@ epip4d
-	stw	r8, [r1+], #0xc
-
-	@ set PGSR
-	movl	r5, #0x40000
-	stw	r5, [r1+], #0x10
-
-	@ prepare DDR2 refresh settings
-	ldw	r5, [r0+], #0x24
-	or	r5, r5, #0x00000001
-
-	@ prepare PMCR for PLL changing
-	movl	r6, #0xc
-
-	@ prepare for closing PLL
-	movl	r7, #0x1
-
-	@ prepare sleep mode
-	mov	r8, #0x1
-
-@	movl	r0, 0x11111111
-@	put_word_ocd r0
-	b	pkunity_cpu_do_suspend
-
-	.ltorg
-	.align	5
-pkunity_cpu_do_suspend:
-	b	101f
-	@ put DDR2 into self-refresh
-100:	stw	r5, [r0+], #0x24
-	@ change PLL
-	stw	r6, [r1]
-	b	1f
-
-	.ltorg
-	.align	5
-101:	b	102f
-	@ wait for PLL changing complete
-1:	ldw	r6, [r1+], #0x44
-	csub.a	r6, #0x1
-	bne	1b
-	b	2f
-
-	.ltorg
-	.align	5
-102:	b	100b
-	@ close PLL
-2:	stw	r7, [r1+], #0x4
-	@ enter sleep mode
-	stw	r8, [r1]
-3:	b	3b
-
-
-
-
-/*
- * puv3_cpu_resume()
- *
- * entry point from bootloader into kernel during resume
- *
- * Note: Yes, part of the following code is located into the .data section.
- *       This is to allow sleep_save_sp to be accessed with a relative load
- *       while we can't rely on any MMU translation.  We could have put
- *       sleep_save_sp in the .text section as well, but some setups might
- *       insist on it to be truly read-only.
- */
-
-	.data
-	.align 5
-ENTRY(puv3_cpu_resume)
-@	movl	r0, 0x20202020
-@	put_word_ocd r0
-
-	ldw	r0, sleep_save_sp		@ stack phys addr
-	ldw	r2, =resume_after_mmu		@ its absolute virtual address
-	ldm	(r3 - r6), [r0]+		@ CP regs + virt stack ptr
-	mov	sp, r6				@ CP regs + virt stack ptr
-
-	mov	r1, #0
-	movc	p0.c6, r1, #6			@ invalidate I & D TLBs
-	movc	p0.c5, r1, #28			@ invalidate I & D caches, BTB
-
-	movc	p0.c7, r3, #0			@ PID
-	movc	p0.c2, r4, #0			@ translation table base addr
-	movc	p0.c1, r5, #0			@ control reg, turn on mmu
-	nop
-	jump	r2
-	nop
-	nop
-	nop
-	nop
-	nop
-
-sleep_save_sp:
-	.word	0				@ preserve stack phys ptr here
-
-	.text
-resume_after_mmu:
-@	movl	r0, 0x30303030
-@	put_word_ocd r0
-
-#ifdef	CONFIG_UNICORE_FPU_F64
-	lfm.w	(f0  - f7 ), [sp]+
-	lfm.w	(f8  - f15), [sp]+
-	lfm.w	(f16 - f23), [sp]+
-	lfm.w	(f24 - f31), [sp]+
-	ldm.w	(r4), [sp]+
-	ctf	r4, s31
-#endif
-	ldm.w	(r4 - r15), [sp]+		@ restore registers from stack
-	ldm.w	(r16 - r27, pc), [sp]+		@ return to caller
diff --git a/arch/unicore32/kernel/stacktrace.c b/arch/unicore32/kernel/stacktrace.c
deleted file mode 100644
index c9d8650e9d78..000000000000
--- a/arch/unicore32/kernel/stacktrace.c
+++ /dev/null
@@ -1,127 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/stacktrace.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/sched/debug.h>
-#include <linux/stacktrace.h>
-
-#include <asm/stacktrace.h>
-
-#if defined(CONFIG_FRAME_POINTER)
-/*
- * Unwind the current stack frame and store the new register values in the
- * structure passed as argument. Unwinding is equivalent to a function return,
- * hence the new PC value rather than LR should be used for backtrace.
- *
- * With framepointer enabled, a simple function prologue looks like this:
- *	mov	ip, sp
- *	stmdb	sp!, {fp, ip, lr, pc}
- *	sub	fp, ip, #4
- *
- * A simple function epilogue looks like this:
- *	ldm	sp, {fp, sp, pc}
- *
- * Note that with framepointer enabled, even the leaf functions have the same
- * prologue and epilogue, therefore we can ignore the LR value in this case.
- */
-int notrace unwind_frame(struct stackframe *frame)
-{
-	unsigned long high, low;
-	unsigned long fp = frame->fp;
-
-	/* only go to a higher address on the stack */
-	low = frame->sp;
-	high = ALIGN(low, THREAD_SIZE);
-
-	/* check current frame pointer is within bounds */
-	if (fp < (low + 12) || fp + 4 >= high)
-		return -EINVAL;
-
-	/* restore the registers from the stack frame */
-	frame->fp = *(unsigned long *)(fp - 12);
-	frame->sp = *(unsigned long *)(fp - 8);
-	frame->pc = *(unsigned long *)(fp - 4);
-
-	return 0;
-}
-#endif
-
-void notrace walk_stackframe(struct stackframe *frame,
-		     int (*fn)(struct stackframe *, void *), void *data)
-{
-	while (1) {
-		int ret;
-
-		if (fn(frame, data))
-			break;
-		ret = unwind_frame(frame);
-		if (ret < 0)
-			break;
-	}
-}
-EXPORT_SYMBOL(walk_stackframe);
-
-#ifdef CONFIG_STACKTRACE
-struct stack_trace_data {
-	struct stack_trace *trace;
-	unsigned int no_sched_functions;
-	unsigned int skip;
-};
-
-static int save_trace(struct stackframe *frame, void *d)
-{
-	struct stack_trace_data *data = d;
-	struct stack_trace *trace = data->trace;
-	unsigned long addr = frame->pc;
-
-	if (data->no_sched_functions && in_sched_functions(addr))
-		return 0;
-	if (data->skip) {
-		data->skip--;
-		return 0;
-	}
-
-	trace->entries[trace->nr_entries++] = addr;
-
-	return trace->nr_entries >= trace->max_entries;
-}
-
-void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
-{
-	struct stack_trace_data data;
-	struct stackframe frame;
-
-	data.trace = trace;
-	data.skip = trace->skip;
-
-	if (tsk != current) {
-		data.no_sched_functions = 1;
-		frame.fp = thread_saved_fp(tsk);
-		frame.sp = thread_saved_sp(tsk);
-		frame.lr = 0;		/* recovered from the stack */
-		frame.pc = thread_saved_pc(tsk);
-	} else {
-		register unsigned long current_sp asm("sp");
-
-		data.no_sched_functions = 0;
-		frame.fp = (unsigned long)__builtin_frame_address(0);
-		frame.sp = current_sp;
-		frame.lr = (unsigned long)__builtin_return_address(0);
-		frame.pc = (unsigned long)save_stack_trace_tsk;
-	}
-
-	walk_stackframe(&frame, save_trace, &data);
-}
-
-void save_stack_trace(struct stack_trace *trace)
-{
-	save_stack_trace_tsk(current, trace);
-}
-EXPORT_SYMBOL_GPL(save_stack_trace);
-#endif
diff --git a/arch/unicore32/kernel/sys.c b/arch/unicore32/kernel/sys.c
deleted file mode 100644
index 256fb4082296..000000000000
--- a/arch/unicore32/kernel/sys.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/sys.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/sem.h>
-#include <linux/msg.h>
-#include <linux/shm.h>
-#include <linux/stat.h>
-#include <linux/syscalls.h>
-#include <linux/mman.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/ipc.h>
-#include <linux/uaccess.h>
-
-#include <asm/syscalls.h>
-#include <asm/cacheflush.h>
-
-/* Provide the actual syscall number to call mapping. */
-#undef __SYSCALL
-#define __SYSCALL(nr, call)	[nr] = (call),
-
-#define sys_mmap2 sys_mmap_pgoff
-/* Note that we don't include <linux/unistd.h> but <asm/unistd.h> */
-void *sys_call_table[__NR_syscalls] = {
-	[0 ... __NR_syscalls-1] = sys_ni_syscall,
-#include <asm/unistd.h>
-};
diff --git a/arch/unicore32/kernel/time.c b/arch/unicore32/kernel/time.c
deleted file mode 100644
index c3a37edf4d40..000000000000
--- a/arch/unicore32/kernel/time.c
+++ /dev/null
@@ -1,128 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/time.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/timex.h>
-#include <linux/clockchips.h>
-
-#include <mach/hardware.h>
-
-#define MIN_OSCR_DELTA 2
-
-static irqreturn_t puv3_ost0_interrupt(int irq, void *dev_id)
-{
-	struct clock_event_device *c = dev_id;
-
-	/* Disarm the compare/match, signal the event. */
-	writel(readl(OST_OIER) & ~OST_OIER_E0, OST_OIER);
-	writel(readl(OST_OSSR) & ~OST_OSSR_M0, OST_OSSR);
-	c->event_handler(c);
-
-	return IRQ_HANDLED;
-}
-
-static int
-puv3_osmr0_set_next_event(unsigned long delta, struct clock_event_device *c)
-{
-	unsigned long next, oscr;
-
-	writel(readl(OST_OIER) | OST_OIER_E0, OST_OIER);
-	next = readl(OST_OSCR) + delta;
-	writel(next, OST_OSMR0);
-	oscr = readl(OST_OSCR);
-
-	return (signed)(next - oscr) <= MIN_OSCR_DELTA ? -ETIME : 0;
-}
-
-static int puv3_osmr0_shutdown(struct clock_event_device *evt)
-{
-	writel(readl(OST_OIER) & ~OST_OIER_E0, OST_OIER);
-	writel(readl(OST_OSSR) & ~OST_OSSR_M0, OST_OSSR);
-	return 0;
-}
-
-static struct clock_event_device ckevt_puv3_osmr0 = {
-	.name			= "osmr0",
-	.features		= CLOCK_EVT_FEAT_ONESHOT,
-	.rating			= 200,
-	.set_next_event		= puv3_osmr0_set_next_event,
-	.set_state_shutdown	= puv3_osmr0_shutdown,
-	.set_state_oneshot	= puv3_osmr0_shutdown,
-};
-
-static u64 puv3_read_oscr(struct clocksource *cs)
-{
-	return readl(OST_OSCR);
-}
-
-static struct clocksource cksrc_puv3_oscr = {
-	.name		= "oscr",
-	.rating		= 200,
-	.read		= puv3_read_oscr,
-	.mask		= CLOCKSOURCE_MASK(32),
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-void __init time_init(void)
-{
-	writel(0, OST_OIER);		/* disable any timer interrupts */
-	writel(0, OST_OSSR);		/* clear status on all timers */
-
-	clockevents_calc_mult_shift(&ckevt_puv3_osmr0, CLOCK_TICK_RATE, 5);
-
-	ckevt_puv3_osmr0.max_delta_ns =
-		clockevent_delta2ns(0x7fffffff, &ckevt_puv3_osmr0);
-	ckevt_puv3_osmr0.max_delta_ticks = 0x7fffffff;
-	ckevt_puv3_osmr0.min_delta_ns =
-		clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_puv3_osmr0) + 1;
-	ckevt_puv3_osmr0.min_delta_ticks = MIN_OSCR_DELTA * 2;
-	ckevt_puv3_osmr0.cpumask = cpumask_of(0);
-
-	if (request_irq(IRQ_TIMER0, puv3_ost0_interrupt,
-			IRQF_TIMER | IRQF_IRQPOLL, "ost0", &ckevt_puv3_osmr0))
-		pr_err("Failed to register ost0 interrupt\n");
-
-	clocksource_register_hz(&cksrc_puv3_oscr, CLOCK_TICK_RATE);
-	clockevents_register_device(&ckevt_puv3_osmr0);
-}
-
-#ifdef CONFIG_PM
-unsigned long osmr[4], oier;
-
-void puv3_timer_suspend(void)
-{
-	osmr[0] = readl(OST_OSMR0);
-	osmr[1] = readl(OST_OSMR1);
-	osmr[2] = readl(OST_OSMR2);
-	osmr[3] = readl(OST_OSMR3);
-	oier = readl(OST_OIER);
-}
-
-void puv3_timer_resume(void)
-{
-	writel(0, OST_OSSR);
-	writel(osmr[0], OST_OSMR0);
-	writel(osmr[1], OST_OSMR1);
-	writel(osmr[2], OST_OSMR2);
-	writel(osmr[3], OST_OSMR3);
-	writel(oier, OST_OIER);
-
-	/*
-	 * OSMR0 is the system timer: make sure OSCR is sufficiently behind
-	 */
-	writel(readl(OST_OSMR0) - LATCH, OST_OSCR);
-}
-#else
-void puv3_timer_suspend(void) { };
-void puv3_timer_resume(void) { };
-#endif
-
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
deleted file mode 100644
index a3ac01df1a2e..000000000000
--- a/arch/unicore32/kernel/traps.c
+++ /dev/null
@@ -1,322 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/traps.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  'traps.c' handles hardware exceptions after we have saved some state.
- *  Mostly a debugging aid, but will probably kill the offending process.
- */
-#include <linux/module.h>
-#include <linux/signal.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/task_stack.h>
-#include <linux/spinlock.h>
-#include <linux/personality.h>
-#include <linux/kallsyms.h>
-#include <linux/kdebug.h>
-#include <linux/uaccess.h>
-#include <linux/delay.h>
-#include <linux/hardirq.h>
-#include <linux/init.h>
-#include <linux/atomic.h>
-#include <linux/unistd.h>
-
-#include <asm/cacheflush.h>
-#include <asm/traps.h>
-
-#include "setup.h"
-
-static void dump_mem(const char *, const char *, unsigned long, unsigned long);
-
-void dump_backtrace_entry(unsigned long where,
-		unsigned long from, unsigned long frame)
-{
-#ifdef CONFIG_KALLSYMS
-	printk(KERN_DEFAULT "[<%08lx>] (%pS) from [<%08lx>] (%pS)\n",
-			where, (void *)where, from, (void *)from);
-#else
-	printk(KERN_DEFAULT "Function entered at [<%08lx>] from [<%08lx>]\n",
-			where, from);
-#endif
-}
-
-/*
- * Stack pointers should always be within the kernels view of
- * physical memory.  If it is not there, then we can't dump
- * out any information relating to the stack.
- */
-static int verify_stack(unsigned long sp)
-{
-	if (sp < PAGE_OFFSET ||
-	    (sp > (unsigned long)high_memory && high_memory != NULL))
-		return -EFAULT;
-
-	return 0;
-}
-
-/*
- * Dump out the contents of some memory nicely...
- */
-static void dump_mem(const char *lvl, const char *str, unsigned long bottom,
-		     unsigned long top)
-{
-	unsigned long first;
-	mm_segment_t fs;
-	int i;
-
-	/*
-	 * We need to switch to kernel mode so that we can use __get_user
-	 * to safely read from kernel space.  Note that we now dump the
-	 * code first, just in case the backtrace kills us.
-	 */
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-
-	printk(KERN_DEFAULT "%s%s(0x%08lx to 0x%08lx)\n",
-			lvl, str, bottom, top);
-
-	for (first = bottom & ~31; first < top; first += 32) {
-		unsigned long p;
-		char str[sizeof(" 12345678") * 8 + 1];
-
-		memset(str, ' ', sizeof(str));
-		str[sizeof(str) - 1] = '\0';
-
-		for (p = first, i = 0; i < 8 && p < top; i++, p += 4) {
-			if (p >= bottom && p < top) {
-				unsigned long val;
-				if (__get_user(val, (unsigned long *)p) == 0)
-					sprintf(str + i * 9, " %08lx", val);
-				else
-					sprintf(str + i * 9, " ????????");
-			}
-		}
-		printk(KERN_DEFAULT "%s%04lx:%s\n", lvl, first & 0xffff, str);
-	}
-
-	set_fs(fs);
-}
-
-static void dump_instr(const char *lvl, struct pt_regs *regs)
-{
-	unsigned long addr = instruction_pointer(regs);
-	const int width = 8;
-	mm_segment_t fs;
-	char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str;
-	int i;
-
-	/*
-	 * We need to switch to kernel mode so that we can use __get_user
-	 * to safely read from kernel space.  Note that we now dump the
-	 * code first, just in case the backtrace kills us.
-	 */
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-
-	for (i = -4; i < 1; i++) {
-		unsigned int val, bad;
-
-		bad = __get_user(val, &((u32 *)addr)[i]);
-
-		if (!bad)
-			p += sprintf(p, i == 0 ? "(%0*x) " : "%0*x ",
-					width, val);
-		else {
-			p += sprintf(p, "bad PC value");
-			break;
-		}
-	}
-	printk(KERN_DEFAULT "%sCode: %s\n", lvl, str);
-
-	set_fs(fs);
-}
-
-static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
-			   const char *loglvl)
-{
-	unsigned int fp;
-	int ok = 1;
-
-	printk("%sBacktrace: ", loglvl);
-
-	if (!tsk)
-		tsk = current;
-
-	if (regs)
-		fp = regs->UCreg_fp;
-	else if (tsk != current)
-		fp = thread_saved_fp(tsk);
-	else
-		asm("mov %0, fp" : "=r" (fp) : : "cc");
-
-	if (!fp) {
-		printk("%sno frame pointer", loglvl);
-		ok = 0;
-	} else if (verify_stack(fp)) {
-		printk("%sinvalid frame pointer 0x%08x", loglvl, fp);
-		ok = 0;
-	} else if (fp < (unsigned long)end_of_stack(tsk))
-		printk("%sframe pointer underflow", loglvl);
-	printk("%s\n", loglvl);
-
-	if (ok)
-		c_backtrace(fp, loglvl);
-}
-
-void show_stack(struct task_struct *tsk, unsigned long *sp,
-		       const char *loglvl)
-{
-	dump_backtrace(NULL, tsk, loglvl);
-	barrier();
-}
-
-static int __die(const char *str, int err, struct thread_info *thread,
-		struct pt_regs *regs)
-{
-	struct task_struct *tsk = thread->task;
-	static int die_counter;
-	int ret;
-
-	printk(KERN_EMERG "Internal error: %s: %x [#%d]\n",
-	       str, err, ++die_counter);
-
-	/* trap and error numbers are mostly meaningless on UniCore */
-	ret = notify_die(DIE_OOPS, str, regs, err, tsk->thread.trap_no, \
-			SIGSEGV);
-	if (ret == NOTIFY_STOP)
-		return ret;
-
-	print_modules();
-	__show_regs(regs);
-	printk(KERN_EMERG "Process %.*s (pid: %d, stack limit = 0x%p)\n",
-		TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1);
-
-	if (!user_mode(regs) || in_interrupt()) {
-		dump_mem(KERN_EMERG, "Stack: ", regs->UCreg_sp,
-			 THREAD_SIZE + (unsigned long)task_stack_page(tsk));
-		dump_backtrace(regs, tsk, KERN_EMERG);
-		dump_instr(KERN_EMERG, regs);
-	}
-
-	return ret;
-}
-
-DEFINE_SPINLOCK(die_lock);
-
-/*
- * This function is protected against re-entrancy.
- */
-void die(const char *str, struct pt_regs *regs, int err)
-{
-	struct thread_info *thread = current_thread_info();
-	int ret;
-
-	oops_enter();
-
-	spin_lock_irq(&die_lock);
-	console_verbose();
-	bust_spinlocks(1);
-	ret = __die(str, err, thread, regs);
-
-	bust_spinlocks(0);
-	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
-	spin_unlock_irq(&die_lock);
-	oops_exit();
-
-	if (in_interrupt())
-		panic("Fatal exception in interrupt");
-	if (panic_on_oops)
-		panic("Fatal exception");
-	if (ret != NOTIFY_STOP)
-		do_exit(SIGSEGV);
-}
-
-void uc32_notify_die(const char *str, struct pt_regs *regs,
-		int sig, int code, void __user *addr,
-		unsigned long err, unsigned long trap)
-{
-	if (user_mode(regs)) {
-		current->thread.error_code = err;
-		current->thread.trap_no = trap;
-
-		force_sig_fault(sig, code, addr);
-	} else
-		die(str, regs, err);
-}
-
-/*
- * bad_mode handles the impossible case in the vectors.  If you see one of
- * these, then it's extremely serious, and could mean you have buggy hardware.
- * It never returns, and never tries to sync.  We hope that we can at least
- * dump out some state information...
- */
-asmlinkage void bad_mode(struct pt_regs *regs, unsigned int reason)
-{
-	console_verbose();
-
-	printk(KERN_CRIT "Bad mode detected with reason 0x%x\n", reason);
-
-	die("Oops - bad mode", regs, 0);
-	local_irq_disable();
-	panic("bad mode");
-}
-
-void __pte_error(const char *file, int line, unsigned long val)
-{
-	printk(KERN_DEFAULT "%s:%d: bad pte %08lx.\n", file, line, val);
-}
-
-void __pmd_error(const char *file, int line, unsigned long val)
-{
-	printk(KERN_DEFAULT "%s:%d: bad pmd %08lx.\n", file, line, val);
-}
-
-void __pgd_error(const char *file, int line, unsigned long val)
-{
-	printk(KERN_DEFAULT "%s:%d: bad pgd %08lx.\n", file, line, val);
-}
-
-asmlinkage void __div0(void)
-{
-	printk(KERN_DEFAULT "Division by zero in kernel.\n");
-	dump_stack();
-}
-EXPORT_SYMBOL(__div0);
-
-void abort(void)
-{
-	BUG();
-
-	/* if that doesn't kill us, halt */
-	panic("Oops failed to kill thread");
-}
-
-void __init trap_init(void)
-{
-	return;
-}
-
-void __init early_trap_init(void)
-{
-	unsigned long vectors = VECTORS_BASE;
-
-	/*
-	 * Copy the vectors, stubs (in entry-unicore.S)
-	 * into the vector page, mapped at 0xffff0000, and ensure these
-	 * are visible to the instruction stream.
-	 */
-	memcpy((void *)vectors,
-			__vectors_start,
-			__vectors_end - __vectors_start);
-	memcpy((void *)vectors + 0x200,
-			__stubs_start,
-			__stubs_end - __stubs_start);
-
-	early_signal_init();
-
-	flush_icache_range(vectors, vectors + PAGE_SIZE);
-}
diff --git a/arch/unicore32/kernel/vmlinux.lds.S b/arch/unicore32/kernel/vmlinux.lds.S
deleted file mode 100644
index 6fb320b337ef..000000000000
--- a/arch/unicore32/kernel/vmlinux.lds.S
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/vmlinux.lds.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/thread_info.h>
-#include <asm/memory.h>
-#include <asm/page.h>
-#include <asm/cache.h>
-
-OUTPUT_ARCH(unicore32)
-ENTRY(stext)
-
-jiffies = jiffies_64;
-
-SECTIONS
-{
-	. = PAGE_OFFSET + KERNEL_IMAGE_START;
-
-	_text = .;
-	__init_begin = .;
-	HEAD_TEXT_SECTION
-	INIT_TEXT_SECTION(PAGE_SIZE)
-	INIT_DATA_SECTION(16)
-	PERCPU_SECTION(L1_CACHE_BYTES)
-	__init_end = .;
-
-	_stext = .;
-	.text : {		/* Real text segment */
-		TEXT_TEXT
-		SCHED_TEXT
-		CPUIDLE_TEXT
-		LOCK_TEXT
-
-		*(.fixup)
-		*(.gnu.warning)
-	}
-	_etext = .;
-
-	_sdata = .;
-	RO_DATA(PAGE_SIZE)
-	RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
-	_edata = .;
-
-	EXCEPTION_TABLE(L1_CACHE_BYTES)
-
-	BSS_SECTION(0, 0, 0)
-	_end = .;
-
-	STABS_DEBUG
-	DWARF_DEBUG
-
-	DISCARDS		/* Exit code and data */
-}
diff --git a/arch/unicore32/lib/Makefile b/arch/unicore32/lib/Makefile
deleted file mode 100644
index 5af06645b8f0..000000000000
--- a/arch/unicore32/lib/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# linux/arch/unicore32/lib/Makefile
-#
-# Copyright (C) 2001-2010 GUAN Xue-tao
-#
-
-lib-y	:= backtrace.o delay.o findbit.o
-lib-y	+= strncpy_from_user.o strnlen_user.o
-lib-y	+= clear_user.o copy_page.o
-lib-y	+= copy_from_user.o copy_to_user.o
-
-GNU_LIBC_A		= $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libc.a)
-GNU_LIBC_A_OBJS		:= memchr.o memcpy.o memmove.o memset.o
-GNU_LIBC_A_OBJS		+= strchr.o strrchr.o
-GNU_LIBC_A_OBJS		+= rawmemchr.o			# needed by strrchr.o
-
-GNU_LIBGCC_A		= $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libgcc.a)
-GNU_LIBGCC_A_OBJS	:= _ashldi3.o _ashrdi3.o _lshrdi3.o
-GNU_LIBGCC_A_OBJS	+= _divsi3.o _modsi3.o _ucmpdi2.o _umodsi3.o _udivsi3.o
-
-lib-y	+= $(GNU_LIBC_A_OBJS) $(GNU_LIBGCC_A_OBJS)
-
-$(addprefix $(obj)/, $(GNU_LIBC_A_OBJS)):
-	$(Q)$(AR) p $(GNU_LIBC_A) $(notdir $@) > $@
-
-$(addprefix $(obj)/, $(GNU_LIBGCC_A_OBJS)):
-	$(Q)$(AR) p $(GNU_LIBGCC_A) $(notdir $@) > $@
diff --git a/arch/unicore32/lib/backtrace.S b/arch/unicore32/lib/backtrace.S
deleted file mode 100644
index 6221944b81f3..000000000000
--- a/arch/unicore32/lib/backtrace.S
+++ /dev/null
@@ -1,168 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/backtrace.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-		.text
-
-@ fp is 0 or stack frame
-
-#define frame	v4
-#define sv_fp	v5
-#define sv_pc	v6
-#define offset	v8
-#define loglvl	v9
-
-ENTRY(__backtrace)
-		mov	r0, fp
-
-ENTRY(c_backtrace)
-
-#if !defined(CONFIG_FRAME_POINTER) || !defined(CONFIG_PRINTK)
-		mov	pc, lr
-ENDPROC(__backtrace)
-ENDPROC(c_backtrace)
-#else
-		stm.w	(v4 - v10, lr), [sp-]	@ Save an extra register
-						@ so we have a location...
-		mov.a	frame, r0		@ if frame pointer is zero
-		beq	no_frame		@ we have no stack frames
-		mov	loglvl, r1
-
-1:		stm.w	(pc), [sp-]		@ calculate offset of PC stored
-		ldw.w	r0, [sp]+, #4		@ by stmfd for this CPU
-		adr	r1, 1b
-		sub	offset, r0, r1
-
-/*
- * Stack frame layout:
- *             optionally saved caller registers (r4 - r10)
- *             saved fp
- *             saved sp
- *             saved lr
- *    frame => saved pc
- *             optionally saved arguments (r0 - r3)
- * saved sp => <next word>
- *
- * Functions start with the following code sequence:
- *                  mov   ip, sp
- *                  stm.w (r0 - r3), [sp-] (optional)
- * corrected pc =>  stm.w sp, (..., fp, ip, lr, pc)
- */
-for_each_frame:
-
-1001:		ldw	sv_pc, [frame+], #0	@ get saved pc
-1002:		ldw	sv_fp, [frame+], #-12	@ get saved fp
-
-		sub	sv_pc, sv_pc, offset	@ Correct PC for prefetching
-
-1003:		ldw	r2, [sv_pc+], #-4	@ if stmfd sp, {args} exists,
-		ldw	r3, .Ldsi+4		@ adjust saved 'pc' back one
-		cxor.a	r3, r2 >> #14		@ instruction
-		beq	201f
-		sub	r0, sv_pc, #4		@ allow for mov
-		b	202f
-201:
-		sub	r0, sv_pc, #8		@ allow for mov + stmia
-202:
-		ldw	r1, [frame+], #-4	@ get saved lr
-		mov	r2, frame
-		b.l	dump_backtrace_entry
-
-		ldw	r1, [sv_pc+], #-4	@ if stmfd sp, {args} exists,
-		ldw	r3, .Ldsi+4
-		cxor.a	r3, r1 >> #14
-		bne	1004f
-		ldw	r0, [frame+], #-8	@ get sp
-		sub	r0, r0, #4		@ point at the last arg
-		b.l	.Ldumpstm		@ dump saved registers
-
-1004:		ldw	r1, [sv_pc+], #0	@ if stmfd {, fp, ip, lr, pc}
-		ldw	r3, .Ldsi		@ instruction exists,
-		cxor.a	r3, r1 >> #14
-		bne	201f
-		sub	r0, frame, #16
-		b.l	.Ldumpstm		@ dump saved registers
-201:
-		cxor.a	sv_fp, #0		@ zero saved fp means
-		beq	no_frame		@ no further frames
-
-		csub.a	sv_fp, frame		@ next frame must be
-		mov	frame, sv_fp		@ above the current frame
-		bua	for_each_frame
-
-1006:		adr	r0, .Lbad
-		mov	r1, loglvl
-		mov	r2, frame
-		b.l	printk
-no_frame:	ldm.w	(v4 - v10, pc), [sp]+
-ENDPROC(__backtrace)
-ENDPROC(c_backtrace)
-
-		.pushsection __ex_table,"a"
-		.align	3
-		.long	1001b, 1006b
-		.long	1002b, 1006b
-		.long	1003b, 1006b
-		.long	1004b, 1006b
-		.popsection
-
-#define instr v4
-#define reg   v5
-#define stack v6
-
-.Ldumpstm:	stm.w	(instr, reg, stack, v7, lr), [sp-]
-		mov	stack, r0
-		mov	instr, r1
-		mov	reg, #14
-		mov	v7, #0
-1:		mov	r3, #1
-		csub.a	reg, #8
-		bne	201f
-		sub	reg, reg, #3
-201:
-		cand.a	instr, r3 << reg
-		beq	2f
-		add	v7, v7, #1
-		cxor.a	v7, #6
-		cmoveq	v7, #1
-		bne	201f
-		adr	r0, .Lcr
-		mov	r1, loglvl
-		b.l	printk
-201:
-		ldw.w	r3, [stack]+, #-4
-		mov	r2, reg
-		csub.a	r2, #8
-		bsl	201f
-		sub	r2, r2, #3
-201:
-		cand.a	instr, #0x40		@ if H is 1, high 16 regs
-		beq	201f
-		add	r2, r2, #0x10		@ so r2 need add 16
-201:
-		adr	r0, .Lfp
-		mov	r1, loglvl
-		b.l	printk
-2:		sub.a	reg, reg, #1
-		bns	1b
-		cxor.a	v7, #0
-		beq	201f
-		adr	r0, .Lcr
-		mov	r1, loglvl
-		b.l	printk
-201:		ldm.w	(instr, reg, stack, v7, pc), [sp]+
-
-.Lfp:		.asciz	"%sr%d:%08x "
-.Lcr:		.asciz	"%s\n"
-.Lbad:		.asciz	"%sBacktrace aborted due to bad frame pointer <%p>\n"
-		.align
-.Ldsi:		.word	0x92eec000 >> 14	@ stm.w sp, (... fp, ip, lr, pc)
-		.word	0x92e10000 >> 14	@ stm.w sp, ()
-
-#endif
diff --git a/arch/unicore32/lib/clear_user.S b/arch/unicore32/lib/clear_user.S
deleted file mode 100644
index c6ca431b1090..000000000000
--- a/arch/unicore32/lib/clear_user.S
+++ /dev/null
@@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/clear_user.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-		.text
-
-/* Prototype: int __clear_user(void *addr, size_t sz)
- * Purpose  : clear some user memory
- * Params   : addr - user memory address to clear
- *          : sz   - number of bytes to clear
- * Returns  : number of bytes NOT cleared
- */
-WEAK(__clear_user)
-		stm.w	(lr), [sp-]
-		stm.w	(r1), [sp-]
-		mov	r2, #0
-		csub.a	r1, #4
-		bsl	2f
-		and.a	ip, r0, #3
-		beq	1f
-		csub.a	ip, #2
-		strusr	r2, r0, 1
-		strusr	r2, r0, 1, el
-		strusr	r2, r0, 1, sl
-		rsub	ip, ip, #4
-		sub	r1, r1, ip		@  7  6  5  4  3  2  1
-1:		sub.a	r1, r1, #8		@ -1 -2 -3 -4 -5 -6 -7
-		strusr	r2, r0, 4, ns, rept=2
-		bns	1b
-		add.a	r1, r1, #4		@  3  2  1  0 -1 -2 -3
-		strusr	r2, r0, 4, ns
-2:		cand.a	r1, #2			@ 1x 1x 0x 0x 1x 1x 0x
-		strusr	r2, r0, 1, ne, rept=2
-		cand.a	r1, #1			@ x1 x0 x1 x0 x1 x0 x1
-		beq	3f
-USER(		stb.u	r2, [r0])
-3:		mov	r0, #0
-		ldm.w	(r1), [sp]+
-		ldm.w	(pc), [sp]+
-ENDPROC(__clear_user)
-
-		.pushsection .fixup,"ax"
-		.align	0
-9001:		ldm.w	(r0), [sp]+
-		ldm.w	(pc), [sp]+
-		.popsection
-
diff --git a/arch/unicore32/lib/copy_from_user.S b/arch/unicore32/lib/copy_from_user.S
deleted file mode 100644
index affb43920ac0..000000000000
--- a/arch/unicore32/lib/copy_from_user.S
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/copy_from_user.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-/*
- * Prototype:
- *
- *	size_t raw_copy_from_user(void *to, const void *from, size_t n)
- *
- * Purpose:
- *
- *	copy a block to kernel memory from user memory
- *
- * Params:
- *
- *	to = kernel memory
- *	from = user memory
- *	n = number of bytes to copy
- *
- * Return value:
- *
- *	Number of bytes NOT copied.
- */
-
-	.macro ldr1w ptr reg abort
-	ldrusr	\reg, \ptr, 4, abort=\abort
-	.endm
-
-	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
-100:	ldm.w	(\reg1, \reg2, \reg3, \reg4), [\ptr]+
-	.pushsection __ex_table, "a"
-	.align	3
-	.long 100b, \abort
-	.popsection
-	.endm
-
-	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-100:	ldm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+
-	.pushsection __ex_table, "a"
-	.align	3
-	.long 100b, \abort
-	.popsection
-	.endm
-
-	.macro ldr1b ptr reg cond=al abort
-	ldrusr	\reg, \ptr, 1, \cond, abort=\abort
-	.endm
-
-	.macro str1w ptr reg abort
-	stw.w \reg, [\ptr]+, #4
-	.endm
-
-	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	stm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+
-	.endm
-
-	.macro str1b ptr reg cond=al abort
-	.ifnc	\cond, al
-	b\cond	201f
-	b	202f
-	.endif
-201:	stb.w \reg, [\ptr]+, #1
-202:
-	.endm
-
-	.macro enter
-	mov	r3, #0
-	stm.w	(r0, r2, r3), [sp-]
-	.endm
-
-	.macro exit
-	add	sp, sp, #8
-	ldm.w	(r0), [sp]+
-	mov	pc, lr
-	.endm
-
-	.text
-
-ENTRY(raw_copy_from_user)
-
-#include "copy_template.S"
-
-ENDPROC(raw_copy_from_user)
-
-	.pushsection .fixup,"ax"
-	.align 0
-	copy_abort_preamble
-	ldm.w	(r1, r2, r3), [sp]+
-	sub	r0, r0, r1
-	rsub	r0, r0, r2
-	copy_abort_end
-	.popsection
-
diff --git a/arch/unicore32/lib/copy_page.S b/arch/unicore32/lib/copy_page.S
deleted file mode 100644
index dc163f2d1af0..000000000000
--- a/arch/unicore32/lib/copy_page.S
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/copy_page.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  ASM optimised string functions
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <generated/asm-offsets.h>
-#include <asm/cache.h>
-
-#define COPY_COUNT (PAGE_SZ/256)
-
-		.text
-		.align	5
-/*
- * UniCore optimised copy_page routine
- */
-ENTRY(copy_page)
-		stm.w	(r17 - r19, lr), [sp-]
-		mov	r17, r0
-		mov	r18, r1
-		mov	r19, #COPY_COUNT
-1:
-	.rept	4
-		ldm.w	(r0 - r15), [r18]+
-		stm.w	(r0 - r15), [r17]+
-	.endr
-		sub.a	r19, r19, #1
-		bne	1b
-		ldm.w	(r17 - r19, pc), [sp]+
-ENDPROC(copy_page)
diff --git a/arch/unicore32/lib/copy_template.S b/arch/unicore32/lib/copy_template.S
deleted file mode 100644
index 02a7aef83fbf..000000000000
--- a/arch/unicore32/lib/copy_template.S
+++ /dev/null
@@ -1,211 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/copy_template.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-/*
- * Theory of operation
- * -------------------
- *
- * This file provides the core code for a forward memory copy used in
- * the implementation of memcopy(), copy_to_user() and copy_from_user().
- *
- * The including file must define the following accessor macros
- * according to the need of the given function:
- *
- * ldr1w ptr reg abort
- *
- *	This loads one word from 'ptr', stores it in 'reg' and increments
- *	'ptr' to the next word. The 'abort' argument is used for fixup tables.
- *
- * ldr4w ptr reg1 reg2 reg3 reg4 abort
- * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
- *
- *	This loads four or eight words starting from 'ptr', stores them
- *	in provided registers and increments 'ptr' past those words.
- *	The'abort' argument is used for fixup tables.
- *
- * ldr1b ptr reg cond abort
- *
- *	Similar to ldr1w, but it loads a byte and increments 'ptr' one byte.
- *	It also must apply the condition code if provided, otherwise the
- *	"al" condition is assumed by default.
- *
- * str1w ptr reg abort
- * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
- * str1b ptr reg cond abort
- *
- *	Same as their ldr* counterparts, but data is stored to 'ptr' location
- *	rather than being loaded.
- *
- * enter
- *
- *	Preserve the provided registers on the stack plus any additional
- *	data as needed by the implementation including this code. Called
- *	upon code entry.
- *
- * exit
- *
- *	Restore registers with the values previously saved with the
- *	'preserv' macro. Called upon code termination.
- */
-
-
-		enter
-
-		sub.a	r2, r2, #4
-		bsl	8f
-		and.a	ip, r0, #3
-		bne	9f
-		and.a	ip, r1, #3
-		bne	10f
-
-1:		sub.a	r2, r2, #(28)
-		stm.w	(r5 - r8), [sp-]
-		bsl	5f
-
-3:
-4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, r10, r11, abort=20f
-		sub.a	r2, r2, #32
-		str8w	r0, r3, r4, r5, r6, r7, r8, r10, r11, abort=20f
-		beg	3b
-
-5:		and.a	ip, r2, #28
-		rsub	ip, ip, #32
-		beq	7f
-		add	pc, pc, ip		@ C is always clear here
-		nop
-
-		ldr1w	r1, r3, abort=20f
-		ldr1w	r1, r4, abort=20f
-		ldr1w	r1, r5, abort=20f
-		ldr1w	r1, r6, abort=20f
-		ldr1w	r1, r7, abort=20f
-		ldr1w	r1, r8, abort=20f
-		ldr1w	r1, r11, abort=20f
-
-		add	pc, pc, ip
-		nop
-
-		str1w	r0, r3, abort=20f
-		str1w	r0, r4, abort=20f
-		str1w	r0, r5, abort=20f
-		str1w	r0, r6, abort=20f
-		str1w	r0, r7, abort=20f
-		str1w	r0, r8, abort=20f
-		str1w	r0, r11, abort=20f
-
-7:		ldm.w	(r5 - r8), [sp]+
-
-8:		mov.a	r2, r2 << #31
-		ldr1b	r1, r3, ne, abort=21f
-		ldr1b	r1, r4, ea, abort=21f
-		ldr1b	r1, r10, ea, abort=21f
-		str1b	r0, r3, ne, abort=21f
-		str1b	r0, r4, ea, abort=21f
-		str1b	r0, r10, ea, abort=21f
-
-		exit
-
-9:		rsub	ip, ip, #4
-		csub.a	ip, #2
-		ldr1b	r1, r3, sg, abort=21f
-		ldr1b	r1, r4, eg, abort=21f
-		ldr1b	r1, r11, abort=21f
-		str1b	r0, r3, sg, abort=21f
-		str1b	r0, r4, eg, abort=21f
-		sub.a	r2, r2, ip
-		str1b	r0, r11, abort=21f
-		bsl	8b
-		and.a	ip, r1, #3
-		beq	1b
-
-10:		andn	r1, r1, #3
-		csub.a	ip, #2
-		ldr1w	r1, r11, abort=21f
-		beq	17f
-		bsg	18f
-
-
-		.macro	forward_copy_shift a b
-
-		sub.a	r2, r2, #28
-		bsl	14f
-
-11:		stm.w	(r5 - r9), [sp-]
-
-12:
-		ldr4w	r1, r4, r5, r6, r7, abort=19f
-		mov	r3, r11 pull #\a
-		sub.a	r2, r2, #32
-		ldr4w	r1, r8, r9, r10, r11, abort=19f
-		or	r3, r3, r4 push #\b
-		mov	r4, r4 pull #\a
-		or	r4, r4, r5 push #\b
-		mov	r5, r5 pull #\a
-		or	r5, r5, r6 push #\b
-		mov	r6, r6 pull #\a
-		or	r6, r6, r7 push #\b
-		mov	r7, r7 pull #\a
-		or	r7, r7, r8 push #\b
-		mov	r8, r8 pull #\a
-		or	r8, r8, r9 push #\b
-		mov	r9, r9 pull #\a
-		or	r9, r9, r10 push #\b
-		mov	r10, r10 pull #\a
-		or	r10, r10, r11 push #\b
-		str8w	r0, r3, r4, r5, r6, r7, r8, r9, r10, , abort=19f
-		beg	12b
-
-		ldm.w	(r5 - r9), [sp]+
-
-14:		and.a	ip, r2, #28
-		beq	16f
-
-15:		mov	r3, r11 pull #\a
-		ldr1w	r1, r11, abort=21f
-		sub.a	ip, ip, #4
-		or	r3, r3, r11 push #\b
-		str1w	r0, r3, abort=21f
-		bsg	15b
-
-16:		sub	r1, r1, #(\b / 8)
-		b	8b
-
-		.endm
-
-
-		forward_copy_shift	a=8	b=24
-
-17:		forward_copy_shift	a=16	b=16
-
-18:		forward_copy_shift	a=24	b=8
-
-
-/*
- * Abort preamble and completion macros.
- * If a fixup handler is required then those macros must surround it.
- * It is assumed that the fixup code will handle the private part of
- * the exit macro.
- */
-
-	.macro	copy_abort_preamble
-19:	ldm.w	(r5 - r9), [sp]+
-	b	21f
-299:	.word	0			@ store lr
-					@ to avoid function call in fixup
-20:	ldm.w	(r5 - r8), [sp]+
-21:
-	adr	r1, 299b
-	stw	lr, [r1]
-	.endm
-
-	.macro	copy_abort_end
-	adr	lr, 299b
-	ldw	pc, [lr]
-	.endm
-
diff --git a/arch/unicore32/lib/copy_to_user.S b/arch/unicore32/lib/copy_to_user.S
deleted file mode 100644
index c867f08f89ce..000000000000
--- a/arch/unicore32/lib/copy_to_user.S
+++ /dev/null
@@ -1,93 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/copy_to_user.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-/*
- * Prototype:
- *
- *	size_t raw_copy_to_user(void *to, const void *from, size_t n)
- *
- * Purpose:
- *
- *	copy a block to user memory from kernel memory
- *
- * Params:
- *
- *	to = user memory
- *	from = kernel memory
- *	n = number of bytes to copy
- *
- * Return value:
- *
- *	Number of bytes NOT copied.
- */
-
-	.macro ldr1w ptr reg abort
-	ldw.w \reg, [\ptr]+, #4
-	.endm
-
-	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
-	ldm.w	(\reg1, \reg2, \reg3, \reg4), [\ptr]+
-	.endm
-
-	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	ldm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+
-	.endm
-
-	.macro ldr1b ptr reg cond=al abort
-	notcond	\cond, .+8
-	ldb.w \reg, [\ptr]+, #1
-	.endm
-
-	.macro str1w ptr reg abort
-	strusr	\reg, \ptr, 4, abort=\abort
-	.endm
-
-	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-100:	stm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+
-
-	.pushsection __ex_table, "a"
-	.long 100b, \abort
-	.popsection
-	.endm
-
-	.macro str1b ptr reg cond=al abort
-	strusr	\reg, \ptr, 1, \cond, abort=\abort
-	.endm
-
-	.macro enter
-	mov	r3, #0
-	stm.w	(r0, r2, r3), [sp-]
-	.endm
-
-	.macro exit
-	add	sp, sp, #8
-	ldm.w	(r0), [sp]+
-	mov	pc, lr
-	.endm
-
-	.text
-
-WEAK(raw_copy_to_user)
-
-#include "copy_template.S"
-
-ENDPROC(raw_copy_to_user)
-
-	.pushsection .fixup,"ax"
-	.align 0
-	copy_abort_preamble
-	ldm.w	(r1, r2, r3), [sp]+
-	sub	r0, r0, r1
-	rsub	r0, r0, r2
-	copy_abort_end
-	.popsection
-
diff --git a/arch/unicore32/lib/delay.S b/arch/unicore32/lib/delay.S
deleted file mode 100644
index 6a359dd034e5..000000000000
--- a/arch/unicore32/lib/delay.S
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/delay.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/param.h>
-		.text
-
-.LC0:		.word	loops_per_jiffy
-.LC1:		.word	(2199023*HZ)>>11
-
-/*
- * r0  <= 2000
- * lpj <= 0x01ffffff (max. 3355 bogomips)
- * HZ  <= 1000
- */
-
-ENTRY(__udelay)
-		ldw	r2, .LC1
-		mul	r0, r2, r0
-ENTRY(__const_udelay)				@ 0 <= r0 <= 0x7fffff06
-		ldw	r2, .LC0
-		ldw	r2, [r2]		@ max = 0x01ffffff
-		mov	r0, r0 >> #14		@ max = 0x0001ffff
-		mov	r2, r2 >> #10		@ max = 0x00007fff
-		mul	r0, r2, r0		@ max = 2^32-1
-		mov.a	r0, r0 >> #6
-		cmoveq	pc, lr
-
-/*
- * loops = r0 * HZ * loops_per_jiffy / 1000000
- *
- * Oh, if only we had a cycle counter...
- */
-
-@ Delay routine
-ENTRY(__delay)
-		sub.a	r0, r0, #2
-		bua	__delay
-		mov	pc, lr
-ENDPROC(__udelay)
-ENDPROC(__const_udelay)
-ENDPROC(__delay)
diff --git a/arch/unicore32/lib/findbit.S b/arch/unicore32/lib/findbit.S
deleted file mode 100644
index 42f1282670d2..000000000000
--- a/arch/unicore32/lib/findbit.S
+++ /dev/null
@@ -1,97 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/findbit.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-                .text
-
-/*
- * Purpose  : Find a 'zero' bit
- * Prototype: int find_first_zero_bit(void *addr, unsigned int maxbit);
- */
-ENTRY(find_first_zero_bit)
-		cxor.a	r1, #0
-		beq	3f
-		mov	r2, #0
-1:		ldb	r3, [r0+], r2 >> #3
-		xor.a	r3, r3, #0xff		@ invert bits
-		bne	.L_found		@ any now set - found zero bit
-		add	r2, r2, #8		@ next bit pointer
-2:		csub.a	r2, r1			@ any more?
-		bub	1b
-3:		mov	r0, r1			@ no free bits
-		mov	pc, lr
-ENDPROC(find_first_zero_bit)
-
-/*
- * Purpose  : Find next 'zero' bit
- * Prototype: int find_next_zero_bit
- *		(void *addr, unsigned int maxbit, int offset)
- */
-ENTRY(find_next_zero_bit)
-		cxor.a	r1, #0
-		beq	3b
-		and.a	ip, r2, #7
-		beq	1b			@ If new byte, goto old routine
-		ldb	r3, [r0+], r2 >> #3
-		xor	r3, r3, #0xff		@ now looking for a 1 bit
-		mov.a	r3, r3 >> ip		@ shift off unused bits
-		bne	.L_found
-		or	r2, r2, #7		@ if zero, then no bits here
-		add	r2, r2, #1		@ align bit pointer
-		b	2b			@ loop for next bit
-ENDPROC(find_next_zero_bit)
-
-/*
- * Purpose  : Find a 'one' bit
- * Prototype: int find_first_bit
- *		(const unsigned long *addr, unsigned int maxbit);
- */
-ENTRY(find_first_bit)
-		cxor.a	r1, #0
-		beq	3f
-		mov	r2, #0
-1:		ldb	r3, [r0+], r2 >> #3
-		mov.a	r3, r3
-		bne	.L_found		@ any now set - found zero bit
-		add	r2, r2, #8		@ next bit pointer
-2:		csub.a	r2, r1			@ any more?
-		bub	1b
-3:		mov	r0, r1			@ no free bits
-		mov	pc, lr
-ENDPROC(find_first_bit)
-
-/*
- * Purpose  : Find next 'one' bit
- * Prototype: int find_next_zero_bit
- *		(void *addr, unsigned int maxbit, int offset)
- */
-ENTRY(find_next_bit)
-		cxor.a	r1, #0
-		beq	3b
-		and.a	ip, r2, #7
-		beq	1b			@ If new byte, goto old routine
-		ldb	r3, [r0+], r2 >> #3
-		mov.a	r3, r3 >> ip		@ shift off unused bits
-		bne	.L_found
-		or	r2, r2, #7		@ if zero, then no bits here
-		add	r2, r2, #1		@ align bit pointer
-		b	2b			@ loop for next bit
-ENDPROC(find_next_bit)
-
-/*
- * One or more bits in the LSB of r3 are assumed to be set.
- */
-.L_found:
-		rsub	r1, r3, #0
-		and	r3, r3, r1
-		cntlz	r3, r3
-		rsub	r3, r3, #31
-		add	r0, r2, r3
-		mov	pc, lr
-
diff --git a/arch/unicore32/lib/strncpy_from_user.S b/arch/unicore32/lib/strncpy_from_user.S
deleted file mode 100644
index f227b8227a4c..000000000000
--- a/arch/unicore32/lib/strncpy_from_user.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/strncpy_from_user.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/errno.h>
-
-	.text
-	.align	5
-
-/*
- * Copy a string from user space to kernel space.
- *  r0 = dst, r1 = src, r2 = byte length
- * returns the number of characters copied (strlen of copied string),
- *  -EFAULT on exception, or "len" if we fill the whole buffer
- */
-ENTRY(__strncpy_from_user)
-	mov	ip, r1
-1:	sub.a	r2, r2, #1
-	ldrusr	r3, r1, 1, ns
-	bfs	2f
-	stb.w	r3, [r0]+, #1
-	cxor.a	r3, #0
-	bne	1b
-	sub	r1, r1, #1	@ take NUL character out of count
-2:	sub	r0, r1, ip
-	mov	pc, lr
-ENDPROC(__strncpy_from_user)
-
-	.pushsection .fixup,"ax"
-	.align	0
-9001:	mov	r3, #0
-	stb	r3, [r0+], #0	@ null terminate
-	mov	r0, #-EFAULT
-	mov	pc, lr
-	.popsection
-
diff --git a/arch/unicore32/lib/strnlen_user.S b/arch/unicore32/lib/strnlen_user.S
deleted file mode 100644
index c836b12776fe..000000000000
--- a/arch/unicore32/lib/strnlen_user.S
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/strnlen_user.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/errno.h>
-
-	.text
-	.align	5
-
-/* Prototype: unsigned long __strnlen_user(const char *str, long n)
- * Purpose  : get length of a string in user memory
- * Params   : str - address of string in user memory
- * Returns  : length of string *including terminator*
- *	      or zero on exception, or n + 1 if too long
- */
-ENTRY(__strnlen_user)
-	mov	r2, r0
-1:
-	ldrusr	r3, r0, 1
-	cxor.a	r3, #0
-	beq	2f
-	sub.a	r1, r1, #1
-	bne	1b
-	add	r0, r0, #1
-2:	sub	r0, r0, r2
-	mov	pc, lr
-ENDPROC(__strnlen_user)
-
-	.pushsection .fixup,"ax"
-	.align	0
-9001:	mov	r0, #0
-	mov	pc, lr
-	.popsection
diff --git a/arch/unicore32/mm/Kconfig b/arch/unicore32/mm/Kconfig
deleted file mode 100644
index 82759b6aba67..000000000000
--- a/arch/unicore32/mm/Kconfig
+++ /dev/null
@@ -1,41 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-comment "Processor Type"
-
-# Select CPU types depending on the architecture selected.  This selects
-# which CPUs we support in the kernel image, and the compiler instruction
-# optimiser behaviour.
-
-config CPU_UCV2
-	def_bool y
-
-comment "Processor Features"
-
-config CPU_ICACHE_DISABLE
-	bool "Disable I-Cache (I-bit)"
-	help
-	  Say Y here to disable the processor instruction cache. Unless
-	  you have a reason not to or are unsure, say N.
-
-config CPU_DCACHE_DISABLE
-	bool "Disable D-Cache (D-bit)"
-	help
-	  Say Y here to disable the processor data cache. Unless
-	  you have a reason not to or are unsure, say N.
-
-config CPU_DCACHE_WRITETHROUGH
-	bool "Force write through D-cache"
-	help
-	  Say Y here to use the data cache in writethrough mode. Unless you
-	  specifically require this or are unsure, say N.
-
-config CPU_DCACHE_LINE_DISABLE
-	bool "Disable D-cache line ops"
-	default y
-	help
-	  Say Y here to disable the data cache line operations.
-
-config CPU_TLB_SINGLE_ENTRY_DISABLE
-	bool "Disable TLB single entry ops"
-	default y
-	help
-	  Say Y here to disable the TLB single entry operations.
diff --git a/arch/unicore32/mm/Makefile b/arch/unicore32/mm/Makefile
deleted file mode 100644
index 8106260583ab..000000000000
--- a/arch/unicore32/mm/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the linux unicore-specific parts of the memory manager.
-#
-
-obj-y				:= extable.o fault.o init.o pgd.o mmu.o
-obj-y				+= flush.o ioremap.o
-
-obj-$(CONFIG_MODULES)		+= proc-syms.o
-
-obj-$(CONFIG_ALIGNMENT_TRAP)	+= alignment.o
-
-obj-$(CONFIG_CPU_UCV2)		+= cache-ucv2.o tlb-ucv2.o proc-ucv2.o
-
diff --git a/arch/unicore32/mm/alignment.c b/arch/unicore32/mm/alignment.c
deleted file mode 100644
index 2ea98f7a4156..000000000000
--- a/arch/unicore32/mm/alignment.c
+++ /dev/null
@@ -1,524 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/alignment.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-/*
- * TODO:
- *  FPU ldm/stm not handling
- */
-#include <linux/compiler.h>
-#include <linux/kernel.h>
-#include <linux/sched/debug.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/pgtable.h>
-
-#include <asm/tlbflush.h>
-#include <asm/unaligned.h>
-
-#include "mm.h"
-
-#define CODING_BITS(i)	(i & 0xe0000120)
-
-#define LDST_P_BIT(i)	(i & (1 << 28))	/* Preindex             */
-#define LDST_U_BIT(i)	(i & (1 << 27))	/* Add offset           */
-#define LDST_W_BIT(i)	(i & (1 << 25))	/* Writeback            */
-#define LDST_L_BIT(i)	(i & (1 << 24))	/* Load                 */
-
-#define LDST_P_EQ_U(i)	((((i) ^ ((i) >> 1)) & (1 << 27)) == 0)
-
-#define LDSTH_I_BIT(i)	(i & (1 << 26))	/* half-word immed      */
-#define LDM_S_BIT(i)	(i & (1 << 26))	/* write ASR from BSR */
-#define LDM_H_BIT(i)	(i & (1 << 6))	/* select r0-r15 or r16-r31 */
-
-#define RN_BITS(i)	((i >> 19) & 31)	/* Rn                   */
-#define RD_BITS(i)	((i >> 14) & 31)	/* Rd                   */
-#define RM_BITS(i)	(i & 31)	/* Rm                   */
-
-#define REGMASK_BITS(i)	(((i & 0x7fe00) >> 3) | (i & 0x3f))
-#define OFFSET_BITS(i)	(i & 0x03fff)
-
-#define SHIFT_BITS(i)	((i >> 9) & 0x1f)
-#define SHIFT_TYPE(i)	(i & 0xc0)
-#define SHIFT_LSL	0x00
-#define SHIFT_LSR	0x40
-#define SHIFT_ASR	0x80
-#define SHIFT_RORRRX	0xc0
-
-union offset_union {
-	unsigned long un;
-	signed long sn;
-};
-
-#define TYPE_ERROR	0
-#define TYPE_FAULT	1
-#define TYPE_LDST	2
-#define TYPE_DONE	3
-#define TYPE_SWAP  4
-#define TYPE_COLS  5		/* Coprocessor load/store */
-
-#define get8_unaligned_check(val, addr, err)		\
-	__asm__(					\
-	"1:	ldb.u	%1, [%2], #1\n"			\
-	"2:\n"						\
-	"	.pushsection .fixup,\"ax\"\n"		\
-	"	.align	2\n"				\
-	"3:	mov	%0, #1\n"			\
-	"	b	2b\n"				\
-	"	.popsection\n"				\
-	"	.pushsection __ex_table,\"a\"\n"		\
-	"	.align	3\n"				\
-	"	.long	1b, 3b\n"			\
-	"	.popsection\n"				\
-	: "=r" (err), "=&r" (val), "=r" (addr)		\
-	: "0" (err), "2" (addr))
-
-#define get8t_unaligned_check(val, addr, err)		\
-	__asm__(					\
-	"1:	ldb.u	%1, [%2], #1\n"			\
-	"2:\n"						\
-	"	.pushsection .fixup,\"ax\"\n"		\
-	"	.align	2\n"				\
-	"3:	mov	%0, #1\n"			\
-	"	b	2b\n"				\
-	"	.popsection\n"				\
-	"	.pushsection __ex_table,\"a\"\n"		\
-	"	.align	3\n"				\
-	"	.long	1b, 3b\n"			\
-	"	.popsection\n"				\
-	: "=r" (err), "=&r" (val), "=r" (addr)		\
-	: "0" (err), "2" (addr))
-
-#define get16_unaligned_check(val, addr)			\
-	do {							\
-		unsigned int err = 0, v, a = addr;		\
-		get8_unaligned_check(val, a, err);		\
-		get8_unaligned_check(v, a, err);		\
-		val |= v << 8;					\
-		if (err)					\
-			goto fault;				\
-	} while (0)
-
-#define put16_unaligned_check(val, addr)			\
-	do {							\
-		unsigned int err = 0, v = val, a = addr;	\
-		__asm__(					\
-		"1:	stb.u	%1, [%2], #1\n"			\
-		"	mov	%1, %1 >> #8\n"			\
-		"2:	stb.u	%1, [%2]\n"			\
-		"3:\n"						\
-		"	.pushsection .fixup,\"ax\"\n"		\
-		"	.align	2\n"				\
-		"4:	mov	%0, #1\n"			\
-		"	b	3b\n"				\
-		"	.popsection\n"				\
-		"	.pushsection __ex_table,\"a\"\n"		\
-		"	.align	3\n"				\
-		"	.long	1b, 4b\n"			\
-		"	.long	2b, 4b\n"			\
-		"	.popsection\n"				\
-		: "=r" (err), "=&r" (v), "=&r" (a)		\
-		: "0" (err), "1" (v), "2" (a));			\
-		if (err)					\
-			goto fault;				\
-	} while (0)
-
-#define __put32_unaligned_check(ins, val, addr)			\
-	do {							\
-		unsigned int err = 0, v = val, a = addr;	\
-		__asm__(					\
-		"1:	"ins"	%1, [%2], #1\n"			\
-		"	mov	%1, %1 >> #8\n"			\
-		"2:	"ins"	%1, [%2], #1\n"			\
-		"	mov	%1, %1 >> #8\n"			\
-		"3:	"ins"	%1, [%2], #1\n"			\
-		"	mov	%1, %1 >> #8\n"			\
-		"4:	"ins"	%1, [%2]\n"			\
-		"5:\n"						\
-		"	.pushsection .fixup,\"ax\"\n"		\
-		"	.align	2\n"				\
-		"6:	mov	%0, #1\n"			\
-		"	b	5b\n"				\
-		"	.popsection\n"				\
-		"	.pushsection __ex_table,\"a\"\n"		\
-		"	.align	3\n"				\
-		"	.long	1b, 6b\n"			\
-		"	.long	2b, 6b\n"			\
-		"	.long	3b, 6b\n"			\
-		"	.long	4b, 6b\n"			\
-		"	.popsection\n"				\
-		: "=r" (err), "=&r" (v), "=&r" (a)		\
-		: "0" (err), "1" (v), "2" (a));			\
-		if (err)					\
-			goto fault;				\
-	} while (0)
-
-#define get32_unaligned_check(val, addr)			\
-	do {							\
-		unsigned int err = 0, v, a = addr;		\
-		get8_unaligned_check(val, a, err);		\
-		get8_unaligned_check(v, a, err);		\
-		val |= v << 8;					\
-		get8_unaligned_check(v, a, err);		\
-		val |= v << 16;					\
-		get8_unaligned_check(v, a, err);		\
-		val |= v << 24;					\
-		if (err)					\
-			goto fault;				\
-	} while (0)
-
-#define put32_unaligned_check(val, addr)			\
-	__put32_unaligned_check("stb.u", val, addr)
-
-#define get32t_unaligned_check(val, addr)			\
-	do {							\
-		unsigned int err = 0, v, a = addr;		\
-		get8t_unaligned_check(val, a, err);		\
-		get8t_unaligned_check(v, a, err);		\
-		val |= v << 8;					\
-		get8t_unaligned_check(v, a, err);		\
-		val |= v << 16;					\
-		get8t_unaligned_check(v, a, err);		\
-		val |= v << 24;					\
-		if (err)					\
-			goto fault;				\
-	} while (0)
-
-#define put32t_unaligned_check(val, addr)			\
-	__put32_unaligned_check("stb.u", val, addr)
-
-static void
-do_alignment_finish_ldst(unsigned long addr, unsigned long instr,
-			 struct pt_regs *regs, union offset_union offset)
-{
-	if (!LDST_U_BIT(instr))
-		offset.un = -offset.un;
-
-	if (!LDST_P_BIT(instr))
-		addr += offset.un;
-
-	if (!LDST_P_BIT(instr) || LDST_W_BIT(instr))
-		regs->uregs[RN_BITS(instr)] = addr;
-}
-
-static int
-do_alignment_ldrhstrh(unsigned long addr, unsigned long instr,
-		      struct pt_regs *regs)
-{
-	unsigned int rd = RD_BITS(instr);
-
-	/* old value 0x40002120, can't judge swap instr correctly */
-	if ((instr & 0x4b003fe0) == 0x40000120)
-		goto swp;
-
-	if (LDST_L_BIT(instr)) {
-		unsigned long val;
-		get16_unaligned_check(val, addr);
-
-		/* signed half-word? */
-		if (instr & 0x80)
-			val = (signed long)((signed short)val);
-
-		regs->uregs[rd] = val;
-	} else
-		put16_unaligned_check(regs->uregs[rd], addr);
-
-	return TYPE_LDST;
-
-swp:
-	/* only handle swap word
-	 * for swap byte should not active this alignment exception */
-	get32_unaligned_check(regs->uregs[RD_BITS(instr)], addr);
-	put32_unaligned_check(regs->uregs[RM_BITS(instr)], addr);
-	return TYPE_SWAP;
-
-fault:
-	return TYPE_FAULT;
-}
-
-static int
-do_alignment_ldrstr(unsigned long addr, unsigned long instr,
-		    struct pt_regs *regs)
-{
-	unsigned int rd = RD_BITS(instr);
-
-	if (!LDST_P_BIT(instr) && LDST_W_BIT(instr))
-		goto trans;
-
-	if (LDST_L_BIT(instr))
-		get32_unaligned_check(regs->uregs[rd], addr);
-	else
-		put32_unaligned_check(regs->uregs[rd], addr);
-	return TYPE_LDST;
-
-trans:
-	if (LDST_L_BIT(instr))
-		get32t_unaligned_check(regs->uregs[rd], addr);
-	else
-		put32t_unaligned_check(regs->uregs[rd], addr);
-	return TYPE_LDST;
-
-fault:
-	return TYPE_FAULT;
-}
-
-/*
- * LDM/STM alignment handler.
- *
- * There are 4 variants of this instruction:
- *
- * B = rn pointer before instruction, A = rn pointer after instruction
- *              ------ increasing address ----->
- *	        |    | r0 | r1 | ... | rx |    |
- * PU = 01             B                    A
- * PU = 11        B                    A
- * PU = 00        A                    B
- * PU = 10             A                    B
- */
-static int
-do_alignment_ldmstm(unsigned long addr, unsigned long instr,
-		    struct pt_regs *regs)
-{
-	unsigned int rd, rn, pc_correction, reg_correction, nr_regs, regbits;
-	unsigned long eaddr, newaddr;
-
-	if (LDM_S_BIT(instr))
-		goto bad;
-
-	pc_correction = 4;	/* processor implementation defined */
-
-	/* count the number of registers in the mask to be transferred */
-	nr_regs = hweight16(REGMASK_BITS(instr)) * 4;
-
-	rn = RN_BITS(instr);
-	newaddr = eaddr = regs->uregs[rn];
-
-	if (!LDST_U_BIT(instr))
-		nr_regs = -nr_regs;
-	newaddr += nr_regs;
-	if (!LDST_U_BIT(instr))
-		eaddr = newaddr;
-
-	if (LDST_P_EQ_U(instr))	/* U = P */
-		eaddr += 4;
-
-	/*
-	 * This is a "hint" - we already have eaddr worked out by the
-	 * processor for us.
-	 */
-	if (addr != eaddr) {
-		printk(KERN_ERR "LDMSTM: PC = %08lx, instr = %08lx, "
-		       "addr = %08lx, eaddr = %08lx\n",
-		       instruction_pointer(regs), instr, addr, eaddr);
-		show_regs(regs);
-	}
-
-	if (LDM_H_BIT(instr))
-		reg_correction = 0x10;
-	else
-		reg_correction = 0x00;
-
-	for (regbits = REGMASK_BITS(instr), rd = 0; regbits;
-	     regbits >>= 1, rd += 1)
-		if (regbits & 1) {
-			if (LDST_L_BIT(instr))
-				get32_unaligned_check(regs->
-					uregs[rd + reg_correction], eaddr);
-			else
-				put32_unaligned_check(regs->
-					uregs[rd + reg_correction], eaddr);
-			eaddr += 4;
-		}
-
-	if (LDST_W_BIT(instr))
-		regs->uregs[rn] = newaddr;
-	return TYPE_DONE;
-
-fault:
-	regs->UCreg_pc -= pc_correction;
-	return TYPE_FAULT;
-
-bad:
-	printk(KERN_ERR "Alignment trap: not handling ldm with s-bit set\n");
-	return TYPE_ERROR;
-}
-
-static int
-do_alignment(unsigned long addr, unsigned int error_code, struct pt_regs *regs)
-{
-	union offset_union offset;
-	unsigned long instr, instrptr;
-	int (*handler) (unsigned long addr, unsigned long instr,
-			struct pt_regs *regs);
-	unsigned int type;
-
-	instrptr = instruction_pointer(regs);
-	if (instrptr >= PAGE_OFFSET)
-		instr = *(unsigned long *)instrptr;
-	else {
-		__asm__ __volatile__(
-				"ldw.u	%0, [%1]\n"
-				: "=&r"(instr)
-				: "r"(instrptr));
-	}
-
-	regs->UCreg_pc += 4;
-
-	switch (CODING_BITS(instr)) {
-	case 0x40000120:	/* ldrh or strh */
-		if (LDSTH_I_BIT(instr))
-			offset.un = (instr & 0x3e00) >> 4 | (instr & 31);
-		else
-			offset.un = regs->uregs[RM_BITS(instr)];
-		handler = do_alignment_ldrhstrh;
-		break;
-
-	case 0x60000000:	/* ldr or str immediate */
-	case 0x60000100:	/* ldr or str immediate */
-	case 0x60000020:	/* ldr or str immediate */
-	case 0x60000120:	/* ldr or str immediate */
-		offset.un = OFFSET_BITS(instr);
-		handler = do_alignment_ldrstr;
-		break;
-
-	case 0x40000000:	/* ldr or str register */
-		offset.un = regs->uregs[RM_BITS(instr)];
-		{
-			unsigned int shiftval = SHIFT_BITS(instr);
-
-			switch (SHIFT_TYPE(instr)) {
-			case SHIFT_LSL:
-				offset.un <<= shiftval;
-				break;
-
-			case SHIFT_LSR:
-				offset.un >>= shiftval;
-				break;
-
-			case SHIFT_ASR:
-				offset.sn >>= shiftval;
-				break;
-
-			case SHIFT_RORRRX:
-				if (shiftval == 0) {
-					offset.un >>= 1;
-					if (regs->UCreg_asr & PSR_C_BIT)
-						offset.un |= 1 << 31;
-				} else
-					offset.un = offset.un >> shiftval |
-					    offset.un << (32 - shiftval);
-				break;
-			}
-		}
-		handler = do_alignment_ldrstr;
-		break;
-
-	case 0x80000000:	/* ldm or stm */
-	case 0x80000020:	/* ldm or stm */
-		handler = do_alignment_ldmstm;
-		break;
-
-	default:
-		goto bad;
-	}
-
-	type = handler(addr, instr, regs);
-
-	if (type == TYPE_ERROR || type == TYPE_FAULT)
-		goto bad_or_fault;
-
-	if (type == TYPE_LDST)
-		do_alignment_finish_ldst(addr, instr, regs, offset);
-
-	return 0;
-
-bad_or_fault:
-	if (type == TYPE_ERROR)
-		goto bad;
-	regs->UCreg_pc -= 4;
-	/*
-	 * We got a fault - fix it up, or die.
-	 */
-	do_bad_area(addr, error_code, regs);
-	return 0;
-
-bad:
-	/*
-	 * Oops, we didn't handle the instruction.
-	 * However, we must handle fpu instr firstly.
-	 */
-#ifdef CONFIG_UNICORE_FPU_F64
-	/* handle co.load/store */
-#define CODING_COLS                0xc0000000
-#define COLS_OFFSET_BITS(i)	(i & 0x1FF)
-#define COLS_L_BITS(i)		(i & (1<<24))
-#define COLS_FN_BITS(i)		((i>>14) & 31)
-	if ((instr & 0xe0000000) == CODING_COLS) {
-		unsigned int fn = COLS_FN_BITS(instr);
-		unsigned long val = 0;
-		if (COLS_L_BITS(instr)) {
-			get32t_unaligned_check(val, addr);
-			switch (fn) {
-#define ASM_MTF(n)	case n:						\
-			__asm__ __volatile__("MTF %0, F" __stringify(n)	\
-				: : "r"(val));				\
-			break;
-			ASM_MTF(0); ASM_MTF(1); ASM_MTF(2); ASM_MTF(3);
-			ASM_MTF(4); ASM_MTF(5); ASM_MTF(6); ASM_MTF(7);
-			ASM_MTF(8); ASM_MTF(9); ASM_MTF(10); ASM_MTF(11);
-			ASM_MTF(12); ASM_MTF(13); ASM_MTF(14); ASM_MTF(15);
-			ASM_MTF(16); ASM_MTF(17); ASM_MTF(18); ASM_MTF(19);
-			ASM_MTF(20); ASM_MTF(21); ASM_MTF(22); ASM_MTF(23);
-			ASM_MTF(24); ASM_MTF(25); ASM_MTF(26); ASM_MTF(27);
-			ASM_MTF(28); ASM_MTF(29); ASM_MTF(30); ASM_MTF(31);
-#undef ASM_MTF
-			}
-		} else {
-			switch (fn) {
-#define ASM_MFF(n)	case n:						\
-			__asm__ __volatile__("MFF %0, F" __stringify(n)	\
-				: : "r"(val));				\
-			break;
-			ASM_MFF(0); ASM_MFF(1); ASM_MFF(2); ASM_MFF(3);
-			ASM_MFF(4); ASM_MFF(5); ASM_MFF(6); ASM_MFF(7);
-			ASM_MFF(8); ASM_MFF(9); ASM_MFF(10); ASM_MFF(11);
-			ASM_MFF(12); ASM_MFF(13); ASM_MFF(14); ASM_MFF(15);
-			ASM_MFF(16); ASM_MFF(17); ASM_MFF(18); ASM_MFF(19);
-			ASM_MFF(20); ASM_MFF(21); ASM_MFF(22); ASM_MFF(23);
-			ASM_MFF(24); ASM_MFF(25); ASM_MFF(26); ASM_MFF(27);
-			ASM_MFF(28); ASM_MFF(29); ASM_MFF(30); ASM_MFF(31);
-#undef ASM_MFF
-			}
-			put32t_unaligned_check(val, addr);
-		}
-		return TYPE_COLS;
-	}
-fault:
-	return TYPE_FAULT;
-#endif
-	printk(KERN_ERR "Alignment trap: not handling instruction "
-	       "%08lx at [<%08lx>]\n", instr, instrptr);
-	return 1;
-}
-
-/*
- * This needs to be done after sysctl_init, otherwise sys/ will be
- * overwritten.  Actually, this shouldn't be in sys/ at all since
- * it isn't a sysctl, and it doesn't contain sysctl information.
- */
-static int __init alignment_init(void)
-{
-	hook_fault_code(1, do_alignment, SIGBUS, BUS_ADRALN,
-			"alignment exception");
-
-	return 0;
-}
-
-fs_initcall(alignment_init);
diff --git a/arch/unicore32/mm/cache-ucv2.S b/arch/unicore32/mm/cache-ucv2.S
deleted file mode 100644
index 2108837d6f4f..000000000000
--- a/arch/unicore32/mm/cache-ucv2.S
+++ /dev/null
@@ -1,209 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/mm/cache-ucv2.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  This is the "shell" of the UniCore-v2 processor support.
- */
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/assembler.h>
-#include <asm/page.h>
-
-#include "proc-macros.S"
-
-/*
- *	__cpuc_flush_icache_all()
- *	__cpuc_flush_kern_all()
- *	__cpuc_flush_user_all()
- *
- *	Flush the entire cache.
- */
-ENTRY(__cpuc_flush_icache_all)
-	/*FALLTHROUGH*/
-ENTRY(__cpuc_flush_kern_all)
-	/*FALLTHROUGH*/
-ENTRY(__cpuc_flush_user_all)
-	mov	r0, #0
-	movc	p0.c5, r0, #14			@ Dcache flush all
-	nop8
-
-	mov	r0, #0
-	movc	p0.c5, r0, #20			@ Icache invalidate all
-	nop8
-
-	mov	pc, lr
-
-/*
- *	__cpuc_flush_user_range(start, end, flags)
- *
- *	Flush a range of TLB entries in the specified address space.
- *
- *	- start - start address (may not be aligned)
- *	- end   - end address (exclusive, may not be aligned)
- *	- flags	- vm_area_struct flags describing address space
- */
-ENTRY(__cpuc_flush_user_range)
-	cxor.a	r2, #0
-	beq	__cpuc_dma_flush_range
-
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	andn	r0, r0, #CACHE_LINESIZE - 1	@ Safety check
-	sub	r1, r1, r0
-	csub.a	r1, #MAX_AREA_SIZE
-	bsg	2f
-
-	andn	r1, r1, #CACHE_LINESIZE - 1
-	add	r1, r1, #CACHE_LINESIZE
-
-101:	dcacheline_flush	r0, r11, r12
-
-	add	r0, r0, #CACHE_LINESIZE
-	sub.a	r1, r1, #CACHE_LINESIZE
-	bns	101b
-	b	3f
-#endif
-2:	mov	ip, #0
-	movc	p0.c5, ip, #14			@ Dcache flush all
-	nop8
-
-3:	mov	ip, #0
-	movc	p0.c5, ip, #20			@ Icache invalidate all
-	nop8
-
-	mov	pc, lr
-
-/*
- *	__cpuc_coherent_kern_range(start,end)
- *	__cpuc_coherent_user_range(start,end)
- *
- *	Ensure that the I and D caches are coherent within specified
- *	region.  This is typically used when code has been written to
- *	a memory region, and will be executed.
- *
- *	- start   - virtual start address of region
- *	- end     - virtual end address of region
- */
-ENTRY(__cpuc_coherent_kern_range)
-	/* FALLTHROUGH */
-ENTRY(__cpuc_coherent_user_range)
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	andn	r0, r0, #CACHE_LINESIZE - 1	@ Safety check
-	sub	r1, r1, r0
-	csub.a	r1, #MAX_AREA_SIZE
-	bsg	2f
-
-	andn	r1, r1, #CACHE_LINESIZE - 1
-	add	r1, r1, #CACHE_LINESIZE
-
-	@ r0 va2pa r10
-	mov	r9, #PAGE_SZ
-	sub	r9, r9, #1			@ PAGE_MASK
-101:	va2pa	r0, r10, r11, r12, r13, 2f	@ r10 is PA
-	b	103f
-102:	cand.a	r0, r9
-	beq	101b
-
-103:	movc	p0.c5, r10, #11			@ Dcache clean line of R10
-	nop8
-
-	add	r0, r0, #CACHE_LINESIZE
-	add	r10, r10, #CACHE_LINESIZE
-	sub.a	r1, r1, #CACHE_LINESIZE
-	bns	102b
-	b	3f
-#endif
-2:	mov	ip, #0
-	movc	p0.c5, ip, #10			@ Dcache clean all
-	nop8
-
-3:	mov	ip, #0
-	movc	p0.c5, ip, #20			@ Icache invalidate all
-	nop8
-
-	mov	pc, lr
-
-/*
- *	__cpuc_flush_kern_dcache_area(void *addr, size_t size)
- *
- *	- addr	- kernel address
- *	- size	- region size
- */
-ENTRY(__cpuc_flush_kern_dcache_area)
-	mov	ip, #0
-	movc	p0.c5, ip, #14			@ Dcache flush all
-	nop8
-	mov	pc, lr
-
-/*
- *	__cpuc_dma_clean_range(start,end)
- *	- start   - virtual start address of region
- *	- end     - virtual end address of region
- */
-ENTRY(__cpuc_dma_clean_range)
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	andn	r0, r0, #CACHE_LINESIZE - 1
-	sub	r1, r1, r0
-	andn	r1, r1, #CACHE_LINESIZE - 1
-	add	r1, r1, #CACHE_LINESIZE
-
-	csub.a	r1, #MAX_AREA_SIZE
-	bsg	2f
-
-	@ r0 va2pa r10
-	mov	r9, #PAGE_SZ
-	sub	r9, r9, #1			@ PAGE_MASK
-101:	va2pa	r0, r10, r11, r12, r13, 2f	@ r10 is PA
-	b	1f
-102:	cand.a	r0, r9
-	beq	101b
-
-1:	movc	p0.c5, r10, #11			@ Dcache clean line of R10
-	nop8
-	add	r0, r0, #CACHE_LINESIZE
-	add	r10, r10, #CACHE_LINESIZE
-	sub.a	r1, r1, #CACHE_LINESIZE
-	bns	102b
-	mov	pc, lr
-#endif
-2:	mov	ip, #0
-	movc	p0.c5, ip, #10			@ Dcache clean all
-	nop8
-
-	mov	pc, lr
-
-/*
- *	__cpuc_dma_inv_range(start,end)
- *	__cpuc_dma_flush_range(start,end)
- *	- start   - virtual start address of region
- *	- end     - virtual end address of region
- */
-__cpuc_dma_inv_range:
-	/* FALLTHROUGH */
-ENTRY(__cpuc_dma_flush_range)
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	andn	r0, r0, #CACHE_LINESIZE - 1
-	sub	r1, r1, r0
-	andn	r1, r1, #CACHE_LINESIZE - 1
-	add	r1, r1, #CACHE_LINESIZE
-
-	csub.a	r1, #MAX_AREA_SIZE
-	bsg	2f
-
-	@ r0 va2pa r10
-101:	dcacheline_flush	r0, r11, r12
-
-	add	r0, r0, #CACHE_LINESIZE
-	sub.a	r1, r1, #CACHE_LINESIZE
-	bns	101b
-	mov	pc, lr
-#endif
-2:	mov	ip, #0
-	movc	p0.c5, ip, #14			@ Dcache flush all
-	nop8
-
-	mov	pc, lr
-
diff --git a/arch/unicore32/mm/extable.c b/arch/unicore32/mm/extable.c
deleted file mode 100644
index e53352b41c4a..000000000000
--- a/arch/unicore32/mm/extable.c
+++ /dev/null
@@ -1,21 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/extable.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/extable.h>
-#include <linux/uaccess.h>
-
-int fixup_exception(struct pt_regs *regs)
-{
-	const struct exception_table_entry *fixup;
-
-	fixup = search_exception_tables(instruction_pointer(regs));
-	if (fixup)
-		regs->UCreg_pc = fixup->fixup;
-
-	return fixup != NULL;
-}
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
deleted file mode 100644
index 7654bddde133..000000000000
--- a/arch/unicore32/mm/fault.c
+++ /dev/null
@@ -1,481 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/fault.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/extable.h>
-#include <linux/signal.h>
-#include <linux/mm.h>
-#include <linux/hardirq.h>
-#include <linux/init.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/page-flags.h>
-#include <linux/sched/signal.h>
-#include <linux/io.h>
-
-#include <asm/tlbflush.h>
-
-/*
- * Fault status register encodings.  We steal bit 31 for our own purposes.
- */
-#define FSR_LNX_PF		(1 << 31)
-
-static inline int fsr_fs(unsigned int fsr)
-{
-	/* xyabcde will be abcde+xy */
-	return (fsr & 31) + ((fsr & (3 << 5)) >> 5);
-}
-
-/*
- * This is useful to dump out the page tables associated with
- * 'addr' in mm 'mm'.
- */
-void show_pte(struct mm_struct *mm, unsigned long addr)
-{
-	pgd_t *pgd;
-
-	if (!mm)
-		mm = &init_mm;
-
-	printk(KERN_ALERT "pgd = %p\n", mm->pgd);
-	pgd = pgd_offset(mm, addr);
-	printk(KERN_ALERT "[%08lx] *pgd=%08lx", addr, pgd_val(*pgd));
-
-	do {
-		pmd_t *pmd;
-		pte_t *pte;
-
-		if (pgd_none(*pgd))
-			break;
-
-		if (pgd_bad(*pgd)) {
-			printk("(bad)");
-			break;
-		}
-
-		pmd = pmd_offset((pud_t *) pgd, addr);
-		if (PTRS_PER_PMD != 1)
-			printk(", *pmd=%08lx", pmd_val(*pmd));
-
-		if (pmd_none(*pmd))
-			break;
-
-		if (pmd_bad(*pmd)) {
-			printk("(bad)");
-			break;
-		}
-
-		/* We must not map this if we have highmem enabled */
-		if (PageHighMem(pfn_to_page(pmd_val(*pmd) >> PAGE_SHIFT)))
-			break;
-
-		pte = pte_offset_map(pmd, addr);
-		printk(", *pte=%08lx", pte_val(*pte));
-		pte_unmap(pte);
-	} while (0);
-
-	printk("\n");
-}
-
-/*
- * Oops.  The kernel tried to access some page that wasn't present.
- */
-static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr,
-		unsigned int fsr, struct pt_regs *regs)
-{
-	/*
-	 * Are we prepared to handle this kernel fault?
-	 */
-	if (fixup_exception(regs))
-		return;
-
-	/*
-	 * No handler, we'll have to terminate things with extreme prejudice.
-	 */
-	bust_spinlocks(1);
-	printk(KERN_ALERT
-	       "Unable to handle kernel %s at virtual address %08lx\n",
-	       (addr < PAGE_SIZE) ? "NULL pointer dereference" :
-	       "paging request", addr);
-
-	show_pte(mm, addr);
-	die("Oops", regs, fsr);
-	bust_spinlocks(0);
-	do_exit(SIGKILL);
-}
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * User mode accesses just cause a SIGSEGV
- */
-static void __do_user_fault(unsigned long addr, unsigned int fsr,
-			    unsigned int sig, int code,	struct pt_regs *regs)
-{
-	struct task_struct *tsk = current;
-
-	tsk->thread.address = addr;
-	tsk->thread.error_code = fsr;
-	tsk->thread.trap_no = 14;
-	force_sig_fault(sig, code, (void __user *)addr);
-}
-
-void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
-{
-	struct task_struct *tsk = current;
-	struct mm_struct *mm = tsk->active_mm;
-
-	/*
-	 * If we are in kernel mode at this point, we
-	 * have no context to handle this fault with.
-	 */
-	if (user_mode(regs))
-		__do_user_fault(addr, fsr, SIGSEGV, SEGV_MAPERR, regs);
-	else
-		__do_kernel_fault(mm, addr, fsr, regs);
-}
-
-#define VM_FAULT_BADMAP		0x010000
-#define VM_FAULT_BADACCESS	0x020000
-
-/*
- * Check that the permissions on the VMA allow for the fault which occurred.
- * If we encountered a write fault, we must have write permission, otherwise
- * we allow any permission.
- */
-static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma)
-{
-	unsigned int mask = VM_ACCESS_FLAGS;
-
-	if (!(fsr ^ 0x12))	/* write? */
-		mask = VM_WRITE;
-	if (fsr & FSR_LNX_PF)
-		mask = VM_EXEC;
-
-	return vma->vm_flags & mask ? false : true;
-}
-
-static vm_fault_t __do_pf(struct mm_struct *mm, unsigned long addr,
-		unsigned int fsr, unsigned int flags, struct task_struct *tsk)
-{
-	struct vm_area_struct *vma;
-	vm_fault_t fault;
-
-	vma = find_vma(mm, addr);
-	fault = VM_FAULT_BADMAP;
-	if (unlikely(!vma))
-		goto out;
-	if (unlikely(vma->vm_start > addr))
-		goto check_stack;
-
-	/*
-	 * Ok, we have a good vm_area for this
-	 * memory access, so we can handle it.
-	 */
-good_area:
-	if (access_error(fsr, vma)) {
-		fault = VM_FAULT_BADACCESS;
-		goto out;
-	}
-
-	/*
-	 * If for any reason at all we couldn't handle the fault, make
-	 * sure we exit gracefully rather than endlessly redo the fault.
-	 */
-	fault = handle_mm_fault(vma, addr & PAGE_MASK, flags);
-	return fault;
-
-check_stack:
-	if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr))
-		goto good_area;
-out:
-	return fault;
-}
-
-static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
-{
-	struct task_struct *tsk;
-	struct mm_struct *mm;
-	int sig, code;
-	vm_fault_t fault;
-	unsigned int flags = FAULT_FLAG_DEFAULT;
-
-	tsk = current;
-	mm = tsk->mm;
-
-	/*
-	 * If we're in an interrupt or have no user
-	 * context, we must not take the fault..
-	 */
-	if (faulthandler_disabled() || !mm)
-		goto no_context;
-
-	if (user_mode(regs))
-		flags |= FAULT_FLAG_USER;
-	if (!(fsr ^ 0x12))
-		flags |= FAULT_FLAG_WRITE;
-
-	/*
-	 * As per x86, we may deadlock here.  However, since the kernel only
-	 * validly references user space from well defined areas of the code,
-	 * we can bug out early if this is from code which shouldn't.
-	 */
-	if (!mmap_read_trylock(mm)) {
-		if (!user_mode(regs)
-		    && !search_exception_tables(regs->UCreg_pc))
-			goto no_context;
-retry:
-		mmap_read_lock(mm);
-	} else {
-		/*
-		 * The above down_read_trylock() might have succeeded in
-		 * which case, we'll have missed the might_sleep() from
-		 * down_read()
-		 */
-		might_sleep();
-#ifdef CONFIG_DEBUG_VM
-		if (!user_mode(regs) &&
-		    !search_exception_tables(regs->UCreg_pc))
-			goto no_context;
-#endif
-	}
-
-	fault = __do_pf(mm, addr, fsr, flags, tsk);
-
-	/* If we need to retry but a fatal signal is pending, handle the
-	 * signal first. We do not need to release the mmap_lock because
-	 * it would already be released in __lock_page_or_retry in
-	 * mm/filemap.c. */
-	if (fault_signal_pending(fault, regs))
-		return 0;
-
-	if (!(fault & VM_FAULT_ERROR) && (flags & FAULT_FLAG_ALLOW_RETRY)) {
-		if (fault & VM_FAULT_MAJOR)
-			tsk->maj_flt++;
-		else
-			tsk->min_flt++;
-		if (fault & VM_FAULT_RETRY) {
-			flags |= FAULT_FLAG_TRIED;
-			goto retry;
-		}
-	}
-
-	mmap_read_unlock(mm);
-
-	/*
-	 * Handle the "normal" case first - VM_FAULT_MAJOR
-	 */
-	if (likely(!(fault &
-	       (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
-		return 0;
-
-	/*
-	 * If we are in kernel mode at this point, we
-	 * have no context to handle this fault with.
-	 */
-	if (!user_mode(regs))
-		goto no_context;
-
-	if (fault & VM_FAULT_OOM) {
-		/*
-		 * We ran out of memory, call the OOM killer, and return to
-		 * userspace (which will retry the fault, or kill us if we
-		 * got oom-killed)
-		 */
-		pagefault_out_of_memory();
-		return 0;
-	}
-
-	if (fault & VM_FAULT_SIGBUS) {
-		/*
-		 * We had some memory, but were unable to
-		 * successfully fix up this page fault.
-		 */
-		sig = SIGBUS;
-		code = BUS_ADRERR;
-	} else {
-		/*
-		 * Something tried to access memory that
-		 * isn't in our memory map..
-		 */
-		sig = SIGSEGV;
-		code = fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR;
-	}
-
-	__do_user_fault(addr, fsr, sig, code, regs);
-	return 0;
-
-no_context:
-	__do_kernel_fault(mm, addr, fsr, regs);
-	return 0;
-}
-
-/*
- * First Level Translation Fault Handler
- *
- * We enter here because the first level page table doesn't contain
- * a valid entry for the address.
- *
- * If the address is in kernel space (>= TASK_SIZE), then we are
- * probably faulting in the vmalloc() area.
- *
- * If the init_task's first level page tables contains the relevant
- * entry, we copy the it to this task.  If not, we send the process
- * a signal, fixup the exception, or oops the kernel.
- *
- * NOTE! We MUST NOT take any locks for this case. We may be in an
- * interrupt or a critical region, and should only copy the information
- * from the master page table, nothing more.
- */
-static int do_ifault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
-{
-	unsigned int index;
-	pgd_t *pgd, *pgd_k;
-	pmd_t *pmd, *pmd_k;
-
-	if (addr < TASK_SIZE)
-		return do_pf(addr, fsr, regs);
-
-	if (user_mode(regs))
-		goto bad_area;
-
-	index = pgd_index(addr);
-
-	pgd = cpu_get_pgd() + index;
-	pgd_k = init_mm.pgd + index;
-
-	if (pgd_none(*pgd_k))
-		goto bad_area;
-
-	pmd_k = pmd_offset((pud_t *) pgd_k, addr);
-	pmd = pmd_offset((pud_t *) pgd, addr);
-
-	if (pmd_none(*pmd_k))
-		goto bad_area;
-
-	set_pmd(pmd, *pmd_k);
-	flush_pmd_entry(pmd);
-	return 0;
-
-bad_area:
-	do_bad_area(addr, fsr, regs);
-	return 0;
-}
-
-/*
- * This abort handler always returns "fault".
- */
-static int do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
-{
-	return 1;
-}
-
-static int do_good(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
-{
-	unsigned int res1, res2;
-
-	printk("dabt exception but no error!\n");
-
-	__asm__ __volatile__(
-			"mff %0,f0\n"
-			"mff %1,f1\n"
-			: "=r"(res1), "=r"(res2)
-			:
-			: "memory");
-
-	printk(KERN_EMERG "r0 :%08x  r1 :%08x\n", res1, res2);
-	panic("shut up\n");
-	return 0;
-}
-
-static struct fsr_info {
-	int (*fn) (unsigned long addr, unsigned int fsr, struct pt_regs *regs);
-	int sig;
-	int code;
-	const char *name;
-} fsr_info[] = {
-	/*
-	 * The following are the standard Unicore-I and UniCore-II aborts.
-	 */
-	{ do_good,	SIGBUS,  0,		"no error"		},
-	{ do_bad,	SIGBUS,  BUS_ADRALN,	"alignment exception"	},
-	{ do_bad,	SIGBUS,  BUS_OBJERR,	"external exception"	},
-	{ do_bad,	SIGBUS,  0,		"burst operation"	},
-	{ do_bad,	SIGBUS,  0,		"unknown 00100"		},
-	{ do_ifault,	SIGSEGV, SEGV_MAPERR,	"2nd level pt non-exist"},
-	{ do_bad,	SIGBUS,  0,		"2nd lvl large pt non-exist" },
-	{ do_bad,	SIGBUS,  0,		"invalid pte"		},
-	{ do_pf,	SIGSEGV, SEGV_MAPERR,	"page miss"		},
-	{ do_bad,	SIGBUS,  0,		"middle page miss"	},
-	{ do_bad,	SIGBUS,	 0,		"large page miss"	},
-	{ do_pf,	SIGSEGV, SEGV_MAPERR,	"super page (section) miss" },
-	{ do_bad,	SIGBUS,  0,		"unknown 01100"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 01101"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 01110"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 01111"		},
-	{ do_bad,	SIGBUS,  0,		"addr: up 3G or IO"	},
-	{ do_pf,	SIGSEGV, SEGV_ACCERR,	"read unreadable addr"	},
-	{ do_pf,	SIGSEGV, SEGV_ACCERR,	"write unwriteable addr"},
-	{ do_pf,	SIGSEGV, SEGV_ACCERR,	"exec unexecutable addr"},
-	{ do_bad,	SIGBUS,  0,		"unknown 10100"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 10101"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 10110"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 10111"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11000"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11001"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11010"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11011"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11100"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11101"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11110"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11111"		}
-};
-
-void __init hook_fault_code(int nr,
-		int (*fn) (unsigned long, unsigned int, struct pt_regs *),
-		int sig, int code, const char *name)
-{
-	if (nr < 0 || nr >= ARRAY_SIZE(fsr_info))
-		BUG();
-
-	fsr_info[nr].fn   = fn;
-	fsr_info[nr].sig  = sig;
-	fsr_info[nr].code = code;
-	fsr_info[nr].name = name;
-}
-
-/*
- * Dispatch a data abort to the relevant handler.
- */
-asmlinkage void do_DataAbort(unsigned long addr, unsigned int fsr,
-			struct pt_regs *regs)
-{
-	const struct fsr_info *inf = fsr_info + fsr_fs(fsr);
-
-	if (!inf->fn(addr, fsr & ~FSR_LNX_PF, regs))
-		return;
-
-	printk(KERN_ALERT "Unhandled fault: %s (0x%03x) at 0x%08lx\n",
-	       inf->name, fsr, addr);
-
-	uc32_notify_die("", regs, inf->sig, inf->code, (void __user *)addr,
-			fsr, 0);
-}
-
-asmlinkage void do_PrefetchAbort(unsigned long addr,
-			unsigned int ifsr, struct pt_regs *regs)
-{
-	const struct fsr_info *inf = fsr_info + fsr_fs(ifsr);
-
-	if (!inf->fn(addr, ifsr | FSR_LNX_PF, regs))
-		return;
-
-	printk(KERN_ALERT "Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n",
-	       inf->name, ifsr, addr);
-
-	uc32_notify_die("", regs, inf->sig, inf->code, (void __user *)addr,
-			ifsr, 0);
-}
diff --git a/arch/unicore32/mm/flush.c b/arch/unicore32/mm/flush.c
deleted file mode 100644
index 65954f8d89a2..000000000000
--- a/arch/unicore32/mm/flush.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/flush.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-void flush_cache_mm(struct mm_struct *mm)
-{
-}
-
-void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
-		unsigned long end)
-{
-	if (vma->vm_flags & VM_EXEC)
-		__flush_icache_all();
-}
-
-void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr,
-		unsigned long pfn)
-{
-}
-
-static void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
-			 unsigned long uaddr, void *kaddr, unsigned long len)
-{
-	/* VIPT non-aliasing D-cache */
-	if (vma->vm_flags & VM_EXEC) {
-		unsigned long addr = (unsigned long)kaddr;
-
-		__cpuc_coherent_kern_range(addr, addr + len);
-	}
-}
-
-/*
- * Copy user data from/to a page which is mapped into a different
- * processes address space.  Really, we want to allow our "user
- * space" model to handle this.
- *
- * Note that this code needs to run on the current CPU.
- */
-void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
-		       unsigned long uaddr, void *dst, const void *src,
-		       unsigned long len)
-{
-	memcpy(dst, src, len);
-	flush_ptrace_access(vma, page, uaddr, dst, len);
-}
-
-void __flush_dcache_page(struct address_space *mapping, struct page *page)
-{
-	/*
-	 * Writeback any data associated with the kernel mapping of this
-	 * page.  This ensures that data in the physical page is mutually
-	 * coherent with the kernels mapping.
-	 */
-	__cpuc_flush_kern_dcache_area(page_address(page), PAGE_SIZE);
-}
-
-/*
- * Ensure cache coherency between kernel mapping and userspace mapping
- * of this page.
- */
-void flush_dcache_page(struct page *page)
-{
-	struct address_space *mapping;
-
-	/*
-	 * The zero page is never written to, so never has any dirty
-	 * cache lines, and therefore never needs to be flushed.
-	 */
-	if (page == ZERO_PAGE(0))
-		return;
-
-	mapping = page_mapping_file(page);
-
-	if (mapping && !mapping_mapped(mapping))
-		clear_bit(PG_dcache_clean, &page->flags);
-	else {
-		__flush_dcache_page(mapping, page);
-		if (mapping)
-			__flush_icache_all();
-		set_bit(PG_dcache_clean, &page->flags);
-	}
-}
-EXPORT_SYMBOL(flush_dcache_page);
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
deleted file mode 100644
index 52425d383cea..000000000000
--- a/arch/unicore32/mm/init.c
+++ /dev/null
@@ -1,261 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  linux/arch/unicore32/mm/init.c
- *
- *  Copyright (C) 2010 GUAN Xue-tao
- */
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/swap.h>
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/mman.h>
-#include <linux/nodemask.h>
-#include <linux/initrd.h>
-#include <linux/highmem.h>
-#include <linux/gfp.h>
-#include <linux/sort.h>
-#include <linux/dma-mapping.h>
-#include <linux/export.h>
-
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <linux/sizes.h>
-#include <asm/tlb.h>
-#include <asm/memblock.h>
-#include <mach/map.h>
-
-#include "mm.h"
-
-/*
- * This keeps memory configuration data used by a couple memory
- * initialization functions, as well as show_mem() for the skipping
- * of holes in the memory map.  It is populated by uc32_add_memory().
- */
-struct meminfo meminfo;
-
-static void __init find_limits(unsigned long *min, unsigned long *max_low,
-	unsigned long *max_high)
-{
-	struct meminfo *mi = &meminfo;
-	int i;
-
-	*min = -1UL;
-	*max_low = *max_high = 0;
-
-	for_each_bank(i, mi) {
-		struct membank *bank = &mi->bank[i];
-		unsigned long start, end;
-
-		start = bank_pfn_start(bank);
-		end = bank_pfn_end(bank);
-
-		if (*min > start)
-			*min = start;
-		if (*max_high < end)
-			*max_high = end;
-		if (bank->highmem)
-			continue;
-		if (*max_low < end)
-			*max_low = end;
-	}
-}
-
-static void __init uc32_bootmem_free(unsigned long max_low)
-{
-	unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
-
-	max_zone_pfn[ZONE_DMA] = max_low;
-	max_zone_pfn[ZONE_NORMAL] = max_low;
-
-	/*
-	 * Adjust the sizes according to any special requirements for
-	 * this machine type.
-	 * This might lower ZONE_DMA limit.
-	 */
-	arch_adjust_zones(max_zone_pfn);
-
-	free_area_init(max_zone_pfn);
-}
-
-int pfn_valid(unsigned long pfn)
-{
-	return memblock_is_memory(pfn << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(pfn_valid);
-
-static void uc32_memory_present(void)
-{
-}
-
-static int __init meminfo_cmp(const void *_a, const void *_b)
-{
-	const struct membank *a = _a, *b = _b;
-	long cmp = bank_pfn_start(a) - bank_pfn_start(b);
-	return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
-}
-
-void __init uc32_memblock_init(struct meminfo *mi)
-{
-	int i;
-
-	sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]),
-		meminfo_cmp, NULL);
-
-	for (i = 0; i < mi->nr_banks; i++)
-		memblock_add(mi->bank[i].start, mi->bank[i].size);
-
-	/* Register the kernel text, kernel data and initrd with memblock. */
-	memblock_reserve(__pa(_text), _end - _text);
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (!phys_initrd_size) {
-		phys_initrd_start = 0x01000000;
-		phys_initrd_size = SZ_8M;
-	}
-
-	if (phys_initrd_size) {
-		memblock_reserve(phys_initrd_start, phys_initrd_size);
-
-		/* Now convert initrd to virtual addresses */
-		initrd_start = __phys_to_virt(phys_initrd_start);
-		initrd_end = initrd_start + phys_initrd_size;
-	}
-#endif
-
-	uc32_mm_memblock_reserve();
-
-	memblock_allow_resize();
-	memblock_dump_all();
-}
-
-void __init bootmem_init(void)
-{
-	unsigned long min, max_low, max_high;
-
-	max_low = max_high = 0;
-
-	find_limits(&min, &max_low, &max_high);
-
-	node_set_online(0);
-
-	/*
-	 * Sparsemem tries to allocate bootmem in memory_present(),
-	 * so must be done after the fixed reservations
-	 */
-	uc32_memory_present();
-
-	/*
-	 * sparse_init() needs the bootmem allocator up and running.
-	 */
-	sparse_init();
-
-	/*
-	 * Now free the memory - free_area_init needs
-	 * the sparse mem_map arrays initialized by sparse_init()
-	 * for memmap_init_zone(), otherwise all PFNs are invalid.
-	 */
-	uc32_bootmem_free(max_low);
-
-	high_memory = __va((max_low << PAGE_SHIFT) - 1) + 1;
-
-	/*
-	 * This doesn't seem to be used by the Linux memory manager any
-	 * more, but is used by ll_rw_block.  If we can get rid of it, we
-	 * also get rid of some of the stuff above as well.
-	 *
-	 * Note: max_low_pfn and max_pfn reflect the number of _pages_ in
-	 * the system, not the maximum PFN.
-	 */
-	max_low_pfn = max_low - PHYS_PFN_OFFSET;
-	max_pfn = max_high - PHYS_PFN_OFFSET;
-}
-
-static inline void
-free_memmap(unsigned long start_pfn, unsigned long end_pfn)
-{
-	struct page *start_pg, *end_pg;
-	unsigned long pg, pgend;
-
-	/*
-	 * Convert start_pfn/end_pfn to a struct page pointer.
-	 */
-	start_pg = pfn_to_page(start_pfn - 1) + 1;
-	end_pg = pfn_to_page(end_pfn);
-
-	/*
-	 * Convert to physical addresses, and
-	 * round start upwards and end downwards.
-	 */
-	pg = PAGE_ALIGN(__pa(start_pg));
-	pgend = __pa(end_pg) & PAGE_MASK;
-
-	/*
-	 * If there are free pages between these,
-	 * free the section of the memmap array.
-	 */
-	if (pg < pgend)
-		memblock_free(pg, pgend - pg);
-}
-
-/*
- * The mem_map array can get very big.  Free the unused area of the memory map.
- */
-static void __init free_unused_memmap(struct meminfo *mi)
-{
-	unsigned long bank_start, prev_bank_end = 0;
-	unsigned int i;
-
-	/*
-	 * This relies on each bank being in address order.
-	 * The banks are sorted previously in bootmem_init().
-	 */
-	for_each_bank(i, mi) {
-		struct membank *bank = &mi->bank[i];
-
-		bank_start = bank_pfn_start(bank);
-
-		/*
-		 * If we had a previous bank, and there is a space
-		 * between the current bank and the previous, free it.
-		 */
-		if (prev_bank_end && prev_bank_end < bank_start)
-			free_memmap(prev_bank_end, bank_start);
-
-		/*
-		 * Align up here since the VM subsystem insists that the
-		 * memmap entries are valid from the bank end aligned to
-		 * MAX_ORDER_NR_PAGES.
-		 */
-		prev_bank_end = ALIGN(bank_pfn_end(bank), MAX_ORDER_NR_PAGES);
-	}
-}
-
-/*
- * mem_init() marks the free areas in the mem_map and tells us how much
- * memory is free.  This is done after various parts of the system have
- * claimed their memory after the kernel image.
- */
-void __init mem_init(void)
-{
-	max_mapnr   = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map;
-
-	free_unused_memmap(&meminfo);
-
-	/* this will put all unused low memory onto the freelists */
-	memblock_free_all();
-
-	mem_init_print_info(NULL);
-
-	BUILD_BUG_ON(TASK_SIZE				> MODULES_VADDR);
-	BUG_ON(TASK_SIZE				> MODULES_VADDR);
-
-	if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
-		/*
-		 * On a machine this small we won't get
-		 * anywhere without overcommit, so turn
-		 * it on by default.
-		 */
-		sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
-	}
-}
diff --git a/arch/unicore32/mm/ioremap.c b/arch/unicore32/mm/ioremap.c
deleted file mode 100644
index 46a64bd6156a..000000000000
--- a/arch/unicore32/mm/ioremap.c
+++ /dev/null
@@ -1,242 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/ioremap.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * Re-map IO memory to kernel address space so that we can access it.
- *
- * This allows a driver to remap an arbitrary region of bus memory into
- * virtual space.  One should *only* use readl, writel, memcpy_toio and
- * so on with such remapped areas.
- *
- * Because UniCore only has a 32-bit address space we can't address the
- * whole of the (physical) PCI space at once.  PCI huge-mode addressing
- * allows us to circumvent this restriction by splitting PCI space into
- * two 2GB chunks and mapping only one at a time into processor memory.
- * We use MMU protection domains to trap any attempt to access the bank
- * that is not currently mapped.  (This isn't fully implemented yet.)
- */
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/io.h>
-
-#include <asm/cputype.h>
-#include <asm/cacheflush.h>
-#include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <linux/sizes.h>
-
-#include <mach/map.h>
-#include "mm.h"
-
-/*
- * Used by ioremap() and iounmap() code to mark (super)section-mapped
- * I/O regions in vm_struct->flags field.
- */
-#define VM_UNICORE_SECTION_MAPPING	0x80000000
-
-int ioremap_page(unsigned long virt, unsigned long phys,
-		 const struct mem_type *mtype)
-{
-	return ioremap_page_range(virt, virt + PAGE_SIZE, phys,
-				  __pgprot(mtype->prot_pte));
-}
-EXPORT_SYMBOL(ioremap_page);
-
-/*
- * Section support is unsafe on SMP - If you iounmap and ioremap a region,
- * the other CPUs will not see this change until their next context switch.
- * Meanwhile, (eg) if an interrupt comes in on one of those other CPUs
- * which requires the new ioremap'd region to be referenced, the CPU will
- * reference the _old_ region.
- *
- * Note that get_vm_area_caller() allocates a guard 4K page, so we need to
- * mask the size back to 4MB aligned or we will overflow in the loop below.
- */
-static void unmap_area_sections(unsigned long virt, unsigned long size)
-{
-	unsigned long addr = virt, end = virt + (size & ~(SZ_4M - 1));
-	pgd_t *pgd;
-
-	flush_cache_vunmap(addr, end);
-	pgd = pgd_offset_k(addr);
-	do {
-		pmd_t pmd, *pmdp = pmd_offset((pud_t *)pgd, addr);
-
-		pmd = *pmdp;
-		if (!pmd_none(pmd)) {
-			/*
-			 * Clear the PMD from the page table, and
-			 * increment the kvm sequence so others
-			 * notice this change.
-			 *
-			 * Note: this is still racy on SMP machines.
-			 */
-			pmd_clear(pmdp);
-
-			/*
-			 * Free the page table, if there was one.
-			 */
-			if ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_TABLE)
-				pte_free_kernel(&init_mm, pmd_page_vaddr(pmd));
-		}
-
-		addr += PGDIR_SIZE;
-		pgd++;
-	} while (addr < end);
-
-	flush_tlb_kernel_range(virt, end);
-}
-
-static int
-remap_area_sections(unsigned long virt, unsigned long pfn,
-		    size_t size, const struct mem_type *type)
-{
-	unsigned long addr = virt, end = virt + size;
-	pgd_t *pgd;
-
-	/*
-	 * Remove and free any PTE-based mapping, and
-	 * sync the current kernel mapping.
-	 */
-	unmap_area_sections(virt, size);
-
-	pgd = pgd_offset_k(addr);
-	do {
-		pmd_t *pmd = pmd_offset((pud_t *)pgd, addr);
-
-		set_pmd(pmd, __pmd(__pfn_to_phys(pfn) | type->prot_sect));
-		pfn += SZ_4M >> PAGE_SHIFT;
-		flush_pmd_entry(pmd);
-
-		addr += PGDIR_SIZE;
-		pgd++;
-	} while (addr < end);
-
-	return 0;
-}
-
-void __iomem *__uc32_ioremap_pfn_caller(unsigned long pfn,
-	unsigned long offset, size_t size, unsigned int mtype, void *caller)
-{
-	const struct mem_type *type;
-	int err;
-	unsigned long addr;
-	struct vm_struct *area;
-
-	/*
-	 * High mappings must be section aligned
-	 */
-	if (pfn >= 0x100000 && (__pfn_to_phys(pfn) & ~SECTION_MASK))
-		return NULL;
-
-	/*
-	 * Don't allow RAM to be mapped
-	 */
-	if (pfn_valid(pfn)) {
-		WARN(1, "BUG: Your driver calls ioremap() on\n"
-			"system memory.  This leads to architecturally\n"
-			"unpredictable behaviour, and ioremap() will fail in\n"
-			"the next kernel release. Please fix your driver.\n");
-		return NULL;
-	}
-
-	type = get_mem_type(mtype);
-	if (!type)
-		return NULL;
-
-	/*
-	 * Page align the mapping size, taking account of any offset.
-	 */
-	size = PAGE_ALIGN(offset + size);
-
-	area = get_vm_area_caller(size, VM_IOREMAP, caller);
-	if (!area)
-		return NULL;
-	addr = (unsigned long)area->addr;
-
-	if (!((__pfn_to_phys(pfn) | size | addr) & ~PMD_MASK)) {
-		area->flags |= VM_UNICORE_SECTION_MAPPING;
-		err = remap_area_sections(addr, pfn, size, type);
-	} else
-		err = ioremap_page_range(addr, addr + size, __pfn_to_phys(pfn),
-					 __pgprot(type->prot_pte));
-
-	if (err) {
-		vunmap((void *)addr);
-		return NULL;
-	}
-
-	flush_cache_vmap(addr, addr + size);
-	return (void __iomem *) (offset + addr);
-}
-
-void __iomem *__uc32_ioremap_caller(unsigned long phys_addr, size_t size,
-	unsigned int mtype, void *caller)
-{
-	unsigned long last_addr;
-	unsigned long offset = phys_addr & ~PAGE_MASK;
-	unsigned long pfn = __phys_to_pfn(phys_addr);
-
-	/*
-	 * Don't allow wraparound or zero size
-	 */
-	last_addr = phys_addr + size - 1;
-	if (!size || last_addr < phys_addr)
-		return NULL;
-
-	return __uc32_ioremap_pfn_caller(pfn, offset, size, mtype, caller);
-}
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem *
-__uc32_ioremap_pfn(unsigned long pfn, unsigned long offset, size_t size,
-		  unsigned int mtype)
-{
-	return __uc32_ioremap_pfn_caller(pfn, offset, size, mtype,
-			__builtin_return_address(0));
-}
-EXPORT_SYMBOL(__uc32_ioremap_pfn);
-
-void __iomem *
-__uc32_ioremap(unsigned long phys_addr, size_t size)
-{
-	return __uc32_ioremap_caller(phys_addr, size, MT_DEVICE,
-			__builtin_return_address(0));
-}
-EXPORT_SYMBOL(__uc32_ioremap);
-
-void __uc32_iounmap(volatile void __iomem *io_addr)
-{
-	void *addr = (void *)(PAGE_MASK & (unsigned long)io_addr);
-	struct vm_struct *vm;
-
-	/*
-	 * If this is a section based mapping we need to handle it
-	 * specially as the VM subsystem does not know how to handle
-	 * such a beast. We need the lock here b/c we need to clear
-	 * all the mappings before the area can be reclaimed
-	 * by someone else.
-	 */
-	vm = find_vm_area(addr);
-	if (vm && (vm->flags & VM_IOREMAP) &&
-		(vm->flags & VM_UNICORE_SECTION_MAPPING))
-		unmap_area_sections((unsigned long)vm->addr, vm->size);
-
-	vunmap(addr);
-}
-EXPORT_SYMBOL(__uc32_iounmap);
diff --git a/arch/unicore32/mm/mm.h b/arch/unicore32/mm/mm.h
deleted file mode 100644
index f157f5d249ab..000000000000
--- a/arch/unicore32/mm/mm.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/mm/mm.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <asm/hwdef-copro.h>
-
-/* the upper-most page table pointer */
-extern pmd_t *top_pmd;
-extern int sysctl_overcommit_memory;
-
-#define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
-
-struct mem_type {
-	unsigned int prot_pte;
-	unsigned int prot_l1;
-	unsigned int prot_sect;
-};
-
-const struct mem_type *get_mem_type(unsigned int type);
-
-extern void __flush_dcache_page(struct address_space *, struct page *);
-extern void hook_fault_code(int nr, int (*fn)
-		(unsigned long, unsigned int, struct pt_regs *),
-		int sig, int code, const char *name);
-
-void __init bootmem_init(void);
-void uc32_mm_memblock_reserve(void);
diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c
deleted file mode 100644
index 183d5b056814..000000000000
--- a/arch/unicore32/mm/mmu.c
+++ /dev/null
@@ -1,513 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/mmu.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/mman.h>
-#include <linux/nodemask.h>
-#include <linux/memblock.h>
-#include <linux/fs.h>
-#include <linux/io.h>
-
-#include <asm/cputype.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <linux/sizes.h>
-#include <asm/tlb.h>
-#include <asm/memblock.h>
-
-#include <mach/map.h>
-
-#include "mm.h"
-
-/*
- * empty_zero_page is a special page that is used for
- * zero-initialized data and COW.
- */
-struct page *empty_zero_page;
-EXPORT_SYMBOL(empty_zero_page);
-
-/*
- * The pmd table for the upper-most set of pages.
- */
-pmd_t *top_pmd;
-
-pgprot_t pgprot_user;
-EXPORT_SYMBOL(pgprot_user);
-
-pgprot_t pgprot_kernel;
-EXPORT_SYMBOL(pgprot_kernel);
-
-static int __init noalign_setup(char *__unused)
-{
-	cr_alignment &= ~CR_A;
-	cr_no_alignment &= ~CR_A;
-	set_cr(cr_alignment);
-	return 1;
-}
-__setup("noalign", noalign_setup);
-
-void adjust_cr(unsigned long mask, unsigned long set)
-{
-	unsigned long flags;
-
-	mask &= ~CR_A;
-
-	set &= mask;
-
-	local_irq_save(flags);
-
-	cr_no_alignment = (cr_no_alignment & ~mask) | set;
-	cr_alignment = (cr_alignment & ~mask) | set;
-
-	set_cr((get_cr() & ~mask) | set);
-
-	local_irq_restore(flags);
-}
-
-struct map_desc {
-	unsigned long virtual;
-	unsigned long pfn;
-	unsigned long length;
-	unsigned int type;
-};
-
-#define PROT_PTE_DEVICE		(PTE_PRESENT | PTE_YOUNG |	\
-				PTE_DIRTY | PTE_READ | PTE_WRITE)
-#define PROT_SECT_DEVICE	(PMD_TYPE_SECT | PMD_PRESENT |	\
-				PMD_SECT_READ | PMD_SECT_WRITE)
-
-static struct mem_type mem_types[] = {
-	[MT_DEVICE] = {		  /* Strongly ordered */
-		.prot_pte	= PROT_PTE_DEVICE,
-		.prot_l1	= PMD_TYPE_TABLE | PMD_PRESENT,
-		.prot_sect	= PROT_SECT_DEVICE,
-	},
-	/*
-	 * MT_KUSER: pte for vecpage -- cacheable,
-	 *       and sect for unigfx mmap -- noncacheable
-	 */
-	[MT_KUSER] = {
-		.prot_pte  = PTE_PRESENT | PTE_YOUNG | PTE_DIRTY |
-				PTE_CACHEABLE | PTE_READ | PTE_EXEC,
-		.prot_l1   = PMD_TYPE_TABLE | PMD_PRESENT,
-		.prot_sect = PROT_SECT_DEVICE,
-	},
-	[MT_HIGH_VECTORS] = {
-		.prot_pte  = PTE_PRESENT | PTE_YOUNG | PTE_DIRTY |
-				PTE_CACHEABLE | PTE_READ | PTE_WRITE |
-				PTE_EXEC,
-		.prot_l1   = PMD_TYPE_TABLE | PMD_PRESENT,
-	},
-	[MT_MEMORY] = {
-		.prot_pte  = PTE_PRESENT | PTE_YOUNG | PTE_DIRTY |
-				PTE_WRITE | PTE_EXEC,
-		.prot_l1   = PMD_TYPE_TABLE | PMD_PRESENT,
-		.prot_sect = PMD_TYPE_SECT | PMD_PRESENT | PMD_SECT_CACHEABLE |
-				PMD_SECT_READ | PMD_SECT_WRITE | PMD_SECT_EXEC,
-	},
-	[MT_ROM] = {
-		.prot_sect = PMD_TYPE_SECT | PMD_PRESENT | PMD_SECT_CACHEABLE |
-				PMD_SECT_READ,
-	},
-};
-
-const struct mem_type *get_mem_type(unsigned int type)
-{
-	return type < ARRAY_SIZE(mem_types) ? &mem_types[type] : NULL;
-}
-EXPORT_SYMBOL(get_mem_type);
-
-/*
- * Adjust the PMD section entries according to the CPU in use.
- */
-static void __init build_mem_type_table(void)
-{
-	pgprot_user   = __pgprot(PTE_PRESENT | PTE_YOUNG | PTE_CACHEABLE);
-	pgprot_kernel = __pgprot(PTE_PRESENT | PTE_YOUNG |
-				 PTE_DIRTY | PTE_READ | PTE_WRITE |
-				 PTE_EXEC | PTE_CACHEABLE);
-}
-
-#define vectors_base()	(vectors_high() ? 0xffff0000 : 0)
-
-static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr,
-		unsigned long prot)
-{
-	if (pmd_none(*pmd)) {
-		size_t size = PTRS_PER_PTE * sizeof(pte_t);
-		pte_t *pte = memblock_alloc(size, size);
-
-		if (!pte)
-			panic("%s: Failed to allocate %zu bytes align=%zx\n",
-			      __func__, size, size);
-
-		__pmd_populate(pmd, __pa(pte) | prot);
-	}
-	BUG_ON(pmd_bad(*pmd));
-	return pte_offset_kernel(pmd, addr);
-}
-
-static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
-				  unsigned long end, unsigned long pfn,
-				  const struct mem_type *type)
-{
-	pte_t *pte = early_pte_alloc(pmd, addr, type->prot_l1);
-	do {
-		set_pte(pte, pfn_pte(pfn, __pgprot(type->prot_pte)));
-		pfn++;
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-static void __init alloc_init_section(pgd_t *pgd, unsigned long addr,
-				      unsigned long end, unsigned long phys,
-				      const struct mem_type *type)
-{
-	pmd_t *pmd = pmd_offset((pud_t *)pgd, addr);
-
-	/*
-	 * Try a section mapping - end, addr and phys must all be aligned
-	 * to a section boundary.
-	 */
-	if (((addr | end | phys) & ~SECTION_MASK) == 0) {
-		pmd_t *p = pmd;
-
-		do {
-			set_pmd(pmd, __pmd(phys | type->prot_sect));
-			phys += SECTION_SIZE;
-		} while (pmd++, addr += SECTION_SIZE, addr != end);
-
-		flush_pmd_entry(p);
-	} else {
-		/*
-		 * No need to loop; pte's aren't interested in the
-		 * individual L1 entries.
-		 */
-		alloc_init_pte(pmd, addr, end, __phys_to_pfn(phys), type);
-	}
-}
-
-/*
- * Create the page directory entries and any necessary
- * page tables for the mapping specified by `md'.  We
- * are able to cope here with varying sizes and address
- * offsets, and we take full advantage of sections.
- */
-static void __init create_mapping(struct map_desc *md)
-{
-	unsigned long phys, addr, length, end;
-	const struct mem_type *type;
-	pgd_t *pgd;
-
-	if (md->virtual != vectors_base() && md->virtual < TASK_SIZE) {
-		printk(KERN_WARNING "BUG: not creating mapping for "
-		       "0x%08llx at 0x%08lx in user region\n",
-		       __pfn_to_phys((u64)md->pfn), md->virtual);
-		return;
-	}
-
-	if ((md->type == MT_DEVICE || md->type == MT_ROM) &&
-	    md->virtual >= PAGE_OFFSET && md->virtual < VMALLOC_END) {
-		printk(KERN_WARNING "BUG: mapping for 0x%08llx at 0x%08lx "
-		       "overlaps vmalloc space\n",
-		       __pfn_to_phys((u64)md->pfn), md->virtual);
-	}
-
-	type = &mem_types[md->type];
-
-	addr = md->virtual & PAGE_MASK;
-	phys = (unsigned long)__pfn_to_phys(md->pfn);
-	length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));
-
-	if (type->prot_l1 == 0 && ((addr | phys | length) & ~SECTION_MASK)) {
-		printk(KERN_WARNING "BUG: map for 0x%08lx at 0x%08lx can not "
-		       "be mapped using pages, ignoring.\n",
-		       __pfn_to_phys(md->pfn), addr);
-		return;
-	}
-
-	pgd = pgd_offset_k(addr);
-	end = addr + length;
-	do {
-		unsigned long next = pgd_addr_end(addr, end);
-
-		alloc_init_section(pgd, addr, next, phys, type);
-
-		phys += next - addr;
-		addr = next;
-	} while (pgd++, addr != end);
-}
-
-static void * __initdata vmalloc_min = (void *)(VMALLOC_END - SZ_128M);
-
-/*
- * vmalloc=size forces the vmalloc area to be exactly 'size'
- * bytes. This can be used to increase (or decrease) the vmalloc
- * area - the default is 128m.
- */
-static int __init early_vmalloc(char *arg)
-{
-	unsigned long vmalloc_reserve = memparse(arg, NULL);
-
-	if (vmalloc_reserve < SZ_16M) {
-		vmalloc_reserve = SZ_16M;
-		printk(KERN_WARNING
-			"vmalloc area too small, limiting to %luMB\n",
-			vmalloc_reserve >> 20);
-	}
-
-	if (vmalloc_reserve > VMALLOC_END - (PAGE_OFFSET + SZ_32M)) {
-		vmalloc_reserve = VMALLOC_END - (PAGE_OFFSET + SZ_32M);
-		printk(KERN_WARNING
-			"vmalloc area is too big, limiting to %luMB\n",
-			vmalloc_reserve >> 20);
-	}
-
-	vmalloc_min = (void *)(VMALLOC_END - vmalloc_reserve);
-	return 0;
-}
-early_param("vmalloc", early_vmalloc);
-
-static phys_addr_t lowmem_limit __initdata = SZ_1G;
-
-static void __init sanity_check_meminfo(void)
-{
-	int i, j;
-
-	lowmem_limit = __pa(vmalloc_min - 1) + 1;
-	memblock_set_current_limit(lowmem_limit);
-
-	for (i = 0, j = 0; i < meminfo.nr_banks; i++) {
-		struct membank *bank = &meminfo.bank[j];
-		*bank = meminfo.bank[i];
-		j++;
-	}
-	meminfo.nr_banks = j;
-}
-
-static inline void prepare_page_table(void)
-{
-	unsigned long addr;
-	phys_addr_t end;
-
-	/*
-	 * Clear out all the mappings below the kernel image.
-	 */
-	for (addr = 0; addr < MODULES_VADDR; addr += PGDIR_SIZE)
-		pmd_clear(pmd_off_k(addr));
-
-	for ( ; addr < PAGE_OFFSET; addr += PGDIR_SIZE)
-		pmd_clear(pmd_off_k(addr));
-
-	/*
-	 * Find the end of the first block of lowmem.
-	 */
-	end = memblock.memory.regions[0].base + memblock.memory.regions[0].size;
-	if (end >= lowmem_limit)
-		end = lowmem_limit;
-
-	/*
-	 * Clear out all the kernel space mappings, except for the first
-	 * memory bank, up to the end of the vmalloc region.
-	 */
-	for (addr = __phys_to_virt(end);
-	     addr < VMALLOC_END; addr += PGDIR_SIZE)
-		pmd_clear(pmd_off_k(addr));
-}
-
-/*
- * Reserve the special regions of memory
- */
-void __init uc32_mm_memblock_reserve(void)
-{
-	/*
-	 * Reserve the page tables.  These are already in use,
-	 * and can only be in node 0.
-	 */
-	memblock_reserve(__pa(swapper_pg_dir), PTRS_PER_PGD * sizeof(pgd_t));
-}
-
-/*
- * Set up device the mappings.  Since we clear out the page tables for all
- * mappings above VMALLOC_END, we will remove any debug device mappings.
- * This means you have to be careful how you debug this function, or any
- * called function.  This means you can't use any function or debugging
- * method which may touch any device, otherwise the kernel _will_ crash.
- */
-static void __init devicemaps_init(void)
-{
-	struct map_desc map;
-	unsigned long addr;
-	void *vectors;
-
-	/*
-	 * Allocate the vector page early.
-	 */
-	vectors = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-	if (!vectors)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, PAGE_SIZE, PAGE_SIZE);
-
-	for (addr = VMALLOC_END; addr; addr += PGDIR_SIZE)
-		pmd_clear(pmd_off_k(addr));
-
-	/*
-	 * Create a mapping for the machine vectors at the high-vectors
-	 * location (0xffff0000).  If we aren't using high-vectors, also
-	 * create a mapping at the low-vectors virtual address.
-	 */
-	map.pfn = __phys_to_pfn(virt_to_phys(vectors));
-	map.virtual = VECTORS_BASE;
-	map.length = PAGE_SIZE;
-	map.type = MT_HIGH_VECTORS;
-	create_mapping(&map);
-
-	/*
-	 * Create a mapping for the kuser page at the special
-	 * location (0xbfff0000) to the same vectors location.
-	 */
-	map.pfn = __phys_to_pfn(virt_to_phys(vectors));
-	map.virtual = KUSER_VECPAGE_BASE;
-	map.length = PAGE_SIZE;
-	map.type = MT_KUSER;
-	create_mapping(&map);
-
-	/*
-	 * Finally flush the caches and tlb to ensure that we're in a
-	 * consistent state wrt the writebuffer.  This also ensures that
-	 * any write-allocated cache lines in the vector page are written
-	 * back.  After this point, we can start to touch devices again.
-	 */
-	local_flush_tlb_all();
-	flush_cache_all();
-}
-
-static void __init map_lowmem(void)
-{
-	struct memblock_region *reg;
-
-	/* Map all the lowmem memory banks. */
-	for_each_memblock(memory, reg) {
-		phys_addr_t start = reg->base;
-		phys_addr_t end = start + reg->size;
-		struct map_desc map;
-
-		if (end > lowmem_limit)
-			end = lowmem_limit;
-		if (start >= end)
-			break;
-
-		map.pfn = __phys_to_pfn(start);
-		map.virtual = __phys_to_virt(start);
-		map.length = end - start;
-		map.type = MT_MEMORY;
-
-		create_mapping(&map);
-	}
-}
-
-/*
- * paging_init() sets up the page tables, initialises the zone memory
- * maps, and sets up the zero page, bad page and bad page tables.
- */
-void __init paging_init(void)
-{
-	void *zero_page;
-
-	build_mem_type_table();
-	sanity_check_meminfo();
-	prepare_page_table();
-	map_lowmem();
-	devicemaps_init();
-
-	top_pmd = pmd_off_k(0xffff0000);
-
-	/* allocate the zero page. */
-	zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-	if (!zero_page)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, PAGE_SIZE, PAGE_SIZE);
-
-	bootmem_init();
-
-	empty_zero_page = virt_to_page(zero_page);
-	__flush_dcache_page(NULL, empty_zero_page);
-}
-
-/*
- * In order to soft-boot, we need to insert a 1:1 mapping in place of
- * the user-mode pages.  This will then ensure that we have predictable
- * results when turning the mmu off
- */
-void setup_mm_for_reboot(void)
-{
-	unsigned long base_pmdval;
-	pgd_t *pgd;
-	int i;
-
-	/*
-	 * We need to access to user-mode page tables here. For kernel threads
-	 * we don't have any user-mode mappings so we use the context that we
-	 * "borrowed".
-	 */
-	pgd = current->active_mm->pgd;
-
-	base_pmdval = PMD_SECT_WRITE | PMD_SECT_READ | PMD_TYPE_SECT;
-
-	for (i = 0; i < FIRST_USER_PGD_NR + USER_PTRS_PER_PGD; i++, pgd++) {
-		unsigned long pmdval = (i << PGDIR_SHIFT) | base_pmdval;
-		pmd_t *pmd;
-
-		pmd = pmd_off(pgd, i << PGDIR_SHIFT);
-		set_pmd(pmd, __pmd(pmdval));
-		flush_pmd_entry(pmd);
-	}
-
-	local_flush_tlb_all();
-}
-
-/*
- * Take care of architecture specific things when placing a new PTE into
- * a page table, or changing an existing PTE.  Basically, there are two
- * things that we need to take care of:
- *
- *  1. If PG_dcache_clean is not set for the page, we need to ensure
- *     that any cache entries for the kernels virtual memory
- *     range are written back to the page.
- *  2. If we have multiple shared mappings of the same space in
- *     an object, we need to deal with the cache aliasing issues.
- *
- * Note that the pte lock will be held.
- */
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr,
-	pte_t *ptep)
-{
-	unsigned long pfn = pte_pfn(*ptep);
-	struct address_space *mapping;
-	struct page *page;
-
-	if (!pfn_valid(pfn))
-		return;
-
-	/*
-	 * The zero page is never written to, so never has any dirty
-	 * cache lines, and therefore never needs to be flushed.
-	 */
-	page = pfn_to_page(pfn);
-	if (page == ZERO_PAGE(0))
-		return;
-
-	mapping = page_mapping_file(page);
-	if (!test_and_set_bit(PG_dcache_clean, &page->flags))
-		__flush_dcache_page(mapping, page);
-	if (mapping)
-		if (vma->vm_flags & VM_EXEC)
-			__flush_icache_all();
-}
diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c
deleted file mode 100644
index f01c73e04836..000000000000
--- a/arch/unicore32/mm/pgd.c
+++ /dev/null
@@ -1,102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/pgd.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/mm.h>
-#include <linux/gfp.h>
-#include <linux/highmem.h>
-
-#include <asm/pgalloc.h>
-#include <asm/page.h>
-#include <asm/tlbflush.h>
-
-#include "mm.h"
-
-#define FIRST_KERNEL_PGD_NR	(FIRST_USER_PGD_NR + USER_PTRS_PER_PGD)
-
-/*
- * need to get a 4k page for level 1
- */
-pgd_t *get_pgd_slow(struct mm_struct *mm)
-{
-	pgd_t *new_pgd, *init_pgd;
-	pmd_t *new_pmd, *init_pmd;
-	pte_t *new_pte, *init_pte;
-
-	new_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, 0);
-	if (!new_pgd)
-		goto no_pgd;
-
-	memset(new_pgd, 0, FIRST_KERNEL_PGD_NR * sizeof(pgd_t));
-
-	/*
-	 * Copy over the kernel and IO PGD entries
-	 */
-	init_pgd = pgd_offset_k(0);
-	memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR,
-		       (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t));
-
-	clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t));
-
-	if (!vectors_high()) {
-		/*
-		 * On UniCore, first page must always be allocated since it
-		 * contains the machine vectors.
-		 */
-		new_pmd = pmd_alloc(mm, (pud_t *)new_pgd, 0);
-		if (!new_pmd)
-			goto no_pmd;
-
-		new_pte = pte_alloc_map(mm, new_pmd, 0);
-		if (!new_pte)
-			goto no_pte;
-
-		init_pmd = pmd_offset((pud_t *)init_pgd, 0);
-		init_pte = pte_offset_map(init_pmd, 0);
-		set_pte(new_pte, *init_pte);
-		pte_unmap(init_pte);
-		pte_unmap(new_pte);
-	}
-
-	return new_pgd;
-
-no_pte:
-	pmd_free(mm, new_pmd);
-	mm_dec_nr_pmds(mm);
-no_pmd:
-	free_pages((unsigned long)new_pgd, 0);
-no_pgd:
-	return NULL;
-}
-
-void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd)
-{
-	pmd_t *pmd;
-	pgtable_t pte;
-
-	if (!pgd)
-		return;
-
-	/* pgd is always present and good */
-	pmd = pmd_off(pgd, 0);
-	if (pmd_none(*pmd))
-		goto free;
-	if (pmd_bad(*pmd)) {
-		pmd_ERROR(*pmd);
-		pmd_clear(pmd);
-		goto free;
-	}
-
-	pte = pmd_pgtable(*pmd);
-	pmd_clear(pmd);
-	pte_free(mm, pte);
-	mm_dec_nr_ptes(mm);
-	pmd_free(mm, pmd);
-	mm_dec_nr_pmds(mm);
-free:
-	free_pages((unsigned long) pgd, 0);
-}
diff --git a/arch/unicore32/mm/proc-macros.S b/arch/unicore32/mm/proc-macros.S
deleted file mode 100644
index 3b0ae7d5bd80..000000000000
--- a/arch/unicore32/mm/proc-macros.S
+++ /dev/null
@@ -1,142 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/mm/proc-macros.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * We need constants.h for:
- *  VMA_VM_MM
- *  VMA_VM_FLAGS
- *  VM_EXEC
- */
-#include <generated/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/memory.h>
-
-/*
- * the cache line sizes of the I and D cache are the same
- */
-#define CACHE_LINESIZE	32
-
-/*
- * This is the maximum size of an area which will be invalidated
- * using the single invalidate entry instructions.  Anything larger
- * than this, and we go for the whole cache.
- *
- * This value should be chosen such that we choose the cheapest
- * alternative.
- */
-#ifdef CONFIG_CPU_UCV2
-#define MAX_AREA_SIZE	0x800		/* 64 cache line */
-#endif
-
-/*
- * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
- */
-	.macro	vma_vm_mm, rd, rn
-	ldw	\rd, [\rn+], #VMA_VM_MM
-	.endm
-
-/*
- * vma_vm_flags - get vma->vm_flags
- */
-	.macro	vma_vm_flags, rd, rn
-	ldw	\rd, [\rn+], #VMA_VM_FLAGS
-	.endm
-
-	.macro	tsk_mm, rd, rn
-	ldw	\rd, [\rn+], #TI_TASK
-	ldw	\rd, [\rd+], #TSK_ACTIVE_MM
-	.endm
-
-/*
- * act_mm - get current->active_mm
- */
-	.macro	act_mm, rd
-	andn	\rd, sp, #8128
-	andn	\rd, \rd, #63
-	ldw	\rd, [\rd+], #TI_TASK
-	ldw	\rd, [\rd+], #TSK_ACTIVE_MM
-	.endm
-
-/*
- * mmid - get context id from mm pointer (mm->context.id)
- */
-	.macro	mmid, rd, rn
-	ldw	\rd, [\rn+], #MM_CONTEXT_ID
-	.endm
-
-/*
- * mask_asid - mask the ASID from the context ID
- */
-	.macro	asid, rd, rn
-	and	\rd, \rn, #255
-	.endm
-
-	.macro	crval, clear, mmuset, ucset
-	.word	\clear
-	.word	\mmuset
-	.endm
-
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-/*
- * va2pa va, pa, tbl, msk, off, err
- *	This macro is used to translate virtual address to its physical address.
- *
- *	va: virtual address
- *	pa: physical address, result is stored in this register
- *	tbl, msk, off:	temp registers, will be destroyed
- *	err: jump to error label if the physical address not exist
- * NOTE: all regs must be different
- */
-	.macro	va2pa, va, pa, tbl, msk, off, err=990f
-	movc	\pa, p0.c2, #0
-	mov	\off, \va >> #22		@ off <- index of 1st page table
-	adr	\tbl, 910f			@ tbl <- table of 1st page table
-900:						@ ---- handle 1, 2 page table
-	add	\pa, \pa, #PAGE_OFFSET		@ pa <- virt addr of page table
-	ldw	\pa, [\pa+], \off << #2		@ pa <- the content of pt
-	cand.a	\pa, #4				@ test exist bit
-	beq	\err				@ if not exist
-	and	\off, \pa, #3			@ off <- the last 2 bits
-	add	\tbl, \tbl, \off << #3		@ cmove table pointer
-	ldw	\msk, [\tbl+], #0		@ get the mask
-	ldw	pc, [\tbl+], #4
-930:						@ ---- handle 2nd page table
-	and	\pa, \pa, \msk			@ pa <- phys addr of 2nd pt
-	mov	\off, \va << #10
-	cntlo	\tbl, \msk			@ use tbl as temp reg
-	mov	\off, \off >> \tbl
-	mov	\off, \off >> #2		@ off <- index of 2nd pt
-	adr	\tbl, 920f			@ tbl <- table of 2nd pt
-	b	900b
-910:						@ 1st level page table
-	.word	0xfffff000, 930b		@ second level page table
-	.word	0xfffffc00, 930b		@ second level large page table
-	.word	0x00000000, \err		@ invalid
-	.word	0xffc00000, 980f		@ super page
-
-920:						@ 2nd level page table
-	.word	0xfffff000, 980f		@ page
-	.word	0xffffc000, 980f		@ middle page
-	.word	0xffff0000, 980f		@ large page
-	.word	0x00000000, \err		@ invalid
-980:
-	andn	\tbl, \va, \msk
-	and	\pa, \pa, \msk
-	or	\pa, \pa, \tbl
-990:
-	.endm
-#endif
-
-	.macro dcacheline_flush, addr, t1, t2
-	mov	\t1, \addr << #20
-	ldw	\t2, =_stext			@ _stext must ALIGN(4096)
-	add	\t2, \t2, \t1 >> #20
-	ldw	\t1, [\t2+], #0x0000
-	ldw	\t1, [\t2+], #0x1000
-	ldw	\t1, [\t2+], #0x2000
-	ldw	\t1, [\t2+], #0x3000
-	.endm
diff --git a/arch/unicore32/mm/proc-syms.c b/arch/unicore32/mm/proc-syms.c
deleted file mode 100644
index 6c081616fc3c..000000000000
--- a/arch/unicore32/mm/proc-syms.c
+++ /dev/null
@@ -1,19 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/proc-syms.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/mm.h>
-
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/page.h>
-
-EXPORT_SYMBOL(cpu_dcache_clean_area);
-EXPORT_SYMBOL(cpu_set_pte);
-
-EXPORT_SYMBOL(__cpuc_coherent_kern_range);
diff --git a/arch/unicore32/mm/proc-ucv2.S b/arch/unicore32/mm/proc-ucv2.S
deleted file mode 100644
index 18f8c4fb21a0..000000000000
--- a/arch/unicore32/mm/proc-ucv2.S
+++ /dev/null
@@ -1,131 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/mm/proc-ucv2.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/init.h>
-#include <linux/linkage.h>
-#include <linux/pgtable.h>
-#include <asm/assembler.h>
-#include <asm/hwcap.h>
-#include <asm/pgtable-hwdef.h>
-
-#include "proc-macros.S"
-
-ENTRY(cpu_proc_fin)
-	stm.w	(lr), [sp-]
-	mov	ip, #PSR_R_BIT | PSR_I_BIT | PRIV_MODE
-	mov.a	asr, ip
-	b.l	__cpuc_flush_kern_all
-	ldm.w	(pc), [sp]+
-
-/*
- *	cpu_reset(loc)
- *
- *	Perform a soft reset of the system.  Put the CPU into the
- *	same state as it would be if it had been reset, and branch
- *	to what would be the reset vector.
- *
- *	- loc   - location to jump to for soft reset
- */
-	.align	5
-ENTRY(cpu_reset)
-	mov	ip, #0
-	movc	p0.c5, ip, #28			@ Cache invalidate all
-	nop8
-
-	movc	p0.c6, ip, #6			@ TLB invalidate all
-	nop8
-
-	movc	ip, p0.c1, #0			@ ctrl register
-	or	ip, ip, #0x2000			@ vector base address
-	andn	ip, ip, #0x000f			@ ............idam
-	movc	p0.c1, ip, #0			@ disable caches and mmu
-	nop
-	mov	pc, r0				@ jump to loc
-	nop8
-
-/*
- *	cpu_do_idle()
- *
- *	Idle the processor (eg, wait for interrupt).
- *
- *	IRQs are already disabled.
- */
-ENTRY(cpu_do_idle)
-	mov	r0, #0				@ PCI address
-	.rept	8
-	ldw	r1, [r0]
-	.endr
-	mov	pc, lr
-
-ENTRY(cpu_dcache_clean_area)
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	csub.a	r1, #MAX_AREA_SIZE
-	bsg	101f
-	mov	r9, #PAGE_SZ
-	sub	r9, r9, #1			@ PAGE_MASK
-1:	va2pa	r0, r10, r11, r12, r13		@ r10 is PA
-	b	3f
-2:	cand.a	r0, r9
-	beq	1b
-3:	movc	p0.c5, r10, #11			@ clean D entry
-	nop8
-	add	r0, r0, #CACHE_LINESIZE
-	add	r10, r10, #CACHE_LINESIZE
-	sub.a	r1, r1, #CACHE_LINESIZE
-	bua	2b
-	mov	pc, lr
-#endif
-101:	mov	ip, #0
-	movc	p0.c5, ip, #10			@ Dcache clean all
-	nop8
-
-	mov	pc, lr
-
-/*
- *	cpu_do_switch_mm(pgd_phys)
- *
- *	Set the translation table base pointer to be pgd_phys
- *
- *	- pgd_phys - physical address of new pgd
- *
- *	It is assumed that:
- *	- we are not using split page tables
- */
-	.align	5
-ENTRY(cpu_do_switch_mm)
-	movc	p0.c2, r0, #0			@ update page table ptr
-	nop8
-
-	movc	p0.c6, ip, #6			@ TLB invalidate all
-	nop8
-
-	mov	pc, lr
-
-/*
- *	cpu_set_pte(ptep, pte)
- *
- *	Set a level 2 translation table entry.
- *
- *	- ptep  - pointer to level 2 translation table entry
- *	- pte   - PTE value to store
- */
-	.align	5
-ENTRY(cpu_set_pte)
-	stw	r1, [r0]
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	sub	r2, r0, #PAGE_OFFSET
-	movc	p0.c5, r2, #11				@ Dcache clean line
-	nop8
-#else
-	mov	ip, #0
-	movc	p0.c5, ip, #10				@ Dcache clean all
-	nop8
-	@dcacheline_flush	r0, r2, ip
-#endif
-	mov	pc, lr
-
diff --git a/arch/unicore32/mm/tlb-ucv2.S b/arch/unicore32/mm/tlb-ucv2.S
deleted file mode 100644
index 0ce9c6b6f1db..000000000000
--- a/arch/unicore32/mm/tlb-ucv2.S
+++ /dev/null
@@ -1,86 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/mm/tlb-ucv2.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/init.h>
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/page.h>
-#include <asm/tlbflush.h>
-#include "proc-macros.S"
-
-/*
- *	__cpu_flush_user_tlb_range(start, end, vma)
- *
- *	Invalidate a range of TLB entries in the specified address space.
- *
- *	- start - start address (may not be aligned)
- *	- end   - end address (exclusive, may not be aligned)
- *	- vma   - vma_struct describing address range
- */
-ENTRY(__cpu_flush_user_tlb_range)
-#ifndef	CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE
-	mov	r0, r0 >> #PAGE_SHIFT		@ align address
-	mov	r0, r0 << #PAGE_SHIFT
-	vma_vm_flags r2, r2			@ get vma->vm_flags
-1:
-	movc	p0.c6, r0, #3
-	nop8
-
-	cand.a	r2, #VM_EXEC			@ Executable area ?
-	beq	2f
-
-	movc	p0.c6, r0, #5
-	nop8
-2:
-	add	r0, r0, #PAGE_SZ
-	csub.a	r0, r1
-	beb	1b
-#else
-	movc	p0.c6, r0, #2
-	nop8
-
-	cand.a	r2, #VM_EXEC			@ Executable area ?
-	beq	2f
-
-	movc	p0.c6, r0, #4
-	nop8
-2:
-#endif
-	mov	pc, lr
-
-/*
- *	__cpu_flush_kern_tlb_range(start,end)
- *
- *	Invalidate a range of kernel TLB entries
- *
- *	- start - start address (may not be aligned)
- *	- end   - end address (exclusive, may not be aligned)
- */
-ENTRY(__cpu_flush_kern_tlb_range)
-#ifndef	CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE
-	mov	r0, r0 >> #PAGE_SHIFT		@ align address
-	mov	r0, r0 << #PAGE_SHIFT
-1:
-	movc	p0.c6, r0, #3
-	nop8
-
-	movc	p0.c6, r0, #5
-	nop8
-
-	add	r0, r0, #PAGE_SZ
-	csub.a	r0, r1
-	beb	1b
-#else
-	movc	p0.c6, r0, #2
-	nop8
-
-	movc	p0.c6, r0, #4
-	nop8
-#endif
-	mov	pc, lr
-
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 491f1347bf43..e7b78d5ae1ab 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -26,7 +26,7 @@ int C_A_D = 1;
 struct pid *cad_pid;
 EXPORT_SYMBOL(cad_pid);
 
-#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32)
+#if defined(CONFIG_ARM)
 #define DEFAULT_REBOOT_MODE		= REBOOT_HARD
 #else
 #define DEFAULT_REBOOT_MODE
-- 
cgit v1.2.3


From bba1dc0b55ac462d24ed1228ad49800c238cd6d7 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 29 Jun 2020 21:33:39 -0700
Subject: bpf: Remove redundant synchronize_rcu.

bpf_free_used_maps() or close(map_fd) will trigger map_free callback.
bpf_free_used_maps() is called after bpf prog is no longer executing:
bpf_prog_put->call_rcu->bpf_prog_free->bpf_free_used_maps.
Hence there is no need to call synchronize_rcu() to protect map elements.

Note that hash_of_maps and array_of_maps update/delete inner maps via
sys_bpf() that calls maybe_wait_bpf_programs() and synchronize_rcu().

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/bpf/20200630043343.53195-2-alexei.starovoitov@gmail.com
---
 kernel/bpf/arraymap.c         | 9 ---------
 kernel/bpf/hashtab.c          | 8 +++-----
 kernel/bpf/lpm_trie.c         | 5 -----
 kernel/bpf/queue_stack_maps.c | 7 -------
 kernel/bpf/reuseport_array.c  | 2 --
 kernel/bpf/ringbuf.c          | 7 -------
 kernel/bpf/stackmap.c         | 3 ---
 7 files changed, 3 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index ec5cd11032aa..c66e8273fccd 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -386,13 +386,6 @@ static void array_map_free(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 
-	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
-	 * so the programs (can be more than one that used this map) were
-	 * disconnected from events. Wait for outstanding programs to complete
-	 * and free the array
-	 */
-	synchronize_rcu();
-
 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 		bpf_array_free_percpu(array);
 
@@ -546,8 +539,6 @@ static void fd_array_map_free(struct bpf_map *map)
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
 
-	synchronize_rcu();
-
 	/* make sure it's empty */
 	for (i = 0; i < array->map.max_entries; i++)
 		BUG_ON(array->ptrs[i] != NULL);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index acd06081d81d..d4378d7d442b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1290,12 +1290,10 @@ static void htab_map_free(struct bpf_map *map)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 
-	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
-	 * so the programs (can be more than one that used this map) were
-	 * disconnected from events. Wait for outstanding critical sections in
-	 * these programs to complete
+	/* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
+	 * bpf_free_used_maps() is called after bpf prog is no longer executing.
+	 * There is no need to synchronize_rcu() here to protect map elements.
 	 */
-	synchronize_rcu();
 
 	/* some of free_htab_elem() callbacks for elements of this map may
 	 * not have executed. Wait for them.
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1abd4e3f906d..44474bf3ab7a 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -589,11 +589,6 @@ static void trie_free(struct bpf_map *map)
 	struct lpm_trie_node __rcu **slot;
 	struct lpm_trie_node *node;
 
-	/* Wait for outstanding programs to complete
-	 * update/lookup/delete/get_next_key and free the trie.
-	 */
-	synchronize_rcu();
-
 	/* Always start at the root and walk down to a node that has no
 	 * children. Then free that node, nullify its reference in the parent
 	 * and start over.
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 80c66a6d7c54..44184f82916a 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -101,13 +101,6 @@ static void queue_stack_map_free(struct bpf_map *map)
 {
 	struct bpf_queue_stack *qs = bpf_queue_stack(map);
 
-	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
-	 * so the programs (can be more than one that used this map) were
-	 * disconnected from events. Wait for outstanding critical sections in
-	 * these programs to complete
-	 */
-	synchronize_rcu();
-
 	bpf_map_area_free(qs);
 }
 
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index a09922f656e4..3625c4fcc65c 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -96,8 +96,6 @@ static void reuseport_array_free(struct bpf_map *map)
 	struct sock *sk;
 	u32 i;
 
-	synchronize_rcu();
-
 	/*
 	 * ops->map_*_elem() will not be able to access this
 	 * array now. Hence, this function only races with
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index dbf37aff4827..13a8d3967e07 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -215,13 +215,6 @@ static void ringbuf_map_free(struct bpf_map *map)
 {
 	struct bpf_ringbuf_map *rb_map;
 
-	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
-	 * so the programs (can be more than one that used this map) were
-	 * disconnected from events. Wait for outstanding critical sections in
-	 * these programs to complete
-	 */
-	synchronize_rcu();
-
 	rb_map = container_of(map, struct bpf_ringbuf_map, map);
 	bpf_ringbuf_free(rb_map->rb);
 	kfree(rb_map);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 27dc9b1b08a5..071f98d0f7c6 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -604,9 +604,6 @@ static void stack_map_free(struct bpf_map *map)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 
-	/* wait for bpf programs to complete before freeing stack map */
-	synchronize_rcu();
-
 	bpf_map_area_free(smap->elems);
 	pcpu_freelist_destroy(&smap->freelist);
 	bpf_map_area_free(smap);
-- 
cgit v1.2.3


From d141b8bc5773cbbaf5b8530f08f94fc10fff9e8c Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 29 Jun 2020 23:28:43 -0700
Subject: perf: Expose get/put_callchain_entry()

Sanitize and expose get/put_callchain_entry(). This would be used by bpf
stack map.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200630062846.664389-2-songliubraving@fb.com
---
 include/linux/perf_event.h |  2 ++
 kernel/events/callchain.c  | 13 ++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b4bb32082342..00ab5efa3833 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1244,6 +1244,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);
 extern int get_callchain_buffers(int max_stack);
 extern void put_callchain_buffers(void);
+extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
+extern void put_callchain_entry(int rctx);
 
 extern int sysctl_perf_event_max_stack;
 extern int sysctl_perf_event_max_contexts_per_stack;
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 334d48b16c36..c6ce894e4ce9 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -149,7 +149,7 @@ void put_callchain_buffers(void)
 	}
 }
 
-static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+struct perf_callchain_entry *get_callchain_entry(int *rctx)
 {
 	int cpu;
 	struct callchain_cpus_entries *entries;
@@ -159,8 +159,10 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
 		return NULL;
 
 	entries = rcu_dereference(callchain_cpus_entries);
-	if (!entries)
+	if (!entries) {
+		put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx);
 		return NULL;
+	}
 
 	cpu = smp_processor_id();
 
@@ -168,7 +170,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
 		(*rctx * perf_callchain_entry__sizeof()));
 }
 
-static void
+void
 put_callchain_entry(int rctx)
 {
 	put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
@@ -183,11 +185,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 	int rctx;
 
 	entry = get_callchain_entry(&rctx);
-	if (rctx == -1)
-		return NULL;
-
 	if (!entry)
-		goto exit_put;
+		return NULL;
 
 	ctx.entry     = entry;
 	ctx.max_stack = max_stack;
-- 
cgit v1.2.3


From fa28dcb82a38f8e3993b0fae9106b1a80b59e4f0 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 29 Jun 2020 23:28:44 -0700
Subject: bpf: Introduce helper bpf_get_task_stack()

Introduce helper bpf_get_task_stack(), which dumps stack trace of given
task. This is different to bpf_get_stack(), which gets stack track of
current task. One potential use case of bpf_get_task_stack() is to call
it from bpf_iter__task and dump all /proc/<pid>/stack to a seq_file.

bpf_get_task_stack() uses stack_trace_save_tsk() instead of
get_perf_callchain() for kernel stack. The benefit of this choice is that
stack_trace_save_tsk() doesn't require changes in arch/. The downside of
using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the
stack trace to unsigned long array. For 32-bit systems, we need to
translate it to u64 array.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200630062846.664389-3-songliubraving@fb.com
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 37 +++++++++++++++++++-
 kernel/bpf/stackmap.c          | 77 +++++++++++++++++++++++++++++++++++++++---
 kernel/bpf/verifier.c          |  4 ++-
 kernel/trace/bpf_trace.c       |  2 ++
 scripts/bpf_helpers_doc.py     |  2 ++
 tools/include/uapi/linux/bpf.h | 37 +++++++++++++++++++-
 7 files changed, 153 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3d2ade703a35..0cd7f6884c5c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1627,6 +1627,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
+extern const struct bpf_func_proto bpf_get_task_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0cb8ec948816..da9bf35a26f8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3285,6 +3285,39 @@ union bpf_attr {
  *		Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
  *	Return
  *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
+ *	Description
+ *		Return a user or a kernel stack in bpf program provided buffer.
+ *		To achieve this, the helper needs *task*, which is a valid
+ *		pointer to struct task_struct. To store the stacktrace, the
+ *		bpf program provides *buf* with	a nonnegative *size*.
+ *
+ *		The last argument, *flags*, holds the number of stack frames to
+ *		skip (from 0 to 255), masked with
+ *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ *		the following flags:
+ *
+ *		**BPF_F_USER_STACK**
+ *			Collect a user space stack instead of a kernel stack.
+ *		**BPF_F_USER_BUILD_ID**
+ *			Collect buildid+offset instead of ips for user stack,
+ *			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ *		**bpf_get_task_stack**\ () can collect up to
+ *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ *		to sufficient large buffer size. Note that
+ *		this limit can be controlled with the **sysctl** program, and
+ *		that it should be manually increased in order to profile long
+ *		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ *		::
+ *
+ *			# sysctl kernel.perf_event_max_stack=<new value>
+ *	Return
+ *		A non-negative value equal to or less than *size* on success,
+ *		or a negative error in case of failure.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3427,7 +3460,9 @@ union bpf_attr {
 	FN(skc_to_tcp_sock),		\
 	FN(skc_to_tcp_timewait_sock),	\
 	FN(skc_to_tcp_request_sock),	\
-	FN(skc_to_udp6_sock),
+	FN(skc_to_udp6_sock),		\
+	FN(get_task_stack),		\
+	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 071f98d0f7c6..5ad72ab2276b 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -348,6 +348,40 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 	}
 }
 
+static struct perf_callchain_entry *
+get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
+{
+	struct perf_callchain_entry *entry;
+	int rctx;
+
+	entry = get_callchain_entry(&rctx);
+
+	if (!entry)
+		return NULL;
+
+	entry->nr = init_nr +
+		stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
+				     sysctl_perf_event_max_stack - init_nr, 0);
+
+	/* stack_trace_save_tsk() works on unsigned long array, while
+	 * perf_callchain_entry uses u64 array. For 32-bit systems, it is
+	 * necessary to fix this mismatch.
+	 */
+	if (__BITS_PER_LONG != 64) {
+		unsigned long *from = (unsigned long *) entry->ip;
+		u64 *to = entry->ip;
+		int i;
+
+		/* copy data from the end to avoid using extra buffer */
+		for (i = entry->nr - 1; i >= (int)init_nr; i--)
+			to[i] = (u64)(from[i]);
+	}
+
+	put_callchain_entry(rctx);
+
+	return entry;
+}
+
 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	   u64, flags)
 {
@@ -448,8 +482,8 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
-	   u64, flags)
+static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
+			    void *buf, u32 size, u64 flags)
 {
 	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
 	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
@@ -471,13 +505,22 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
 	if (unlikely(size % elem_size))
 		goto clear;
 
+	/* cannot get valid user stack for task without user_mode regs */
+	if (task && user && !user_mode(regs))
+		goto err_fault;
+
 	num_elem = size / elem_size;
 	if (sysctl_perf_event_max_stack < num_elem)
 		init_nr = 0;
 	else
 		init_nr = sysctl_perf_event_max_stack - num_elem;
-	trace = get_perf_callchain(regs, init_nr, kernel, user,
-				   sysctl_perf_event_max_stack, false, false);
+
+	if (kernel && task)
+		trace = get_callchain_entry_for_task(task, init_nr);
+	else
+		trace = get_perf_callchain(regs, init_nr, kernel, user,
+					   sysctl_perf_event_max_stack,
+					   false, false);
 	if (unlikely(!trace))
 		goto err_fault;
 
@@ -505,6 +548,12 @@ clear:
 	return err;
 }
 
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+	   u64, flags)
+{
+	return __bpf_get_stack(regs, NULL, buf, size, flags);
+}
+
 const struct bpf_func_proto bpf_get_stack_proto = {
 	.func		= bpf_get_stack,
 	.gpl_only	= true,
@@ -515,6 +564,26 @@ const struct bpf_func_proto bpf_get_stack_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
+	   u32, size, u64, flags)
+{
+	struct pt_regs *regs = task_pt_regs(task);
+
+	return __bpf_get_stack(regs, task, buf, size, flags);
+}
+
+static int bpf_get_task_stack_btf_ids[5];
+const struct bpf_func_proto bpf_get_task_stack_proto = {
+	.func		= bpf_get_task_stack,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+	.btf_id		= bpf_get_task_stack_btf_ids,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7de98906ddf4..b608185e1ffd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4864,7 +4864,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (err)
 		return err;
 
-	if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
+	if ((func_id == BPF_FUNC_get_stack ||
+	     func_id == BPF_FUNC_get_task_stack) &&
+	    !env->prog->has_callchain_buf) {
 		const char *err_str;
 
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 5d59dda5f661..977ba3b6f6c6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1137,6 +1137,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_ringbuf_query_proto;
 	case BPF_FUNC_jiffies64:
 		return &bpf_jiffies64_proto;
+	case BPF_FUNC_get_task_stack:
+		return &bpf_get_task_stack_proto;
 	default:
 		return NULL;
 	}
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 6bab40ff442e..6843376733df 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -426,6 +426,7 @@ class PrinterHelpers(Printer):
             'struct tcp_timewait_sock',
             'struct tcp_request_sock',
             'struct udp6_sock',
+            'struct task_struct',
 
             'struct __sk_buff',
             'struct sk_msg_md',
@@ -468,6 +469,7 @@ class PrinterHelpers(Printer):
             'struct tcp_timewait_sock',
             'struct tcp_request_sock',
             'struct udp6_sock',
+            'struct task_struct',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0cb8ec948816..da9bf35a26f8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3285,6 +3285,39 @@ union bpf_attr {
  *		Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
  *	Return
  *		*sk* if casting is valid, or NULL otherwise.
+ *
+ * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
+ *	Description
+ *		Return a user or a kernel stack in bpf program provided buffer.
+ *		To achieve this, the helper needs *task*, which is a valid
+ *		pointer to struct task_struct. To store the stacktrace, the
+ *		bpf program provides *buf* with	a nonnegative *size*.
+ *
+ *		The last argument, *flags*, holds the number of stack frames to
+ *		skip (from 0 to 255), masked with
+ *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ *		the following flags:
+ *
+ *		**BPF_F_USER_STACK**
+ *			Collect a user space stack instead of a kernel stack.
+ *		**BPF_F_USER_BUILD_ID**
+ *			Collect buildid+offset instead of ips for user stack,
+ *			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ *		**bpf_get_task_stack**\ () can collect up to
+ *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ *		to sufficient large buffer size. Note that
+ *		this limit can be controlled with the **sysctl** program, and
+ *		that it should be manually increased in order to profile long
+ *		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ *		::
+ *
+ *			# sysctl kernel.perf_event_max_stack=<new value>
+ *	Return
+ *		A non-negative value equal to or less than *size* on success,
+ *		or a negative error in case of failure.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3427,7 +3460,9 @@ union bpf_attr {
 	FN(skc_to_tcp_sock),		\
 	FN(skc_to_tcp_timewait_sock),	\
 	FN(skc_to_tcp_request_sock),	\
-	FN(skc_to_udp6_sock),
+	FN(skc_to_udp6_sock),		\
+	FN(get_task_stack),		\
+	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From 2df6bb5493f83c7e818f66c384ea337c1b3da228 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 29 Jun 2020 23:28:45 -0700
Subject: bpf: Allow %pB in bpf_seq_printf() and bpf_trace_printk()

This makes it easy to dump stack trace in text.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200630062846.664389-4-songliubraving@fb.com
---
 kernel/trace/bpf_trace.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 977ba3b6f6c6..1d874d8e4384 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -376,7 +376,7 @@ static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
 
 /*
  * Only limited trace_printk() conversion specifiers allowed:
- * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s
+ * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s
  */
 BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
 	   u64, arg2, u64, arg3)
@@ -420,6 +420,11 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
 				goto fmt_str;
 			}
 
+			if (fmt[i + 1] == 'B') {
+				i++;
+				goto fmt_next;
+			}
+
 			/* disallow any further format extensions */
 			if (fmt[i + 1] != 0 &&
 			    !isspace(fmt[i + 1]) &&
@@ -636,7 +641,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
 		if (fmt[i] == 'p') {
 			if (fmt[i + 1] == 0 ||
 			    fmt[i + 1] == 'K' ||
-			    fmt[i + 1] == 'x') {
+			    fmt[i + 1] == 'x' ||
+			    fmt[i + 1] == 'B') {
 				/* just kernel pointers */
 				params[fmt_cnt] = args[fmt_cnt];
 				fmt_cnt++;
-- 
cgit v1.2.3


From 58fbc3c63275c6255a10ae151cc3882d9190c4e0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 30 Jun 2020 08:59:26 -0400
Subject: ring-buffer: Consolidate add_timestamp to remove some branches

Reorganize a little the logic to handle adding the absolute time stamp,
extended and forced time stamps, in such a way to remove a branch or two.
This is just a micro optimization.

Also add before and after time stamps to the rb_event_info structure to
display those values in the rb_check_timestamps() code, if something were to
go wrong.

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 139 ++++++++++++++++++++++++---------------------
 1 file changed, 73 insertions(+), 66 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 888bc9177937..ce125cbe98a5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -416,6 +416,8 @@ struct rb_irq_work {
 struct rb_event_info {
 	u64			ts;
 	u64			delta;
+	u64			before;
+	u64			after;
 	unsigned long		length;
 	struct buffer_page	*tail_page;
 	int			add_timestamp;
@@ -2619,6 +2621,33 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
 static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 				     struct ring_buffer_event *event);
 
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+	return true;
+}
+#endif
+
+static noinline void
+rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+		   struct rb_event_info *info)
+{
+	u64 write_stamp;
+
+	WARN_ONCE(info->delta > (1ULL << 59),
+		  KERN_WARNING "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s",
+		  (unsigned long long)info->delta,
+		  (unsigned long long)info->ts,
+		  (unsigned long long)info->before,
+		  (unsigned long long)info->after,
+		  (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
+		  sched_clock_stable() ? "" :
+		  "If you just came from a suspend/resume,\n"
+		  "please switch to the trace global clock:\n"
+		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n"
+		  "or add trace_clock=global to the kernel command line\n");
+}
+
 /**
  * rb_update_event - update event type and data
  * @cpu_buffer: The per cpu buffer of the @event
@@ -2646,6 +2675,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 		bool abs = info->add_timestamp &
 			(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
 
+		rb_check_timestamp(cpu_buffer, info);
 		event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
 		length -= RB_LEN_TIME_EXTEND;
 		delta = 0;
@@ -2692,13 +2722,6 @@ static unsigned rb_calculate_event_length(unsigned length)
 	return length;
 }
 
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline bool sched_clock_stable(void)
-{
-	return true;
-}
-#endif
-
 static __always_inline bool
 rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 		   struct ring_buffer_event *event)
@@ -3092,24 +3115,6 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
-static noinline void
-rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
-		   struct rb_event_info *info)
-{
-	u64 write_stamp;
-
-	WARN_ONCE(info->delta > (1ULL << 59),
-		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
-		  (unsigned long long)info->delta,
-		  (unsigned long long)info->ts,
-		  (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
-		  sched_clock_stable() ? "" :
-		  "If you just came from a suspend/resume,\n"
-		  "please switch to the trace global clock:\n"
-		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n"
-		  "or add trace_clock=global to the kernel command line\n");
-}
-
 static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		  struct rb_event_info *info)
@@ -3117,7 +3122,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct ring_buffer_event *event;
 	struct buffer_page *tail_page;
 	unsigned long tail, write, w;
-	u64 before, after;
 	bool a_ok;
 	bool b_ok;
 
@@ -3126,40 +3130,31 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
  /*A*/	w = local_read(&tail_page->write) & RB_WRITE_MASK;
 	barrier();
-	b_ok = rb_time_read(&cpu_buffer->before_stamp, &before);
-	a_ok = rb_time_read(&cpu_buffer->write_stamp, &after);
+	b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+	a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
 	barrier();
 	info->ts = rb_time_stamp(cpu_buffer->buffer);
 
-	if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
+	if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) {
 		info->delta = info->ts;
-		info->add_timestamp = RB_ADD_STAMP_ABSOLUTE;
 	} else {
-		info->delta = info->ts - after;
-	}
-
-	if (likely(a_ok && b_ok)) {
-		if (unlikely(test_time_stamp(info->delta))) {
-			rb_check_timestamp(cpu_buffer, info);
-			info->add_timestamp |= RB_ADD_STAMP_EXTEND;
+		/*
+		 * If interrupting an event time update, we may need an
+		 * absolute timestamp.
+		 * Don't bother if this is the start of a new page (w == 0).
+		 */
+		if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
+			info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
+			info->length += RB_LEN_TIME_EXTEND;
+		} else {
+			info->delta = info->ts - info->after;
+			if (unlikely(test_time_stamp(info->delta))) {
+				info->add_timestamp |= RB_ADD_STAMP_EXTEND;
+				info->length += RB_LEN_TIME_EXTEND;
+			}
 		}
 	}
 
-	/*
-	 * If interrupting an event time update, we may need an absolute timestamp.
-	 * Don't bother if this is the start of a new page (w == 0).
-	 */
-	if (unlikely(!a_ok || !b_ok || (before != after && w)))
-		info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
-
-	/*
-	 * If the time delta since the last event is too big to
-	 * hold in the time field of the event, then we append a
-	 * TIME EXTEND event ahead of the data event.
-	 */
-	if (unlikely(info->add_timestamp))
-		info->length += RB_LEN_TIME_EXTEND;
-
  /*B*/	rb_time_set(&cpu_buffer->before_stamp, info->ts);
 
  /*C*/	write = local_add_return(info->length, &tail_page->write);
@@ -3173,10 +3168,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (unlikely(write > BUF_PAGE_SIZE)) {
 		if (tail != w) {
 			/* before and after may now different, fix it up*/
-			b_ok = rb_time_read(&cpu_buffer->before_stamp, &before);
-			a_ok = rb_time_read(&cpu_buffer->write_stamp, &after);
-			if (a_ok && b_ok && before != after)
-				(void)rb_time_cmpxchg(&cpu_buffer->before_stamp, before, after);
+			b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+			a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+			if (a_ok && b_ok && info->before != info->after)
+				(void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
+						      info->before, info->after);
 		}
 		return rb_move_tail(cpu_buffer, tail, info);
 	}
@@ -3193,7 +3189,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		if (likely(!(info->add_timestamp &
 			     (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
 			/* This did not interrupt any time update */
-			info->delta = info->ts - after;
+			info->delta = info->ts - info->after;
 		else
 			/* Just use full timestamp for inerrupting event */
 			info->delta = info->ts;
@@ -3201,31 +3197,33 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		if (unlikely(info->ts != save_before)) {
 			/* SLOW PATH - Interrupted between C and E */
 
-			a_ok = rb_time_read(&cpu_buffer->write_stamp, &after);
+			a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
 			RB_WARN_ON(cpu_buffer, !a_ok);
 
 			/* Write stamp must only go forward */
-			if (save_before > after) {
+			if (save_before > info->after) {
 				/*
 				 * We do not care about the result, only that
 				 * it gets updated atomically.
 				 */
-				(void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, save_before);
+				(void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
+						      info->after, save_before);
 			}
 		}
 	} else {
 		u64 ts;
 		/* SLOW PATH - Interrupted between A and C */
-		a_ok = rb_time_read(&cpu_buffer->write_stamp, &after);
+		a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
 		/* Was interrupted before here, write_stamp must be valid */
 		RB_WARN_ON(cpu_buffer, !a_ok);
 		ts = rb_time_stamp(cpu_buffer->buffer);
 		barrier();
  /*E*/		if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
-		    after < ts) {
+		    info->after < ts) {
 			/* Nothing came after this event between C and E */
-			info->delta = ts - after;
-			(void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, info->ts);
+			info->delta = ts - info->after;
+			(void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
+					      info->after, info->ts);
 			info->ts = ts;
 		} else {
 			/*
@@ -3277,6 +3275,7 @@ rb_reserve_next_event(struct trace_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct rb_event_info info;
 	int nr_loops = 0;
+	int add_ts_default;
 
 	rb_start_commit(cpu_buffer);
 	/* The commit page can not change after this */
@@ -3297,8 +3296,16 @@ rb_reserve_next_event(struct trace_buffer *buffer,
 #endif
 
 	info.length = rb_calculate_event_length(length);
+
+	if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
+		add_ts_default = RB_ADD_STAMP_ABSOLUTE;
+		info.length += RB_LEN_TIME_EXTEND;
+	} else {
+		add_ts_default = RB_ADD_STAMP_NONE;
+	}
+
  again:
-	info.add_timestamp = RB_ADD_STAMP_NONE;
+	info.add_timestamp = add_ts_default;
 	info.delta = 0;
 
 	/*
@@ -3316,7 +3323,7 @@ rb_reserve_next_event(struct trace_buffer *buffer,
 	event = __rb_reserve_next(cpu_buffer, &info);
 
 	if (unlikely(PTR_ERR(event) == -EAGAIN)) {
-		if (info.add_timestamp)
+		if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND))
 			info.length -= RB_LEN_TIME_EXTEND;
 		goto again;
 	}
-- 
cgit v1.2.3


From 74e879373b377f15d4ecb45bf8316b77e8badc49 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 30 Jun 2020 12:47:56 -0400
Subject: ring-buffer: Move the add_timestamp into its own function

Make a helper function rb_add_timestamp() that moves the adding of the
extended time stamps into its own function. Also, remove the noinline and
inline for the functions it calls, as recent benchmarks appear they do not
make a difference (just let gcc decide).

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ce125cbe98a5..a30ca7ec2200 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2596,8 +2596,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	return NULL;
 }
 
-/* Slow path, do not inline */
-static noinline struct ring_buffer_event *
+/* Slow path */
+static struct ring_buffer_event *
 rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
 {
 	if (abs)
@@ -2628,7 +2628,7 @@ static inline bool sched_clock_stable(void)
 }
 #endif
 
-static noinline void
+static void
 rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		   struct rb_event_info *info)
 {
@@ -2648,6 +2648,21 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  "or add trace_clock=global to the kernel command line\n");
 }
 
+static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+				      struct ring_buffer_event **event,
+				      struct rb_event_info *info,
+				      u64 *delta,
+				      unsigned int *length)
+{
+	bool abs = info->add_timestamp &
+		(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
+
+	rb_check_timestamp(cpu_buffer, info);
+	*event = rb_add_time_stamp(*event, info->delta, abs);
+	*length -= RB_LEN_TIME_EXTEND;
+	*delta = 0;
+}
+
 /**
  * rb_update_event - update event type and data
  * @cpu_buffer: The per cpu buffer of the @event
@@ -2671,15 +2686,8 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 	 * If we need to add a timestamp, then we
 	 * add it to the start of the reserved space.
 	 */
-	if (unlikely(info->add_timestamp)) {
-		bool abs = info->add_timestamp &
-			(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
-
-		rb_check_timestamp(cpu_buffer, info);
-		event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
-		length -= RB_LEN_TIME_EXTEND;
-		delta = 0;
-	}
+	if (unlikely(info->add_timestamp))
+		rb_add_timestamp(cpu_buffer, &event, info, &delta, &length);
 
 	event->time_delta = delta;
 	length -= RB_EVNT_HDR_SIZE;
-- 
cgit v1.2.3


From bbeba3e58f040a4297a5ba88ebf6e2b16adc3657 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 30 Jun 2020 13:05:29 -0400
Subject: ring-buffer: Call trace_clock_local() directly for RETPOLINE kernels

After doing some benchmarks and examining the code, I found that the ring
buffer clock calls were quite expensive, and noticed that it uses
retpolines. This is because the ring buffer clock is programmable, and can
be set. But in most cases it simply uses the fastest ns unit clock which is
the trace_clock_local(). For RETPOLINE builds, checking if the ring buffer
clock is set to trace_clock_local() and then calling it directly has brought
the time of an event on my i7 box from an average of 93 nanoseconds an event
down to 83 nanoseconds an event, and the minimum time from 81 nanoseconds to
68 nanoseconds!

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a30ca7ec2200..2bb96ee80120 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -970,8 +970,16 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
 
 static inline u64 rb_time_stamp(struct trace_buffer *buffer)
 {
+	u64 ts;
+
+	/* Skip retpolines :-( */
+	if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
+		ts = trace_clock_local();
+	else
+		ts = buffer->clock();
+
 	/* shift to debug/test normalization and TIME_EXTENTS */
-	return buffer->clock() << DEBUG_SHIFT;
+	return ts << DEBUG_SHIFT;
 }
 
 u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
-- 
cgit v1.2.3


From 29ce24519c0692ca7d998d7444a9e016a4c44fa7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 1 Jul 2020 13:10:19 -0400
Subject: ring-buffer: Do not trigger a WARN if clock going backwards is
 detected

After tweaking the ring buffer to be a bit faster, a warning is triggering
on one of my machines, and causing my tests to fail. This warning is caused
when the delta (current time stamp minus previous time stamp), is larger
than the max time held by the ring buffer (59 bits).

If the clock were to go backwards slightly, this would then easily trigger
this warning. The machine that it triggered on, the clock did go backwards
by around 450 nanoseconds, and this happened after a recalibration of the
TSC clock. Now that the ring buffer is faster, it detects this, and the
delta that is used larger than the max, the warning is triggered and my test
fails.

To handle the clock going backwards, look at the saved before and after time
stamps. If they are the same, it means that the current event did not
interrupt another event, and that those timestamp are of a previous event
that was recorded. If the max delta is triggered, look at those time stamps,
make sure they are the same, then use them to compare with the current
timestamp. If the current timestamp is less than the before/after time
stamps, then that means the clock being used went backward.

Print out a message that this has happened, but do not warn about it (and
only print the message once).

Still do the warning if the delta is indeed larger than what can be used.

Also remove the unneeded KERN_WARNING from the WARN_ONCE() print.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2bb96ee80120..c3a2e7509527 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2642,8 +2642,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 {
 	u64 write_stamp;
 
-	WARN_ONCE(info->delta > (1ULL << 59),
-		  KERN_WARNING "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s",
+	WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s",
 		  (unsigned long long)info->delta,
 		  (unsigned long long)info->ts,
 		  (unsigned long long)info->before,
@@ -2665,7 +2664,26 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 	bool abs = info->add_timestamp &
 		(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
 
-	rb_check_timestamp(cpu_buffer, info);
+	if (unlikely(info->delta > (1ULL << 59))) {
+		/* did the clock go backwards */
+		if (info->before == info->after && info->before > info->ts) {
+			/* not interrupted */
+			static int once;
+
+			/*
+			 * This is possible with a recalibrating of the TSC.
+			 * Do not produce a call stack, but just report it.
+			 */
+			if (!once) {
+				once++;
+				pr_warn("Ring buffer clock went backwards: %llu -> %llu\n",
+					info->before, info->ts);
+			}
+		} else
+			rb_check_timestamp(cpu_buffer, info);
+		if (!abs)
+			info->delta = 0;
+	}
 	*event = rb_add_time_stamp(*event, info->delta, abs);
 	*length -= RB_LEN_TIME_EXTEND;
 	*delta = 0;
-- 
cgit v1.2.3


From 10dd8573b09e84b81539d939d55ebdb6a36c5f3a Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Mon, 29 Jun 2020 13:54:59 +0530
Subject: cpufreq: Register governors at core_initcall

Currently, most CPUFreq governors are registered at the core_initcall
time when the given governor is the default one, and the module_init
time otherwise.

In preparation for letting users specify the default governor on the
kernel command line, change all of them to be registered at the
core_initcall unconditionally, as it is already the case for the
schedutil and performance governors. This will allow us to assume
that builtin governors have been registered before the built-in
CPUFreq drivers probe.

And since all governors have similar init/exit patterns now, introduce
two new macros, cpufreq_governor_{init,exit}(), to factorize the code.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/powerpc/platforms/cell/cpufreq_spudemand.c | 26 ++-----------------------
 drivers/cpufreq/cpufreq_conservative.c          | 22 +++++----------------
 drivers/cpufreq/cpufreq_ondemand.c              | 24 ++++++-----------------
 drivers/cpufreq/cpufreq_performance.c           | 14 ++-----------
 drivers/cpufreq/cpufreq_powersave.c             | 18 +++--------------
 drivers/cpufreq/cpufreq_userspace.c             | 18 +++--------------
 include/linux/cpufreq.h                         | 14 +++++++++++++
 kernel/sched/cpufreq_schedutil.c                |  6 +-----
 8 files changed, 36 insertions(+), 106 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
index 55b31eadb3c8..ca7849e113d7 100644
--- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c
+++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c
@@ -126,30 +126,8 @@ static struct cpufreq_governor spu_governor = {
 	.stop = spu_gov_stop,
 	.owner = THIS_MODULE,
 };
-
-/*
- * module init and destoy
- */
-
-static int __init spu_gov_init(void)
-{
-	int ret;
-
-	ret = cpufreq_register_governor(&spu_governor);
-	if (ret)
-		printk(KERN_ERR "registration of governor failed\n");
-	return ret;
-}
-
-static void __exit spu_gov_exit(void)
-{
-	cpufreq_unregister_governor(&spu_governor);
-}
-
-
-module_init(spu_gov_init);
-module_exit(spu_gov_exit);
+cpufreq_governor_init(spu_governor);
+cpufreq_governor_exit(spu_governor);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Christian Krafft <krafft@de.ibm.com>");
-
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 737ff3b9c2c0..aa39ff31ec9f 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -322,17 +322,7 @@ static struct dbs_governor cs_governor = {
 	.start = cs_start,
 };
 
-#define CPU_FREQ_GOV_CONSERVATIVE	(&cs_governor.gov)
-
-static int __init cpufreq_gov_dbs_init(void)
-{
-	return cpufreq_register_governor(CPU_FREQ_GOV_CONSERVATIVE);
-}
-
-static void __exit cpufreq_gov_dbs_exit(void)
-{
-	cpufreq_unregister_governor(CPU_FREQ_GOV_CONSERVATIVE);
-}
+#define CPU_FREQ_GOV_CONSERVATIVE	(cs_governor.gov)
 
 MODULE_AUTHOR("Alexander Clouter <alex@digriz.org.uk>");
 MODULE_DESCRIPTION("'cpufreq_conservative' - A dynamic cpufreq governor for "
@@ -343,11 +333,9 @@ MODULE_LICENSE("GPL");
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
 struct cpufreq_governor *cpufreq_default_governor(void)
 {
-	return CPU_FREQ_GOV_CONSERVATIVE;
+	return &CPU_FREQ_GOV_CONSERVATIVE;
 }
-
-core_initcall(cpufreq_gov_dbs_init);
-#else
-module_init(cpufreq_gov_dbs_init);
 #endif
-module_exit(cpufreq_gov_dbs_exit);
+
+cpufreq_governor_init(CPU_FREQ_GOV_CONSERVATIVE);
+cpufreq_governor_exit(CPU_FREQ_GOV_CONSERVATIVE);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 82a4d37ddecb..ac361a8b1d3b 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -408,7 +408,7 @@ static struct dbs_governor od_dbs_gov = {
 	.start = od_start,
 };
 
-#define CPU_FREQ_GOV_ONDEMAND	(&od_dbs_gov.gov)
+#define CPU_FREQ_GOV_ONDEMAND	(od_dbs_gov.gov)
 
 static void od_set_powersave_bias(unsigned int powersave_bias)
 {
@@ -429,7 +429,7 @@ static void od_set_powersave_bias(unsigned int powersave_bias)
 			continue;
 
 		policy = cpufreq_cpu_get_raw(cpu);
-		if (!policy || policy->governor != CPU_FREQ_GOV_ONDEMAND)
+		if (!policy || policy->governor != &CPU_FREQ_GOV_ONDEMAND)
 			continue;
 
 		policy_dbs = policy->governor_data;
@@ -461,16 +461,6 @@ void od_unregister_powersave_bias_handler(void)
 }
 EXPORT_SYMBOL_GPL(od_unregister_powersave_bias_handler);
 
-static int __init cpufreq_gov_dbs_init(void)
-{
-	return cpufreq_register_governor(CPU_FREQ_GOV_ONDEMAND);
-}
-
-static void __exit cpufreq_gov_dbs_exit(void)
-{
-	cpufreq_unregister_governor(CPU_FREQ_GOV_ONDEMAND);
-}
-
 MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>");
 MODULE_AUTHOR("Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>");
 MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for "
@@ -480,11 +470,9 @@ MODULE_LICENSE("GPL");
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
 struct cpufreq_governor *cpufreq_default_governor(void)
 {
-	return CPU_FREQ_GOV_ONDEMAND;
+	return &CPU_FREQ_GOV_ONDEMAND;
 }
-
-core_initcall(cpufreq_gov_dbs_init);
-#else
-module_init(cpufreq_gov_dbs_init);
 #endif
-module_exit(cpufreq_gov_dbs_exit);
+
+cpufreq_governor_init(CPU_FREQ_GOV_ONDEMAND);
+cpufreq_governor_exit(CPU_FREQ_GOV_ONDEMAND);
diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c
index def9afe0f5b8..71c1d9aba772 100644
--- a/drivers/cpufreq/cpufreq_performance.c
+++ b/drivers/cpufreq/cpufreq_performance.c
@@ -23,16 +23,6 @@ static struct cpufreq_governor cpufreq_gov_performance = {
 	.limits		= cpufreq_gov_performance_limits,
 };
 
-static int __init cpufreq_gov_performance_init(void)
-{
-	return cpufreq_register_governor(&cpufreq_gov_performance);
-}
-
-static void __exit cpufreq_gov_performance_exit(void)
-{
-	cpufreq_unregister_governor(&cpufreq_gov_performance);
-}
-
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
 struct cpufreq_governor *cpufreq_default_governor(void)
 {
@@ -50,5 +40,5 @@ MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
 MODULE_DESCRIPTION("CPUfreq policy governor 'performance'");
 MODULE_LICENSE("GPL");
 
-core_initcall(cpufreq_gov_performance_init);
-module_exit(cpufreq_gov_performance_exit);
+cpufreq_governor_init(cpufreq_gov_performance);
+cpufreq_governor_exit(cpufreq_gov_performance);
diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c
index 1ae66019eb83..7749522355b5 100644
--- a/drivers/cpufreq/cpufreq_powersave.c
+++ b/drivers/cpufreq/cpufreq_powersave.c
@@ -23,16 +23,6 @@ static struct cpufreq_governor cpufreq_gov_powersave = {
 	.owner		= THIS_MODULE,
 };
 
-static int __init cpufreq_gov_powersave_init(void)
-{
-	return cpufreq_register_governor(&cpufreq_gov_powersave);
-}
-
-static void __exit cpufreq_gov_powersave_exit(void)
-{
-	cpufreq_unregister_governor(&cpufreq_gov_powersave);
-}
-
 MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
 MODULE_DESCRIPTION("CPUfreq policy governor 'powersave'");
 MODULE_LICENSE("GPL");
@@ -42,9 +32,7 @@ struct cpufreq_governor *cpufreq_default_governor(void)
 {
 	return &cpufreq_gov_powersave;
 }
-
-core_initcall(cpufreq_gov_powersave_init);
-#else
-module_init(cpufreq_gov_powersave_init);
 #endif
-module_exit(cpufreq_gov_powersave_exit);
+
+cpufreq_governor_init(cpufreq_gov_powersave);
+cpufreq_governor_exit(cpufreq_gov_powersave);
diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c
index b43e7cd502c5..50a4d7846580 100644
--- a/drivers/cpufreq/cpufreq_userspace.c
+++ b/drivers/cpufreq/cpufreq_userspace.c
@@ -126,16 +126,6 @@ static struct cpufreq_governor cpufreq_gov_userspace = {
 	.owner		= THIS_MODULE,
 };
 
-static int __init cpufreq_gov_userspace_init(void)
-{
-	return cpufreq_register_governor(&cpufreq_gov_userspace);
-}
-
-static void __exit cpufreq_gov_userspace_exit(void)
-{
-	cpufreq_unregister_governor(&cpufreq_gov_userspace);
-}
-
 MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>, "
 		"Russell King <rmk@arm.linux.org.uk>");
 MODULE_DESCRIPTION("CPUfreq policy governor 'userspace'");
@@ -146,9 +136,7 @@ struct cpufreq_governor *cpufreq_default_governor(void)
 {
 	return &cpufreq_gov_userspace;
 }
-
-core_initcall(cpufreq_gov_userspace_init);
-#else
-module_init(cpufreq_gov_userspace_init);
 #endif
-module_exit(cpufreq_gov_userspace_exit);
+
+cpufreq_governor_init(cpufreq_gov_userspace);
+cpufreq_governor_exit(cpufreq_gov_userspace);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 3494f6763597..e62b022cb07e 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -577,6 +577,20 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy);
 int cpufreq_register_governor(struct cpufreq_governor *governor);
 void cpufreq_unregister_governor(struct cpufreq_governor *governor);
 
+#define cpufreq_governor_init(__governor)			\
+static int __init __governor##_init(void)			\
+{								\
+	return cpufreq_register_governor(&__governor);	\
+}								\
+core_initcall(__governor##_init)
+
+#define cpufreq_governor_exit(__governor)			\
+static void __exit __governor##_exit(void)			\
+{								\
+	return cpufreq_unregister_governor(&__governor);	\
+}								\
+module_exit(__governor##_exit)
+
 struct cpufreq_governor *cpufreq_default_governor(void);
 struct cpufreq_governor *cpufreq_fallback_governor(void);
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7fbaee24c824..402a09af9f43 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -909,11 +909,7 @@ struct cpufreq_governor *cpufreq_default_governor(void)
 }
 #endif
 
-static int __init sugov_register(void)
-{
-	return cpufreq_register_governor(&schedutil_gov);
-}
-core_initcall(sugov_register);
+cpufreq_governor_init(schedutil_gov);
 
 #ifdef CONFIG_ENERGY_MODEL
 extern bool sched_energy_update;
-- 
cgit v1.2.3


From 1d50e5d0c5052446cb85a3bf11fe8ba4e8d770ca Mon Sep 17 00:00:00 2001
From: Bhupesh Sharma <bhsharma@redhat.com>
Date: Thu, 14 May 2020 00:22:36 +0530
Subject: crash_core, vmcoreinfo: Append 'MAX_PHYSMEM_BITS' to vmcoreinfo

Right now user-space tools like 'makedumpfile' and 'crash' need to rely
on a best-guess method of determining value of 'MAX_PHYSMEM_BITS'
supported by underlying kernel.

This value is used in user-space code to calculate the bit-space
required to store a section for SPARESMEM (similar to the existing
calculation method used in the kernel implementation):

  #define SECTIONS_SHIFT    (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)

Now, regressions have been reported in user-space utilities
like 'makedumpfile' and 'crash' on arm64, with the recently added
kernel support for 52-bit physical address space, as there is
no clear method of determining this value in user-space
(other than reading kernel CONFIG flags).

As per suggestion from makedumpfile maintainer (Kazu), it makes more
sense to append 'MAX_PHYSMEM_BITS' to vmcoreinfo in the core code itself
rather than in arch-specific code, so that the user-space code for other
archs can also benefit from this addition to the vmcoreinfo and use it
as a standard way of determining 'SECTIONS_SHIFT' value in user-land.

A reference 'makedumpfile' implementation which reads the
'MAX_PHYSMEM_BITS' value from vmcoreinfo in a arch-independent fashion
is available here:

While at it also update vmcoreinfo documentation for 'MAX_PHYSMEM_BITS'
variable being added to vmcoreinfo.

'MAX_PHYSMEM_BITS' defines the maximum supported physical address
space memory.

Signed-off-by: Bhupesh Sharma <bhsharma@redhat.com>
Tested-by: John Donnelly <john.p.donnelly@oracle.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: Boris Petkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: James Morse <james.morse@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Kazuhito Hagio <k-hagio@ab.jp.nec.com>
Cc: x86@kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Cc: kexec@lists.infradead.org
Link: https://lore.kernel.org/r/1589395957-24628-2-git-send-email-bhsharma@redhat.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/admin-guide/kdump/vmcoreinfo.rst | 5 +++++
 kernel/crash_core.c                            | 1 +
 2 files changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index e4ee8b2db604..2a632020f809 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -93,6 +93,11 @@ It exists in the sparse memory mapping model, and it is also somewhat
 similar to the mem_map variable, both of them are used to translate an
 address.
 
+MAX_PHYSMEM_BITS
+----------------
+
+Defines the maximum supported physical address space memory.
+
 page
 ----
 
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 9f1557b98468..18175687133a 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -413,6 +413,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
 	VMCOREINFO_STRUCT_SIZE(mem_section);
 	VMCOREINFO_OFFSET(mem_section, section_mem_map);
+	VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
 #endif
 	VMCOREINFO_STRUCT_SIZE(page);
 	VMCOREINFO_STRUCT_SIZE(pglist_data);
-- 
cgit v1.2.3


From 046cc3dd9a2507b8e111714807ab8bf15ea5bb70 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 2 Jul 2020 19:45:37 -0700
Subject: bpf: Fix build without CONFIG_STACKTRACE

Without CONFIG_STACKTRACE stack_trace_save_tsk() is not defined. Let
get_callchain_entry_for_task() to always return NULL in such cases.

Fixes: fa28dcb82a38 ("bpf: Introduce helper bpf_get_task_stack()")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200703024537.79971-1-songliubraving@fb.com
---
 kernel/bpf/stackmap.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 5ad72ab2276b..a6c361ed7937 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -351,6 +351,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 static struct perf_callchain_entry *
 get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
 {
+#ifdef CONFIG_STACKTRACE
 	struct perf_callchain_entry *entry;
 	int rctx;
 
@@ -380,6 +381,9 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
 	put_callchain_entry(rctx);
 
 	return entry;
+#else /* CONFIG_STACKTRACE */
+	return NULL;
+#endif
 }
 
 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
-- 
cgit v1.2.3


From 8fa88a88d573093868565a1afba43b5ae5b3a316 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Fri, 3 Jul 2020 16:56:45 +0100
Subject: genirq: Remove preflow handler support

That was put in place for sparc64, and blackfin also used it for some time;
sparc64 no longer uses those, and blackfin is dead.

As there are no more users, remove preflow handlers.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200703155645.29703-3-valentin.schneider@arm.com
---
 include/linux/irqdesc.h    | 15 ---------------
 include/linux/irqhandler.h |  1 -
 kernel/irq/Kconfig         |  4 ----
 kernel/irq/chip.c          | 13 -------------
 4 files changed, 33 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 8f2820c5e69e..5745491303e0 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -22,7 +22,6 @@ struct pt_regs;
  * @irq_common_data:	per irq and chip data passed down to chip functions
  * @kstat_irqs:		irq stats per cpu
  * @handle_irq:		highlevel irq-events handler
- * @preflow_handler:	handler called before the flow handler (currently used by sparc)
  * @action:		the irq action chain
  * @status_use_accessors: status information
  * @core_internal_state__do_not_mess_with_it: core internal status information
@@ -58,9 +57,6 @@ struct irq_desc {
 	struct irq_data		irq_data;
 	unsigned int __percpu	*kstat_irqs;
 	irq_flow_handler_t	handle_irq;
-#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
-	irq_preflow_handler_t	preflow_handler;
-#endif
 	struct irqaction	*action;	/* IRQ action list */
 	unsigned int		status_use_accessors;
 	unsigned int		core_internal_state__do_not_mess_with_it;
@@ -268,15 +264,4 @@ irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
 	}
 }
 
-#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
-static inline void
-__irq_set_preflow_handler(unsigned int irq, irq_preflow_handler_t handler)
-{
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-	desc->preflow_handler = handler;
-}
-#endif
-
 #endif
diff --git a/include/linux/irqhandler.h b/include/linux/irqhandler.h
index 1e6f4e7123d6..c30f454a9518 100644
--- a/include/linux/irqhandler.h
+++ b/include/linux/irqhandler.h
@@ -10,6 +10,5 @@
 struct irq_desc;
 struct irq_data;
 typedef	void (*irq_flow_handler_t)(struct irq_desc *desc);
-typedef	void (*irq_preflow_handler_t)(struct irq_data *data);
 
 #endif
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 20512252ecc9..10a5aff4eecc 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -51,10 +51,6 @@ config GENERIC_IRQ_INJECTION
 config HARDIRQS_SW_RESEND
        bool
 
-# Preflow handler support for fasteoi (sparc64)
-config IRQ_PREFLOW_FASTEOI
-       bool
-
 # Edge style eoi based handler (cell)
 config IRQ_EDGE_EOI_HANDLER
        bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 41e7e37a0928..75bbaa8b38f1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -656,16 +656,6 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(handle_level_irq);
 
-#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
-static inline void preflow_handler(struct irq_desc *desc)
-{
-	if (desc->preflow_handler)
-		desc->preflow_handler(&desc->irq_data);
-}
-#else
-static inline void preflow_handler(struct irq_desc *desc) { }
-#endif
-
 static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
 {
 	if (!(desc->istate & IRQS_ONESHOT)) {
@@ -721,7 +711,6 @@ void handle_fasteoi_irq(struct irq_desc *desc)
 	if (desc->istate & IRQS_ONESHOT)
 		mask_irq(desc);
 
-	preflow_handler(desc);
 	handle_irq_event(desc);
 
 	cond_unmask_eoi_irq(desc, chip);
@@ -1231,7 +1220,6 @@ void handle_fasteoi_ack_irq(struct irq_desc *desc)
 	/* Start handling the irq */
 	desc->irq_data.chip->irq_ack(&desc->irq_data);
 
-	preflow_handler(desc);
 	handle_irq_event(desc);
 
 	cond_unmask_eoi_irq(desc, chip);
@@ -1281,7 +1269,6 @@ void handle_fasteoi_mask_irq(struct irq_desc *desc)
 	if (desc->istate & IRQS_ONESHOT)
 		mask_irq(desc);
 
-	preflow_handler(desc);
 	handle_irq_event(desc);
 
 	cond_unmask_eoi_irq(desc, chip);
-- 
cgit v1.2.3


From 5fec25f2cb959cb5f189d7f6127bee3efc782530 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 24 Jun 2020 16:34:57 -0500
Subject: umh: Capture the pid in umh_pipe_setup

The pid in struct subprocess_info is only used by umh_clean_and_save_pid to
write the pid into umh_info.

Instead always capture the pid on struct umh_info in umh_pipe_setup, removing
code that is specific to user mode drivers from the common user path of
user mode helpers.

v1: https://lkml.kernel.org/r/87h7uygf9i.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/875zb97iix.fsf_-_@x220.int.ebiederm.org
Link: https://lkml.kernel.org/r/20200702164140.4468-1-ebiederm@xmission.com
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umh.h | 1 -
 kernel/umh.c        | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/umh.h b/include/linux/umh.h
index 0c08de356d0d..aae16a0ebd0f 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -25,7 +25,6 @@ struct subprocess_info {
 	struct file *file;
 	int wait;
 	int retval;
-	pid_t pid;
 	int (*init)(struct subprocess_info *info, struct cred *new);
 	void (*cleanup)(struct subprocess_info *info);
 	void *data;
diff --git a/kernel/umh.c b/kernel/umh.c
index 79f139a7ca03..c2a582b3a2bf 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -102,7 +102,6 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	sub_info->pid = task_pid_nr(current);
 	if (sub_info->file) {
 		retval = do_execve_file(sub_info->file,
 					sub_info->argv, sub_info->envp);
@@ -468,6 +467,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 
 	umh_info->pipe_to_umh = to_umh[1];
 	umh_info->pipe_from_umh = from_umh[0];
+	umh_info->pid = task_pid_nr(current);
 	return 0;
 }
 
@@ -476,13 +476,12 @@ static void umh_clean_and_save_pid(struct subprocess_info *info)
 	struct umh_info *umh_info = info->data;
 
 	/* cleanup if umh_pipe_setup() was successful but exec failed */
-	if (info->pid && info->retval) {
+	if (info->retval) {
 		fput(umh_info->pipe_to_umh);
 		fput(umh_info->pipe_from_umh);
 	}
 
 	argv_free(info->argv);
-	umh_info->pid = info->pid;
 }
 
 /**
-- 
cgit v1.2.3


From b044fa2ae50d52d8c9f9d130055c2aea032e7475 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 25 Jun 2020 10:04:25 -0500
Subject: umh: Move setting PF_UMH into umh_pipe_setup

I am separating the code specific to user mode drivers from the code
for ordinary user space helpers.  Move setting of PF_UMH from
call_usermodehelper_exec_async which is core user mode helper code
into umh_pipe_setup which is user mode driver code.

The code is equally as easy to write in one location as the other and
the movement minimizes the impact of the user mode driver code on the
core of the user mode helper code.

Setting PF_UMH unconditionally is harmless as an action will only
happen if it is paired with an entry on umh_list.

v1: https://lkml.kernel.org/r/87bll6gf8t.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87zh8l63xs.fsf_-_@x220.int.ebiederm.org
Link: https://lkml.kernel.org/r/20200702164140.4468-2-ebiederm@xmission.com
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/umh.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/umh.c b/kernel/umh.c
index c2a582b3a2bf..e6b9d6636850 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -102,12 +102,10 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	if (sub_info->file) {
+	if (sub_info->file)
 		retval = do_execve_file(sub_info->file,
 					sub_info->argv, sub_info->envp);
-		if (!retval)
-			current->flags |= PF_UMH;
-	} else
+	else
 		retval = do_execve(getname_kernel(sub_info->path),
 				   (const char __user *const __user *)sub_info->argv,
 				   (const char __user *const __user *)sub_info->envp);
@@ -468,6 +466,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 	umh_info->pipe_to_umh = to_umh[1];
 	umh_info->pipe_from_umh = from_umh[0];
 	umh_info->pid = task_pid_nr(current);
+	current->flags |= PF_UMH;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3a171042aeab8702391c311d84633a6c267d566c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 25 Jun 2020 10:10:03 -0500
Subject: umh: Rename the user mode driver helpers for clarity

Now that the functionality of umh_setup_pipe and
umh_clean_and_save_pid has changed their names are too specific and
don't make much sense.  Instead name them  umd_setup and umd_cleanup
for the functional role in setting up user mode drivers.

v1: https://lkml.kernel.org/r/875zbegf82.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87tuyt63x3.fsf_-_@x220.int.ebiederm.org
Link: https://lkml.kernel.org/r/20200702164140.4468-3-ebiederm@xmission.com
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/umh.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/umh.c b/kernel/umh.c
index e6b9d6636850..26c3d493f168 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -429,7 +429,7 @@ struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
 	return sub_info;
 }
 
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct umh_info *umh_info = info->data;
 	struct file *from_umh[2];
@@ -470,11 +470,11 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 	return 0;
 }
 
-static void umh_clean_and_save_pid(struct subprocess_info *info)
+static void umd_cleanup(struct subprocess_info *info)
 {
 	struct umh_info *umh_info = info->data;
 
-	/* cleanup if umh_pipe_setup() was successful but exec failed */
+	/* cleanup if umh_setup() was successful but exec failed */
 	if (info->retval) {
 		fput(umh_info->pipe_to_umh);
 		fput(umh_info->pipe_from_umh);
@@ -520,8 +520,8 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 	}
 
 	err = -ENOMEM;
-	sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
-						  umh_clean_and_save_pid, info);
+	sub_info = call_usermodehelper_setup_file(file, umd_setup, umd_cleanup,
+						  info);
 	if (!sub_info)
 		goto out;
 
-- 
cgit v1.2.3


From 21d598280675c463ea1b264fab06e9614aacd1e1 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 24 Jun 2020 17:01:18 -0500
Subject: umh: Remove call_usermodehelper_setup_file.

The only caller of call_usermodehelper_setup_file is fork_usermode_blob.
In fork_usermode_blob replace call_usermodehelper_setup_file with
call_usermodehelper_setup and delete fork_usermodehelper_setup_file.

For this to work the argv_free is moved from umh_clean_and_save_pid
to fork_usermode_blob.

v1: https://lkml.kernel.org/r/87zh8qf0mp.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87o8p163u1.fsf_-_@x220.int.ebiederm.org
Link: https://lkml.kernel.org/r/20200702164140.4468-4-ebiederm@xmission.com
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umh.h |  3 ---
 kernel/umh.c        | 42 +++++++++++-------------------------------
 2 files changed, 11 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/umh.h b/include/linux/umh.h
index aae16a0ebd0f..de08af00c68a 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -39,9 +39,6 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
 
-struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
-			  int (*init)(struct subprocess_info *info, struct cred *new),
-			  void (*cleanup)(struct subprocess_info *), void *data);
 struct umh_info {
 	const char *cmdline;
 	struct file *pipe_to_umh;
diff --git a/kernel/umh.c b/kernel/umh.c
index 26c3d493f168..b8fa9b99b366 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -402,33 +402,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
-struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
-		int (*init)(struct subprocess_info *info, struct cred *new),
-		void (*cleanup)(struct subprocess_info *info), void *data)
-{
-	struct subprocess_info *sub_info;
-	struct umh_info *info = data;
-	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
-
-	sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
-	if (!sub_info)
-		return NULL;
-
-	sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL);
-	if (!sub_info->argv) {
-		kfree(sub_info);
-		return NULL;
-	}
-
-	INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
-	sub_info->path = "none";
-	sub_info->file = file;
-	sub_info->init = init;
-	sub_info->cleanup = cleanup;
-	sub_info->data = data;
-	return sub_info;
-}
-
 static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct umh_info *umh_info = info->data;
@@ -479,8 +452,6 @@ static void umd_cleanup(struct subprocess_info *info)
 		fput(umh_info->pipe_to_umh);
 		fput(umh_info->pipe_from_umh);
 	}
-
-	argv_free(info->argv);
 }
 
 /**
@@ -501,7 +472,9 @@ static void umd_cleanup(struct subprocess_info *info)
  */
 int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 {
+	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
 	struct subprocess_info *sub_info;
+	char **argv = NULL;
 	struct file *file;
 	ssize_t written;
 	loff_t pos = 0;
@@ -520,11 +493,16 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 	}
 
 	err = -ENOMEM;
-	sub_info = call_usermodehelper_setup_file(file, umd_setup, umd_cleanup,
-						  info);
+	argv = argv_split(GFP_KERNEL, cmdline, NULL);
+	if (!argv)
+		goto out;
+
+	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
+					     umd_setup, umd_cleanup, info);
 	if (!sub_info)
 		goto out;
 
+	sub_info->file = file;
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
 	if (!err) {
 		mutex_lock(&umh_list_lock);
@@ -532,6 +510,8 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
 		mutex_unlock(&umh_list_lock);
 	}
 out:
+	if (argv)
+		argv_free(argv);
 	fput(file);
 	return err;
 }
-- 
cgit v1.2.3


From 884c5e683b67dbc52892e24c29eed864f330ec08 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 26 Jun 2020 12:23:00 -0500
Subject: umh: Separate the user mode driver and the user mode helper support

This makes it clear which code is part of the core user mode
helper support and which code is needed to implement user mode
drivers.

This makes the kernel smaller for everyone who does not use a usermode
driver.

v1: https://lkml.kernel.org/r/87tuyyf0ln.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87imf963s6.fsf_-_@x220.int.ebiederm.org
Link: https://lkml.kernel.org/r/20200702164140.4468-5-ebiederm@xmission.com
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/bpfilter.h        |   2 +-
 include/linux/sched.h           |   8 ---
 include/linux/umh.h             |  10 ---
 include/linux/usermode_driver.h |  30 +++++++++
 kernel/Makefile                 |   1 +
 kernel/exit.c                   |   1 +
 kernel/umh.c                    | 139 --------------------------------------
 kernel/usermode_driver.c        | 146 ++++++++++++++++++++++++++++++++++++++++
 8 files changed, 179 insertions(+), 158 deletions(-)
 create mode 100644 include/linux/usermode_driver.h
 create mode 100644 kernel/usermode_driver.c

(limited to 'kernel')

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
index d815622cd31e..d6d6206052a6 100644
--- a/include/linux/bpfilter.h
+++ b/include/linux/bpfilter.h
@@ -3,7 +3,7 @@
 #define _LINUX_BPFILTER_H
 
 #include <uapi/linux/bpfilter.h>
-#include <linux/umh.h>
+#include <linux/usermode_driver.h>
 
 struct sock;
 int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b62e6aaf28f0..59d1e92bb88e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2020,14 +2020,6 @@ static inline void rseq_execve(struct task_struct *t)
 
 #endif
 
-void __exit_umh(struct task_struct *tsk);
-
-static inline void exit_umh(struct task_struct *tsk)
-{
-	if (unlikely(tsk->flags & PF_UMH))
-		__exit_umh(tsk);
-}
-
 #ifdef CONFIG_DEBUG_RSEQ
 
 void rseq_syscall(struct pt_regs *regs);
diff --git a/include/linux/umh.h b/include/linux/umh.h
index de08af00c68a..73173c4a07e5 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -39,16 +39,6 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
 
-struct umh_info {
-	const char *cmdline;
-	struct file *pipe_to_umh;
-	struct file *pipe_from_umh;
-	struct list_head list;
-	void (*cleanup)(struct umh_info *info);
-	pid_t pid;
-};
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
-
 extern int
 call_usermodehelper_exec(struct subprocess_info *info, int wait);
 
diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
new file mode 100644
index 000000000000..c5f6dc950227
--- /dev/null
+++ b/include/linux/usermode_driver.h
@@ -0,0 +1,30 @@
+#ifndef __LINUX_USERMODE_DRIVER_H__
+#define __LINUX_USERMODE_DRIVER_H__
+
+#include <linux/umh.h>
+
+#ifdef CONFIG_BPFILTER
+void __exit_umh(struct task_struct *tsk);
+
+static inline void exit_umh(struct task_struct *tsk)
+{
+	if (unlikely(tsk->flags & PF_UMH))
+		__exit_umh(tsk);
+}
+#else
+static inline void exit_umh(struct task_struct *tsk)
+{
+}
+#endif
+
+struct umh_info {
+	const char *cmdline;
+	struct file *pipe_to_umh;
+	struct file *pipe_from_umh;
+	struct list_head list;
+	void (*cleanup)(struct umh_info *info);
+	pid_t pid;
+};
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
+
+#endif /* __LINUX_USERMODE_DRIVER_H__ */
diff --git a/kernel/Makefile b/kernel/Makefile
index f3218bc5ec69..43928759893a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
 	    async.o range.o smpboot.o ucount.o
 
+obj-$(CONFIG_BPFILTER) += usermode_driver.o
 obj-$(CONFIG_MODULES) += kmod.o
 obj-$(CONFIG_MULTIUSER) += groups.o
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 727150f28103..a081deea52ca 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -63,6 +63,7 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
+#include <linux/usermode_driver.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/umh.c b/kernel/umh.c
index b8fa9b99b366..3e4e453d45c8 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -26,8 +26,6 @@
 #include <linux/ptrace.h>
 #include <linux/async.h>
 #include <linux/uaccess.h>
-#include <linux/shmem_fs.h>
-#include <linux/pipe_fs_i.h>
 
 #include <trace/events/module.h>
 
@@ -38,8 +36,6 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
 static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
 static DEFINE_SPINLOCK(umh_sysctl_lock);
 static DECLARE_RWSEM(umhelper_sem);
-static LIST_HEAD(umh_list);
-static DEFINE_MUTEX(umh_list_lock);
 
 static void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
@@ -402,121 +398,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
-static int umd_setup(struct subprocess_info *info, struct cred *new)
-{
-	struct umh_info *umh_info = info->data;
-	struct file *from_umh[2];
-	struct file *to_umh[2];
-	int err;
-
-	/* create pipe to send data to umh */
-	err = create_pipe_files(to_umh, 0);
-	if (err)
-		return err;
-	err = replace_fd(0, to_umh[0], 0);
-	fput(to_umh[0]);
-	if (err < 0) {
-		fput(to_umh[1]);
-		return err;
-	}
-
-	/* create pipe to receive data from umh */
-	err = create_pipe_files(from_umh, 0);
-	if (err) {
-		fput(to_umh[1]);
-		replace_fd(0, NULL, 0);
-		return err;
-	}
-	err = replace_fd(1, from_umh[1], 0);
-	fput(from_umh[1]);
-	if (err < 0) {
-		fput(to_umh[1]);
-		replace_fd(0, NULL, 0);
-		fput(from_umh[0]);
-		return err;
-	}
-
-	umh_info->pipe_to_umh = to_umh[1];
-	umh_info->pipe_from_umh = from_umh[0];
-	umh_info->pid = task_pid_nr(current);
-	current->flags |= PF_UMH;
-	return 0;
-}
-
-static void umd_cleanup(struct subprocess_info *info)
-{
-	struct umh_info *umh_info = info->data;
-
-	/* cleanup if umh_setup() was successful but exec failed */
-	if (info->retval) {
-		fput(umh_info->pipe_to_umh);
-		fput(umh_info->pipe_from_umh);
-	}
-}
-
-/**
- * fork_usermode_blob - fork a blob of bytes as a usermode process
- * @data: a blob of bytes that can be do_execv-ed as a file
- * @len: length of the blob
- * @info: information about usermode process (shouldn't be NULL)
- *
- * If info->cmdline is set it will be used as command line for the
- * user process, else "usermodehelper" is used.
- *
- * Returns either negative error or zero which indicates success
- * in executing a blob of bytes as a usermode process. In such
- * case 'struct umh_info *info' is populated with two pipes
- * and a pid of the process. The caller is responsible for health
- * check of the user process, killing it via pid, and closing the
- * pipes when user process is no longer needed.
- */
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
-{
-	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
-	struct subprocess_info *sub_info;
-	char **argv = NULL;
-	struct file *file;
-	ssize_t written;
-	loff_t pos = 0;
-	int err;
-
-	file = shmem_kernel_file_setup("", len, 0);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-
-	written = kernel_write(file, data, len, &pos);
-	if (written != len) {
-		err = written;
-		if (err >= 0)
-			err = -ENOMEM;
-		goto out;
-	}
-
-	err = -ENOMEM;
-	argv = argv_split(GFP_KERNEL, cmdline, NULL);
-	if (!argv)
-		goto out;
-
-	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
-					     umd_setup, umd_cleanup, info);
-	if (!sub_info)
-		goto out;
-
-	sub_info->file = file;
-	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
-	if (!err) {
-		mutex_lock(&umh_list_lock);
-		list_add(&info->list, &umh_list);
-		mutex_unlock(&umh_list_lock);
-	}
-out:
-	if (argv)
-		argv_free(argv);
-	fput(file);
-	return err;
-}
-EXPORT_SYMBOL_GPL(fork_usermode_blob);
-
 /**
  * call_usermodehelper_exec - start a usermode application
  * @sub_info: information about the subprocessa
@@ -678,26 +559,6 @@ static int proc_cap_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
-void __exit_umh(struct task_struct *tsk)
-{
-	struct umh_info *info;
-	pid_t pid = tsk->pid;
-
-	mutex_lock(&umh_list_lock);
-	list_for_each_entry(info, &umh_list, list) {
-		if (info->pid == pid) {
-			list_del(&info->list);
-			mutex_unlock(&umh_list_lock);
-			goto out;
-		}
-	}
-	mutex_unlock(&umh_list_lock);
-	return;
-out:
-	if (info->cleanup)
-		info->cleanup(info);
-}
-
 struct ctl_table usermodehelper_table[] = {
 	{
 		.procname	= "bset",
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
new file mode 100644
index 000000000000..5b05863af855
--- /dev/null
+++ b/kernel/usermode_driver.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * umd - User mode driver support
+ */
+#include <linux/shmem_fs.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/usermode_driver.h>
+
+static LIST_HEAD(umh_list);
+static DEFINE_MUTEX(umh_list_lock);
+
+static int umd_setup(struct subprocess_info *info, struct cred *new)
+{
+	struct umh_info *umh_info = info->data;
+	struct file *from_umh[2];
+	struct file *to_umh[2];
+	int err;
+
+	/* create pipe to send data to umh */
+	err = create_pipe_files(to_umh, 0);
+	if (err)
+		return err;
+	err = replace_fd(0, to_umh[0], 0);
+	fput(to_umh[0]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		return err;
+	}
+
+	/* create pipe to receive data from umh */
+	err = create_pipe_files(from_umh, 0);
+	if (err) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		return err;
+	}
+	err = replace_fd(1, from_umh[1], 0);
+	fput(from_umh[1]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		fput(from_umh[0]);
+		return err;
+	}
+
+	umh_info->pipe_to_umh = to_umh[1];
+	umh_info->pipe_from_umh = from_umh[0];
+	umh_info->pid = task_pid_nr(current);
+	current->flags |= PF_UMH;
+	return 0;
+}
+
+static void umd_cleanup(struct subprocess_info *info)
+{
+	struct umh_info *umh_info = info->data;
+
+	/* cleanup if umh_setup() was successful but exec failed */
+	if (info->retval) {
+		fput(umh_info->pipe_to_umh);
+		fput(umh_info->pipe_from_umh);
+	}
+}
+
+/**
+ * fork_usermode_blob - fork a blob of bytes as a usermode process
+ * @data: a blob of bytes that can be do_execv-ed as a file
+ * @len: length of the blob
+ * @info: information about usermode process (shouldn't be NULL)
+ *
+ * If info->cmdline is set it will be used as command line for the
+ * user process, else "usermodehelper" is used.
+ *
+ * Returns either negative error or zero which indicates success
+ * in executing a blob of bytes as a usermode process. In such
+ * case 'struct umh_info *info' is populated with two pipes
+ * and a pid of the process. The caller is responsible for health
+ * check of the user process, killing it via pid, and closing the
+ * pipes when user process is no longer needed.
+ */
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
+{
+	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
+	struct subprocess_info *sub_info;
+	char **argv = NULL;
+	struct file *file;
+	ssize_t written;
+	loff_t pos = 0;
+	int err;
+
+	file = shmem_kernel_file_setup("", len, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	written = kernel_write(file, data, len, &pos);
+	if (written != len) {
+		err = written;
+		if (err >= 0)
+			err = -ENOMEM;
+		goto out;
+	}
+
+	err = -ENOMEM;
+	argv = argv_split(GFP_KERNEL, cmdline, NULL);
+	if (!argv)
+		goto out;
+
+	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
+					     umd_setup, umd_cleanup, info);
+	if (!sub_info)
+		goto out;
+
+	sub_info->file = file;
+	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+	if (!err) {
+		mutex_lock(&umh_list_lock);
+		list_add(&info->list, &umh_list);
+		mutex_unlock(&umh_list_lock);
+	}
+out:
+	if (argv)
+		argv_free(argv);
+	fput(file);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fork_usermode_blob);
+
+void __exit_umh(struct task_struct *tsk)
+{
+	struct umh_info *info;
+	pid_t pid = tsk->pid;
+
+	mutex_lock(&umh_list_lock);
+	list_for_each_entry(info, &umh_list, list) {
+		if (info->pid == pid) {
+			list_del(&info->list);
+			mutex_unlock(&umh_list_lock);
+			goto out;
+		}
+	}
+	mutex_unlock(&umh_list_lock);
+	return;
+out:
+	if (info->cleanup)
+		info->cleanup(info);
+}
+
-- 
cgit v1.2.3


From 74be2d3b80af1bb264c3b9905b52c15efc03c0fe Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 26 Jun 2020 11:16:06 -0500
Subject: umd: For clarity rename umh_info umd_info

This structure is only used for user mode drivers so change
the prefix from umh to umd to make that clear.

v1: https://lkml.kernel.org/r/87o8p6f0kw.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/878sg563po.fsf_-_@x220.int.ebiederm.org
Link: https://lkml.kernel.org/r/20200702164140.4468-6-ebiederm@xmission.com
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/bpfilter.h        |  2 +-
 include/linux/usermode_driver.h |  6 +++---
 kernel/usermode_driver.c        | 20 ++++++++++----------
 net/ipv4/bpfilter/sockopt.c     |  2 +-
 4 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
index d6d6206052a6..ec9972d822e0 100644
--- a/include/linux/bpfilter.h
+++ b/include/linux/bpfilter.h
@@ -11,7 +11,7 @@ int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
 int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
 			    int __user *optlen);
 struct bpfilter_umh_ops {
-	struct umh_info info;
+	struct umd_info info;
 	/* since ip_getsockopt() can run in parallel, serialize access to umh */
 	struct mutex lock;
 	int (*sockopt)(struct sock *sk, int optname,
diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
index c5f6dc950227..7131ea611bab 100644
--- a/include/linux/usermode_driver.h
+++ b/include/linux/usermode_driver.h
@@ -17,14 +17,14 @@ static inline void exit_umh(struct task_struct *tsk)
 }
 #endif
 
-struct umh_info {
+struct umd_info {
 	const char *cmdline;
 	struct file *pipe_to_umh;
 	struct file *pipe_from_umh;
 	struct list_head list;
-	void (*cleanup)(struct umh_info *info);
+	void (*cleanup)(struct umd_info *info);
 	pid_t pid;
 };
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
+int fork_usermode_blob(void *data, size_t len, struct umd_info *info);
 
 #endif /* __LINUX_USERMODE_DRIVER_H__ */
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index 5b05863af855..e73550e946d6 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -11,7 +11,7 @@ static DEFINE_MUTEX(umh_list_lock);
 
 static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
-	struct umh_info *umh_info = info->data;
+	struct umd_info *umd_info = info->data;
 	struct file *from_umh[2];
 	struct file *to_umh[2];
 	int err;
@@ -43,21 +43,21 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 		return err;
 	}
 
-	umh_info->pipe_to_umh = to_umh[1];
-	umh_info->pipe_from_umh = from_umh[0];
-	umh_info->pid = task_pid_nr(current);
+	umd_info->pipe_to_umh = to_umh[1];
+	umd_info->pipe_from_umh = from_umh[0];
+	umd_info->pid = task_pid_nr(current);
 	current->flags |= PF_UMH;
 	return 0;
 }
 
 static void umd_cleanup(struct subprocess_info *info)
 {
-	struct umh_info *umh_info = info->data;
+	struct umd_info *umd_info = info->data;
 
 	/* cleanup if umh_setup() was successful but exec failed */
 	if (info->retval) {
-		fput(umh_info->pipe_to_umh);
-		fput(umh_info->pipe_from_umh);
+		fput(umd_info->pipe_to_umh);
+		fput(umd_info->pipe_from_umh);
 	}
 }
 
@@ -72,12 +72,12 @@ static void umd_cleanup(struct subprocess_info *info)
  *
  * Returns either negative error or zero which indicates success
  * in executing a blob of bytes as a usermode process. In such
- * case 'struct umh_info *info' is populated with two pipes
+ * case 'struct umd_info *info' is populated with two pipes
  * and a pid of the process. The caller is responsible for health
  * check of the user process, killing it via pid, and closing the
  * pipes when user process is no longer needed.
  */
-int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
+int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 {
 	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
 	struct subprocess_info *sub_info;
@@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob);
 
 void __exit_umh(struct task_struct *tsk)
 {
-	struct umh_info *info;
+	struct umd_info *info;
 	pid_t pid = tsk->pid;
 
 	mutex_lock(&umh_list_lock);
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 0480918bfc7c..c0dbcc86fcdb 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -12,7 +12,7 @@
 struct bpfilter_umh_ops bpfilter_ops;
 EXPORT_SYMBOL_GPL(bpfilter_ops);
 
-static void bpfilter_umh_cleanup(struct umh_info *info)
+static void bpfilter_umh_cleanup(struct umd_info *info)
 {
 	mutex_lock(&bpfilter_ops.lock);
 	bpfilter_ops.stop = true;
-- 
cgit v1.2.3


From 1199c6c3da5197e9924a906b9de71b8d0ac62a01 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 25 Jun 2020 11:38:08 -0500
Subject: umd: Rename umd_info.cmdline umd_info.driver_name

The only thing supplied in the cmdline today is the driver name so
rename the field to clarify the code.

As this value is always supplied stop trying to handle the case of
a NULL cmdline.

Additionally since we now have a name we can count on use the
driver_name any place where the code is looking for a name
of the binary.

v1: https://lkml.kernel.org/r/87imfef0k3.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87366d63os.fsf_-_@x220.int.ebiederm.org
Link: https://lkml.kernel.org/r/20200702164140.4468-7-ebiederm@xmission.com
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/usermode_driver.h |  2 +-
 kernel/usermode_driver.c        | 11 ++++-------
 net/ipv4/bpfilter/sockopt.c     |  2 +-
 3 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
index 7131ea611bab..48cf25e3145d 100644
--- a/include/linux/usermode_driver.h
+++ b/include/linux/usermode_driver.h
@@ -18,7 +18,7 @@ static inline void exit_umh(struct task_struct *tsk)
 #endif
 
 struct umd_info {
-	const char *cmdline;
+	const char *driver_name;
 	struct file *pipe_to_umh;
 	struct file *pipe_from_umh;
 	struct list_head list;
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index e73550e946d6..46d60d855e93 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -67,9 +67,6 @@ static void umd_cleanup(struct subprocess_info *info)
  * @len: length of the blob
  * @info: information about usermode process (shouldn't be NULL)
  *
- * If info->cmdline is set it will be used as command line for the
- * user process, else "usermodehelper" is used.
- *
  * Returns either negative error or zero which indicates success
  * in executing a blob of bytes as a usermode process. In such
  * case 'struct umd_info *info' is populated with two pipes
@@ -79,7 +76,6 @@ static void umd_cleanup(struct subprocess_info *info)
  */
 int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 {
-	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";
 	struct subprocess_info *sub_info;
 	char **argv = NULL;
 	struct file *file;
@@ -87,7 +83,7 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 	loff_t pos = 0;
 	int err;
 
-	file = shmem_kernel_file_setup("", len, 0);
+	file = shmem_kernel_file_setup(info->driver_name, len, 0);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
@@ -100,11 +96,12 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 	}
 
 	err = -ENOMEM;
-	argv = argv_split(GFP_KERNEL, cmdline, NULL);
+	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
 	if (!argv)
 		goto out;
 
-	sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL,
+	sub_info = call_usermodehelper_setup(info->driver_name, argv, NULL,
+					     GFP_KERNEL,
 					     umd_setup, umd_cleanup, info);
 	if (!sub_info)
 		goto out;
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index c0dbcc86fcdb..5050de28333d 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -70,7 +70,7 @@ static int __init bpfilter_sockopt_init(void)
 {
 	mutex_init(&bpfilter_ops.lock);
 	bpfilter_ops.stop = true;
-	bpfilter_ops.info.cmdline = "bpfilter_umh";
+	bpfilter_ops.info.driver_name = "bpfilter_umh";
 	bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup;
 
 	return 0;
-- 
cgit v1.2.3


From e2dc9bf3f5275ca372001541e5f26af572976e65 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 25 Jun 2020 13:12:59 -0500
Subject: umd: Transform fork_usermode_blob into fork_usermode_driver

Instead of loading a binary blob into a temporary file with
shmem_kernel_file_setup load a binary blob into a temporary tmpfs
filesystem.  This means that the blob can be stored in an init section
and discared, and it means the binary blob will have a filename so can
be executed normally.

The only tricky thing about this code is that in the helper function
blob_to_mnt __fput_sync is used.  That is because a file can not be
executed if it is still open for write, and the ordinary delayed close
for kernel threads does not happen soon enough, which causes the
following exec to fail.  The function umd_load_blob is not called with
any locks so this should be safe.

Executing the blob normally winds up correcting several problems with
the user mode driver code discovered by Tetsuo Handa[1].  By passing
an ordinary filename into the exec, it is no longer necessary to
figure out how to turn a O_RDWR file descriptor into a properly
referende counted O_EXEC file descriptor that forbids all writes.  For
path based LSMs there are no new special cases.

[1] https://lore.kernel.org/linux-fsdevel/2a8775b4-1dd5-9d5c-aa42-9872445e0942@i-love.sakura.ne.jp/
v1: https://lkml.kernel.org/r/87d05mf0j9.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87wo3p4p35.fsf_-_@x220.int.ebiederm.org
Link: https://lkml.kernel.org/r/20200702164140.4468-8-ebiederm@xmission.com
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/usermode_driver.h |   6 +-
 kernel/usermode_driver.c        | 126 +++++++++++++++++++++++++++++++---------
 net/bpfilter/bpfilter_kern.c    |  14 ++++-
 3 files changed, 113 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
index 48cf25e3145d..97c919b7147c 100644
--- a/include/linux/usermode_driver.h
+++ b/include/linux/usermode_driver.h
@@ -2,6 +2,7 @@
 #define __LINUX_USERMODE_DRIVER_H__
 
 #include <linux/umh.h>
+#include <linux/path.h>
 
 #ifdef CONFIG_BPFILTER
 void __exit_umh(struct task_struct *tsk);
@@ -23,8 +24,11 @@ struct umd_info {
 	struct file *pipe_from_umh;
 	struct list_head list;
 	void (*cleanup)(struct umd_info *info);
+	struct path wd;
 	pid_t pid;
 };
-int fork_usermode_blob(void *data, size_t len, struct umd_info *info);
+int umd_load_blob(struct umd_info *info, const void *data, size_t len);
+int umd_unload_blob(struct umd_info *info);
+int fork_usermode_driver(struct umd_info *info);
 
 #endif /* __LINUX_USERMODE_DRIVER_H__ */
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index 46d60d855e93..a86798759f83 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -4,11 +4,98 @@
  */
 #include <linux/shmem_fs.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/mount.h>
+#include <linux/fs_struct.h>
+#include <linux/task_work.h>
 #include <linux/usermode_driver.h>
 
 static LIST_HEAD(umh_list);
 static DEFINE_MUTEX(umh_list_lock);
 
+static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
+{
+	struct file_system_type *type;
+	struct vfsmount *mnt;
+	struct file *file;
+	ssize_t written;
+	loff_t pos = 0;
+
+	type = get_fs_type("tmpfs");
+	if (!type)
+		return ERR_PTR(-ENODEV);
+
+	mnt = kern_mount(type);
+	put_filesystem(type);
+	if (IS_ERR(mnt))
+		return mnt;
+
+	file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700);
+	if (IS_ERR(file)) {
+		mntput(mnt);
+		return ERR_CAST(file);
+	}
+
+	written = kernel_write(file, data, len, &pos);
+	if (written != len) {
+		int err = written;
+		if (err >= 0)
+			err = -ENOMEM;
+		filp_close(file, NULL);
+		mntput(mnt);
+		return ERR_PTR(err);
+	}
+
+	fput(file);
+
+	/* Flush delayed fput so exec can open the file read-only */
+	flush_delayed_fput();
+	task_work_run();
+	return mnt;
+}
+
+/**
+ * umd_load_blob - Remember a blob of bytes for fork_usermode_driver
+ * @info: information about usermode driver
+ * @data: a blob of bytes that can be executed as a file
+ * @len:  The lentgh of the blob
+ *
+ */
+int umd_load_blob(struct umd_info *info, const void *data, size_t len)
+{
+	struct vfsmount *mnt;
+
+	if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt))
+		return -EBUSY;
+
+	mnt = blob_to_mnt(data, len, info->driver_name);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	info->wd.mnt = mnt;
+	info->wd.dentry = mnt->mnt_root;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(umd_load_blob);
+
+/**
+ * umd_unload_blob - Disassociate @info from a previously loaded blob
+ * @info: information about usermode driver
+ *
+ */
+int umd_unload_blob(struct umd_info *info)
+{
+	if (WARN_ON_ONCE(!info->wd.mnt ||
+			 !info->wd.dentry ||
+			 info->wd.mnt->mnt_root != info->wd.dentry))
+		return -EINVAL;
+
+	kern_unmount(info->wd.mnt);
+	info->wd.mnt = NULL;
+	info->wd.dentry = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(umd_unload_blob);
+
 static int umd_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct umd_info *umd_info = info->data;
@@ -43,6 +130,7 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 		return err;
 	}
 
+	set_fs_pwd(current->fs, &umd_info->wd);
 	umd_info->pipe_to_umh = to_umh[1];
 	umd_info->pipe_from_umh = from_umh[0];
 	umd_info->pid = task_pid_nr(current);
@@ -62,39 +150,21 @@ static void umd_cleanup(struct subprocess_info *info)
 }
 
 /**
- * fork_usermode_blob - fork a blob of bytes as a usermode process
- * @data: a blob of bytes that can be do_execv-ed as a file
- * @len: length of the blob
- * @info: information about usermode process (shouldn't be NULL)
+ * fork_usermode_driver - fork a usermode driver
+ * @info: information about usermode driver (shouldn't be NULL)
  *
- * Returns either negative error or zero which indicates success
- * in executing a blob of bytes as a usermode process. In such
- * case 'struct umd_info *info' is populated with two pipes
- * and a pid of the process. The caller is responsible for health
- * check of the user process, killing it via pid, and closing the
- * pipes when user process is no longer needed.
+ * Returns either negative error or zero which indicates success in
+ * executing a usermode driver. In such case 'struct umd_info *info'
+ * is populated with two pipes and a pid of the process. The caller is
+ * responsible for health check of the user process, killing it via
+ * pid, and closing the pipes when user process is no longer needed.
  */
-int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
+int fork_usermode_driver(struct umd_info *info)
 {
 	struct subprocess_info *sub_info;
 	char **argv = NULL;
-	struct file *file;
-	ssize_t written;
-	loff_t pos = 0;
 	int err;
 
-	file = shmem_kernel_file_setup(info->driver_name, len, 0);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-
-	written = kernel_write(file, data, len, &pos);
-	if (written != len) {
-		err = written;
-		if (err >= 0)
-			err = -ENOMEM;
-		goto out;
-	}
-
 	err = -ENOMEM;
 	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
 	if (!argv)
@@ -106,7 +176,6 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 	if (!sub_info)
 		goto out;
 
-	sub_info->file = file;
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
 	if (!err) {
 		mutex_lock(&umh_list_lock);
@@ -116,10 +185,9 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info)
 out:
 	if (argv)
 		argv_free(argv);
-	fput(file);
 	return err;
 }
-EXPORT_SYMBOL_GPL(fork_usermode_blob);
+EXPORT_SYMBOL_GPL(fork_usermode_driver);
 
 void __exit_umh(struct task_struct *tsk)
 {
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index c0f0990f30b6..28883b00609d 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -77,9 +77,7 @@ static int start_umh(void)
 	int err;
 
 	/* fork usermode process */
-	err = fork_usermode_blob(&bpfilter_umh_start,
-				 &bpfilter_umh_end - &bpfilter_umh_start,
-				 &bpfilter_ops.info);
+	err = fork_usermode_driver(&bpfilter_ops.info);
 	if (err)
 		return err;
 	bpfilter_ops.stop = false;
@@ -98,6 +96,12 @@ static int __init load_umh(void)
 {
 	int err;
 
+	err = umd_load_blob(&bpfilter_ops.info,
+			    &bpfilter_umh_start,
+			    &bpfilter_umh_end - &bpfilter_umh_start);
+	if (err)
+		return err;
+
 	mutex_lock(&bpfilter_ops.lock);
 	if (!bpfilter_ops.stop) {
 		err = -EFAULT;
@@ -110,6 +114,8 @@ static int __init load_umh(void)
 	}
 out:
 	mutex_unlock(&bpfilter_ops.lock);
+	if (err)
+		umd_unload_blob(&bpfilter_ops.info);
 	return err;
 }
 
@@ -122,6 +128,8 @@ static void __exit fini_umh(void)
 		bpfilter_ops.sockopt = NULL;
 	}
 	mutex_unlock(&bpfilter_ops.lock);
+
+	umd_unload_blob(&bpfilter_ops.info);
 }
 module_init(load_umh);
 module_exit(fini_umh);
-- 
cgit v1.2.3


From 55e6074e3fa67e1fb9ec140904db7e6cae6eda4b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 25 Jun 2020 13:52:50 -0500
Subject: umh: Stop calling do_execve_file

With the user mode driver code changed to not set subprocess_info.file
there are no more users of subproces_info.file.  Remove this field
from struct subprocess_info and remove the only user in
call_usermodehelper_exec_async that would call do_execve_file instead
of do_execve if file was set.

v1: https://lkml.kernel.org/r/877dvuf0i7.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87r1tx4p2a.fsf_-_@x220.int.ebiederm.org
Link: https://lkml.kernel.org/r/20200702164140.4468-9-ebiederm@xmission.com
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/umh.h |  1 -
 kernel/umh.c        | 10 +++-------
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/umh.h b/include/linux/umh.h
index 73173c4a07e5..244aff638220 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -22,7 +22,6 @@ struct subprocess_info {
 	const char *path;
 	char **argv;
 	char **envp;
-	struct file *file;
 	int wait;
 	int retval;
 	int (*init)(struct subprocess_info *info, struct cred *new);
diff --git a/kernel/umh.c b/kernel/umh.c
index 3e4e453d45c8..6ca2096298b9 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -98,13 +98,9 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	if (sub_info->file)
-		retval = do_execve_file(sub_info->file,
-					sub_info->argv, sub_info->envp);
-	else
-		retval = do_execve(getname_kernel(sub_info->path),
-				   (const char __user *const __user *)sub_info->argv,
-				   (const char __user *const __user *)sub_info->envp);
+	retval = do_execve(getname_kernel(sub_info->path),
+			   (const char __user *const __user *)sub_info->argv,
+			   (const char __user *const __user *)sub_info->envp);
 out:
 	sub_info->retval = retval;
 	/*
-- 
cgit v1.2.3


From 1c340ead18ee4b4a84357abdef6d4f39ee08328b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 25 Jun 2020 16:48:26 -0500
Subject: umd: Track user space drivers with struct pid

Use struct pid instead of user space pid values that are prone to wrap
araound.

In addition track the entire thread group instead of just the first
thread that is started by exec.  There are no multi-threaded user mode
drivers today but there is nothing preclucing user drivers from being
multi-threaded, so it is just a good idea to track the entire process.

Take a reference count on the tgid's in question to make it possible
to remove exit_umh in a future change.

As a struct pid is available directly use kill_pid_info.

The prior process signalling code was iffy in using a userspace pid
known to be in the initial pid namespace and then looking up it's task
in whatever the current pid namespace is.  It worked only because
kernel threads always run in the initial pid namespace.

As the tgid is now refcounted verify the tgid is NULL at the start of
fork_usermode_driver to avoid the possibility of silent pid leaks.

v1: https://lkml.kernel.org/r/87mu4qdlv2.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/a70l4oy8.fsf_-_@x220.int.ebiederm.org
Link: https://lkml.kernel.org/r/20200702164140.4468-12-ebiederm@xmission.com
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/usermode_driver.h |  2 +-
 kernel/exit.c                   |  3 ++-
 kernel/usermode_driver.c        | 15 ++++++++++-----
 net/bpfilter/bpfilter_kern.c    | 13 +++++--------
 net/ipv4/bpfilter/sockopt.c     |  3 ++-
 5 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
index 97c919b7147c..45adbffb31d9 100644
--- a/include/linux/usermode_driver.h
+++ b/include/linux/usermode_driver.h
@@ -25,7 +25,7 @@ struct umd_info {
 	struct list_head list;
 	void (*cleanup)(struct umd_info *info);
 	struct path wd;
-	pid_t pid;
+	struct pid *tgid;
 };
 int umd_load_blob(struct umd_info *info, const void *data, size_t len);
 int umd_unload_blob(struct umd_info *info);
diff --git a/kernel/exit.c b/kernel/exit.c
index a081deea52ca..d3294b611df1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -805,7 +805,8 @@ void __noreturn do_exit(long code)
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	exit_thread(tsk);
-	exit_umh(tsk);
+	if (group_dead)
+		exit_umh(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index a86798759f83..f77f8d7ce9e3 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -133,7 +133,7 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 	set_fs_pwd(current->fs, &umd_info->wd);
 	umd_info->pipe_to_umh = to_umh[1];
 	umd_info->pipe_from_umh = from_umh[0];
-	umd_info->pid = task_pid_nr(current);
+	umd_info->tgid = get_pid(task_tgid(current));
 	current->flags |= PF_UMH;
 	return 0;
 }
@@ -146,6 +146,8 @@ static void umd_cleanup(struct subprocess_info *info)
 	if (info->retval) {
 		fput(umd_info->pipe_to_umh);
 		fput(umd_info->pipe_from_umh);
+		put_pid(umd_info->tgid);
+		umd_info->tgid = NULL;
 	}
 }
 
@@ -155,9 +157,9 @@ static void umd_cleanup(struct subprocess_info *info)
  *
  * Returns either negative error or zero which indicates success in
  * executing a usermode driver. In such case 'struct umd_info *info'
- * is populated with two pipes and a pid of the process. The caller is
+ * is populated with two pipes and a tgid of the process. The caller is
  * responsible for health check of the user process, killing it via
- * pid, and closing the pipes when user process is no longer needed.
+ * tgid, and closing the pipes when user process is no longer needed.
  */
 int fork_usermode_driver(struct umd_info *info)
 {
@@ -165,6 +167,9 @@ int fork_usermode_driver(struct umd_info *info)
 	char **argv = NULL;
 	int err;
 
+	if (WARN_ON_ONCE(info->tgid))
+		return -EBUSY;
+
 	err = -ENOMEM;
 	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
 	if (!argv)
@@ -192,11 +197,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_driver);
 void __exit_umh(struct task_struct *tsk)
 {
 	struct umd_info *info;
-	pid_t pid = tsk->pid;
+	struct pid *tgid = task_tgid(tsk);
 
 	mutex_lock(&umh_list_lock);
 	list_for_each_entry(info, &umh_list, list) {
-		if (info->pid == pid) {
+		if (info->tgid == tgid) {
 			list_del(&info->list);
 			mutex_unlock(&umh_list_lock);
 			goto out;
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index 28883b00609d..08ea77c2b137 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -15,16 +15,13 @@ extern char bpfilter_umh_end;
 
 static void shutdown_umh(void)
 {
-	struct task_struct *tsk;
+	struct umd_info *info = &bpfilter_ops.info;
+	struct pid *tgid = info->tgid;
 
 	if (bpfilter_ops.stop)
 		return;
 
-	tsk = get_pid_task(find_vpid(bpfilter_ops.info.pid), PIDTYPE_PID);
-	if (tsk) {
-		send_sig(SIGKILL, tsk, 1);
-		put_task_struct(tsk);
-	}
+	kill_pid(tgid, SIGKILL, 1);
 }
 
 static void __stop_umh(void)
@@ -48,7 +45,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname,
 	req.cmd = optname;
 	req.addr = (long __force __user)optval;
 	req.len = optlen;
-	if (!bpfilter_ops.info.pid)
+	if (!bpfilter_ops.info.tgid)
 		goto out;
 	n = __kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req),
 			   &pos);
@@ -81,7 +78,7 @@ static int start_umh(void)
 	if (err)
 		return err;
 	bpfilter_ops.stop = false;
-	pr_info("Loaded bpfilter_umh pid %d\n", bpfilter_ops.info.pid);
+	pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid));
 
 	/* health check that usermode process started correctly */
 	if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) {
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 5050de28333d..56cbc43145f6 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -18,7 +18,8 @@ static void bpfilter_umh_cleanup(struct umd_info *info)
 	bpfilter_ops.stop = true;
 	fput(info->pipe_to_umh);
 	fput(info->pipe_from_umh);
-	info->pid = 0;
+	put_pid(info->tgid);
+	info->tgid = NULL;
 	mutex_unlock(&bpfilter_ops.lock);
 }
 
-- 
cgit v1.2.3


From ff2a91127b374c75ae024b31d22f23ad49d16eb4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sun, 24 May 2020 20:57:00 +0200
Subject: fork: remove do_fork()

Now that all architectures have been switched to use _do_fork() and the new
struct kernel_clone_args calling convention we can remove the legacy
do_fork() helper completely. The calling convention used to be brittle and
do_fork() didn't buy us anything. The only calling convention accepted
should be based on struct kernel_clone_args going forward. It's cleaner and
uniform.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/sched/task.h |  1 -
 kernel/fork.c              | 25 +------------------------
 2 files changed, 1 insertion(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ddce0ea515d1..9f03c44941fb 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -96,7 +96,6 @@ extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern long _do_fork(struct kernel_clone_args *kargs);
-extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 struct mm_struct *copy_init_mm(void);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/kernel/fork.c b/kernel/fork.c
index 9875aeb2ba41..0fd7eb1b38f9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2493,29 +2493,6 @@ long _do_fork(struct kernel_clone_args *args)
 	return nr;
 }
 
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-/* For compatibility with architectures that call do_fork directly rather than
- * using the syscall entry points below. */
-long do_fork(unsigned long clone_flags,
-	      unsigned long stack_start,
-	      unsigned long stack_size,
-	      int __user *parent_tidptr,
-	      int __user *child_tidptr)
-{
-	struct kernel_clone_args args = {
-		.flags		= (lower_32_bits(clone_flags) & ~CSIGNAL),
-		.pidfd		= parent_tidptr,
-		.child_tid	= child_tidptr,
-		.parent_tid	= parent_tidptr,
-		.exit_signal	= (lower_32_bits(clone_flags) & CSIGNAL),
-		.stack		= stack_start,
-		.stack_size	= stack_size,
-	};
-
-	return _do_fork(&args);
-}
-#endif
-
 /*
  * Create a kernel thread.
  */
@@ -2923,7 +2900,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
 /*
  * unshare allows a process to 'unshare' part of the process
  * context which was originally shared using clone.  copy_*
- * functions used by do_fork() cannot be used here directly
+ * functions used by _do_fork() cannot be used here directly
  * because they modify an inactive task_struct that is being
  * constructed. Here we are modifying the current, active,
  * task_struct.
-- 
cgit v1.2.3


From 140c8180eb7c7cbda399f64474788b86db72db32 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sun, 24 May 2020 23:34:20 +0200
Subject: arch: remove HAVE_COPY_THREAD_TLS

All architectures support copy_thread_tls() now, so remove the legacy
copy_thread() function and the HAVE_COPY_THREAD_TLS config option. Everyone
uses the same process creation calling convention based on
copy_thread_tls() and struct kernel_clone_args. This will make it easier to
maintain the core process creation code under kernel/, simplifies the
callpaths and makes the identical for all architectures.

Cc: linux-arch@vger.kernel.org
Acked-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Acked-by: Greentime Hu <green.hu@gmail.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 arch/Kconfig               |  7 -------
 arch/alpha/Kconfig         |  1 -
 arch/arc/Kconfig           |  1 -
 arch/arm/Kconfig           |  1 -
 arch/arm64/Kconfig         |  1 -
 arch/c6x/Kconfig           |  1 -
 arch/csky/Kconfig          |  1 -
 arch/h8300/Kconfig         |  1 -
 arch/hexagon/Kconfig       |  1 -
 arch/ia64/Kconfig          |  1 -
 arch/m68k/Kconfig          |  1 -
 arch/microblaze/Kconfig    |  1 -
 arch/mips/Kconfig          |  1 -
 arch/nds32/Kconfig         |  1 -
 arch/nios2/Kconfig         |  1 -
 arch/openrisc/Kconfig      |  1 -
 arch/parisc/Kconfig        |  1 -
 arch/powerpc/Kconfig       |  1 -
 arch/riscv/Kconfig         |  1 -
 arch/s390/Kconfig          |  1 -
 arch/sh/Kconfig            |  1 -
 arch/sparc/Kconfig         |  1 -
 arch/um/Kconfig            |  1 -
 arch/unicore32/Kconfig     |  1 -
 arch/x86/Kconfig           |  1 -
 arch/xtensa/Kconfig        |  1 -
 include/linux/sched/task.h | 15 +--------------
 kernel/fork.c              |  9 ---------
 28 files changed, 1 insertion(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 8cc35dc556c7..943aac2f3ebe 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -754,13 +754,6 @@ config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	depends on MMU
 	select ARCH_HAS_ELF_RANDOMIZE
 
-config HAVE_COPY_THREAD_TLS
-	bool
-	help
-	  Architecture provides copy_thread_tls to accept tls argument via
-	  normal C parameter passing, rather than extracting the syscall
-	  argument from pt_regs.
-
 config HAVE_STACK_VALIDATION
 	bool
 	help
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index b01515c6b2ed..10862c5a8c76 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -38,7 +38,6 @@ config ALPHA
 	select OLD_SIGSUSPEND
 	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
 	select MMU_GATHER_NO_RANGE
-	select HAVE_COPY_THREAD_TLS
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index fddc70029727..1fa0b98ed9ce 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -29,7 +29,6 @@ config ARC
 	select GENERIC_SMP_IDLE_THREAD
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_TRACEHOOK
-	select HAVE_COPY_THREAD_TLS
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_FUTEX_CMPXCHG if FUTEX
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2ac74904a3ce..445b5ed693f0 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -72,7 +72,6 @@ config ARM
 	select HAVE_ARM_SMCCC if CPU_V7
 	select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32
 	select HAVE_CONTEXT_TRACKING
-	select HAVE_COPY_THREAD_TLS
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_DEBUG_KMEMLEAK if !XIP_KERNEL
 	select HAVE_DMA_CONTIGUOUS if MMU
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a4a094bedcb2..de93e965727d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -148,7 +148,6 @@ config ARM64
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_CMPXCHG_LOCAL
 	select HAVE_CONTEXT_TRACKING
-	select HAVE_COPY_THREAD_TLS
 	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_CONTIGUOUS
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index 9cde76a5928e..6444ebfd06a6 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -22,7 +22,6 @@ config C6X
 	select GENERIC_CLOCKEVENTS
 	select MODULES_USE_ELF_RELA
 	select MMU_GATHER_NO_RANGE if MMU
-	select HAVE_COPY_THREAD_TLS
 
 config MMU
 	def_bool n
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index bd31ab12f77d..902f1142d550 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -38,7 +38,6 @@ config CSKY
 	select GX6605S_TIMER if CPU_CK610
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_AUDITSYSCALL
-	select HAVE_COPY_THREAD_TLS
 	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index de0eb417a0b9..d11666d538fe 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -26,7 +26,6 @@ config H8300
 	select HAVE_ARCH_HASH
 	select CPU_NO_EFFICIENT_FFS
 	select UACCESS_MEMCPY
-	select HAVE_COPY_THREAD_TLS
 
 config CPU_BIG_ENDIAN
 	def_bool y
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 19bc2f2ee331..667cfc511cf9 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -31,7 +31,6 @@ config HEXAGON
 	select GENERIC_CLOCKEVENTS_BROADCAST
 	select MODULES_USE_ELF_RELA
 	select GENERIC_CPU_DEVICES
-	select HAVE_COPY_THREAD_TLS
 	help
 	  Qualcomm Hexagon is a processor architecture designed for high
 	  performance and low power across a wide variety of applications.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 1b6034b89a04..1fa2fe2ef053 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -55,7 +55,6 @@ config IA64
 	select HAVE_ARCH_AUDITSYSCALL
 	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
-	select HAVE_COPY_THREAD_TLS
 	select NUMA if !FLATMEM
 	default y
 	help
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 6ad6cdac74b3..6663f1741798 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -14,7 +14,6 @@ config M68K
 	select HAVE_AOUT if MMU
 	select HAVE_ASM_MODVERSIONS
 	select HAVE_DEBUG_BUGVERBOSE
-	select HAVE_COPY_THREAD_TLS
 	select GENERIC_IRQ_SHOW
 	select GENERIC_ATOMIC64
 	select HAVE_UID16
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index e3a211a41880..d262ac0c8714 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -46,7 +46,6 @@ config MICROBLAZE
 	select CPU_NO_EFFICIENT_FFS
 	select MMU_GATHER_NO_RANGE if MMU
 	select SPARSE_IRQ
-	select HAVE_COPY_THREAD_TLS
 
 # Endianness selection
 choice
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 6fee1a133e9d..ca92c3ed2dc5 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -51,7 +51,6 @@ config MIPS
 	select HAVE_CBPF_JIT if !64BIT && !CPU_MICROMIPS
 	select HAVE_CONTEXT_TRACKING
 	select HAVE_TIF_NOHZ
-	select HAVE_COPY_THREAD_TLS
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig
index 7b6eaca81cce..e30298e99e1b 100644
--- a/arch/nds32/Kconfig
+++ b/arch/nds32/Kconfig
@@ -48,7 +48,6 @@ config NDS32
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
-	select HAVE_COPY_THREAD_TLS
 	help
 	  Andes(nds32) Linux support.
 
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index f9a05957a883..c6645141bb2a 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -27,7 +27,6 @@ config NIOS2
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
 	select CPU_NO_EFFICIENT_FFS
 	select MMU_GATHER_NO_RANGE if MMU
-	select HAVE_COPY_THREAD_TLS
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 8588996165ae..7e94fe37cb2f 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -16,7 +16,6 @@ config OPENRISC
 	select HANDLE_DOMAIN_IRQ
 	select GPIOLIB
 	select HAVE_ARCH_TRACEHOOK
-	select HAVE_COPY_THREAD_TLS
 	select SPARSE_IRQ
 	select GENERIC_IRQ_CHIP
 	select GENERIC_IRQ_PROBE
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 8e4c3708773d..2667eeb6c6f1 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -62,7 +62,6 @@ config PARISC
 	select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE
 	select HAVE_KPROBES_ON_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
-	select HAVE_COPY_THREAD_TLS
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9fa23eb320ff..3b262d87e9c4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -186,7 +186,6 @@ config PPC
 	select HAVE_STACKPROTECTOR		if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
 	select HAVE_CONTEXT_TRACKING		if PPC64
 	select HAVE_TIF_NOHZ			if PPC64
-	select HAVE_COPY_THREAD_TLS
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_DYNAMIC_FTRACE
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 128192e14ff2..f6a3a2bea3d8 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -52,7 +52,6 @@ config RISCV
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ASM_MODVERSIONS
-	select HAVE_COPY_THREAD_TLS
 	select HAVE_DMA_CONTIGUOUS if MMU
 	select HAVE_EBPF_JIT if MMU
 	select HAVE_FUTEX_CMPXCHG if FUTEX
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index c7d7ede6300c..959969759453 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -136,7 +136,6 @@ config S390
 	select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_CMPXCHG_LOCAL
-	select HAVE_COPY_THREAD_TLS
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index e10118d61ce7..9fc2b010e938 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -70,7 +70,6 @@ config SUPERH
 	select ARCH_HIBERNATION_POSSIBLE if MMU
 	select SPARSE_IRQ
 	select HAVE_STACKPROTECTOR
-	select HAVE_COPY_THREAD_TLS
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 66213c0cb557..5bf2dc163540 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -48,7 +48,6 @@ config SPARC
 	select LOCKDEP_SMALL if LOCKDEP
 	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
-	select HAVE_COPY_THREAD_TLS
 
 config SPARC32
 	def_bool !64BIT
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 9318dc6d1a0c..ef69be17ff70 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -14,7 +14,6 @@ config UML
 	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_BUGVERBOSE
-	select HAVE_COPY_THREAD_TLS
 	select GENERIC_IRQ_SHOW
 	select GENERIC_CPU_DEVICES
 	select GENERIC_CLOCKEVENTS
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 01451cf500d2..11ba1839d198 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -22,7 +22,6 @@ config UNICORE32
 	select MODULES_USE_ELF_REL
 	select NEED_DMA_MAP_STATE
 	select MMU_GATHER_NO_RANGE if MMU
-	select HAVE_COPY_THREAD_TLS
 	help
 	  UniCore-32 is 32-bit Instruction Set Architecture,
 	  including a series of low-power-consumption RISC chip
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6a0cc524882d..214b8bf39bbe 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -161,7 +161,6 @@ config X86
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_CMPXCHG_LOCAL
 	select HAVE_CONTEXT_TRACKING		if X86_64
-	select HAVE_COPY_THREAD_TLS
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_CONTIGUOUS
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 3a9f1e80394a..b71ba910d92f 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -24,7 +24,6 @@ config XTENSA
 	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
 	select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL
 	select HAVE_ARCH_TRACEHOOK
-	select HAVE_COPY_THREAD_TLS
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_EXIT_THREAD
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 9f03c44941fb..77cbe14c3034 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -65,22 +65,9 @@ extern void fork_init(void);
 
 extern void release_task(struct task_struct * p);
 
-#ifdef CONFIG_HAVE_COPY_THREAD_TLS
 extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
 			struct task_struct *, unsigned long);
-#else
-extern int copy_thread(unsigned long, unsigned long, unsigned long,
-			struct task_struct *);
-
-/* Architectures that haven't opted into copy_thread_tls get the tls argument
- * via pt_regs, so ignore the tls argument passed via C. */
-static inline int copy_thread_tls(
-		unsigned long clone_flags, unsigned long sp, unsigned long arg,
-		struct task_struct *p, unsigned long tls)
-{
-	return copy_thread(clone_flags, sp, arg, p);
-}
-#endif
+
 extern void flush_thread(void);
 
 #ifdef CONFIG_HAVE_EXIT_THREAD
diff --git a/kernel/fork.c b/kernel/fork.c
index 0fd7eb1b38f9..8e52e16a1b5e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2577,15 +2577,6 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 
 #ifdef __ARCH_WANT_SYS_CLONE3
 
-/*
- * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from
- * the registers containing the syscall arguments for clone. This doesn't work
- * with clone3 since the TLS value is passed in clone_args instead.
- */
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-#error clone3 requires copy_thread_tls support in arch
-#endif
-
 noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
 					      struct clone_args __user *uargs,
 					      size_t usize)
-- 
cgit v1.2.3


From 714acdbd1c94e7e3ab90f6b6938f1ccb27b662f0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 11 Jun 2020 11:04:15 +0200
Subject: arch: rename copy_thread_tls() back to copy_thread()

Now that HAVE_COPY_THREAD_TLS has been removed, rename copy_thread_tls()
back simply copy_thread(). It's a simpler name, and doesn't imply that only
tls is copied here. This finishes an outstanding chunk of internal process
creation work since we've added clone3().

Cc: linux-arch@vger.kernel.org
Acked-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>A
Acked-by: Stafford Horne <shorne@gmail.com>
Acked-by: Greentime Hu <green.hu@gmail.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>A
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 arch/alpha/kernel/process.c      | 6 +++---
 arch/arc/kernel/process.c        | 5 +++--
 arch/arm/kernel/process.c        | 5 ++---
 arch/arm64/kernel/process.c      | 2 +-
 arch/c6x/kernel/process.c        | 6 +++---
 arch/csky/kernel/process.c       | 2 +-
 arch/h8300/kernel/process.c      | 5 ++---
 arch/hexagon/kernel/process.c    | 4 ++--
 arch/ia64/kernel/process.c       | 7 +++----
 arch/m68k/kernel/process.c       | 5 ++---
 arch/microblaze/kernel/process.c | 4 ++--
 arch/mips/kernel/process.c       | 5 +++--
 arch/nds32/kernel/process.c      | 5 ++---
 arch/nios2/kernel/process.c      | 4 ++--
 arch/openrisc/kernel/process.c   | 6 +++---
 arch/parisc/kernel/process.c     | 2 +-
 arch/powerpc/kernel/process.c    | 2 +-
 arch/riscv/kernel/process.c      | 4 ++--
 arch/s390/kernel/process.c       | 4 ++--
 arch/sh/kernel/process_32.c      | 4 ++--
 arch/sparc/kernel/process.c      | 6 +++---
 arch/sparc/kernel/process_32.c   | 5 ++---
 arch/sparc/kernel/process_64.c   | 5 ++---
 arch/um/kernel/process.c         | 2 +-
 arch/unicore32/kernel/process.c  | 5 ++---
 arch/x86/kernel/process.c        | 4 ++--
 arch/x86/kernel/unwind_frame.c   | 2 +-
 arch/xtensa/kernel/process.c     | 2 +-
 include/linux/sched/task.h       | 4 ++--
 kernel/fork.c                    | 3 +--
 30 files changed, 59 insertions(+), 66 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index dfdb6b6ba61c..7462a7911002 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -233,9 +233,9 @@ release_thread(struct task_struct *dead_task)
 /*
  * Copy architecture-specific thread state
  */
-int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
-		    unsigned long kthread_arg, struct task_struct *p,
-		    unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long usp,
+		unsigned long kthread_arg, struct task_struct *p,
+		unsigned long tls)
 {
 	extern void ret_from_fork(void);
 	extern void ret_from_kernel_thread(void);
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index 8c8e5172fecd..105420c23c8b 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -173,8 +173,9 @@ asmlinkage void ret_from_fork(void);
  * |    user_r25    |
  * ------------------  <===== END of PAGE
  */
-int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
-	unsigned long kthread_arg, struct task_struct *p, unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long usp,
+		unsigned long kthread_arg, struct task_struct *p,
+		unsigned long tls)
 {
 	struct pt_regs *c_regs;        /* child's pt_regs */
 	unsigned long *childksp;       /* to unwind out of __switch_to() */
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 58eaa1f60e16..3395be186c7d 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -225,9 +225,8 @@ void release_thread(struct task_struct *dead_task)
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
-int
-copy_thread_tls(unsigned long clone_flags, unsigned long stack_start,
-	    unsigned long stk_sz, struct task_struct *p, unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long stack_start,
+		unsigned long stk_sz, struct task_struct *p, unsigned long tls)
 {
 	struct thread_info *thread = task_thread_info(p);
 	struct pt_regs *childregs = task_pt_regs(p);
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 6089638c7d43..84ec630b8ab5 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -375,7 +375,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 
 asmlinkage void ret_from_fork(void) asm("ret_from_fork");
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long stack_start,
+int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 		unsigned long stk_sz, struct task_struct *p, unsigned long tls)
 {
 	struct pt_regs *childregs = task_pt_regs(p);
diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c
index afa3ea9a93aa..9f4fd6a40a10 100644
--- a/arch/c6x/kernel/process.c
+++ b/arch/c6x/kernel/process.c
@@ -104,9 +104,9 @@ void start_thread(struct pt_regs *regs, unsigned int pc, unsigned long usp)
 /*
  * Copy a new thread context in its stack.
  */
-int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
-		    unsigned long ustk_size, struct task_struct *p,
-		    unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long usp,
+		unsigned long ustk_size, struct task_struct *p,
+		unsigned long tls)
 {
 	struct pt_regs *childregs;
 
diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c
index 8b3fad062ab2..28cfeaaf902a 100644
--- a/arch/csky/kernel/process.c
+++ b/arch/csky/kernel/process.c
@@ -40,7 +40,7 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 	return sw->r15;
 }
 
-int copy_thread_tls(unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
 		unsigned long usp,
 		unsigned long kthread_arg,
 		struct task_struct *p,
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index ae23de4dcf42..83ce3caf7313 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -105,9 +105,8 @@ void flush_thread(void)
 {
 }
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
-		    unsigned long topstk, struct task_struct *p,
-		    unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long usp,
+		unsigned long topstk, struct task_struct *p, unsigned long tls)
 {
 	struct pt_regs *childregs;
 
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index d756f9556dd7..d294e71d11d8 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -50,8 +50,8 @@ void arch_cpu_idle(void)
 /*
  * Copy architecture-specific thread state
  */
-int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
-		    unsigned long arg, struct task_struct *p, unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
+		struct task_struct *p, unsigned long tls)
 {
 	struct thread_info *ti = task_thread_info(p);
 	struct hexagon_switch_stack *ss;
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 416dca619da5..9cedb56c3fda 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -311,7 +311,7 @@ ia64_load_extra (struct task_struct *task)
  *	<clone syscall>	        <some kernel call frames>
  *	sys_clone		   :
  *	_do_fork		_do_fork
- *	copy_thread_tls		copy_thread_tls
+ *	copy_thread		copy_thread
  *
  * This means that the stack layout is as follows:
  *
@@ -333,9 +333,8 @@ ia64_load_extra (struct task_struct *task)
  * so there is nothing to worry about.
  */
 int
-copy_thread_tls(unsigned long clone_flags, unsigned long user_stack_base,
-		    unsigned long user_stack_size, struct task_struct *p,
-		    unsigned long tls)
+copy_thread(unsigned long clone_flags, unsigned long user_stack_base,
+	    unsigned long user_stack_size, struct task_struct *p, unsigned long tls)
 {
 	extern char ia64_ret_from_clone;
 	struct switch_stack *child_stack, *stack;
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 0608439ba452..6492a2c54dbc 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -138,9 +138,8 @@ asmlinkage int m68k_clone3(struct pt_regs *regs)
 	return sys_clone3((struct clone_args __user *)regs->d1, regs->d2);
 }
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
-		    unsigned long arg, struct task_struct *p,
-		    unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
+		struct task_struct *p, unsigned long tls)
 {
 	struct fork_frame {
 		struct switch_stack sw;
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index c2ca9c326510..6cabeab9e2ba 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -54,8 +54,8 @@ void flush_thread(void)
 {
 }
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
-		    unsigned long arg, struct task_struct *p, unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
+		struct task_struct *p, unsigned long tls)
 {
 	struct pt_regs *childregs = task_pt_regs(p);
 	struct thread_info *ti = task_thread_info(p);
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index ff5320b79100..f5dc316a826a 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -119,8 +119,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 /*
  * Copy architecture-specific thread state
  */
-int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
-	unsigned long kthread_arg, struct task_struct *p, unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long usp,
+		unsigned long kthread_arg, struct task_struct *p,
+		unsigned long tls)
 {
 	struct thread_info *ti = task_thread_info(p);
 	struct pt_regs *childregs, *regs = current_pt_regs();
diff --git a/arch/nds32/kernel/process.c b/arch/nds32/kernel/process.c
index 7dbb1bf64165..e85bbbadc0e7 100644
--- a/arch/nds32/kernel/process.c
+++ b/arch/nds32/kernel/process.c
@@ -149,9 +149,8 @@ void flush_thread(void)
 DEFINE_PER_CPU(struct task_struct *, __entry_task);
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
-int copy_thread_tls(unsigned long clone_flags, unsigned long stack_start,
-		    unsigned long stk_sz, struct task_struct *p,
-		    unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long stack_start,
+		unsigned long stk_sz, struct task_struct *p, unsigned long tls)
 {
 	struct pt_regs *childregs = task_pt_regs(p);
 
diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c
index 3dde4d6d8fbe..0a42ab8e4c32 100644
--- a/arch/nios2/kernel/process.c
+++ b/arch/nios2/kernel/process.c
@@ -100,8 +100,8 @@ void flush_thread(void)
 {
 }
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
-		    unsigned long arg, struct task_struct *p, unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
+		struct task_struct *p, unsigned long tls)
 {
 	struct pt_regs *childregs = task_pt_regs(p);
 	struct pt_regs *regs;
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index d7010e72450c..848f74c2c47a 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -116,7 +116,7 @@ void release_thread(struct task_struct *dead_task)
 extern asmlinkage void ret_from_fork(void);
 
 /*
- * copy_thread_tls
+ * copy_thread
  * @clone_flags: flags
  * @usp: user stack pointer or fn for kernel thread
  * @arg: arg to fn for kernel thread; always NULL for userspace thread
@@ -147,8 +147,8 @@ extern asmlinkage void ret_from_fork(void);
  */
 
 int
-copy_thread_tls(unsigned long clone_flags, unsigned long usp,
-		unsigned long arg, struct task_struct *p, unsigned long tls)
+copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
+	    struct task_struct *p, unsigned long tls)
 {
 	struct pt_regs *userregs;
 	struct pt_regs *kregs;
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index b7abb12edd3a..de6299ff1530 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -208,7 +208,7 @@ arch_initcall(parisc_idle_init);
  * Copy architecture-specific thread state
  */
 int
-copy_thread_tls(unsigned long clone_flags, unsigned long usp,
+copy_thread(unsigned long clone_flags, unsigned long usp,
 	    unsigned long kthread_arg, struct task_struct *p, unsigned long tls)
 {
 	struct pt_regs *cregs = &(p->thread.regs);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 4650b9bb217f..794b754deec2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1593,7 +1593,7 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp)
 /*
  * Copy architecture-specific thread state
  */
-int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long kthread_arg, struct task_struct *p,
 		unsigned long tls)
 {
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index 824d117cf202..31f39442df72 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -101,8 +101,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 	return 0;
 }
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
-	unsigned long arg, struct task_struct *p, unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
+		struct task_struct *p, unsigned long tls)
 {
 	struct pt_regs *childregs = task_pt_regs(p);
 
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index eb6e23ad15a2..b06dec1267d0 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -80,8 +80,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 	return 0;
 }
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
-		    unsigned long arg, struct task_struct *p, unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
+		unsigned long arg, struct task_struct *p, unsigned long tls)
 {
 	struct fake_frame
 	{
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 537a82d80616..b0fefd8f53a6 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -115,8 +115,8 @@ EXPORT_SYMBOL(dump_fpu);
 asmlinkage void ret_from_fork(void);
 asmlinkage void ret_from_kernel_thread(void);
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
-		    unsigned long arg, struct task_struct *p, unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
+		struct task_struct *p, unsigned long tls)
 {
 	struct thread_info *ti = task_thread_info(p);
 	struct pt_regs *childregs;
diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c
index 8bbe62d77b77..5234b5ccc0b9 100644
--- a/arch/sparc/kernel/process.c
+++ b/arch/sparc/kernel/process.c
@@ -28,7 +28,7 @@ asmlinkage long sparc_fork(struct pt_regs *regs)
 	ret = _do_fork(&args);
 
 	/* If we get an error and potentially restart the system
-	 * call, we're screwed because copy_thread_tls() clobbered
+	 * call, we're screwed because copy_thread() clobbered
 	 * the parent's %o1.  So detect that case and restore it
 	 * here.
 	 */
@@ -53,7 +53,7 @@ asmlinkage long sparc_vfork(struct pt_regs *regs)
 	ret = _do_fork(&args);
 
 	/* If we get an error and potentially restart the system
-	 * call, we're screwed because copy_thread_tls() clobbered
+	 * call, we're screwed because copy_thread() clobbered
 	 * the parent's %o1.  So detect that case and restore it
 	 * here.
 	 */
@@ -99,7 +99,7 @@ asmlinkage long sparc_clone(struct pt_regs *regs)
 	ret = _do_fork(&args);
 
 	/* If we get an error and potentially restart the system
-	 * call, we're screwed because copy_thread_tls() clobbered
+	 * call, we're screwed because copy_thread() clobbered
 	 * the parent's %o1.  So detect that case and restore it
 	 * here.
 	 */
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 3e1f7b639e9a..bd123f1de2e7 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -273,9 +273,8 @@ clone_stackframe(struct sparc_stackf __user *dst,
 extern void ret_from_fork(void);
 extern void ret_from_kernel_thread(void);
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
-		    unsigned long arg, struct task_struct *p,
-		    unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
+		struct task_struct *p, unsigned long tls)
 {
 	struct thread_info *ti = task_thread_info(p);
 	struct pt_regs *childregs, *regs = current_pt_regs();
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 278bf287c4be..04ef19b88632 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -577,9 +577,8 @@ barf:
  * Parent -->  %o0 == childs  pid, %o1 == 0
  * Child  -->  %o0 == parents pid, %o1 == 1
  */
-int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
-		    unsigned long arg, struct task_struct *p,
-		    unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
+		struct task_struct *p, unsigned long tls)
 {
 	struct thread_info *t = task_thread_info(p);
 	struct pt_regs *regs = current_pt_regs();
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index e3a2cf92a373..26b5e243d3fc 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -152,7 +152,7 @@ void fork_handler(void)
 	userspace(&current->thread.regs.regs, current_thread_info()->aux_fp_regs);
 }
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long arg, struct task_struct * p, unsigned long tls)
 {
 	void (*handler)(void);
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index 49a305565a53..d5ae5a971c19 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -219,9 +219,8 @@ void release_thread(struct task_struct *dead_task)
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long stack_start,
-		    unsigned long stk_sz, struct task_struct *p,
-		    unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long stack_start,
+		unsigned long stk_sz, struct task_struct *p, unsigned long tls)
 {
 	struct thread_info *thread = task_thread_info(p);
 	struct pt_regs *childregs = task_pt_regs(p);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f362ce0d5ac0..b35cd50ed0dc 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -121,8 +121,8 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
 		return do_set_thread_area_64(p, ARCH_SET_FS, tls);
 }
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
-		    unsigned long arg, struct task_struct *p, unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
+		struct task_struct *p, unsigned long tls)
 {
 	struct inactive_task_frame *frame;
 	struct fork_frame *fork_frame;
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 722a85f3b2dd..3070fd6561be 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -269,7 +269,7 @@ bool unwind_next_frame(struct unwind_state *state)
 		/*
 		 * kthreads (other than the boot CPU's idle thread) have some
 		 * partial regs at the end of their stack which were placed
-		 * there by copy_thread_tls().  But the regs don't have any
+		 * there by copy_thread().  But the regs don't have any
 		 * useful information, so we can skip them.
 		 *
 		 * This user_mode() check is slightly broader than a PF_KTHREAD
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index b7fe6f443b42..397a7de56377 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -201,7 +201,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
  * involved.  Much simpler to just not copy those live frames across.
  */
 
-int copy_thread_tls(unsigned long clone_flags, unsigned long usp_thread_fn,
+int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn,
 		unsigned long thread_fn_arg, struct task_struct *p,
 		unsigned long tls)
 {
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 77cbe14c3034..b6253f2ea96a 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -65,8 +65,8 @@ extern void fork_init(void);
 
 extern void release_task(struct task_struct * p);
 
-extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
-			struct task_struct *, unsigned long);
+extern int copy_thread(unsigned long, unsigned long, unsigned long,
+		       struct task_struct *, unsigned long);
 
 extern void flush_thread(void);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 8e52e16a1b5e..790841eb0a21 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2104,8 +2104,7 @@ static __latent_entropy struct task_struct *copy_process(
 	retval = copy_io(clone_flags, p);
 	if (retval)
 		goto bad_fork_cleanup_namespaces;
-	retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
-				 args->tls);
+	retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
 	if (retval)
 		goto bad_fork_cleanup_io;
 
-- 
cgit v1.2.3


From 38fd525a4c61e7ecdc9ad4dcbf7b767d0a007962 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 1 Jul 2020 07:30:06 -0500
Subject: exit: Factor thread_group_exited out of pidfd_poll

Create an independent helper thread_group_exited which returns true
when all threads have passed exit_notify in do_exit.  AKA all of the
threads are at least zombies and might be dead or completely gone.

Create this helper by taking the logic out of pidfd_poll where it is
already tested, and adding a READ_ONCE on the read of
task->exit_state.

I will be changing the user mode driver code to use this same logic
to know when a user mode driver needs to be restarted.

Place the new helper thread_group_exited in kernel/exit.c and
EXPORT it so it can be used by modules.

Link: https://lkml.kernel.org/r/20200702164140.4468-13-ebiederm@xmission.com
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched/signal.h |  2 ++
 kernel/exit.c                | 24 ++++++++++++++++++++++++
 kernel/fork.c                |  6 +-----
 3 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0ee5e696c5d8..1bad18a1d8ba 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
+extern bool thread_group_exited(struct pid *pid);
+
 extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
 							unsigned long *flags);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index d3294b611df1..dee246c0866f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1713,6 +1713,30 @@ Efault:
 }
 #endif
 
+/**
+ * thread_group_exited - check that a thread group has exited
+ * @pid: tgid of thread group to be checked.
+ *
+ * Test if the thread group represented by tgid has exited (all
+ * threads are zombies, dead or completely gone).
+ *
+ * Return: true if the thread group has exited. false otherwise.
+ */
+bool thread_group_exited(struct pid *pid)
+{
+	struct task_struct *task;
+	bool exited;
+
+	rcu_read_lock();
+	task = pid_task(pid, PIDTYPE_PID);
+	exited = !task ||
+		(READ_ONCE(task->exit_state) && thread_group_empty(task));
+	rcu_read_unlock();
+
+	return exited;
+}
+EXPORT_SYMBOL(thread_group_exited);
+
 __weak void abort(void)
 {
 	BUG();
diff --git a/kernel/fork.c b/kernel/fork.c
index 142b23645d82..bf215af7a904 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1787,22 +1787,18 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
  */
 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 {
-	struct task_struct *task;
 	struct pid *pid = file->private_data;
 	__poll_t poll_flags = 0;
 
 	poll_wait(file, &pid->wait_pidfd, pts);
 
-	rcu_read_lock();
-	task = pid_task(pid, PIDTYPE_PID);
 	/*
 	 * Inform pollers only when the whole thread group exits.
 	 * If the thread group leader exits before all other threads in the
 	 * group, then poll(2) should block, similar to the wait(2) family.
 	 */
-	if (!task || (task->exit_state && thread_group_empty(task)))
+	if (thread_group_exited(pid))
 		poll_flags = EPOLLIN | EPOLLRDNORM;
-	rcu_read_unlock();
 
 	return poll_flags;
 }
-- 
cgit v1.2.3


From 8c2f52663973e643c617663d826e2b0daa008b38 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 25 Jun 2020 17:40:40 -0500
Subject: umd: Remove exit_umh

The bpfilter code no longer uses the umd_info.cleanup callback.  This
callback is what exit_umh exists to call.  So remove exit_umh and all
of it's associated booking.

v1: https://lkml.kernel.org/r/87bll6dlte.fsf_-_@x220.int.ebiederm.org
v2: https://lkml.kernel.org/r/87y2o53abg.fsf_-_@x220.int.ebiederm.org
Link: https://lkml.kernel.org/r/20200702164140.4468-15-ebiederm@xmission.com
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched.h           |  1 -
 include/linux/usermode_driver.h | 16 ----------------
 kernel/exit.c                   |  3 ---
 kernel/usermode_driver.c        | 28 ----------------------------
 4 files changed, 48 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 59d1e92bb88e..edb2020875ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1511,7 +1511,6 @@ extern struct pid *cad_pid;
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
 #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
-#define PF_UMH			0x02000000	/* I'm an Usermodehelper process */
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_NOCMA	0x10000000	/* All allocation request will have _GFP_MOVABLE cleared */
diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
index 45adbffb31d9..073a9e0ec07d 100644
--- a/include/linux/usermode_driver.h
+++ b/include/linux/usermode_driver.h
@@ -4,26 +4,10 @@
 #include <linux/umh.h>
 #include <linux/path.h>
 
-#ifdef CONFIG_BPFILTER
-void __exit_umh(struct task_struct *tsk);
-
-static inline void exit_umh(struct task_struct *tsk)
-{
-	if (unlikely(tsk->flags & PF_UMH))
-		__exit_umh(tsk);
-}
-#else
-static inline void exit_umh(struct task_struct *tsk)
-{
-}
-#endif
-
 struct umd_info {
 	const char *driver_name;
 	struct file *pipe_to_umh;
 	struct file *pipe_from_umh;
-	struct list_head list;
-	void (*cleanup)(struct umd_info *info);
 	struct path wd;
 	struct pid *tgid;
 };
diff --git a/kernel/exit.c b/kernel/exit.c
index dee246c0866f..39226a018ed7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -63,7 +63,6 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
-#include <linux/usermode_driver.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -805,8 +804,6 @@ void __noreturn do_exit(long code)
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	exit_thread(tsk);
-	if (group_dead)
-		exit_umh(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index f77f8d7ce9e3..cd136f86f799 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -9,9 +9,6 @@
 #include <linux/task_work.h>
 #include <linux/usermode_driver.h>
 
-static LIST_HEAD(umh_list);
-static DEFINE_MUTEX(umh_list_lock);
-
 static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
 {
 	struct file_system_type *type;
@@ -134,7 +131,6 @@ static int umd_setup(struct subprocess_info *info, struct cred *new)
 	umd_info->pipe_to_umh = to_umh[1];
 	umd_info->pipe_from_umh = from_umh[0];
 	umd_info->tgid = get_pid(task_tgid(current));
-	current->flags |= PF_UMH;
 	return 0;
 }
 
@@ -182,11 +178,6 @@ int fork_usermode_driver(struct umd_info *info)
 		goto out;
 
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
-	if (!err) {
-		mutex_lock(&umh_list_lock);
-		list_add(&info->list, &umh_list);
-		mutex_unlock(&umh_list_lock);
-	}
 out:
 	if (argv)
 		argv_free(argv);
@@ -194,23 +185,4 @@ out:
 }
 EXPORT_SYMBOL_GPL(fork_usermode_driver);
 
-void __exit_umh(struct task_struct *tsk)
-{
-	struct umd_info *info;
-	struct pid *tgid = task_tgid(tsk);
-
-	mutex_lock(&umh_list_lock);
-	list_for_each_entry(info, &umh_list, list) {
-		if (info->tgid == tgid) {
-			list_del(&info->list);
-			mutex_unlock(&umh_list_lock);
-			goto out;
-		}
-	}
-	mutex_unlock(&umh_list_lock);
-	return;
-out:
-	if (info->cleanup)
-		info->cleanup(info);
-}
 
-- 
cgit v1.2.3


From 33c326014fe69304244868cc793c2c77be533125 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 29 Jun 2020 08:28:33 -0500
Subject: umd: Stop using split_argv

There is exactly one argument so there is nothing to split.  All
split_argv does now is cause confusion and avoid the need for a cast
when passing a "const char *" string to call_usermodehelper_setup.

So avoid confusion and the possibility of an odd driver name causing
problems by just using a fixed argv array with a cast in the call to
call_usermodehelper_setup.

v1: https://lkml.kernel.org/r/87sged3a9n.fsf_-_@x220.int.ebiederm.org
Link: https://lkml.kernel.org/r/20200702164140.4468-16-ebiederm@xmission.com
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/usermode_driver.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index cd136f86f799..0b35212ffc3d 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -160,27 +160,21 @@ static void umd_cleanup(struct subprocess_info *info)
 int fork_usermode_driver(struct umd_info *info)
 {
 	struct subprocess_info *sub_info;
-	char **argv = NULL;
+	const char *argv[] = { info->driver_name, NULL };
 	int err;
 
 	if (WARN_ON_ONCE(info->tgid))
 		return -EBUSY;
 
 	err = -ENOMEM;
-	argv = argv_split(GFP_KERNEL, info->driver_name, NULL);
-	if (!argv)
-		goto out;
-
-	sub_info = call_usermodehelper_setup(info->driver_name, argv, NULL,
-					     GFP_KERNEL,
+	sub_info = call_usermodehelper_setup(info->driver_name,
+					     (char **)argv, NULL, GFP_KERNEL,
 					     umd_setup, umd_cleanup, info);
 	if (!sub_info)
 		goto out;
 
 	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
 out:
-	if (argv)
-		argv_free(argv);
 	return err;
 }
 EXPORT_SYMBOL_GPL(fork_usermode_driver);
-- 
cgit v1.2.3


From f5836749c9c04a10decd2742845ad4870965fdef Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 6 Jul 2020 16:01:25 -0700
Subject: bpf: Add BPF_CGROUP_INET_SOCK_RELEASE hook

Sometimes it's handy to know when the socket gets freed. In
particular, we'd like to try to use a smarter allocation of
ports for bpf_bind and explore the possibility of limiting
the number of SOCK_DGRAM sockets the process can have.

Implement BPF_CGROUP_INET_SOCK_RELEASE hook that triggers on
inet socket release. It triggers only for userspace sockets
(not in-kernel ones) and therefore has the same semantics as
the existing BPF_CGROUP_INET_SOCK_CREATE.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200706230128.4073544-2-sdf@google.com
---
 include/linux/bpf-cgroup.h | 4 ++++
 include/uapi/linux/bpf.h   | 1 +
 kernel/bpf/syscall.c       | 3 +++
 net/core/filter.c          | 1 +
 net/ipv4/af_inet.c         | 3 +++
 5 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index c66c545e161a..2c6f26670acc 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -210,6 +210,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)				       \
 	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE)
 
+#define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk)			       \
+	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_RELEASE)
+
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk)				       \
 	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND)
 
@@ -401,6 +404,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da9bf35a26f8..548a749aebb3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -226,6 +226,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET4_GETSOCKNAME,
 	BPF_CGROUP_INET6_GETSOCKNAME,
 	BPF_XDP_DEVMAP,
+	BPF_CGROUP_INET_SOCK_RELEASE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8da159936bab..156f51ffada2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1981,6 +1981,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 		switch (expected_attach_type) {
 		case BPF_CGROUP_INET_SOCK_CREATE:
+		case BPF_CGROUP_INET_SOCK_RELEASE:
 		case BPF_CGROUP_INET4_POST_BIND:
 		case BPF_CGROUP_INET6_POST_BIND:
 			return 0;
@@ -2779,6 +2780,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_CGROUP_SKB;
 		break;
 	case BPF_CGROUP_INET_SOCK_CREATE:
+	case BPF_CGROUP_INET_SOCK_RELEASE:
 	case BPF_CGROUP_INET4_POST_BIND:
 	case BPF_CGROUP_INET6_POST_BIND:
 		return BPF_PROG_TYPE_CGROUP_SOCK;
@@ -2929,6 +2931,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
 	case BPF_CGROUP_INET_SOCK_CREATE:
+	case BPF_CGROUP_INET_SOCK_RELEASE:
 	case BPF_CGROUP_INET4_BIND:
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_POST_BIND:
diff --git a/net/core/filter.c b/net/core/filter.c
index c5e696e6c315..ddcc0d6209e1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6890,6 +6890,7 @@ static bool __sock_filter_check_attach_type(int off,
 	case offsetof(struct bpf_sock, priority):
 		switch (attach_type) {
 		case BPF_CGROUP_INET_SOCK_CREATE:
+		case BPF_CGROUP_INET_SOCK_RELEASE:
 			goto full_access;
 		default:
 			return false;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ea6ed6d487ed..ff141d630bdf 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -411,6 +411,9 @@ int inet_release(struct socket *sock)
 	if (sk) {
 		long timeout;
 
+		if (!sk->sk_kern_sock)
+			BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk);
+
 		/* Applications forget to leave groups before exiting */
 		ip_mc_drop_socket(sk);
 
-- 
cgit v1.2.3


From 42815808f1791eb53c3a05fb78c0a8642ecf8467 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Mon, 6 Jul 2020 17:49:09 +0200
Subject: timens: make vdso_join_timens() always succeed

As discussed on-list (cf. [1]), in order to make setns() support time
namespaces when attaching to multiple namespaces at once properly we
need to tweak vdso_join_timens() to always succeed. So switch
vdso_join_timens() to using a read lock and replacing
mmap_write_lock_killable() to mmap_read_lock() as we discussed.

Last cycle setns() was changed to support attaching to multiple namespaces
atomically. This requires all namespaces to have a point of no return where
they can't fail anymore. Specifically, <namespace-type>_install() is
allowed to perform permission checks and install the namespace into the new
struct nsset that it has been given but it is not allowed to make visible
changes to the affected task. Once <namespace-type>_install() returns
anything that the given namespace type requires to be setup in addition
needs to ideally be done in a function that can't fail or if it fails the
failure is not fatal. For time namespaces the relevant functions that fall
into this category are timens_set_vvar_page() and vdso_join_timens().
Currently the latter can fail but doesn't need to. With this we can go on
to implement a timens_commit() helper in a follow up patch to be used by
setns().

[1]: https://lore.kernel.org/lkml/20200611110221.pgd3r5qkjrjmfqa2@wittgenstein

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Andrei Vagin <avagin@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Dmitry Safonov <dima@arista.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20200706154912.3248030-2-christian.brauner@ubuntu.com
---
 arch/x86/entry/vdso/vma.c |  5 ++---
 kernel/time/namespace.c   | 10 ++--------
 2 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index ea7c1f0b79df..9185cb1d13b9 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -144,8 +144,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
 	struct mm_struct *mm = task->mm;
 	struct vm_area_struct *vma;
 
-	if (mmap_write_lock_killable(mm))
-		return -EINTR;
+	mmap_read_lock(mm);
 
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		unsigned long size = vma->vm_end - vma->vm_start;
@@ -154,7 +153,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
 			zap_page_range(vma, vma->vm_start, size);
 	}
 
-	mmap_write_unlock(mm);
+	mmap_read_unlock(mm);
 	return 0;
 }
 #else
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 5d9fc22d836a..e5af6fe87af8 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -284,7 +284,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new)
 {
 	struct nsproxy *nsproxy = nsset->nsproxy;
 	struct time_namespace *ns = to_time_ns(new);
-	int err;
 
 	if (!current_is_single_threaded())
 		return -EUSERS;
@@ -295,9 +294,7 @@ static int timens_install(struct nsset *nsset, struct ns_common *new)
 
 	timens_set_vvar_page(current, ns);
 
-	err = vdso_join_timens(current, ns);
-	if (err)
-		return err;
+	vdso_join_timens(current, ns);
 
 	get_time_ns(ns);
 	put_time_ns(nsproxy->time_ns);
@@ -313,7 +310,6 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
 {
 	struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
 	struct time_namespace *ns = to_time_ns(nsc);
-	int err;
 
 	/* create_new_namespaces() already incremented the ref counter */
 	if (nsproxy->time_ns == nsproxy->time_ns_for_children)
@@ -321,9 +317,7 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
 
 	timens_set_vvar_page(tsk, ns);
 
-	err = vdso_join_timens(tsk, ns);
-	if (err)
-		return err;
+	vdso_join_timens(tsk, ns);
 
 	get_time_ns(ns);
 	put_time_ns(nsproxy->time_ns);
-- 
cgit v1.2.3


From 5cfea9a106bbb1af919ab94456fdd02209984070 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Mon, 6 Jul 2020 17:49:10 +0200
Subject: timens: add timens_commit() helper

Wrap the calls to timens_set_vvar_page() and vdso_join_timens() in
timens_on_fork() and timens_install() in a new timens_commit() helper.
We'll use this helper in a follow-up patch in nsproxy too.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Andrei Vagin <avagin@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Dmitry Safonov <dima@arista.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20200706154912.3248030-3-christian.brauner@ubuntu.com
---
 kernel/time/namespace.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index e5af6fe87af8..aa7b90aac2a7 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -280,6 +280,12 @@ static void timens_put(struct ns_common *ns)
 	put_time_ns(to_time_ns(ns));
 }
 
+static void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
+{
+	timens_set_vvar_page(tsk, ns);
+	vdso_join_timens(tsk, ns);
+}
+
 static int timens_install(struct nsset *nsset, struct ns_common *new)
 {
 	struct nsproxy *nsproxy = nsset->nsproxy;
@@ -292,9 +298,8 @@ static int timens_install(struct nsset *nsset, struct ns_common *new)
 	    !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
-	timens_set_vvar_page(current, ns);
 
-	vdso_join_timens(current, ns);
+	timens_commit(current, ns);
 
 	get_time_ns(ns);
 	put_time_ns(nsproxy->time_ns);
@@ -315,14 +320,12 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
 	if (nsproxy->time_ns == nsproxy->time_ns_for_children)
 		return 0;
 
-	timens_set_vvar_page(tsk, ns);
-
-	vdso_join_timens(tsk, ns);
-
 	get_time_ns(ns);
 	put_time_ns(nsproxy->time_ns);
 	nsproxy->time_ns = ns;
 
+	timens_commit(tsk, ns);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 76c12881a38aaa83e1eb4ce2fada36c3a732bad4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Mon, 6 Jul 2020 17:49:11 +0200
Subject: nsproxy: support CLONE_NEWTIME with setns()

So far setns() was missing time namespace support. This was partially due
to it simply not being implemented but also because vdso_join_timens()
could still fail which made switching to multiple namespaces atomically
problematic. This is now fixed so support CLONE_NEWTIME with setns()

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Andrei Vagin <avagin@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Dmitry Safonov <dima@arista.com>
Link: https://lore.kernel.org/r/20200706154912.3248030-4-christian.brauner@ubuntu.com
---
 include/linux/time_namespace.h |  6 ++++++
 kernel/nsproxy.c               | 21 +++++++++++++++++++--
 kernel/time/namespace.c        |  5 +----
 3 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 824d54e057eb..5b6031385db0 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -33,6 +33,7 @@ extern struct time_namespace init_time_ns;
 #ifdef CONFIG_TIME_NS
 extern int vdso_join_timens(struct task_struct *task,
 			    struct time_namespace *ns);
+extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns);
 
 static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
 {
@@ -96,6 +97,11 @@ static inline int vdso_join_timens(struct task_struct *task,
 	return 0;
 }
 
+static inline void timens_commit(struct task_struct *tsk,
+				 struct time_namespace *ns)
+{
+}
+
 static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
 {
 	return NULL;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index cd356630a311..12dd41b39a7f 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -262,8 +262,8 @@ void exit_task_namespaces(struct task_struct *p)
 static int check_setns_flags(unsigned long flags)
 {
 	if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-				 CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID |
-				 CLONE_NEWCGROUP)))
+				 CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER |
+				 CLONE_NEWPID | CLONE_NEWCGROUP)))
 		return -EINVAL;
 
 #ifndef CONFIG_USER_NS
@@ -290,6 +290,10 @@ static int check_setns_flags(unsigned long flags)
 	if (flags & CLONE_NEWNET)
 		return -EINVAL;
 #endif
+#ifndef CONFIG_TIME_NS
+	if (flags & CLONE_NEWTIME)
+		return -EINVAL;
+#endif
 
 	return 0;
 }
@@ -464,6 +468,14 @@ static int validate_nsset(struct nsset *nsset, struct pid *pid)
 	}
 #endif
 
+#ifdef CONFIG_TIME_NS
+	if (flags & CLONE_NEWTIME) {
+		ret = validate_ns(nsset, &nsp->time_ns->ns);
+		if (ret)
+			goto out;
+	}
+#endif
+
 out:
 	if (pid_ns)
 		put_pid_ns(pid_ns);
@@ -507,6 +519,11 @@ static void commit_nsset(struct nsset *nsset)
 		exit_sem(me);
 #endif
 
+#ifdef CONFIG_TIME_NS
+	if (flags & CLONE_NEWTIME)
+		timens_commit(me, nsset->nsproxy->time_ns);
+#endif
+
 	/* transfer ownership */
 	switch_task_namespaces(me, nsset->nsproxy);
 	nsset->nsproxy = NULL;
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index aa7b90aac2a7..afc65e6be33e 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -280,7 +280,7 @@ static void timens_put(struct ns_common *ns)
 	put_time_ns(to_time_ns(ns));
 }
 
-static void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
+void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
 {
 	timens_set_vvar_page(tsk, ns);
 	vdso_join_timens(tsk, ns);
@@ -298,9 +298,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new)
 	    !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
-
-	timens_commit(current, ns);
-
 	get_time_ns(ns);
 	put_time_ns(nsproxy->time_ns);
 	nsproxy->time_ns = ns;
-- 
cgit v1.2.3


From ff9ff926889dd8026b4ba55266a010c27f68604f Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:21 -0700
Subject: perf/core: Factor out functions to allocate/free the task_ctx_data

The method to allocate/free the task_ctx_data is going to be changed in
the following patch. Currently, the task_ctx_data is allocated/freed in
several different places. To avoid repeatedly modifying the same codes
in several different places, alloc_task_ctx_data() and
free_task_ctx_data() are factored out to allocate/free the
task_ctx_data. The modification only needs to be applied once.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-16-git-send-email-kan.liang@linux.intel.com
---
 kernel/events/core.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9b8f92500833..75090403f942 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1238,12 +1238,22 @@ static void get_ctx(struct perf_event_context *ctx)
 	refcount_inc(&ctx->refcount);
 }
 
+static void *alloc_task_ctx_data(struct pmu *pmu)
+{
+	return kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+}
+
+static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
+{
+	kfree(task_ctx_data);
+}
+
 static void free_ctx(struct rcu_head *head)
 {
 	struct perf_event_context *ctx;
 
 	ctx = container_of(head, struct perf_event_context, rcu_head);
-	kfree(ctx->task_ctx_data);
+	free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
 	kfree(ctx);
 }
 
@@ -4471,7 +4481,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
 		goto errout;
 
 	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
-		task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+		task_ctx_data = alloc_task_ctx_data(pmu);
 		if (!task_ctx_data) {
 			err = -ENOMEM;
 			goto errout;
@@ -4529,11 +4539,11 @@ retry:
 		}
 	}
 
-	kfree(task_ctx_data);
+	free_task_ctx_data(pmu, task_ctx_data);
 	return ctx;
 
 errout:
-	kfree(task_ctx_data);
+	free_task_ctx_data(pmu, task_ctx_data);
 	return ERR_PTR(err);
 }
 
@@ -12497,8 +12507,7 @@ inherit_event(struct perf_event *parent_event,
 	    !child_ctx->task_ctx_data) {
 		struct pmu *pmu = child_event->pmu;
 
-		child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
-						   GFP_KERNEL);
+		child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
 		if (!child_ctx->task_ctx_data) {
 			free_event(child_event);
 			return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3


From 217c2a633ebb36f1cc6d249f4ef2e4a809d46818 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:22 -0700
Subject: perf/core: Use kmem_cache to allocate the PMU specific data

Currently, the PMU specific data task_ctx_data is allocated by the
function kzalloc() in the perf generic code. When there is no specific
alignment requirement for the task_ctx_data, the method works well for
now. However, there will be a problem once a specific alignment
requirement is introduced in future features, e.g., the Architecture LBR
XSAVE feature requires 64-byte alignment. If the specific alignment
requirement is not fulfilled, the XSAVE family of instructions will fail
to save/restore the xstate to/from the task_ctx_data.

The function kzalloc() itself only guarantees a natural alignment. A
new method to allocate the task_ctx_data has to be introduced, which
has to meet the requirements as below:
- must be a generic method can be used by different architectures,
  because the allocation of the task_ctx_data is implemented in the
  perf generic code;
- must be an alignment-guarantee method (The alignment requirement is
  not changed after the boot);
- must be able to allocate/free a buffer (smaller than a page size)
  dynamically;
- should not cause extra CPU overhead or space overhead.

Several options were considered as below:
- One option is to allocate a larger buffer for task_ctx_data. E.g.,
    ptr = kmalloc(size + alignment, GFP_KERNEL);
    ptr &= ~(alignment - 1);
  This option causes space overhead.
- Another option is to allocate the task_ctx_data in the PMU specific
  code. To do so, several function pointers have to be added. As a
  result, both the generic structure and the PMU specific structure
  will become bigger. Besides, extra function calls are added when
  allocating/freeing the buffer. This option will increase both the
  space overhead and CPU overhead.
- The third option is to use a kmem_cache to allocate a buffer for the
  task_ctx_data. The kmem_cache can be created with a specific alignment
  requirement by the PMU at boot time. A new pointer for kmem_cache has
  to be added in the generic struct pmu, which would be used to
  dynamically allocate a buffer for the task_ctx_data at run time.
  Although the new pointer is added to the struct pmu, the existing
  variable task_ctx_size is not required anymore. The size of the
  generic structure is kept the same.

The third option which meets all the aforementioned requirements is used
to replace kzalloc() for the PMU specific data allocation. A later patch
will remove the kzalloc() method and the related variables.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-17-git-send-email-kan.liang@linux.intel.com
---
 include/linux/perf_event.h | 5 +++++
 kernel/events/core.c       | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 46fe5cfb5163..09915ae06d28 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -424,6 +424,11 @@ struct pmu {
 	 */
 	size_t				task_ctx_size;
 
+	/*
+	 * Kmem cache of PMU specific data
+	 */
+	struct kmem_cache		*task_ctx_cache;
+
 	/*
 	 * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data)
 	 * can be synchronized using this function. See Intel LBR callstack support
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 75090403f942..30d9b3182369 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1240,12 +1240,18 @@ static void get_ctx(struct perf_event_context *ctx)
 
 static void *alloc_task_ctx_data(struct pmu *pmu)
 {
+	if (pmu->task_ctx_cache)
+		return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
+
 	return kzalloc(pmu->task_ctx_size, GFP_KERNEL);
 }
 
 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
 {
-	kfree(task_ctx_data);
+	if (pmu->task_ctx_cache && task_ctx_data)
+		kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
+	else
+		kfree(task_ctx_data);
 }
 
 static void free_ctx(struct rcu_head *head)
-- 
cgit v1.2.3


From 5a09928d339f3cf0973991ddc3a2798825c84c99 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:24 -0700
Subject: perf/x86: Remove task_ctx_size

A new kmem_cache method has replaced the kzalloc() to allocate the PMU
specific data. The task_ctx_size is not required anymore.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-19-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/core.c      | 1 -
 arch/x86/events/intel/lbr.c | 1 -
 include/linux/perf_event.h  | 4 ----
 kernel/events/core.c        | 4 +---
 4 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index d740c861724c..6b1228ae007d 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2371,7 +2371,6 @@ static struct pmu pmu = {
 
 	.event_idx		= x86_pmu_event_idx,
 	.sched_task		= x86_pmu_sched_task,
-	.task_ctx_size          = sizeof(struct x86_perf_task_context),
 	.swap_task_ctx		= x86_pmu_swap_task_ctx,
 	.check_period		= x86_pmu_check_period,
 
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index e784c1d485ca..3ad528996d1c 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1672,7 +1672,6 @@ void __init intel_pmu_arch_lbr_init(void)
 
 	size = sizeof(struct x86_perf_task_context_arch_lbr) +
 	       lbr_nr * sizeof(struct lbr_entry);
-	x86_get_pmu()->task_ctx_size = size;
 	x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
 
 	x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 09915ae06d28..3b22db08b6fb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -419,10 +419,6 @@ struct pmu {
 	 */
 	void (*sched_task)		(struct perf_event_context *ctx,
 					bool sched_in);
-	/*
-	 * PMU specific data size
-	 */
-	size_t				task_ctx_size;
 
 	/*
 	 * Kmem cache of PMU specific data
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 30d9b3182369..7c436d705fbd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1243,15 +1243,13 @@ static void *alloc_task_ctx_data(struct pmu *pmu)
 	if (pmu->task_ctx_cache)
 		return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
 
-	return kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+	return NULL;
 }
 
 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
 {
 	if (pmu->task_ctx_cache && task_ctx_data)
 		kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
-	else
-		kfree(task_ctx_data);
 }
 
 static void free_ctx(struct rcu_head *head)
-- 
cgit v1.2.3


From 85c2ce9104eb93517db2037699471c517e81f9b4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 30 Jun 2020 16:49:05 +0200
Subject: sched, vmlinux.lds: Increase STRUCT_ALIGNMENT to 64 bytes for GCC-4.9

For some mysterious reason GCC-4.9 has a 64 byte section alignment for
structures, all other GCC versions (and Clang) tested (including 4.8
and 5.0) are fine with the 32 bytes alignment.

Getting this right is important for the new SCHED_DATA macro that
creates an explicitly ordered array of 'struct sched_class' in the
linker script and expect pointer arithmetic to work.

Fixes: c3a340f7e7ea ("sched: Have sched_class_highest define by vmlinux.lds.h")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200630144905.GX4817@hirez.programming.kicks-ass.net
---
 include/asm-generic/vmlinux.lds.h | 18 +++++++++++-------
 kernel/sched/sched.h              |  3 ++-
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 66fb84c3dc7e..3ceb4b7279ec 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -108,6 +108,17 @@
 #define SBSS_MAIN .sbss
 #endif
 
+/*
+ * GCC 4.5 and later have a 32 bytes section alignment for structures.
+ * Except GCC 4.9, that feels the need to align on 64 bytes.
+ */
+#if __GNUC__ == 4 && __GNUC_MINOR__ == 9
+#define STRUCT_ALIGNMENT 64
+#else
+#define STRUCT_ALIGNMENT 32
+#endif
+#define STRUCT_ALIGN() . = ALIGN(STRUCT_ALIGNMENT)
+
 /*
  * The order of the sched class addresses are important, as they are
  * used to determine the order of the priority of each sched class in
@@ -123,13 +134,6 @@
 	*(__stop_sched_class)			\
 	__end_sched_classes = .;
 
-/*
- * Align to a 32 byte boundary equal to the
- * alignment gcc 4.5 uses for a struct
- */
-#define STRUCT_ALIGNMENT 32
-#define STRUCT_ALIGN() . = ALIGN(STRUCT_ALIGNMENT)
-
 /* The actual configuration determine if the init/exit sections
  * are handled as text/data or they can be discarded (which
  * often happens at runtime)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5aa6661ecaf1..9bef2dd01247 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -67,6 +67,7 @@
 #include <linux/tsacct_kern.h>
 
 #include <asm/tlb.h>
+#include <asm-generic/vmlinux.lds.h>
 
 #ifdef CONFIG_PARAVIRT
 # include <asm/paravirt.h>
@@ -1810,7 +1811,7 @@ struct sched_class {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*task_change_group)(struct task_struct *p, int type);
 #endif
-} __aligned(32); /* STRUCT_ALIGN(), vmlinux.lds.h */
+} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */
 
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-- 
cgit v1.2.3


From d81ae8aac85ca2e307d273f6dc7863a721bf054e Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@arm.com>
Date: Tue, 30 Jun 2020 12:21:22 +0100
Subject: sched/uclamp: Fix initialization of struct uclamp_rq

struct uclamp_rq was zeroed out entirely in assumption that in the first
call to uclamp_rq_inc() they'd be initialized correctly in accordance to
default settings.

But when next patch introduces a static key to skip
uclamp_rq_{inc,dec}() until userspace opts in to use uclamp, schedutil
will fail to perform any frequency changes because the
rq->uclamp[UCLAMP_MAX].value is zeroed at init and stays as such. Which
means all rqs are capped to 0 by default.

Fix it by making sure we do proper initialization at init without
relying on uclamp_rq_inc() doing it later.

Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcounting")
Signed-off-by: Qais Yousef <qais.yousef@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lkml.kernel.org/r/20200630112123.12076-2-qais.yousef@arm.com
---
 kernel/sched/core.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 15c980af63db..9605db70e671 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1239,6 +1239,20 @@ static void uclamp_fork(struct task_struct *p)
 	}
 }
 
+static void __init init_uclamp_rq(struct rq *rq)
+{
+	enum uclamp_id clamp_id;
+	struct uclamp_rq *uc_rq = rq->uclamp;
+
+	for_each_clamp_id(clamp_id) {
+		uc_rq[clamp_id] = (struct uclamp_rq) {
+			.value = uclamp_none(clamp_id)
+		};
+	}
+
+	rq->uclamp_flags = 0;
+}
+
 static void __init init_uclamp(void)
 {
 	struct uclamp_se uc_max = {};
@@ -1247,11 +1261,8 @@ static void __init init_uclamp(void)
 
 	mutex_init(&uclamp_mutex);
 
-	for_each_possible_cpu(cpu) {
-		memset(&cpu_rq(cpu)->uclamp, 0,
-				sizeof(struct uclamp_rq)*UCLAMP_CNT);
-		cpu_rq(cpu)->uclamp_flags = 0;
-	}
+	for_each_possible_cpu(cpu)
+		init_uclamp_rq(cpu_rq(cpu));
 
 	for_each_clamp_id(clamp_id) {
 		uclamp_se_set(&init_task.uclamp_req[clamp_id],
-- 
cgit v1.2.3


From 46609ce227039fd192e0ecc7d940bed587fd2c78 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@arm.com>
Date: Tue, 30 Jun 2020 12:21:23 +0100
Subject: sched/uclamp: Protect uclamp fast path code with static key

There is a report that when uclamp is enabled, a netperf UDP test
regresses compared to a kernel compiled without uclamp.

https://lore.kernel.org/lkml/20200529100806.GA3070@suse.de/

While investigating the root cause, there were no sign that the uclamp
code is doing anything particularly expensive but could suffer from bad
cache behavior under certain circumstances that are yet to be
understood.

https://lore.kernel.org/lkml/20200616110824.dgkkbyapn3io6wik@e107158-lin/

To reduce the pressure on the fast path anyway, add a static key that is
by default will skip executing uclamp logic in the
enqueue/dequeue_task() fast path until it's needed.

As soon as the user start using util clamp by:

	1. Changing uclamp value of a task with sched_setattr()
	2. Modifying the default sysctl_sched_util_clamp_{min, max}
	3. Modifying the default cpu.uclamp.{min, max} value in cgroup

We flip the static key now that the user has opted to use util clamp.
Effectively re-introducing uclamp logic in the enqueue/dequeue_task()
fast path. It stays on from that point forward until the next reboot.

This should help minimize the effect of util clamp on workloads that
don't need it but still allow distros to ship their kernels with uclamp
compiled in by default.

SCHED_WARN_ON() in uclamp_rq_dec_id() was removed since now we can end
up with unbalanced call to uclamp_rq_dec_id() if we flip the key while
a task is running in the rq. Since we know it is harmless we just
quietly return if we attempt a uclamp_rq_dec_id() when
rq->uclamp[].bucket[].tasks is 0.

In schedutil, we introduce a new uclamp_is_enabled() helper which takes
the static key into account to ensure RT boosting behavior is retained.

The following results demonstrates how this helps on 2 Sockets Xeon E5
2x10-Cores system.

                                   nouclamp                 uclamp      uclamp-static-key
Hmean     send-64         162.43 (   0.00%)      157.84 *  -2.82%*      163.39 *   0.59%*
Hmean     send-128        324.71 (   0.00%)      314.78 *  -3.06%*      326.18 *   0.45%*
Hmean     send-256        641.55 (   0.00%)      628.67 *  -2.01%*      648.12 *   1.02%*
Hmean     send-1024      2525.28 (   0.00%)     2448.26 *  -3.05%*     2543.73 *   0.73%*
Hmean     send-2048      4836.14 (   0.00%)     4712.08 *  -2.57%*     4867.69 *   0.65%*
Hmean     send-3312      7540.83 (   0.00%)     7425.45 *  -1.53%*     7621.06 *   1.06%*
Hmean     send-4096      9124.53 (   0.00%)     8948.82 *  -1.93%*     9276.25 *   1.66%*
Hmean     send-8192     15589.67 (   0.00%)    15486.35 *  -0.66%*    15819.98 *   1.48%*
Hmean     send-16384    26386.47 (   0.00%)    25752.25 *  -2.40%*    26773.74 *   1.47%*

The perf diff between nouclamp and uclamp-static-key when uclamp is
disabled in the fast path:

     8.73%     -1.55%  [kernel.kallsyms]        [k] try_to_wake_up
     0.07%     +0.04%  [kernel.kallsyms]        [k] deactivate_task
     0.13%     -0.02%  [kernel.kallsyms]        [k] activate_task

The diff between nouclamp and uclamp-static-key when uclamp is enabled
in the fast path:

     8.73%     -0.72%  [kernel.kallsyms]        [k] try_to_wake_up
     0.13%     +0.39%  [kernel.kallsyms]        [k] activate_task
     0.07%     +0.38%  [kernel.kallsyms]        [k] deactivate_task

Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcounting")
Reported-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Qais Yousef <qais.yousef@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lkml.kernel.org/r/20200630112123.12076-3-qais.yousef@arm.com
---
 kernel/sched/core.c              | 74 +++++++++++++++++++++++++++++++++++++++-
 kernel/sched/cpufreq_schedutil.c |  2 +-
 kernel/sched/sched.h             | 47 +++++++++++++++++++++++--
 3 files changed, 119 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9605db70e671..4cf30e4de653 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -796,6 +796,26 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
 /* All clamps are required to be less or equal than these values */
 static struct uclamp_se uclamp_default[UCLAMP_CNT];
 
+/*
+ * This static key is used to reduce the uclamp overhead in the fast path. It
+ * primarily disables the call to uclamp_rq_{inc, dec}() in
+ * enqueue/dequeue_task().
+ *
+ * This allows users to continue to enable uclamp in their kernel config with
+ * minimum uclamp overhead in the fast path.
+ *
+ * As soon as userspace modifies any of the uclamp knobs, the static key is
+ * enabled, since we have an actual users that make use of uclamp
+ * functionality.
+ *
+ * The knobs that would enable this static key are:
+ *
+ *   * A task modifying its uclamp value with sched_setattr().
+ *   * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
+ *   * An admin modifying the cgroup cpu.uclamp.{min, max}
+ */
+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
+
 /* Integer rounded range for each bucket */
 #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
 
@@ -992,10 +1012,38 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
 
 	lockdep_assert_held(&rq->lock);
 
+	/*
+	 * If sched_uclamp_used was enabled after task @p was enqueued,
+	 * we could end up with unbalanced call to uclamp_rq_dec_id().
+	 *
+	 * In this case the uc_se->active flag should be false since no uclamp
+	 * accounting was performed at enqueue time and we can just return
+	 * here.
+	 *
+	 * Need to be careful of the following enqeueue/dequeue ordering
+	 * problem too
+	 *
+	 *	enqueue(taskA)
+	 *	// sched_uclamp_used gets enabled
+	 *	enqueue(taskB)
+	 *	dequeue(taskA)
+	 *	// Must not decrement bukcet->tasks here
+	 *	dequeue(taskB)
+	 *
+	 * where we could end up with stale data in uc_se and
+	 * bucket[uc_se->bucket_id].
+	 *
+	 * The following check here eliminates the possibility of such race.
+	 */
+	if (unlikely(!uc_se->active))
+		return;
+
 	bucket = &uc_rq->bucket[uc_se->bucket_id];
+
 	SCHED_WARN_ON(!bucket->tasks);
 	if (likely(bucket->tasks))
 		bucket->tasks--;
+
 	uc_se->active = false;
 
 	/*
@@ -1023,6 +1071,15 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
 
+	/*
+	 * Avoid any overhead until uclamp is actually used by the userspace.
+	 *
+	 * The condition is constructed such that a NOP is generated when
+	 * sched_uclamp_used is disabled.
+	 */
+	if (!static_branch_unlikely(&sched_uclamp_used))
+		return;
+
 	if (unlikely(!p->sched_class->uclamp_enabled))
 		return;
 
@@ -1038,6 +1095,15 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
 
+	/*
+	 * Avoid any overhead until uclamp is actually used by the userspace.
+	 *
+	 * The condition is constructed such that a NOP is generated when
+	 * sched_uclamp_used is disabled.
+	 */
+	if (!static_branch_unlikely(&sched_uclamp_used))
+		return;
+
 	if (unlikely(!p->sched_class->uclamp_enabled))
 		return;
 
@@ -1146,8 +1212,10 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 		update_root_tg = true;
 	}
 
-	if (update_root_tg)
+	if (update_root_tg) {
+		static_branch_enable(&sched_uclamp_used);
 		uclamp_update_root_tg();
+	}
 
 	/*
 	 * We update all RUNNABLE tasks only when task groups are in use.
@@ -1212,6 +1280,8 @@ static void __setscheduler_uclamp(struct task_struct *p,
 	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
 		return;
 
+	static_branch_enable(&sched_uclamp_used);
+
 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
 		uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
 			      attr->sched_util_min, true);
@@ -7436,6 +7506,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
 	if (req.ret)
 		return req.ret;
 
+	static_branch_enable(&sched_uclamp_used);
+
 	mutex_lock(&uclamp_mutex);
 	rcu_read_lock();
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7fbaee24c824..dc6835bc6490 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -210,7 +210,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
 	unsigned long dl_util, util, irq;
 	struct rq *rq = cpu_rq(cpu);
 
-	if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
+	if (!uclamp_is_used() &&
 	    type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
 		return max;
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9bef2dd01247..b1432f608061 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -878,6 +878,8 @@ struct uclamp_rq {
 	unsigned int value;
 	struct uclamp_bucket bucket[UCLAMP_BUCKETS];
 };
+
+DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
 #endif /* CONFIG_UCLAMP_TASK */
 
 /*
@@ -2365,12 +2367,35 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #ifdef CONFIG_UCLAMP_TASK
 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
 
+/**
+ * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
+ * @rq:		The rq to clamp against. Must not be NULL.
+ * @util:	The util value to clamp.
+ * @p:		The task to clamp against. Can be NULL if you want to clamp
+ *		against @rq only.
+ *
+ * Clamps the passed @util to the max(@rq, @p) effective uclamp values.
+ *
+ * If sched_uclamp_used static key is disabled, then just return the util
+ * without any clamping since uclamp aggregation at the rq level in the fast
+ * path is disabled, rendering this operation a NOP.
+ *
+ * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It
+ * will return the correct effective uclamp value of the task even if the
+ * static key is disabled.
+ */
 static __always_inline
 unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
 				  struct task_struct *p)
 {
-	unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
-	unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+	unsigned long min_util;
+	unsigned long max_util;
+
+	if (!static_branch_likely(&sched_uclamp_used))
+		return util;
+
+	min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
+	max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
 
 	if (p) {
 		min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
@@ -2387,6 +2412,19 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
 
 	return clamp(util, min_util, max_util);
 }
+
+/*
+ * When uclamp is compiled in, the aggregation at rq level is 'turned off'
+ * by default in the fast path and only gets turned on once userspace performs
+ * an operation that requires it.
+ *
+ * Returns true if userspace opted-in to use uclamp and aggregation at rq level
+ * hence is active.
+ */
+static inline bool uclamp_is_used(void)
+{
+	return static_branch_likely(&sched_uclamp_used);
+}
 #else /* CONFIG_UCLAMP_TASK */
 static inline
 unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
@@ -2394,6 +2432,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
 {
 	return util;
 }
+
+static inline bool uclamp_is_used(void)
+{
+	return false;
+}
 #endif /* CONFIG_UCLAMP_TASK */
 
 #ifdef arch_scale_freq_capacity
-- 
cgit v1.2.3


From 9d246053a69196c7c27068870e9b4b66ac536f68 Mon Sep 17 00:00:00 2001
From: Phil Auld <pauld@redhat.com>
Date: Mon, 29 Jun 2020 15:23:03 -0400
Subject: sched: Add a tracepoint to track rq->nr_running

Add a bare tracepoint trace_sched_update_nr_running_tp which tracks
->nr_running CPU's rq. This is used to accurately trace this data and
provide a visualization of scheduler imbalances in, for example, the
form of a heat map.  The tracepoint is accessed by loading an external
kernel module. An example module (forked from Qais' module and including
the pelt related tracepoints) can be found at:

  https://github.com/auldp/tracepoints-helpers.git

A script to turn the trace-cmd report output into a heatmap plot can be
found at:

  https://github.com/jirvoz/plot-nr-running

The tracepoints are added to add_nr_running() and sub_nr_running() which
are in kernel/sched/sched.h. In order to avoid CREATE_TRACE_POINTS in
the header a wrapper call is used and the trace/events/sched.h include
is moved before sched.h in kernel/sched/core.

Signed-off-by: Phil Auld <pauld@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200629192303.GC120228@lorien.usersys.redhat.com
---
 include/linux/sched.h        |  1 +
 include/trace/events/sched.h |  4 ++++
 kernel/sched/core.c          | 13 +++++++++----
 kernel/sched/fair.c          |  8 ++++++--
 kernel/sched/pelt.c          |  2 --
 kernel/sched/sched.h         | 10 ++++++++++
 6 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 683372943093..12b10ce51a08 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2044,6 +2044,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
 const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
 
 int sched_trace_rq_cpu(struct rq *rq);
+int sched_trace_rq_nr_running(struct rq *rq);
 
 const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
 
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 04f9a4c7b0d9..0d5ff0958d48 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -642,6 +642,10 @@ DECLARE_TRACE(sched_util_est_se_tp,
 	TP_PROTO(struct sched_entity *se),
 	TP_ARGS(se));
 
+DECLARE_TRACE(sched_update_nr_running_tp,
+	TP_PROTO(struct rq *rq, int change),
+	TP_ARGS(rq, change));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4cf30e4de653..ff0519551188 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,6 +6,10 @@
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  */
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+#undef CREATE_TRACE_POINTS
+
 #include "sched.h"
 
 #include <linux/nospec.h>
@@ -23,9 +27,6 @@
 #include "pelt.h"
 #include "smp.h"
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
-
 /*
  * Export tracepoints that act as a bare tracehook (ie: have no trace event
  * associated with them) to allow external modules to probe them.
@@ -38,6 +39,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
@@ -8195,4 +8197,7 @@ const u32 sched_prio_to_wmult[40] = {
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
-#undef CREATE_TRACE_POINTS
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
+{
+        trace_sched_update_nr_running_tp(rq, count);
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6fab1d17c575..3213cb247aff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -22,8 +22,6 @@
  */
 #include "sched.h"
 
-#include <trace/events/sched.h>
-
 /*
  * Targeted preemption latency for CPU-bound tasks:
  *
@@ -11296,3 +11294,9 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
 #endif
 }
 EXPORT_SYMBOL_GPL(sched_trace_rd_span);
+
+int sched_trace_rq_nr_running(struct rq *rq)
+{
+        return rq ? rq->nr_running : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 11bea3b08115..2c613e1cff3a 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -28,8 +28,6 @@
 #include "sched.h"
 #include "pelt.h"
 
-#include <trace/events/sched.h>
-
 /*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b1432f608061..65b72e0487bf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -76,6 +76,8 @@
 #include "cpupri.h"
 #include "cpudeadline.h"
 
+#include <trace/events/sched.h>
+
 #ifdef CONFIG_SCHED_DEBUG
 # define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
 #else
@@ -97,6 +99,7 @@ extern atomic_long_t calc_load_tasks;
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
+extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
  */
@@ -1973,6 +1976,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 	unsigned prev_nr = rq->nr_running;
 
 	rq->nr_running = prev_nr + count;
+	if (trace_sched_update_nr_running_tp_enabled()) {
+		call_trace_sched_update_nr_running(rq, count);
+	}
 
 #ifdef CONFIG_SMP
 	if (prev_nr < 2 && rq->nr_running >= 2) {
@@ -1987,6 +1993,10 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
 	rq->nr_running -= count;
+	if (trace_sched_update_nr_running_tp_enabled()) {
+		call_trace_sched_update_nr_running(rq, count);
+	}
+
 	/* Check if we still need preemption */
 	sched_update_tick_dependency(rq);
 }
-- 
cgit v1.2.3


From 05eee619ed61c8cd89633954d38c4e5653086845 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Date: Wed, 23 Oct 2019 19:16:22 +0800
Subject: x86/kvm: Add "nopvspin" parameter to disable PV spinlocks

There are cases where a guest tries to switch spinlocks to bare metal
behavior (e.g. by setting "xen_nopvspin" on XEN platform and
"hv_nopvspin" on HYPER_V).

That feature is missed on KVM, add a new parameter "nopvspin" to disable
PV spinlocks for KVM guest.

The new 'nopvspin' parameter will also replace Xen and Hyper-V specific
parameters in future patches.

Define variable nopvsin as global because it will be used in future
patches as above.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Wanpeng Li <wanpengli@tencent.com>
Cc: Jim Mattson <jmattson@google.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  5 ++++
 arch/x86/include/asm/qspinlock.h                |  1 +
 arch/x86/kernel/kvm.c                           | 39 ++++++++++++++++++++-----
 kernel/locking/qspinlock.c                      |  7 +++++
 4 files changed, 45 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fb95fad81c79..6a8934ffdaf6 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5739,6 +5739,11 @@
 			as generic guest with no PV drivers. Currently support
 			XEN HVM, KVM, HYPER_V and VMWARE guest.
 
+	nopvspin	[X86,KVM]
+			Disables the qspinlock slow path using PV optimizations
+			which allow the hypervisor to 'idle' the guest on lock
+			contention.
+
 	xirc2ps_cs=	[NET,PCMCIA]
 			Format:
 			<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 444d6fd9a6d8..d86ab942219c 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -32,6 +32,7 @@ extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
 extern void __pv_init_lock_hash(void);
 extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
 extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+extern bool nopvspin;
 
 #define	queued_spin_unlock queued_spin_unlock
 /**
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 4ef21a87f1d3..d9995931ea18 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -871,18 +871,36 @@ asm(
  */
 void __init kvm_spinlock_init(void)
 {
-	/* Does host kernel support KVM_FEATURE_PV_UNHALT? */
-	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
+	/*
+	 * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
+	 * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
+	 * preferred over native qspinlock when vCPU is preempted.
+	 */
+	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
+		pr_info("PV spinlocks disabled, no host support\n");
 		return;
+	}
 
+	/*
+	 * Disable PV spinlocks and use native qspinlock when dedicated pCPUs
+	 * are available.
+	 */
 	if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
-		static_branch_disable(&virt_spin_lock_key);
-		return;
+		pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
+		goto out;
 	}
 
-	/* Don't use the pvqspinlock code if there is only 1 vCPU. */
-	if (num_possible_cpus() == 1)
-		return;
+	if (num_possible_cpus() == 1) {
+		pr_info("PV spinlocks disabled, single CPU\n");
+		goto out;
+	}
+
+	if (nopvspin) {
+		pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
+		goto out;
+	}
+
+	pr_info("PV spinlocks enabled\n");
 
 	__pv_init_lock_hash();
 	pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
@@ -895,6 +913,13 @@ void __init kvm_spinlock_init(void)
 		pv_ops.lock.vcpu_is_preempted =
 			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
 	}
+	/*
+	 * When PV spinlock is enabled which is preferred over
+	 * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
+	 * Just disable it anyway.
+	 */
+out:
+	static_branch_disable(&virt_spin_lock_key);
 }
 
 #endif	/* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b9515fcc9b29..cbff6ba53d56 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -581,4 +581,11 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
 #include "qspinlock_paravirt.h"
 #include "qspinlock.c"
 
+bool nopvspin __initdata;
+static __init int parse_nopvspin(char *arg)
+{
+	nopvspin = true;
+	return 0;
+}
+early_param("nopvspin", parse_nopvspin);
 #endif
-- 
cgit v1.2.3


From d7481b24b816b8c3955a9eaf01b97e2bd7f61a37 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 3 Jul 2020 12:56:19 -0400
Subject: audit: issue CWD record to accompany LSM_AUDIT_DATA_* records

The LSM_AUDIT_DATA_* records for PATH, FILE, IOCTL_OP, DENTRY and INODE
are incomplete without the task context of the AUDIT Current Working
Directory record.  Add it.

This record addition can't use audit_dummy_context to determine whether
or not to store the record information since the LSM_AUDIT_DATA_*
records are initiated by various LSMs independent of any audit rules.
context->in_syscall is used to determine if it was called in user
context like audit_getname.

Please see the upstream issue
https://github.com/linux-audit/audit-kernel/issues/96

Adapted from Vladis Dronov's v2 patch.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h |  9 ++++++++-
 kernel/auditsc.c      | 17 +++++++++++++++--
 security/lsm_audit.c  |  5 +++++
 3 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b5478c64bc69..523f77494847 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -292,7 +292,7 @@ extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
 extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern struct filename *__audit_reusename(const __user char *uptr);
 extern void __audit_getname(struct filename *name);
-
+extern void __audit_getcwd(void);
 extern void __audit_inode(struct filename *name, const struct dentry *dentry,
 				unsigned int flags);
 extern void __audit_file(const struct file *);
@@ -351,6 +351,11 @@ static inline void audit_getname(struct filename *name)
 	if (unlikely(!audit_dummy_context()))
 		__audit_getname(name);
 }
+static inline void audit_getcwd(void)
+{
+	if (unlikely(audit_context()))
+		__audit_getcwd();
+}
 static inline void audit_inode(struct filename *name,
 				const struct dentry *dentry,
 				unsigned int aflags) {
@@ -579,6 +584,8 @@ static inline struct filename *audit_reusename(const __user char *name)
 }
 static inline void audit_getname(struct filename *name)
 { }
+static inline void audit_getcwd(void)
+{ }
 static inline void audit_inode(struct filename *name,
 				const struct dentry *dentry,
 				unsigned int aflags)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index eae1a599ffe3..6884b50069d1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1891,6 +1891,20 @@ __audit_reusename(const __user char *uptr)
 	return NULL;
 }
 
+inline void _audit_getcwd(struct audit_context *context)
+{
+	if (!context->pwd.dentry)
+		get_fs_pwd(current->fs, &context->pwd);
+}
+
+void __audit_getcwd(void)
+{
+	struct audit_context *context = audit_context();
+
+	if (context->in_syscall)
+		_audit_getcwd(context);
+}
+
 /**
  * __audit_getname - add a name to the list
  * @name: name to add
@@ -1915,8 +1929,7 @@ void __audit_getname(struct filename *name)
 	name->aname = n;
 	name->refcnt++;
 
-	if (!context->pwd.dentry)
-		get_fs_pwd(current->fs, &context->pwd);
+	_audit_getcwd(context);
 }
 
 static inline int audit_copy_fcaps(struct audit_names *name,
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 2d2bf49016f4..7c555621c2bd 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -241,6 +241,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 			audit_log_untrustedstring(ab, inode->i_sb->s_id);
 			audit_log_format(ab, " ino=%lu", inode->i_ino);
 		}
+		audit_getcwd();
 		break;
 	}
 	case LSM_AUDIT_DATA_FILE: {
@@ -254,6 +255,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 			audit_log_untrustedstring(ab, inode->i_sb->s_id);
 			audit_log_format(ab, " ino=%lu", inode->i_ino);
 		}
+		audit_getcwd();
 		break;
 	}
 	case LSM_AUDIT_DATA_IOCTL_OP: {
@@ -269,6 +271,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 		}
 
 		audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd);
+		audit_getcwd();
 		break;
 	}
 	case LSM_AUDIT_DATA_DENTRY: {
@@ -283,6 +286,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 			audit_log_untrustedstring(ab, inode->i_sb->s_id);
 			audit_log_format(ab, " ino=%lu", inode->i_ino);
 		}
+		audit_getcwd();
 		break;
 	}
 	case LSM_AUDIT_DATA_INODE: {
@@ -300,6 +304,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 		audit_log_format(ab, " dev=");
 		audit_log_untrustedstring(ab, inode->i_sb->s_id);
 		audit_log_format(ab, " ino=%lu", inode->i_ino);
+		audit_getcwd();
 		break;
 	}
 	case LSM_AUDIT_DATA_TASK: {
-- 
cgit v1.2.3


From 746cf3459f118592a72ef42e7551777ff17b1684 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@linux.alibaba.com>
Date: Fri, 3 Jul 2020 10:06:09 +0800
Subject: tracing: Simplify defining of the next event id

The value to be used and compared in trace_search_list() is "last + 1".
Let's just define next to be "last + 1" instead of doing the addition
each time.

Link: https://lkml.kernel.org/r/20200703020612.12930-2-richard.weiyang@linux.alibaba.com

Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 73976de7f8cc..a35232d61601 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -675,11 +675,11 @@ static LIST_HEAD(ftrace_event_list);
 static int trace_search_list(struct list_head **list)
 {
 	struct trace_event *e;
-	int last = __TRACE_LAST_TYPE;
+	int next = __TRACE_LAST_TYPE + 1;
 
 	if (list_empty(&ftrace_event_list)) {
 		*list = &ftrace_event_list;
-		return last + 1;
+		return next;
 	}
 
 	/*
@@ -687,17 +687,17 @@ static int trace_search_list(struct list_head **list)
 	 * lets see if somebody freed one.
 	 */
 	list_for_each_entry(e, &ftrace_event_list, list) {
-		if (e->type != last + 1)
+		if (e->type != next)
 			break;
-		last++;
+		next++;
 	}
 
 	/* Did we used up all 65 thousand events??? */
-	if ((last + 1) > TRACE_EVENT_TYPE_MAX)
+	if (next > TRACE_EVENT_TYPE_MAX)
 		return 0;
 
 	*list = &e->list;
-	return last + 1;
+	return next;
 }
 
 void trace_event_read_lock(void)
-- 
cgit v1.2.3


From 36b8aacf2a483ba40fbe91c830d314e0bc133044 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@linux.alibaba.com>
Date: Fri, 3 Jul 2020 10:06:10 +0800
Subject: tracing: Save one trace_event->type by using __TRACE_LAST_TYPE

Static defined trace_event->type stops at (__TRACE_LAST_TYPE - 1) and
dynamic trace_event->type starts from (__TRACE_LAST_TYPE + 1).

To save one trace_event->type index, let's use __TRACE_LAST_TYPE.

Link: https://lkml.kernel.org/r/20200703020612.12930-3-richard.weiyang@linux.alibaba.com

Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a35232d61601..4d1893564912 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -20,7 +20,7 @@ DECLARE_RWSEM(trace_event_sem);
 
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
-static int next_event_type = __TRACE_LAST_TYPE + 1;
+static int next_event_type = __TRACE_LAST_TYPE;
 
 enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
 {
@@ -675,7 +675,7 @@ static LIST_HEAD(ftrace_event_list);
 static int trace_search_list(struct list_head **list)
 {
 	struct trace_event *e;
-	int next = __TRACE_LAST_TYPE + 1;
+	int next = __TRACE_LAST_TYPE;
 
 	if (list_empty(&ftrace_event_list)) {
 		*list = &ftrace_event_list;
-- 
cgit v1.2.3


From 248591f5d257a19c1cba9ab9da3536bfbc2f0149 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 24 Jun 2020 13:32:46 +0200
Subject: kcsan: Make KCSAN compatible with new IRQ state tracking

The new IRQ state tracking code does not honor lockdep_off(), and as
such we should again permit tracing by using non-raw functions in
core.c. Update the lockdep_off() comment in report.c, to reflect the
fact there is still a potential risk of deadlock due to using printk()
from scheduler code.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200624113246.GA170324@elver.google.com
---
 kernel/kcsan/core.c   | 5 ++---
 kernel/kcsan/report.c | 9 +++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 15f67949d11e..732623c30359 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -397,8 +397,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
 	}
 
 	if (!kcsan_interrupt_watcher)
-		/* Use raw to avoid lockdep recursion via IRQ flags tracing. */
-		raw_local_irq_save(irq_flags);
+		local_irq_save(irq_flags);
 
 	watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write);
 	if (watchpoint == NULL) {
@@ -539,7 +538,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
 	kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS);
 out_unlock:
 	if (!kcsan_interrupt_watcher)
-		raw_local_irq_restore(irq_flags);
+		local_irq_restore(irq_flags);
 out:
 	user_access_restore(ua_flags);
 }
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index ac5f8345bae9..6b2fb1a6d8cd 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -606,10 +606,11 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type,
 		goto out;
 
 	/*
-	 * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if
-	 * we do not turn off lockdep here; this could happen due to recursion
-	 * into lockdep via KCSAN if we detect a race in utilities used by
-	 * lockdep.
+	 * Because we may generate reports when we're in scheduler code, the use
+	 * of printk() could deadlock. Until such time that all printing code
+	 * called in print_report() is scheduler-safe, accept the risk, and just
+	 * get our message out. As such, also disable lockdep to hide the
+	 * warning, and avoid disabling lockdep for the rest of the kernel.
 	 */
 	lockdep_off();
 
-- 
cgit v1.2.3


From 859d069ee1ddd87862e1d6a356a82ed417dbeb67 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 May 2020 15:00:57 +0200
Subject: lockdep: Prepare for NMI IRQ state tracking

There is no reason not to always, accurately, track IRQ state.

This change also makes IRQ state tracking ignore lockdep_off().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200623083721.155449112@infradead.org
---
 kernel/locking/lockdep.c | 46 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 29a8de4c50b9..d595623c4b34 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -395,7 +395,7 @@ void lockdep_init_task(struct task_struct *task)
 
 static __always_inline void lockdep_recursion_finish(void)
 {
-	if (WARN_ON_ONCE(--current->lockdep_recursion))
+	if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK))
 		current->lockdep_recursion = 0;
 }
 
@@ -3646,7 +3646,16 @@ static void __trace_hardirqs_on_caller(void)
  */
 void lockdep_hardirqs_on_prepare(unsigned long ip)
 {
-	if (unlikely(!debug_locks || current->lockdep_recursion))
+	if (unlikely(!debug_locks))
+		return;
+
+	/*
+	 * NMIs do not (and cannot) track lock dependencies, nothing to do.
+	 */
+	if (unlikely(in_nmi()))
+		return;
+
+	if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
 		return;
 
 	if (unlikely(current->hardirqs_enabled)) {
@@ -3692,7 +3701,27 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 {
 	struct task_struct *curr = current;
 
-	if (unlikely(!debug_locks || curr->lockdep_recursion))
+	if (unlikely(!debug_locks))
+		return;
+
+	/*
+	 * NMIs can happen in the middle of local_irq_{en,dis}able() where the
+	 * tracking state and hardware state are out of sync.
+	 *
+	 * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from,
+	 * and not rely on hardware state like normal interrupts.
+	 */
+	if (unlikely(in_nmi())) {
+		/*
+		 * Skip:
+		 *  - recursion check, because NMI can hit lockdep;
+		 *  - hardware state check, because above;
+		 *  - chain_key check, see lockdep_hardirqs_on_prepare().
+		 */
+		goto skip_checks;
+	}
+
+	if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
 		return;
 
 	if (curr->hardirqs_enabled) {
@@ -3720,6 +3749,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 	DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key !=
 			    current->curr_chain_key);
 
+skip_checks:
 	/* we'll do an OFF -> ON transition: */
 	curr->hardirqs_enabled = 1;
 	curr->hardirq_enable_ip = ip;
@@ -3735,7 +3765,15 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
 {
 	struct task_struct *curr = current;
 
-	if (unlikely(!debug_locks || curr->lockdep_recursion))
+	if (unlikely(!debug_locks))
+		return;
+
+	/*
+	 * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep;
+	 * they will restore the software state. This ensures the software
+	 * state is consistent inside NMIs as well.
+	 */
+	if (unlikely(!in_nmi() && (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)))
 		return;
 
 	/*
-- 
cgit v1.2.3


From a21ee6055c30ce68c4e201c6496f0ed2a1936230 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 25 May 2020 12:22:41 +0200
Subject: lockdep: Change hardirq{s_enabled,_context} to per-cpu variables

Currently all IRQ-tracking state is in task_struct, this means that
task_struct needs to be defined before we use it.

Especially for lockdep_assert_irq*() this can lead to header-hell.

Move the hardirq state into per-cpu variables to avoid the task_struct
dependency.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200623083721.512673481@infradead.org
---
 include/linux/irqflags.h | 19 ++++++++++++-------
 include/linux/lockdep.h  | 34 ++++++++++++++++++----------------
 include/linux/sched.h    |  2 --
 kernel/fork.c            |  4 +---
 kernel/locking/lockdep.c | 30 +++++++++++++++---------------
 kernel/softirq.c         |  6 ++++++
 6 files changed, 52 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 6384d2813ded..255444fe4609 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -14,6 +14,7 @@
 
 #include <linux/typecheck.h>
 #include <asm/irqflags.h>
+#include <asm/percpu.h>
 
 /* Currently lockdep_softirqs_on/off is used only by lockdep */
 #ifdef CONFIG_PROVE_LOCKING
@@ -31,18 +32,22 @@
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
+
+DECLARE_PER_CPU(int, hardirqs_enabled);
+DECLARE_PER_CPU(int, hardirq_context);
+
   extern void trace_hardirqs_on_prepare(void);
   extern void trace_hardirqs_off_finish(void);
   extern void trace_hardirqs_on(void);
   extern void trace_hardirqs_off(void);
-# define lockdep_hardirq_context(p)	((p)->hardirq_context)
+# define lockdep_hardirq_context(p)	(this_cpu_read(hardirq_context))
 # define lockdep_softirq_context(p)	((p)->softirq_context)
-# define lockdep_hardirqs_enabled(p)	((p)->hardirqs_enabled)
+# define lockdep_hardirqs_enabled(p)	(this_cpu_read(hardirqs_enabled))
 # define lockdep_softirqs_enabled(p)	((p)->softirqs_enabled)
-# define lockdep_hardirq_enter()		\
-do {						\
-	if (!current->hardirq_context++)	\
-		current->hardirq_threaded = 0;	\
+# define lockdep_hardirq_enter()			\
+do {							\
+	if (this_cpu_inc_return(hardirq_context) == 1)	\
+		current->hardirq_threaded = 0;		\
 } while (0)
 # define lockdep_hardirq_threaded()		\
 do {						\
@@ -50,7 +55,7 @@ do {						\
 } while (0)
 # define lockdep_hardirq_exit()			\
 do {						\
-	current->hardirq_context--;		\
+	this_cpu_dec(hardirq_context);		\
 } while (0)
 # define lockdep_softirq_enter()		\
 do {						\
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 3b73cf84f77d..be6cb17a8879 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -11,6 +11,7 @@
 #define __LINUX_LOCKDEP_H
 
 #include <linux/lockdep_types.h>
+#include <asm/percpu.h>
 
 struct task_struct;
 
@@ -529,28 +530,29 @@ do {									\
 	lock_release(&(lock)->dep_map, _THIS_IP_);			\
 } while (0)
 
-#define lockdep_assert_irqs_enabled()	do {				\
-		WARN_ONCE(debug_locks && !current->lockdep_recursion &&	\
-			  !current->hardirqs_enabled,			\
-			  "IRQs not enabled as expected\n");		\
-	} while (0)
+DECLARE_PER_CPU(int, hardirqs_enabled);
+DECLARE_PER_CPU(int, hardirq_context);
 
-#define lockdep_assert_irqs_disabled()	do {				\
-		WARN_ONCE(debug_locks && !current->lockdep_recursion &&	\
-			  current->hardirqs_enabled,			\
-			  "IRQs not disabled as expected\n");		\
-	} while (0)
+#define lockdep_assert_irqs_enabled()					\
+do {									\
+	WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirqs_enabled));	\
+} while (0)
 
-#define lockdep_assert_in_irq() do {					\
-		WARN_ONCE(debug_locks && !current->lockdep_recursion &&	\
-			  !current->hardirq_context,			\
-			  "Not in hardirq as expected\n");		\
-	} while (0)
+#define lockdep_assert_irqs_disabled()					\
+do {									\
+	WARN_ON_ONCE(debug_locks && this_cpu_read(hardirqs_enabled));	\
+} while (0)
+
+#define lockdep_assert_in_irq()						\
+do {									\
+	WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirq_context));	\
+} while (0)
 
 #else
 # define might_lock(lock) do { } while (0)
 # define might_lock_read(lock) do { } while (0)
 # define might_lock_nested(lock, subclass) do { } while (0)
+
 # define lockdep_assert_irqs_enabled() do { } while (0)
 # define lockdep_assert_irqs_disabled() do { } while (0)
 # define lockdep_assert_in_irq() do { } while (0)
@@ -560,7 +562,7 @@ do {									\
 
 # define lockdep_assert_RT_in_threaded_ctx() do {			\
 		WARN_ONCE(debug_locks && !current->lockdep_recursion &&	\
-			  current->hardirq_context &&			\
+			  lockdep_hardirq_context(current) &&		\
 			  !(current->hardirq_threaded || current->irq_config),	\
 			  "Not in threaded context on PREEMPT_RT as expected\n");	\
 } while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 692e327d7455..3903a9500926 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -990,8 +990,6 @@ struct task_struct {
 	unsigned long			hardirq_disable_ip;
 	unsigned int			hardirq_enable_event;
 	unsigned int			hardirq_disable_event;
-	int				hardirqs_enabled;
-	int				hardirq_context;
 	u64				hardirq_chain_key;
 	unsigned long			softirq_disable_ip;
 	unsigned long			softirq_enable_ip;
diff --git a/kernel/fork.c b/kernel/fork.c
index efc5493203ae..70d9d0a4de2a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1954,8 +1954,8 @@ static __latent_entropy struct task_struct *copy_process(
 
 	rt_mutex_init_task(p);
 
+	lockdep_assert_irqs_enabled();
 #ifdef CONFIG_PROVE_LOCKING
-	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
 	retval = -EAGAIN;
@@ -2036,7 +2036,6 @@ static __latent_entropy struct task_struct *copy_process(
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	p->irq_events = 0;
-	p->hardirqs_enabled = 0;
 	p->hardirq_enable_ip = 0;
 	p->hardirq_enable_event = 0;
 	p->hardirq_disable_ip = _THIS_IP_;
@@ -2046,7 +2045,6 @@ static __latent_entropy struct task_struct *copy_process(
 	p->softirq_enable_event = 0;
 	p->softirq_disable_ip = 0;
 	p->softirq_disable_event = 0;
-	p->hardirq_context = 0;
 	p->softirq_context = 0;
 #endif
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d595623c4b34..ab4ffbe0e9e9 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr,
 	pr_warn("-----------------------------------------------------\n");
 	pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
 		curr->comm, task_pid_nr(curr),
-		curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
+		lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
 		curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
-		curr->hardirqs_enabled,
+		lockdep_hardirqs_enabled(curr),
 		curr->softirqs_enabled);
 	print_lock(next);
 
@@ -3658,7 +3658,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
 	if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
 		return;
 
-	if (unlikely(current->hardirqs_enabled)) {
+	if (unlikely(lockdep_hardirqs_enabled(current))) {
 		/*
 		 * Neither irq nor preemption are disabled here
 		 * so this is racy by nature but losing one hit
@@ -3686,7 +3686,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
 	 * Can't allow enabling interrupts while in an interrupt handler,
 	 * that's general bad form and such. Recursion, limited stack etc..
 	 */
-	if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+	if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context(current)))
 		return;
 
 	current->hardirq_chain_key = current->curr_chain_key;
@@ -3724,7 +3724,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 	if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
 		return;
 
-	if (curr->hardirqs_enabled) {
+	if (lockdep_hardirqs_enabled(curr)) {
 		/*
 		 * Neither irq nor preemption are disabled here
 		 * so this is racy by nature but losing one hit
@@ -3751,7 +3751,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 
 skip_checks:
 	/* we'll do an OFF -> ON transition: */
-	curr->hardirqs_enabled = 1;
+	this_cpu_write(hardirqs_enabled, 1);
 	curr->hardirq_enable_ip = ip;
 	curr->hardirq_enable_event = ++curr->irq_events;
 	debug_atomic_inc(hardirqs_on_events);
@@ -3783,11 +3783,11 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
-	if (curr->hardirqs_enabled) {
+	if (lockdep_hardirqs_enabled(curr)) {
 		/*
 		 * We have done an ON -> OFF transition:
 		 */
-		curr->hardirqs_enabled = 0;
+		this_cpu_write(hardirqs_enabled, 0);
 		curr->hardirq_disable_ip = ip;
 		curr->hardirq_disable_event = ++curr->irq_events;
 		debug_atomic_inc(hardirqs_off_events);
@@ -3832,7 +3832,7 @@ void lockdep_softirqs_on(unsigned long ip)
 	 * usage bit for all held locks, if hardirqs are
 	 * enabled too:
 	 */
-	if (curr->hardirqs_enabled)
+	if (lockdep_hardirqs_enabled(curr))
 		mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
 	lockdep_recursion_finish();
 }
@@ -3881,7 +3881,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
 	 */
 	if (!hlock->trylock) {
 		if (hlock->read) {
-			if (curr->hardirq_context)
+			if (lockdep_hardirq_context(curr))
 				if (!mark_lock(curr, hlock,
 						LOCK_USED_IN_HARDIRQ_READ))
 					return 0;
@@ -3890,7 +3890,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
 						LOCK_USED_IN_SOFTIRQ_READ))
 					return 0;
 		} else {
-			if (curr->hardirq_context)
+			if (lockdep_hardirq_context(curr))
 				if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
 					return 0;
 			if (curr->softirq_context)
@@ -3928,7 +3928,7 @@ lock_used:
 
 static inline unsigned int task_irq_context(struct task_struct *task)
 {
-	return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context +
+	return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context(task) +
 	       LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context;
 }
 
@@ -4021,7 +4021,7 @@ static inline short task_wait_context(struct task_struct *curr)
 	 * Set appropriate wait type for the context; for IRQs we have to take
 	 * into account force_irqthread as that is implied by PREEMPT_RT.
 	 */
-	if (curr->hardirq_context) {
+	if (lockdep_hardirq_context(curr)) {
 		/*
 		 * Check if force_irqthreads will run us threaded.
 		 */
@@ -4864,11 +4864,11 @@ static void check_flags(unsigned long flags)
 		return;
 
 	if (irqs_disabled_flags(flags)) {
-		if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) {
+		if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled(current))) {
 			printk("possible reason: unannotated irqs-off.\n");
 		}
 	} else {
-		if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) {
+		if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled(current))) {
 			printk("possible reason: unannotated irqs-on.\n");
 		}
 	}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c4201b7f42b1..342c53feaa7a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -107,6 +107,12 @@ static bool ksoftirqd_running(unsigned long pending)
  * where hardirqs are disabled legitimately:
  */
 #ifdef CONFIG_TRACE_IRQFLAGS
+
+DEFINE_PER_CPU(int, hardirqs_enabled);
+DEFINE_PER_CPU(int, hardirq_context);
+EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
+EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
+
 void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
 {
 	unsigned long flags;
-- 
cgit v1.2.3


From f9ad4a5f3f20bee022b1bdde94e5ece6dc0b0edc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 May 2020 13:03:26 +0200
Subject: lockdep: Remove lockdep_hardirq{s_enabled,_context}() argument

Now that the macros use per-cpu data, we no longer need the argument.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200623083721.571835311@infradead.org
---
 arch/x86/entry/common.c        |  2 +-
 include/linux/irqflags.h       |  8 ++++----
 include/linux/lockdep.h        |  2 +-
 kernel/locking/lockdep.c       | 30 +++++++++++++++---------------
 kernel/softirq.c               |  2 +-
 tools/include/linux/irqflags.h |  4 ++--
 6 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 63c607dd6c52..4ea640363f5d 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -758,7 +758,7 @@ noinstr void idtentry_exit_user(struct pt_regs *regs)
 
 noinstr bool idtentry_enter_nmi(struct pt_regs *regs)
 {
-	bool irq_state = lockdep_hardirqs_enabled(current);
+	bool irq_state = lockdep_hardirqs_enabled();
 
 	__nmi_enter();
 	lockdep_hardirqs_off(CALLER_ADDR0);
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 255444fe4609..5811ee8a5cd8 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -40,9 +40,9 @@ DECLARE_PER_CPU(int, hardirq_context);
   extern void trace_hardirqs_off_finish(void);
   extern void trace_hardirqs_on(void);
   extern void trace_hardirqs_off(void);
-# define lockdep_hardirq_context(p)	(this_cpu_read(hardirq_context))
+# define lockdep_hardirq_context()	(this_cpu_read(hardirq_context))
 # define lockdep_softirq_context(p)	((p)->softirq_context)
-# define lockdep_hardirqs_enabled(p)	(this_cpu_read(hardirqs_enabled))
+# define lockdep_hardirqs_enabled()	(this_cpu_read(hardirqs_enabled))
 # define lockdep_softirqs_enabled(p)	((p)->softirqs_enabled)
 # define lockdep_hardirq_enter()			\
 do {							\
@@ -109,9 +109,9 @@ do {						\
 # define trace_hardirqs_off_finish()		do { } while (0)
 # define trace_hardirqs_on()		do { } while (0)
 # define trace_hardirqs_off()		do { } while (0)
-# define lockdep_hardirq_context(p)	0
+# define lockdep_hardirq_context()	0
 # define lockdep_softirq_context(p)	0
-# define lockdep_hardirqs_enabled(p)	0
+# define lockdep_hardirqs_enabled()	0
 # define lockdep_softirqs_enabled(p)	0
 # define lockdep_hardirq_enter()	do { } while (0)
 # define lockdep_hardirq_threaded()	do { } while (0)
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index be6cb17a8879..fd04b9e96091 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -562,7 +562,7 @@ do {									\
 
 # define lockdep_assert_RT_in_threaded_ctx() do {			\
 		WARN_ONCE(debug_locks && !current->lockdep_recursion &&	\
-			  lockdep_hardirq_context(current) &&		\
+			  lockdep_hardirq_context() &&			\
 			  !(current->hardirq_threaded || current->irq_config),	\
 			  "Not in threaded context on PREEMPT_RT as expected\n");	\
 } while (0)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index ab4ffbe0e9e9..c9ea05edce25 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr,
 	pr_warn("-----------------------------------------------------\n");
 	pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
 		curr->comm, task_pid_nr(curr),
-		lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+		lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
 		curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
-		lockdep_hardirqs_enabled(curr),
+		lockdep_hardirqs_enabled(),
 		curr->softirqs_enabled);
 	print_lock(next);
 
@@ -3331,9 +3331,9 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
 
 	pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
 		curr->comm, task_pid_nr(curr),
-		lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+		lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
 		lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
-		lockdep_hardirqs_enabled(curr),
+		lockdep_hardirqs_enabled(),
 		lockdep_softirqs_enabled(curr));
 	print_lock(this);
 
@@ -3658,7 +3658,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
 	if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
 		return;
 
-	if (unlikely(lockdep_hardirqs_enabled(current))) {
+	if (unlikely(lockdep_hardirqs_enabled())) {
 		/*
 		 * Neither irq nor preemption are disabled here
 		 * so this is racy by nature but losing one hit
@@ -3686,7 +3686,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
 	 * Can't allow enabling interrupts while in an interrupt handler,
 	 * that's general bad form and such. Recursion, limited stack etc..
 	 */
-	if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context(current)))
+	if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context()))
 		return;
 
 	current->hardirq_chain_key = current->curr_chain_key;
@@ -3724,7 +3724,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 	if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
 		return;
 
-	if (lockdep_hardirqs_enabled(curr)) {
+	if (lockdep_hardirqs_enabled()) {
 		/*
 		 * Neither irq nor preemption are disabled here
 		 * so this is racy by nature but losing one hit
@@ -3783,7 +3783,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
-	if (lockdep_hardirqs_enabled(curr)) {
+	if (lockdep_hardirqs_enabled()) {
 		/*
 		 * We have done an ON -> OFF transition:
 		 */
@@ -3832,7 +3832,7 @@ void lockdep_softirqs_on(unsigned long ip)
 	 * usage bit for all held locks, if hardirqs are
 	 * enabled too:
 	 */
-	if (lockdep_hardirqs_enabled(curr))
+	if (lockdep_hardirqs_enabled())
 		mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
 	lockdep_recursion_finish();
 }
@@ -3881,7 +3881,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
 	 */
 	if (!hlock->trylock) {
 		if (hlock->read) {
-			if (lockdep_hardirq_context(curr))
+			if (lockdep_hardirq_context())
 				if (!mark_lock(curr, hlock,
 						LOCK_USED_IN_HARDIRQ_READ))
 					return 0;
@@ -3890,7 +3890,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
 						LOCK_USED_IN_SOFTIRQ_READ))
 					return 0;
 		} else {
-			if (lockdep_hardirq_context(curr))
+			if (lockdep_hardirq_context())
 				if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
 					return 0;
 			if (curr->softirq_context)
@@ -3928,7 +3928,7 @@ lock_used:
 
 static inline unsigned int task_irq_context(struct task_struct *task)
 {
-	return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context(task) +
+	return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() +
 	       LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context;
 }
 
@@ -4021,7 +4021,7 @@ static inline short task_wait_context(struct task_struct *curr)
 	 * Set appropriate wait type for the context; for IRQs we have to take
 	 * into account force_irqthread as that is implied by PREEMPT_RT.
 	 */
-	if (lockdep_hardirq_context(curr)) {
+	if (lockdep_hardirq_context()) {
 		/*
 		 * Check if force_irqthreads will run us threaded.
 		 */
@@ -4864,11 +4864,11 @@ static void check_flags(unsigned long flags)
 		return;
 
 	if (irqs_disabled_flags(flags)) {
-		if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled(current))) {
+		if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) {
 			printk("possible reason: unannotated irqs-off.\n");
 		}
 	} else {
-		if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled(current))) {
+		if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) {
 			printk("possible reason: unannotated irqs-on.\n");
 		}
 	}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 342c53feaa7a..5e9aaa648a74 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -230,7 +230,7 @@ static inline bool lockdep_softirq_start(void)
 {
 	bool in_hardirq = false;
 
-	if (lockdep_hardirq_context(current)) {
+	if (lockdep_hardirq_context()) {
 		in_hardirq = true;
 		lockdep_hardirq_exit();
 	}
diff --git a/tools/include/linux/irqflags.h b/tools/include/linux/irqflags.h
index 67e01bbadbfe..501262aee8ff 100644
--- a/tools/include/linux/irqflags.h
+++ b/tools/include/linux/irqflags.h
@@ -2,9 +2,9 @@
 #ifndef _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_
 #define _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_
 
-# define lockdep_hardirq_context(p)	0
+# define lockdep_hardirq_context()	0
 # define lockdep_softirq_context(p)	0
-# define lockdep_hardirqs_enabled(p)	0
+# define lockdep_hardirqs_enabled()	0
 # define lockdep_softirqs_enabled(p)	0
 # define lockdep_hardirq_enter()	do { } while (0)
 # define lockdep_hardirq_exit()		do { } while (0)
-- 
cgit v1.2.3


From c818c03b661cd769e035e41673d5543ba2ebda64 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 13 May 2020 14:11:26 -0700
Subject: seccomp: Report number of loaded filters in /proc/$pid/status

A common question asked when debugging seccomp filters is "how many
filters are attached to your process?" Provide a way to easily answer
this question through /proc/$pid/status with a "Seccomp_filters" line.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/proc/array.c         | 2 ++
 include/linux/seccomp.h | 2 ++
 init/init_task.c        | 3 +++
 kernel/seccomp.c        | 3 +++
 4 files changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 55ecbeb3a721..65ec2029fa80 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -341,6 +341,8 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 	seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
 #ifdef CONFIG_SECCOMP
 	seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
+	seq_put_decimal_ull(m, "\nSeccomp_filters:\t",
+			    atomic_read(&p->seccomp.filter_count));
 #endif
 	seq_puts(m, "\nSpeculation_Store_Bypass:\t");
 	switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 4192369b8418..2ec2720f83cc 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -13,6 +13,7 @@
 #ifdef CONFIG_SECCOMP
 
 #include <linux/thread_info.h>
+#include <linux/atomic.h>
 #include <asm/seccomp.h>
 
 struct seccomp_filter;
@@ -29,6 +30,7 @@ struct seccomp_filter;
  */
 struct seccomp {
 	int mode;
+	atomic_t filter_count;
 	struct seccomp_filter *filter;
 };
 
diff --git a/init/init_task.c b/init/init_task.c
index 15089d15010a..a3eb3847e1f4 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -204,6 +204,9 @@ struct task_struct init_task
 #ifdef CONFIG_SECURITY
 	.security	= NULL,
 #endif
+#ifdef CONFIG_SECCOMP
+	.seccomp	= { .filter_count = ATOMIC_INIT(0) },
+#endif
 };
 EXPORT_SYMBOL(init_task);
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d653d8426de9..f387e5004c29 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -398,6 +398,8 @@ static inline void seccomp_sync_threads(unsigned long flags)
 		put_seccomp_filter(thread);
 		smp_store_release(&thread->seccomp.filter,
 				  caller->seccomp.filter);
+		atomic_set(&thread->seccomp.filter_count,
+			   atomic_read(&thread->seccomp.filter_count));
 
 		/*
 		 * Don't let an unprivileged task work around
@@ -544,6 +546,7 @@ static long seccomp_attach_filter(unsigned int flags,
 	 */
 	filter->prev = current->seccomp.filter;
 	current->seccomp.filter = filter;
+	atomic_inc(&current->seccomp.filter_count);
 
 	/* Now that the new filter is in place, synchronize to all threads. */
 	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
-- 
cgit v1.2.3


From 9f87dcf14b82b05ff0e26970439b372ae135de0c Mon Sep 17 00:00:00 2001
From: Sargun Dhillon <sargun@sargun.me>
Date: Mon, 1 Jun 2020 04:25:32 -0700
Subject: seccomp: Add find_notification helper

This adds a helper which can iterate through a seccomp_filter to
find a notification matching an ID. It removes several replicated
chunks of code.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Tycho Andersen <tycho@tycho.ws>
Cc: Matt Denton <mpdenton@google.com>
Cc: Kees Cook <keescook@google.com>,
Cc: Jann Horn <jannh@google.com>,
Cc: Robert Sesek <rsesek@google.com>,
Cc: Chris Palmer <palmer@google.com>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Tycho Andersen <tycho@tycho.ws>
Link: https://lore.kernel.org/r/20200601112532.150158-1-sargun@sargun.me
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/seccomp.c | 53 +++++++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f387e5004c29..e711d697afb6 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -41,6 +41,7 @@
 #include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <linux/anon_inodes.h>
+#include <linux/lockdep.h>
 
 enum notify_state {
 	SECCOMP_NOTIFY_INIT,
@@ -1024,6 +1025,23 @@ static int seccomp_notify_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+/* must be called with notif_lock held */
+static inline struct seccomp_knotif *
+find_notification(struct seccomp_filter *filter, u64 id)
+{
+	struct seccomp_knotif *cur;
+
+	lockdep_assert_held(&filter->notify_lock);
+
+	list_for_each_entry(cur, &filter->notif->notifications, list) {
+		if (cur->id == id)
+			return cur;
+	}
+
+	return NULL;
+}
+
+
 static long seccomp_notify_recv(struct seccomp_filter *filter,
 				void __user *buf)
 {
@@ -1081,15 +1099,8 @@ out:
 		 * may have died when we released the lock, so we need to make
 		 * sure it's still around.
 		 */
-		knotif = NULL;
 		mutex_lock(&filter->notify_lock);
-		list_for_each_entry(cur, &filter->notif->notifications, list) {
-			if (cur->id == unotif.id) {
-				knotif = cur;
-				break;
-			}
-		}
-
+		knotif = find_notification(filter, unotif.id);
 		if (knotif) {
 			knotif->state = SECCOMP_NOTIFY_INIT;
 			up(&filter->notif->request);
@@ -1104,7 +1115,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
 				void __user *buf)
 {
 	struct seccomp_notif_resp resp = {};
-	struct seccomp_knotif *knotif = NULL, *cur;
+	struct seccomp_knotif *knotif;
 	long ret;
 
 	if (copy_from_user(&resp, buf, sizeof(resp)))
@@ -1121,13 +1132,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
 	if (ret < 0)
 		return ret;
 
-	list_for_each_entry(cur, &filter->notif->notifications, list) {
-		if (cur->id == resp.id) {
-			knotif = cur;
-			break;
-		}
-	}
-
+	knotif = find_notification(filter, resp.id);
 	if (!knotif) {
 		ret = -ENOENT;
 		goto out;
@@ -1153,7 +1158,7 @@ out:
 static long seccomp_notify_id_valid(struct seccomp_filter *filter,
 				    void __user *buf)
 {
-	struct seccomp_knotif *knotif = NULL;
+	struct seccomp_knotif *knotif;
 	u64 id;
 	long ret;
 
@@ -1164,16 +1169,12 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
 	if (ret < 0)
 		return ret;
 
-	ret = -ENOENT;
-	list_for_each_entry(knotif, &filter->notif->notifications, list) {
-		if (knotif->id == id) {
-			if (knotif->state == SECCOMP_NOTIFY_SENT)
-				ret = 0;
-			goto out;
-		}
-	}
+	knotif = find_notification(filter, id);
+	if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
+		ret = 0;
+	else
+		ret = -ENOENT;
 
-out:
 	mutex_unlock(&filter->notify_lock);
 	return ret;
 }
-- 
cgit v1.2.3


From b707ddee11d1dc4518ab7f1aa5e7af9ceaa23317 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sun, 31 May 2020 13:50:28 +0200
Subject: seccomp: rename "usage" to "refs" and document

Naming the lifetime counter of a seccomp filter "usage" suggests a
little too strongly that its about tasks that are using this filter
while it also tracks other references such as the user notifier or
ptrace. This also updates the documentation to note this fact.

We'll be introducing an actual usage counter in a follow-up patch.

Cc: Tycho Andersen <tycho@tycho.ws>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matt Denton <mpdenton@google.com>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Jann Horn <jannh@google.com>
Cc: Chris Palmer <palmer@google.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Robert Sesek <rsesek@google.com>
Cc: Jeffrey Vander Stoep <jeffv@google.com>
Cc: Linux Containers <containers@lists.linux-foundation.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/20200531115031.391515-1-christian.brauner@ubuntu.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/seccomp.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e711d697afb6..d4dd3344e312 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -107,10 +107,11 @@ struct notification {
 /**
  * struct seccomp_filter - container for seccomp BPF programs
  *
- * @usage: reference count to manage the object lifetime.
- *         get/put helpers should be used when accessing an instance
- *         outside of a lifetime-guarded section.  In general, this
- *         is only needed for handling filters shared across tasks.
+ * @refs: Reference count to manage the object lifetime.
+ *	  A filter's reference count is incremented for each directly
+ *	  attached task, once for the dependent filter, and if
+ *	  requested for the user notifier. When @refs reaches zero,
+ *	  the filter can be freed.
  * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
  * @prev: points to a previously installed, or inherited, filter
  * @prog: the BPF program to evaluate
@@ -125,10 +126,10 @@ struct notification {
  * how namespaces work.
  *
  * seccomp_filter objects should never be modified after being attached
- * to a task_struct (other than @usage).
+ * to a task_struct (other than @refs).
  */
 struct seccomp_filter {
-	refcount_t usage;
+	refcount_t refs;
 	bool log;
 	struct seccomp_filter *prev;
 	struct bpf_prog *prog;
@@ -464,7 +465,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 		return ERR_PTR(ret);
 	}
 
-	refcount_set(&sfilter->usage, 1);
+	refcount_set(&sfilter->refs, 1);
 
 	return sfilter;
 }
@@ -558,7 +559,7 @@ static long seccomp_attach_filter(unsigned int flags,
 
 static void __get_seccomp_filter(struct seccomp_filter *filter)
 {
-	refcount_inc(&filter->usage);
+	refcount_inc(&filter->refs);
 }
 
 /* get_seccomp_filter - increments the reference count of the filter on @tsk */
@@ -581,7 +582,7 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter)
 static void __put_seccomp_filter(struct seccomp_filter *orig)
 {
 	/* Clean up single-reference branches iteratively. */
-	while (orig && refcount_dec_and_test(&orig->usage)) {
+	while (orig && refcount_dec_and_test(&orig->refs)) {
 		struct seccomp_filter *freeme = orig;
 		orig = orig->prev;
 		seccomp_filter_free(freeme);
-- 
cgit v1.2.3


From 3a15fb6ed92cb32b0a83f406aa4a96f28c9adbc3 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sun, 31 May 2020 13:50:29 +0200
Subject: seccomp: release filter after task is fully dead

The seccomp filter used to be released in free_task() which is called
asynchronously via call_rcu() and assorted mechanisms. Since we need
to inform tasks waiting on the seccomp notifier when a filter goes empty
we will notify them as soon as a task has been marked fully dead in
release_task(). To not split seccomp cleanup into two parts, move
filter release out of free_task() and into release_task() after we've
unhashed struct task from struct pid, exited signals, and unlinked it
from the threadgroups' thread list. We'll put the empty filter
notification infrastructure into it in a follow up patch.

This also renames put_seccomp_filter() to seccomp_filter_release() which
is a more descriptive name of what we're doing here especially once
we've added the empty filter notification mechanism in there.

We're also NULL-ing the task's filter tree entrypoint which seems
cleaner than leaving a dangling pointer in there. Note that this shouldn't
need any memory barriers since we're calling this when the task is in
release_task() which means it's EXIT_DEAD. So it can't modify its seccomp
filters anymore. You can also see this from the point where we're calling
seccomp_filter_release(). It's after __exit_signal() and at this point,
tsk->sighand will already have been NULLed which is required for
thread-sync and filter installation alike.

Cc: Tycho Andersen <tycho@tycho.ws>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matt Denton <mpdenton@google.com>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Jann Horn <jannh@google.com>
Cc: Chris Palmer <palmer@google.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Robert Sesek <rsesek@google.com>
Cc: Jeffrey Vander Stoep <jeffv@google.com>
Cc: Linux Containers <containers@lists.linux-foundation.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/20200531115031.391515-2-christian.brauner@ubuntu.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/seccomp.h |  4 ++--
 kernel/exit.c           |  1 +
 kernel/fork.c           |  1 -
 kernel/seccomp.c        | 62 +++++++++++++++++++++++++++++--------------------
 4 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 2ec2720f83cc..babcd6c02d09 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -84,10 +84,10 @@ static inline int seccomp_mode(struct seccomp *s)
 #endif /* CONFIG_SECCOMP */
 
 #ifdef CONFIG_SECCOMP_FILTER
-extern void put_seccomp_filter(struct task_struct *tsk);
+extern void seccomp_filter_release(struct task_struct *tsk);
 extern void get_seccomp_filter(struct task_struct *tsk);
 #else  /* CONFIG_SECCOMP_FILTER */
-static inline void put_seccomp_filter(struct task_struct *tsk)
+static inline void seccomp_filter_release(struct task_struct *tsk)
 {
 	return;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 727150f28103..00d77e5ba700 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -217,6 +217,7 @@ repeat:
 	}
 
 	write_unlock_irq(&tasklist_lock);
+	seccomp_filter_release(p);
 	proc_flush_pid(thread_pid);
 	put_pid(thread_pid);
 	release_thread(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 142b23645d82..c51a9cd824c5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -473,7 +473,6 @@ void free_task(struct task_struct *tsk)
 #endif
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
-	put_seccomp_filter(tsk);
 	arch_release_task_struct(tsk);
 	if (tsk->flags & PF_KTHREAD)
 		free_kthread_struct(tsk);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d4dd3344e312..0ca6d5243427 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -368,6 +368,42 @@ static inline pid_t seccomp_can_sync_threads(void)
 	return 0;
 }
 
+static inline void seccomp_filter_free(struct seccomp_filter *filter)
+{
+	if (filter) {
+		bpf_prog_destroy(filter->prog);
+		kfree(filter);
+	}
+}
+
+static void __put_seccomp_filter(struct seccomp_filter *orig)
+{
+	/* Clean up single-reference branches iteratively. */
+	while (orig && refcount_dec_and_test(&orig->refs)) {
+		struct seccomp_filter *freeme = orig;
+		orig = orig->prev;
+		seccomp_filter_free(freeme);
+	}
+}
+
+/**
+ * seccomp_filter_release - Detach the task from its filter tree
+ *			    and drop its reference count during
+ *			    exit.
+ *
+ * This function should only be called when the task is exiting as
+ * it detaches it from its filter tree. As such, READ_ONCE() and
+ * barriers are not needed here, as would normally be needed.
+ */
+void seccomp_filter_release(struct task_struct *tsk)
+{
+	struct seccomp_filter *orig = tsk->seccomp.filter;
+
+	/* Detach task from its filter tree. */
+	tsk->seccomp.filter = NULL;
+	__put_seccomp_filter(orig);
+}
+
 /**
  * seccomp_sync_threads: sets all threads to use current's filter
  *
@@ -397,7 +433,7 @@ static inline void seccomp_sync_threads(unsigned long flags)
 		 * current's path will hold a reference.  (This also
 		 * allows a put before the assignment.)
 		 */
-		put_seccomp_filter(thread);
+		__put_seccomp_filter(thread->seccomp.filter);
 		smp_store_release(&thread->seccomp.filter,
 				  caller->seccomp.filter);
 		atomic_set(&thread->seccomp.filter_count,
@@ -571,30 +607,6 @@ void get_seccomp_filter(struct task_struct *tsk)
 	__get_seccomp_filter(orig);
 }
 
-static inline void seccomp_filter_free(struct seccomp_filter *filter)
-{
-	if (filter) {
-		bpf_prog_destroy(filter->prog);
-		kfree(filter);
-	}
-}
-
-static void __put_seccomp_filter(struct seccomp_filter *orig)
-{
-	/* Clean up single-reference branches iteratively. */
-	while (orig && refcount_dec_and_test(&orig->refs)) {
-		struct seccomp_filter *freeme = orig;
-		orig = orig->prev;
-		seccomp_filter_free(freeme);
-	}
-}
-
-/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
-void put_seccomp_filter(struct task_struct *tsk)
-{
-	__put_seccomp_filter(tsk->seccomp.filter);
-}
-
 static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason)
 {
 	clear_siginfo(info);
-- 
cgit v1.2.3


From 76194c4e830d570d9e369d637bb907591d2b3111 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Mon, 1 Jun 2020 11:50:07 -0700
Subject: seccomp: Lift wait_queue into struct seccomp_filter

Lift the wait_queue from struct notification into struct seccomp_filter.
This is cleaner overall and lets us avoid having to take the notifier
mutex in the future for EPOLLHUP notifications since we need to neither
read nor modify the notifier specific aspects of the seccomp filter. In
the exit path I'd very much like to avoid having to take the notifier mutex
for each filter in the task's filter hierarchy.

Cc: Tycho Andersen <tycho@tycho.ws>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matt Denton <mpdenton@google.com>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Jann Horn <jannh@google.com>
Cc: Chris Palmer <palmer@google.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Robert Sesek <rsesek@google.com>
Cc: Jeffrey Vander Stoep <jeffv@google.com>
Cc: Linux Containers <containers@lists.linux-foundation.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/seccomp.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 0ca6d5243427..39c1c687d8ad 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -95,13 +95,11 @@ struct seccomp_knotif {
  *           filter->notify_lock.
  * @next_id: The id of the next request.
  * @notifications: A list of struct seccomp_knotif elements.
- * @wqh: A wait queue for poll.
  */
 struct notification {
 	struct semaphore request;
 	u64 next_id;
 	struct list_head notifications;
-	wait_queue_head_t wqh;
 };
 
 /**
@@ -117,6 +115,7 @@ struct notification {
  * @prog: the BPF program to evaluate
  * @notif: the struct that holds all notification related information
  * @notify_lock: A lock for all notification-related accesses.
+ * @wqh: A wait queue for poll if a notifier is in use.
  *
  * seccomp_filter objects are organized in a tree linked via the @prev
  * pointer.  For any task, it appears to be a singly-linked list starting
@@ -135,6 +134,7 @@ struct seccomp_filter {
 	struct bpf_prog *prog;
 	struct notification *notif;
 	struct mutex notify_lock;
+	wait_queue_head_t wqh;
 };
 
 /* Limit any path through the tree to 256KB worth of instructions. */
@@ -502,6 +502,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 	}
 
 	refcount_set(&sfilter->refs, 1);
+	init_waitqueue_head(&sfilter->wqh);
 
 	return sfilter;
 }
@@ -774,7 +775,7 @@ static int seccomp_do_user_notification(int this_syscall,
 	list_add(&n.list, &match->notif->notifications);
 
 	up(&match->notif->request);
-	wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM);
+	wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
 	mutex_unlock(&match->notify_lock);
 
 	/*
@@ -1098,7 +1099,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
 	unotif.data = *(knotif->data);
 
 	knotif->state = SECCOMP_NOTIFY_SENT;
-	wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM);
+	wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM);
 	ret = 0;
 out:
 	mutex_unlock(&filter->notify_lock);
@@ -1217,7 +1218,7 @@ static __poll_t seccomp_notify_poll(struct file *file,
 	__poll_t ret = 0;
 	struct seccomp_knotif *cur;
 
-	poll_wait(file, &filter->notif->wqh, poll_tab);
+	poll_wait(file, &filter->wqh, poll_tab);
 
 	if (mutex_lock_interruptible(&filter->notify_lock) < 0)
 		return EPOLLERR;
@@ -1261,7 +1262,6 @@ static struct file *init_listener(struct seccomp_filter *filter)
 	sema_init(&filter->notif->request, 0);
 	filter->notif->next_id = get_random_u64();
 	INIT_LIST_HEAD(&filter->notif->notifications);
-	init_waitqueue_head(&filter->notif->wqh);
 
 	ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
 				 filter, O_RDWR);
-- 
cgit v1.2.3


From 99cdb8b9a57393b5978e7a6310a2cba511dd179b Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sun, 31 May 2020 13:50:30 +0200
Subject: seccomp: notify about unused filter

We've been making heavy use of the seccomp notifier to intercept and
handle certain syscalls for containers. This patch allows a syscall
supervisor listening on a given notifier to be notified when a seccomp
filter has become unused.

A container is often managed by a singleton supervisor process the
so-called "monitor". This monitor process has an event loop which has
various event handlers registered. If the user specified a seccomp
profile that included a notifier for various syscalls then we also
register a seccomp notify even handler. For any container using a
separate pid namespace the lifecycle of the seccomp notifier is bound to
the init process of the pid namespace, i.e. when the init process exits
the filter must be unused.

If a new process attaches to a container we force it to assume a seccomp
profile. This can either be the same seccomp profile as the container
was started with or a modified one. If the attaching process makes use
of the seccomp notifier we will register a new seccomp notifier handler
in the monitor's event loop. However, when the attaching process exits
we can't simply delete the handler since other child processes could've
been created (daemons spawned etc.) that have inherited the seccomp
filter and so we need to keep the seccomp notifier fd alive in the event
loop. But this is problematic since we don't get a notification when the
seccomp filter has become unused and so we currently never remove the
seccomp notifier fd from the event loop and just keep accumulating fds
in the event loop. We've had this issue for a while but it has recently
become more pressing as more and larger users make use of this.

To fix this, we introduce a new "users" reference counter that tracks any
tasks and dependent filters making use of a filter. When a notifier is
registered waiting tasks will be notified that the filter is now empty
by receiving a (E)POLLHUP event.

The concept in this patch introduces is the same as for signal_struct,
i.e. reference counting for life-cycle management is decoupled from
reference counting taks using the object. There's probably some trickery
possible but the second counter is just the correct way of doing this
IMHO and has precedence.

Cc: Tycho Andersen <tycho@tycho.ws>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matt Denton <mpdenton@google.com>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Jann Horn <jannh@google.com>
Cc: Chris Palmer <palmer@google.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Robert Sesek <rsesek@google.com>
Cc: Jeffrey Vander Stoep <jeffv@google.com>
Cc: Linux Containers <containers@lists.linux-foundation.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/20200531115031.391515-3-christian.brauner@ubuntu.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/seccomp.c | 44 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 39c1c687d8ad..ba30bffa9301 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -110,6 +110,14 @@ struct notification {
  *	  attached task, once for the dependent filter, and if
  *	  requested for the user notifier. When @refs reaches zero,
  *	  the filter can be freed.
+ * @users: A filter's @users count is incremented for each directly
+ *         attached task (filter installation, fork(), thread_sync),
+ *	   and once for the dependent filter (tracked in filter->prev).
+ *	   When it reaches zero it indicates that no direct or indirect
+ *	   users of that filter exist. No new tasks can get associated with
+ *	   this filter after reaching 0. The @users count is always smaller
+ *	   or equal to @refs. Hence, reaching 0 for @users does not mean
+ *	   the filter can be freed.
  * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
  * @prev: points to a previously installed, or inherited, filter
  * @prog: the BPF program to evaluate
@@ -129,6 +137,7 @@ struct notification {
  */
 struct seccomp_filter {
 	refcount_t refs;
+	refcount_t users;
 	bool log;
 	struct seccomp_filter *prev;
 	struct bpf_prog *prog;
@@ -376,6 +385,15 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter)
 	}
 }
 
+static void __seccomp_filter_orphan(struct seccomp_filter *orig)
+{
+	while (orig && refcount_dec_and_test(&orig->users)) {
+		if (waitqueue_active(&orig->wqh))
+			wake_up_poll(&orig->wqh, EPOLLHUP);
+		orig = orig->prev;
+	}
+}
+
 static void __put_seccomp_filter(struct seccomp_filter *orig)
 {
 	/* Clean up single-reference branches iteratively. */
@@ -386,10 +404,18 @@ static void __put_seccomp_filter(struct seccomp_filter *orig)
 	}
 }
 
+static void __seccomp_filter_release(struct seccomp_filter *orig)
+{
+	/* Notify about any unused filters in the task's former filter tree. */
+	__seccomp_filter_orphan(orig);
+	/* Finally drop all references to the task's former tree. */
+	__put_seccomp_filter(orig);
+}
+
 /**
- * seccomp_filter_release - Detach the task from its filter tree
- *			    and drop its reference count during
- *			    exit.
+ * seccomp_filter_release - Detach the task from its filter tree,
+ *			    drop its reference count, and notify
+ *			    about unused filters
  *
  * This function should only be called when the task is exiting as
  * it detaches it from its filter tree. As such, READ_ONCE() and
@@ -401,7 +427,7 @@ void seccomp_filter_release(struct task_struct *tsk)
 
 	/* Detach task from its filter tree. */
 	tsk->seccomp.filter = NULL;
-	__put_seccomp_filter(orig);
+	__seccomp_filter_release(orig);
 }
 
 /**
@@ -428,12 +454,15 @@ static inline void seccomp_sync_threads(unsigned long flags)
 
 		/* Get a task reference for the new leaf node. */
 		get_seccomp_filter(caller);
+
 		/*
 		 * Drop the task reference to the shared ancestor since
 		 * current's path will hold a reference.  (This also
 		 * allows a put before the assignment.)
 		 */
-		__put_seccomp_filter(thread->seccomp.filter);
+		__seccomp_filter_release(thread->seccomp.filter);
+
+		/* Make our new filter tree visible. */
 		smp_store_release(&thread->seccomp.filter,
 				  caller->seccomp.filter);
 		atomic_set(&thread->seccomp.filter_count,
@@ -502,6 +531,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 	}
 
 	refcount_set(&sfilter->refs, 1);
+	refcount_set(&sfilter->users, 1);
 	init_waitqueue_head(&sfilter->wqh);
 
 	return sfilter;
@@ -606,6 +636,7 @@ void get_seccomp_filter(struct task_struct *tsk)
 	if (!orig)
 		return;
 	__get_seccomp_filter(orig);
+	refcount_inc(&orig->users);
 }
 
 static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason)
@@ -1234,6 +1265,9 @@ static __poll_t seccomp_notify_poll(struct file *file,
 
 	mutex_unlock(&filter->notify_lock);
 
+	if (refcount_read(&filter->users) == 0)
+		ret |= EPOLLHUP;
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From e68f9d49dda1744d548426c8b4335a8d693a36d0 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 15 Jun 2020 22:02:56 -0700
Subject: seccomp: Use pr_fmt

Avoid open-coding "seccomp: " prefixes for pr_*() calls.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/seccomp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ba30bffa9301..5f0e3f3a7a5d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -13,6 +13,7 @@
  * Mode 2 allows user-defined system call filters in the form
  *        of Berkeley Packet Filters/Linux Socket Filters.
  */
+#define pr_fmt(fmt) "seccomp: " fmt
 
 #include <linux/refcount.h>
 #include <linux/audit.h>
@@ -1873,7 +1874,7 @@ static int __init seccomp_sysctl_init(void)
 
 	hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
 	if (!hdr)
-		pr_warn("seccomp: sysctl registration failed\n");
+		pr_warn("sysctl registration failed\n");
 	else
 		kmemleak_not_leak(hdr);
 
-- 
cgit v1.2.3


From 47e33c05f9f07cac3de833e531bcac9ae052c7ca Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 15 Jun 2020 15:42:46 -0700
Subject: seccomp: Fix ioctl number for SECCOMP_IOCTL_NOTIF_ID_VALID

When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced it had the wrong
direction flag set. While this isn't a big deal as nothing currently
enforces these bits in the kernel, it should be defined correctly. Fix
the define and provide support for the old command until it is no longer
needed for backward compatibility.

Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace")
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/seccomp.h                  | 3 ++-
 kernel/seccomp.c                              | 9 +++++++++
 tools/testing/selftests/seccomp/seccomp_bpf.c | 2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index c1735455bc53..965290f7dcc2 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -123,5 +123,6 @@ struct seccomp_notif_resp {
 #define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
 						struct seccomp_notif_resp)
-#define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
+#define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOW(2, __u64)
+
 #endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5f0e3f3a7a5d..0ed57e8c49d0 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -44,6 +44,14 @@
 #include <linux/anon_inodes.h>
 #include <linux/lockdep.h>
 
+/*
+ * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
+ * wrong direction flag in the ioctl number. This is the broken one,
+ * which the kernel needs to keep supporting until all userspaces stop
+ * using the wrong command number.
+ */
+#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR	SECCOMP_IOR(2, __u64)
+
 enum notify_state {
 	SECCOMP_NOTIFY_INIT,
 	SECCOMP_NOTIFY_SENT,
@@ -1236,6 +1244,7 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
 		return seccomp_notify_recv(filter, buf);
 	case SECCOMP_IOCTL_NOTIF_SEND:
 		return seccomp_notify_send(filter, buf);
+	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
 	case SECCOMP_IOCTL_NOTIF_ID_VALID:
 		return seccomp_notify_id_valid(filter, buf);
 	default:
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 43884b6625fc..61f9ac200001 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -186,7 +186,7 @@ struct seccomp_metadata {
 #define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
 						struct seccomp_notif_resp)
-#define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
+#define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOW(2, __u64)
 
 struct seccomp_notif {
 	__u64 id;
-- 
cgit v1.2.3


From fe4bfff86ec54773df3db79e8112e3b0f820c799 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 19 Jun 2020 12:20:15 -0700
Subject: seccomp: Use -1 marker for end of mode 1 syscall list

The terminator for the mode 1 syscalls list was a 0, but that could be
a valid syscall number (e.g. x86_64 __NR_read). By luck, __NR_read was
listed first and the loop construct would not test it, so there was no
bug. However, this is fragile. Replace the terminator with -1 instead,
and make the variable name for mode 1 syscall lists more descriptive.

Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/mips/include/asm/seccomp.h |  4 ++--
 include/asm-generic/seccomp.h   |  2 +-
 kernel/seccomp.c                | 10 +++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/mips/include/asm/seccomp.h b/arch/mips/include/asm/seccomp.h
index e383d7e27b93..aa809589a181 100644
--- a/arch/mips/include/asm/seccomp.h
+++ b/arch/mips/include/asm/seccomp.h
@@ -9,12 +9,12 @@ static inline const int *get_compat_mode1_syscalls(void)
 	static const int syscalls_O32[] = {
 		__NR_O32_Linux + 3, __NR_O32_Linux + 4,
 		__NR_O32_Linux + 1, __NR_O32_Linux + 193,
-		0, /* null terminated */
+		-1, /* negative terminated */
 	};
 	static const int syscalls_N32[] = {
 		__NR_N32_Linux + 0, __NR_N32_Linux + 1,
 		__NR_N32_Linux + 58, __NR_N32_Linux + 211,
-		0, /* null terminated */
+		-1, /* negative terminated */
 	};
 
 	if (IS_ENABLED(CONFIG_MIPS32_O32) && test_thread_flag(TIF_32BIT_REGS))
diff --git a/include/asm-generic/seccomp.h b/include/asm-generic/seccomp.h
index 1321ac7821d7..6b6f42bc58f9 100644
--- a/include/asm-generic/seccomp.h
+++ b/include/asm-generic/seccomp.h
@@ -33,7 +33,7 @@ static inline const int *get_compat_mode1_syscalls(void)
 	static const int mode1_syscalls_32[] = {
 		__NR_seccomp_read_32, __NR_seccomp_write_32,
 		__NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
-		0, /* null terminated */
+		-1, /* negative terminated */
 	};
 	return mode1_syscalls_32;
 }
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 0ed57e8c49d0..866a432cd746 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -742,20 +742,20 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
  */
 static const int mode1_syscalls[] = {
 	__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
-	0, /* null terminated */
+	-1, /* negative terminated */
 };
 
 static void __secure_computing_strict(int this_syscall)
 {
-	const int *syscall_whitelist = mode1_syscalls;
+	const int *allowed_syscalls = mode1_syscalls;
 #ifdef CONFIG_COMPAT
 	if (in_compat_syscall())
-		syscall_whitelist = get_compat_mode1_syscalls();
+		allowed_syscalls = get_compat_mode1_syscalls();
 #endif
 	do {
-		if (*syscall_whitelist == this_syscall)
+		if (*allowed_syscalls == this_syscall)
 			return;
-	} while (*++syscall_whitelist);
+	} while (*++allowed_syscalls != -1);
 
 #ifdef SECCOMP_DEBUG
 	dump_stack();
-- 
cgit v1.2.3


From bc885f1ab6de0d38c6956a71b0126543b64875b0 Mon Sep 17 00:00:00 2001
From: Bruno Meneguele <bmeneg@redhat.com>
Date: Fri, 10 Jul 2020 14:44:23 -0300
Subject: doc:kmsg: explicitly state the return value in case of SEEK_CUR

The commit 625d3449788f ("Revert "kernel/printk: add kmsg SEEK_CUR
handling"") reverted a change done to the return value in case a SEEK_CUR
operation was performed for kmsg buffer based on the fact that different
userspace apps were handling the new return value (-ESPIPE) in different
ways, breaking them.

At the same time -ESPIPE was the wrong decision because kmsg /does support/
seek() but doesn't follow the "normal" behavior userspace is used to.
Because of that and also considering the time -EINVAL has been used, it was
decided to keep this way to avoid more userspace breakage.

This patch adds an official statement to the kmsg documentation pointing to
the current return value for SEEK_CUR, -EINVAL, thus userspace libraries
and apps can refer to it for a definitive guide on what to expect.

Signed-off-by: Bruno Meneguele <bmeneg@redhat.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20200710174423.10480-1-bmeneg@redhat.com
---
 Documentation/ABI/testing/dev-kmsg | 11 +++++++++++
 kernel/printk/printk.c             |  8 ++++++++
 2 files changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/dev-kmsg b/Documentation/ABI/testing/dev-kmsg
index 1e6c28b1942b..a917efc289a2 100644
--- a/Documentation/ABI/testing/dev-kmsg
+++ b/Documentation/ABI/testing/dev-kmsg
@@ -61,6 +61,17 @@ Description:	The /dev/kmsg character device node provides userspace access
 		SEEK_CUR is not supported, returning -ESPIPE (invalid seek) to
 		errno whenever requested.
 
+		Other seek operations or offsets are not supported because of
+		the special behavior this device has. The device allows to read
+		or write only whole variable length messages (records) that are
+		stored in a ring buffer.
+
+		Because of the non-standard behavior also the error values are
+		non-standard. -ESPIPE is returned for non-zero offset. -EINVAL
+		is returned for other operations, e.g. SEEK_CUR. This behavior
+		and values are historical and could not be modified without the
+		risk of breaking userspace.
+
 		The output format consists of a prefix carrying the syslog
 		prefix including priority and facility, the 64 bit message
 		sequence number and the monotonic timestamp in microseconds,
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8c14835be46c..5f6eca65dd9a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -943,6 +943,14 @@ out:
 	return ret;
 }
 
+/*
+ * Be careful when modifying this function!!!
+ *
+ * Only few operations are supported because the device works only with the
+ * entire variable length messages (records). Non-standard values are
+ * returned in the other cases and has been this way for quite some time.
+ * User space applications might depend on this behavior.
+ */
 static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct devkmsg_user *user = file->private_data;
-- 
cgit v1.2.3


From c9a0f3b85e09dd16665b639cb884490410619434 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sat, 11 Jul 2020 23:53:24 +0200
Subject: bpf: Resolve BTF IDs in vmlinux image

Using BTF_ID_LIST macro to define lists for several helpers
using BTF arguments.

And running resolve_btfids on vmlinux elf object during linking,
so the .BTF_ids section gets the IDs resolved.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200711215329.41165-5-jolsa@kernel.org
---
 Makefile                 | 3 ++-
 kernel/bpf/stackmap.c    | 5 ++++-
 kernel/trace/bpf_trace.c | 9 +++++++--
 net/core/filter.c        | 9 +++++++--
 scripts/link-vmlinux.sh  | 6 ++++++
 5 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/Makefile b/Makefile
index 017e775b3288..462f1a75fcb3 100644
--- a/Makefile
+++ b/Makefile
@@ -448,6 +448,7 @@ OBJSIZE		= $(CROSS_COMPILE)size
 STRIP		= $(CROSS_COMPILE)strip
 endif
 PAHOLE		= pahole
+RESOLVE_BTFIDS	= $(objtree)/tools/bpf/resolve_btfids/resolve_btfids
 LEX		= flex
 YACC		= bison
 AWK		= awk
@@ -510,7 +511,7 @@ GCC_PLUGINS_CFLAGS :=
 CLANG_FLAGS :=
 
 export ARCH SRCARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC
-export CPP AR NM STRIP OBJCOPY OBJDUMP OBJSIZE READELF PAHOLE LEX YACC AWK INSTALLKERNEL
+export CPP AR NM STRIP OBJCOPY OBJDUMP OBJSIZE READELF PAHOLE RESOLVE_BTFIDS LEX YACC AWK INSTALLKERNEL
 export PERL PYTHON PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX
 export KGZIP KBZIP2 KLZOP LZMA LZ4 XZ
 export KBUILD_HOSTCXXFLAGS KBUILD_HOSTLDFLAGS KBUILD_HOSTLDLIBS LDFLAGS_MODULE
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index a6c361ed7937..48d8e739975f 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -9,6 +9,7 @@
 #include <linux/elf.h>
 #include <linux/pagemap.h>
 #include <linux/irq_work.h>
+#include <linux/btf_ids.h>
 #include "percpu_freelist.h"
 
 #define STACK_CREATE_FLAG_MASK					\
@@ -576,7 +577,9 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
 	return __bpf_get_stack(regs, task, buf, size, flags);
 }
 
-static int bpf_get_task_stack_btf_ids[5];
+BTF_ID_LIST(bpf_get_task_stack_btf_ids)
+BTF_ID(struct, task_struct)
+
 const struct bpf_func_proto bpf_get_task_stack_proto = {
 	.func		= bpf_get_task_stack,
 	.gpl_only	= false,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e0b7775039ab..e178e8e32b33 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,7 @@
 #include <linux/kprobes.h>
 #include <linux/syscalls.h>
 #include <linux/error-injection.h>
+#include <linux/btf_ids.h>
 
 #include <asm/tlb.h>
 
@@ -710,7 +711,9 @@ out:
 	return err;
 }
 
-static int bpf_seq_printf_btf_ids[5];
+BTF_ID_LIST(bpf_seq_printf_btf_ids)
+BTF_ID(struct, seq_file)
+
 static const struct bpf_func_proto bpf_seq_printf_proto = {
 	.func		= bpf_seq_printf,
 	.gpl_only	= true,
@@ -728,7 +731,9 @@ BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
 	return seq_write(m, data, len) ? -EOVERFLOW : 0;
 }
 
-static int bpf_seq_write_btf_ids[5];
+BTF_ID_LIST(bpf_seq_write_btf_ids)
+BTF_ID(struct, seq_file)
+
 static const struct bpf_func_proto bpf_seq_write_proto = {
 	.func		= bpf_seq_write,
 	.gpl_only	= true,
diff --git a/net/core/filter.c b/net/core/filter.c
index ddcc0d6209e1..4e572441e64a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -75,6 +75,7 @@
 #include <net/ipv6_stubs.h>
 #include <net/bpf_sk_storage.h>
 #include <net/transp_v6.h>
+#include <linux/btf_ids.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -3779,7 +3780,9 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
-static int bpf_skb_output_btf_ids[5];
+BTF_ID_LIST(bpf_skb_output_btf_ids)
+BTF_ID(struct, sk_buff)
+
 const struct bpf_func_proto bpf_skb_output_proto = {
 	.func		= bpf_skb_event_output,
 	.gpl_only	= true,
@@ -4173,7 +4176,9 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
-static int bpf_xdp_output_btf_ids[5];
+BTF_ID_LIST(bpf_xdp_output_btf_ids)
+BTF_ID(struct, xdp_buff)
+
 const struct bpf_func_proto bpf_xdp_output_proto = {
 	.func		= bpf_xdp_event_output,
 	.gpl_only	= true,
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 92dd745906f4..e26f02dbedee 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -336,6 +336,12 @@ fi
 
 vmlinux_link vmlinux "${kallsymso}" ${btf_vmlinux_bin_o}
 
+# fill in BTF IDs
+if [ -n "${CONFIG_DEBUG_INFO_BTF}" ]; then
+info BTFIDS vmlinux
+${RESOLVE_BTFIDS} vmlinux
+fi
+
 if [ -n "${CONFIG_BUILDTIME_TABLE_SORT}" ]; then
 	info SORTTAB vmlinux
 	if ! sorttable vmlinux; then
-- 
cgit v1.2.3


From 138b9a0511c789f2451ff1d80e7fd3f9eef3a9e3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sat, 11 Jul 2020 23:53:25 +0200
Subject: bpf: Remove btf_id helpers resolving

Now when we moved the helpers btf_id arrays into .BTF_ids section,
we can remove the code that resolve those IDs in runtime.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200711215329.41165-6-jolsa@kernel.org
---
 kernel/bpf/btf.c | 89 ++++----------------------------------------------------
 1 file changed, 5 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4c3007f428b1..71140b73ae3c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4079,96 +4079,17 @@ error:
 	return -EINVAL;
 }
 
-static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn,
-				   int arg)
-{
-	char fnname[KSYM_SYMBOL_LEN + 4] = "btf_";
-	const struct btf_param *args;
-	const struct btf_type *t;
-	const char *tname, *sym;
-	u32 btf_id, i;
-
-	if (IS_ERR(btf_vmlinux)) {
-		bpf_log(log, "btf_vmlinux is malformed\n");
-		return -EINVAL;
-	}
-
-	sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4);
-	if (!sym) {
-		bpf_log(log, "kernel doesn't have kallsyms\n");
-		return -EFAULT;
-	}
-
-	for (i = 1; i <= btf_vmlinux->nr_types; i++) {
-		t = btf_type_by_id(btf_vmlinux, i);
-		if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF)
-			continue;
-		tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
-		if (!strcmp(tname, fnname))
-			break;
-	}
-	if (i > btf_vmlinux->nr_types) {
-		bpf_log(log, "helper %s type is not found\n", fnname);
-		return -ENOENT;
-	}
-
-	t = btf_type_by_id(btf_vmlinux, t->type);
-	if (!btf_type_is_ptr(t))
-		return -EFAULT;
-	t = btf_type_by_id(btf_vmlinux, t->type);
-	if (!btf_type_is_func_proto(t))
-		return -EFAULT;
-
-	args = (const struct btf_param *)(t + 1);
-	if (arg >= btf_type_vlen(t)) {
-		bpf_log(log, "bpf helper %s doesn't have %d-th argument\n",
-			fnname, arg);
-		return -EINVAL;
-	}
-
-	t = btf_type_by_id(btf_vmlinux, args[arg].type);
-	if (!btf_type_is_ptr(t) || !t->type) {
-		/* anything but the pointer to struct is a helper config bug */
-		bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n");
-		return -EFAULT;
-	}
-	btf_id = t->type;
-	t = btf_type_by_id(btf_vmlinux, t->type);
-	/* skip modifiers */
-	while (btf_type_is_modifier(t)) {
-		btf_id = t->type;
-		t = btf_type_by_id(btf_vmlinux, t->type);
-	}
-	if (!btf_type_is_struct(t)) {
-		bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n");
-		return -EFAULT;
-	}
-	bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4,
-		arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off));
-	return btf_id;
-}
-
 int btf_resolve_helper_id(struct bpf_verifier_log *log,
 			  const struct bpf_func_proto *fn, int arg)
 {
-	int *btf_id = &fn->btf_id[arg];
-	int ret;
+	int id;
 
 	if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID)
 		return -EINVAL;
-
-	ret = READ_ONCE(*btf_id);
-	if (ret)
-		return ret;
-	/* ok to race the search. The result is the same */
-	ret = __btf_resolve_helper_id(log, fn->func, arg);
-	if (!ret) {
-		/* Function argument cannot be type 'void' */
-		bpf_log(log, "BTF resolution bug\n");
-		return -EFAULT;
-	}
-	WRITE_ONCE(*btf_id, ret);
-	return ret;
+	id = fn->btf_id[arg];
+	if (!id || id > btf_vmlinux->nr_types)
+		return -EINVAL;
+	return id;
 }
 
 static int __get_type_size(struct btf *btf, u32 btf_id,
-- 
cgit v1.2.3


From 49f4e6720748c778795df62d222d0200b61345be Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sat, 11 Jul 2020 23:53:26 +0200
Subject: bpf: Use BTF_ID to resolve bpf_ctx_convert struct

This way the ID is resolved during compile time,
and we can remove the runtime name search.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200711215329.41165-7-jolsa@kernel.org
---
 kernel/bpf/btf.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 71140b73ae3c..a710e3ee1f18 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -18,6 +18,7 @@
 #include <linux/sort.h>
 #include <linux/bpf_verifier.h>
 #include <linux/btf.h>
+#include <linux/btf_ids.h>
 #include <linux/skmsg.h>
 #include <linux/perf_event.h>
 #include <net/sock.h>
@@ -3621,12 +3622,15 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
 	return kern_ctx_type->type;
 }
 
+BTF_ID_LIST(bpf_ctx_convert_btf_id)
+BTF_ID(struct, bpf_ctx_convert)
+
 struct btf *btf_parse_vmlinux(void)
 {
 	struct btf_verifier_env *env = NULL;
 	struct bpf_verifier_log *log;
 	struct btf *btf = NULL;
-	int err, btf_id;
+	int err;
 
 	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
 	if (!env)
@@ -3659,14 +3663,8 @@ struct btf *btf_parse_vmlinux(void)
 	if (err)
 		goto errout;
 
-	/* find struct bpf_ctx_convert for type checking later */
-	btf_id = btf_find_by_name_kind(btf, "bpf_ctx_convert", BTF_KIND_STRUCT);
-	if (btf_id < 0) {
-		err = btf_id;
-		goto errout;
-	}
 	/* btf_parse_vmlinux() runs under bpf_verifier_lock */
-	bpf_ctx_convert.t = btf_type_by_id(btf, btf_id);
+	bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
 
 	/* find bpf map structs for map_ptr access checking */
 	err = btf_vmlinux_map_ids_init(btf, log);
-- 
cgit v1.2.3


From 4969f8a073977123504609d7310b42a588297aa4 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 9 Jun 2020 16:21:38 -0700
Subject: pidfd: Add missing sock updates for pidfd_getfd()

The sock counting (sock_update_netprioidx() and sock_update_classid())
was missing from pidfd's implementation of received fd installation. Add
a call to the new __receive_sock() helper.

Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org
Fixes: 8649c322f75c ("pid: Implement pidfd_getfd syscall")
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/pid.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index f1496b757162..ee58530d1aca 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -42,6 +42,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
 #include <linux/idr.h>
+#include <net/sock.h>
 
 struct pid init_struct_pid = {
 	.count		= REFCOUNT_INIT(1),
@@ -642,10 +643,12 @@ static int pidfd_getfd(struct pid *pid, int fd)
 	}
 
 	ret = get_unused_fd_flags(O_CLOEXEC);
-	if (ret < 0)
+	if (ret < 0) {
 		fput(file);
-	else
+	} else {
+		__receive_sock(file);
 		fd_install(ret, file);
+	}
 
 	return ret;
 }
-- 
cgit v1.2.3


From 910d2f16ac90463a1f5b03d53246c443e2b354b9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 9 Jun 2020 16:21:38 -0700
Subject: pidfd: Replace open-coded receive_fd()

Replace the open-coded version of receive_fd() with a call to the
new helper.

Thanks to Vamshi K Sthambamkadi <vamshi.k.sthambamkadi@gmail.com> for
catching a missed fput() in an earlier version of this patch.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Sargun Dhillon <sargun@sargun.me>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/pid.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index ee58530d1aca..da5aea5f04fa 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -636,19 +636,8 @@ static int pidfd_getfd(struct pid *pid, int fd)
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-	ret = security_file_receive(file);
-	if (ret) {
-		fput(file);
-		return ret;
-	}
-
-	ret = get_unused_fd_flags(O_CLOEXEC);
-	if (ret < 0) {
-		fput(file);
-	} else {
-		__receive_sock(file);
-		fd_install(ret, file);
-	}
+	ret = receive_fd(file, O_CLOEXEC);
+	fput(file);
 
 	return ret;
 }
-- 
cgit v1.2.3


From ac5a72ea5c8989871e61f6bb0852e0f91de51ebe Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Mon, 13 Jul 2020 12:52:33 +0100
Subject: bpf: Use dedicated bpf_trace_printk event instead of trace_printk()

The bpf helper bpf_trace_printk() uses trace_printk() under the hood.
This leads to an alarming warning message originating from trace
buffer allocation which occurs the first time a program using
bpf_trace_printk() is loaded.

We can instead create a trace event for bpf_trace_printk() and enable
it in-kernel when/if we encounter a program using the
bpf_trace_printk() helper.  With this approach, trace_printk()
is not used directly and no warning message appears.

This work was started by Steven (see Link) and finished by Alan; added
Steven's Signed-off-by with his permission.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/r/20200628194334.6238b933@oasis.local.home
Link: https://lore.kernel.org/bpf/1594641154-18897-2-git-send-email-alan.maguire@oracle.com
---
 kernel/trace/Makefile    |  2 ++
 kernel/trace/bpf_trace.c | 42 +++++++++++++++++++++++++++++++++++++-----
 kernel/trace/bpf_trace.h | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 5 deletions(-)
 create mode 100644 kernel/trace/bpf_trace.h

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 6575bb0a0434..aeba5ee7325a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -31,6 +31,8 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE
 GCOV_PROFILE := y
 endif
 
+CFLAGS_bpf_trace.o := -I$(src)
+
 CFLAGS_trace_benchmark.o := -I$(src)
 CFLAGS_trace_events_filter.o := -I$(src)
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e178e8e32b33..3cc0dcb60ca2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -11,6 +11,7 @@
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/kprobes.h>
+#include <linux/spinlock.h>
 #include <linux/syscalls.h>
 #include <linux/error-injection.h>
 #include <linux/btf_ids.h>
@@ -20,6 +21,9 @@
 #include "trace_probe.h"
 #include "trace.h"
 
+#define CREATE_TRACE_POINTS
+#include "bpf_trace.h"
+
 #define bpf_event_rcu_dereference(p)					\
 	rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
 
@@ -375,6 +379,30 @@ static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
 	}
 }
 
+static DEFINE_RAW_SPINLOCK(trace_printk_lock);
+
+#define BPF_TRACE_PRINTK_SIZE   1024
+
+static inline __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...)
+{
+	static char buf[BPF_TRACE_PRINTK_SIZE];
+	unsigned long flags;
+	va_list ap;
+	int ret;
+
+	raw_spin_lock_irqsave(&trace_printk_lock, flags);
+	va_start(ap, fmt);
+	ret = vsnprintf(buf, sizeof(buf), fmt, ap);
+	va_end(ap);
+	/* vsnprintf() will not append null for zero-length strings */
+	if (ret == 0)
+		buf[0] = '\0';
+	trace_bpf_trace_printk(buf);
+	raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
+
+	return ret;
+}
+
 /*
  * Only limited trace_printk() conversion specifiers allowed:
  * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s
@@ -484,8 +512,7 @@ fmt_next:
  */
 #define __BPF_TP_EMIT()	__BPF_ARG3_TP()
 #define __BPF_TP(...)							\
-	__trace_printk(0 /* Fake ip */,					\
-		       fmt, ##__VA_ARGS__)
+	bpf_do_trace_printk(fmt, ##__VA_ARGS__)
 
 #define __BPF_ARG1_TP(...)						\
 	((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64))	\
@@ -522,10 +549,15 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 {
 	/*
-	 * this program might be calling bpf_trace_printk,
-	 * so allocate per-cpu printk buffers
+	 * This program might be calling bpf_trace_printk,
+	 * so enable the associated bpf_trace/bpf_trace_printk event.
+	 * Repeat this each time as it is possible a user has
+	 * disabled bpf_trace_printk events.  By loading a program
+	 * calling bpf_trace_printk() however the user has expressed
+	 * the intent to see such events.
 	 */
-	trace_printk_init_buffers();
+	if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
+		pr_warn_ratelimited("could not enable bpf_trace_printk events");
 
 	return &bpf_trace_printk_proto;
 }
diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h
new file mode 100644
index 000000000000..9acbc11ac7bb
--- /dev/null
+++ b/kernel/trace/bpf_trace.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bpf_trace
+
+#if !defined(_TRACE_BPF_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+
+#define _TRACE_BPF_TRACE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(bpf_trace_printk,
+
+	TP_PROTO(const char *bpf_string),
+
+	TP_ARGS(bpf_string),
+
+	TP_STRUCT__entry(
+		__string(bpf_string, bpf_string)
+	),
+
+	TP_fast_assign(
+		__assign_str(bpf_string, bpf_string);
+	),
+
+	TP_printk("%s", __get_str(bpf_string))
+);
+
+#endif /* _TRACE_BPF_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE bpf_trace
+
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 6ada7ba2fa7710eead191b69fd948569b05d0f61 Mon Sep 17 00:00:00 2001
From: Xiang Chen <chenxiang66@hisilicon.com>
Date: Mon, 22 Jun 2020 17:09:13 +0800
Subject: PM: hibernate: fix white space in a few places

In hibernate.c, some places lack of spaces while some places have
redundant spaces. So fix them.

Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 02ec716a4927..5714f51ba9f8 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1062,7 +1062,7 @@ power_attr(disk);
 static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
 			   char *buf)
 {
-	return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
+	return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device),
 		       MINOR(swsusp_resume_device));
 }
 
@@ -1162,7 +1162,7 @@ static ssize_t reserved_size_store(struct kobject *kobj,
 
 power_attr(reserved_size);
 
-static struct attribute * g[] = {
+static struct attribute *g[] = {
 	&disk_attr.attr,
 	&resume_offset_attr.attr,
 	&resume_attr.attr,
@@ -1190,7 +1190,7 @@ static int __init resume_setup(char *str)
 	if (noresume)
 		return 1;
 
-	strncpy( resume_file, str, 255 );
+	strncpy(resume_file, str, 255);
 	return 1;
 }
 
-- 
cgit v1.2.3


From 02d7f4005e942f75c63302d54a659e673edd00df Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 10 Jul 2020 22:12:10 +0300
Subject: PM: sleep: spread "const char *" correctness

Fixed string literals can be referred to as "const char *".

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
[ rjw: Minor subject edit ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/power.h    | 2 +-
 kernel/power/snapshot.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index ba2094db6294..32fc89ac96c3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -32,7 +32,7 @@ static inline int init_header_complete(struct swsusp_info *info)
 	return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
 }
 
-static inline char *check_image_kernel(struct swsusp_info *info)
+static inline const char *check_image_kernel(struct swsusp_info *info)
 {
 	return arch_hibernation_header_restore(info) ?
 			"architecture specific data" : NULL;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 881128b9351e..cef154261fe2 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2023,7 +2023,7 @@ static int init_header_complete(struct swsusp_info *info)
 	return 0;
 }
 
-static char *check_image_kernel(struct swsusp_info *info)
+static const char *check_image_kernel(struct swsusp_info *info)
 {
 	if (info->version_code != LINUX_VERSION_CODE)
 		return "kernel version";
@@ -2176,7 +2176,7 @@ static void mark_unsafe_pages(struct memory_bitmap *bm)
 
 static int check_header(struct swsusp_info *info)
 {
-	char *reason;
+	const char *reason;
 
 	reason = check_image_kernel(info);
 	if (!reason && info->num_physpages != get_num_physpages())
-- 
cgit v1.2.3


From 7cf97b12545503992020796c74bd84078eb39299 Mon Sep 17 00:00:00 2001
From: Sargun Dhillon <sargun@sargun.me>
Date: Tue, 2 Jun 2020 18:10:43 -0700
Subject: seccomp: Introduce addfd ioctl to seccomp user notifier

The current SECCOMP_RET_USER_NOTIF API allows for syscall supervision over
an fd. It is often used in settings where a supervising task emulates
syscalls on behalf of a supervised task in userspace, either to further
restrict the supervisee's syscall abilities or to circumvent kernel
enforced restrictions the supervisor deems safe to lift (e.g. actually
performing a mount(2) for an unprivileged container).

While SECCOMP_RET_USER_NOTIF allows for the interception of any syscall,
only a certain subset of syscalls could be correctly emulated. Over the
last few development cycles, the set of syscalls which can't be emulated
has been reduced due to the addition of pidfd_getfd(2). With this we are
now able to, for example, intercept syscalls that require the supervisor
to operate on file descriptors of the supervisee such as connect(2).

However, syscalls that cause new file descriptors to be installed can not
currently be correctly emulated since there is no way for the supervisor
to inject file descriptors into the supervisee. This patch adds a
new addfd ioctl to remove this restriction by allowing the supervisor to
install file descriptors into the intercepted task. By implementing this
feature via seccomp the supervisor effectively instructs the supervisee
to install a set of file descriptors into its own file descriptor table
during the intercepted syscall. This way it is possible to intercept
syscalls such as open() or accept(), and install (or replace, like
dup2(2)) the supervisor's resulting fd into the supervisee. One
replacement use-case would be to redirect the stdout and stderr of a
supervisee into log file descriptors opened by the supervisor.

The ioctl handling is based on the discussions[1] of how Extensible
Arguments should interact with ioctls. Instead of building size into
the addfd structure, make it a function of the ioctl command (which
is how sizes are normally passed to ioctls). To support forward and
backward compatibility, just mask out the direction and size, and match
everything. The size (and any future direction) checks are done along
with copy_struct_from_user() logic.

As a note, the seccomp_notif_addfd structure is laid out based on 8-byte
alignment without requiring packing as there have been packing issues
with uapi highlighted before[2][3]. Although we could overload the
newfd field and use -1 to indicate that it is not to be used, doing
so requires changing the size of the fd field, and introduces struct
packing complexity.

[1]: https://lore.kernel.org/lkml/87o8w9bcaf.fsf@mid.deneb.enyo.de/
[2]: https://lore.kernel.org/lkml/a328b91d-fd8f-4f27-b3c2-91a9c45f18c0@rasmusvillemoes.dk/
[3]: https://lore.kernel.org/lkml/20200612104629.GA15814@ircssh-2.c.rugged-nimbus-611.internal

Cc: Christoph Hellwig <hch@lst.de>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Tycho Andersen <tycho@tycho.ws>
Cc: Jann Horn <jannh@google.com>
Cc: Robert Sesek <rsesek@google.com>
Cc: Chris Palmer <palmer@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Suggested-by: Matt Denton <mpdenton@google.com>
Link: https://lore.kernel.org/r/20200603011044.7972-4-sargun@sargun.me
Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Reviewed-by: Will Drewry <wad@chromium.org>
Co-developed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/seccomp.h      |   4 +
 include/uapi/linux/seccomp.h |  22 ++++++
 kernel/seccomp.c             | 175 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 199 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index babcd6c02d09..881c90b6aa25 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -10,6 +10,10 @@
 					 SECCOMP_FILTER_FLAG_NEW_LISTENER | \
 					 SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
 
+/* sizeof() the first published struct seccomp_notif_addfd */
+#define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24
+#define SECCOMP_NOTIFY_ADDFD_SIZE_LATEST SECCOMP_NOTIFY_ADDFD_SIZE_VER0
+
 #ifdef CONFIG_SECCOMP
 
 #include <linux/thread_info.h>
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 965290f7dcc2..6ba18b82a02e 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -113,6 +113,25 @@ struct seccomp_notif_resp {
 	__u32 flags;
 };
 
+/* valid flags for seccomp_notif_addfd */
+#define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
+
+/**
+ * struct seccomp_notif_addfd
+ * @id: The ID of the seccomp notification
+ * @flags: SECCOMP_ADDFD_FLAG_*
+ * @srcfd: The local fd number
+ * @newfd: Optional remote FD number if SETFD option is set, otherwise 0.
+ * @newfd_flags: The O_* flags the remote FD should have applied
+ */
+struct seccomp_notif_addfd {
+	__u64 id;
+	__u32 flags;
+	__u32 srcfd;
+	__u32 newfd;
+	__u32 newfd_flags;
+};
+
 #define SECCOMP_IOC_MAGIC		'!'
 #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
 #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
@@ -124,5 +143,8 @@ struct seccomp_notif_resp {
 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
 						struct seccomp_notif_resp)
 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOW(2, __u64)
+/* On success, the return value is the remote process's added fd number */
+#define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOW(3, \
+						struct seccomp_notif_addfd)
 
 #endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 866a432cd746..3ee59ce0a323 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -87,10 +87,42 @@ struct seccomp_knotif {
 	long val;
 	u32 flags;
 
-	/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
+	/*
+	 * Signals when this has changed states, such as the listener
+	 * dying, a new seccomp addfd message, or changing to REPLIED
+	 */
 	struct completion ready;
 
 	struct list_head list;
+
+	/* outstanding addfd requests */
+	struct list_head addfd;
+};
+
+/**
+ * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
+ *
+ * @file: A reference to the file to install in the other task
+ * @fd: The fd number to install it at. If the fd number is -1, it means the
+ *      installing process should allocate the fd as normal.
+ * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
+ *         is allowed.
+ * @ret: The return value of the installing process. It is set to the fd num
+ *       upon success (>= 0).
+ * @completion: Indicates that the installing process has completed fd
+ *              installation, or gone away (either due to successful
+ *              reply, or signal)
+ *
+ */
+struct seccomp_kaddfd {
+	struct file *file;
+	int fd;
+	unsigned int flags;
+
+	/* To only be set on reply */
+	int ret;
+	struct completion completion;
+	struct list_head list;
 };
 
 /**
@@ -793,6 +825,17 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
 	return filter->notif->next_id++;
 }
 
+static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
+{
+	/*
+	 * Remove the notification, and reset the list pointers, indicating
+	 * that it has been handled.
+	 */
+	list_del_init(&addfd->list);
+	addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
+	complete(&addfd->completion);
+}
+
 static int seccomp_do_user_notification(int this_syscall,
 					struct seccomp_filter *match,
 					const struct seccomp_data *sd)
@@ -801,6 +844,7 @@ static int seccomp_do_user_notification(int this_syscall,
 	u32 flags = 0;
 	long ret = 0;
 	struct seccomp_knotif n = {};
+	struct seccomp_kaddfd *addfd, *tmp;
 
 	mutex_lock(&match->notify_lock);
 	err = -ENOSYS;
@@ -813,6 +857,7 @@ static int seccomp_do_user_notification(int this_syscall,
 	n.id = seccomp_next_notify_id(match);
 	init_completion(&n.ready);
 	list_add(&n.list, &match->notif->notifications);
+	INIT_LIST_HEAD(&n.addfd);
 
 	up(&match->notif->request);
 	wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
@@ -821,17 +866,34 @@ static int seccomp_do_user_notification(int this_syscall,
 	/*
 	 * This is where we wait for a reply from userspace.
 	 */
+wait:
 	err = wait_for_completion_interruptible(&n.ready);
 	mutex_lock(&match->notify_lock);
 	if (err == 0) {
+		/* Check if we were woken up by a addfd message */
+		addfd = list_first_entry_or_null(&n.addfd,
+						 struct seccomp_kaddfd, list);
+		if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
+			seccomp_handle_addfd(addfd);
+			mutex_unlock(&match->notify_lock);
+			goto wait;
+		}
 		ret = n.val;
 		err = n.error;
 		flags = n.flags;
 	}
 
+	/* If there were any pending addfd calls, clear them out */
+	list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
+		/* The process went away before we got a chance to handle it */
+		addfd->ret = -ESRCH;
+		list_del_init(&addfd->list);
+		complete(&addfd->completion);
+	}
+
 	/*
 	 * Note that it's possible the listener died in between the time when
-	 * we were notified of a respons (or a signal) and when we were able to
+	 * we were notified of a response (or a signal) and when we were able to
 	 * re-acquire the lock, so only delete from the list if the
 	 * notification actually exists.
 	 *
@@ -1069,6 +1131,11 @@ static int seccomp_notify_release(struct inode *inode, struct file *file)
 		knotif->error = -ENOSYS;
 		knotif->val = 0;
 
+		/*
+		 * We do not need to wake up any pending addfd messages, as
+		 * the notifier will do that for us, as this just looks
+		 * like a standard reply.
+		 */
 		complete(&knotif->ready);
 	}
 
@@ -1233,12 +1300,109 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
 	return ret;
 }
 
+static long seccomp_notify_addfd(struct seccomp_filter *filter,
+				 struct seccomp_notif_addfd __user *uaddfd,
+				 unsigned int size)
+{
+	struct seccomp_notif_addfd addfd;
+	struct seccomp_knotif *knotif;
+	struct seccomp_kaddfd kaddfd;
+	int ret;
+
+	BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
+	BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
+
+	if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
+		return -EINVAL;
+
+	ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
+	if (ret)
+		return ret;
+
+	if (addfd.newfd_flags & ~O_CLOEXEC)
+		return -EINVAL;
+
+	if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
+		return -EINVAL;
+
+	if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
+		return -EINVAL;
+
+	kaddfd.file = fget(addfd.srcfd);
+	if (!kaddfd.file)
+		return -EBADF;
+
+	kaddfd.flags = addfd.newfd_flags;
+	kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
+		    addfd.newfd : -1;
+	init_completion(&kaddfd.completion);
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret < 0)
+		goto out;
+
+	knotif = find_notification(filter, addfd.id);
+	if (!knotif) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+
+	/*
+	 * We do not want to allow for FD injection to occur before the
+	 * notification has been picked up by a userspace handler, or after
+	 * the notification has been replied to.
+	 */
+	if (knotif->state != SECCOMP_NOTIFY_SENT) {
+		ret = -EINPROGRESS;
+		goto out_unlock;
+	}
+
+	list_add(&kaddfd.list, &knotif->addfd);
+	complete(&knotif->ready);
+	mutex_unlock(&filter->notify_lock);
+
+	/* Now we wait for it to be processed or be interrupted */
+	ret = wait_for_completion_interruptible(&kaddfd.completion);
+	if (ret == 0) {
+		/*
+		 * We had a successful completion. The other side has already
+		 * removed us from the addfd queue, and
+		 * wait_for_completion_interruptible has a memory barrier upon
+		 * success that lets us read this value directly without
+		 * locking.
+		 */
+		ret = kaddfd.ret;
+		goto out;
+	}
+
+	mutex_lock(&filter->notify_lock);
+	/*
+	 * Even though we were woken up by a signal and not a successful
+	 * completion, a completion may have happened in the mean time.
+	 *
+	 * We need to check again if the addfd request has been handled,
+	 * and if not, we will remove it from the queue.
+	 */
+	if (list_empty(&kaddfd.list))
+		ret = kaddfd.ret;
+	else
+		list_del(&kaddfd.list);
+
+out_unlock:
+	mutex_unlock(&filter->notify_lock);
+out:
+	fput(kaddfd.file);
+
+	return ret;
+}
+
 static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
 				 unsigned long arg)
 {
 	struct seccomp_filter *filter = file->private_data;
 	void __user *buf = (void __user *)arg;
 
+	/* Fixed-size ioctls */
 	switch (cmd) {
 	case SECCOMP_IOCTL_NOTIF_RECV:
 		return seccomp_notify_recv(filter, buf);
@@ -1247,6 +1411,13 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
 	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
 	case SECCOMP_IOCTL_NOTIF_ID_VALID:
 		return seccomp_notify_id_valid(filter, buf);
+	}
+
+	/* Extensible Argument ioctls */
+#define EA_IOCTL(cmd)	((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
+	switch (EA_IOCTL(cmd)) {
+	case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
+		return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From d3fa60d7bfdc9a0ff1a524bdef96b3db1fd62022 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Jul 2020 09:45:11 +0200
Subject: dma-mapping: move the remaining DMA API calls out of line

For a long time the DMA API has been implemented inline in dma-mapping.h,
but the function bodies can be quite large.  Move them all out of line.

This also removes all the dma_direct_* exports as those are just
implementation details and should never be used by drivers directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 include/linux/dma-direct.h  |  58 +++++++++++
 include/linux/dma-mapping.h | 247 ++++----------------------------------------
 kernel/dma/direct.c         |   9 --
 kernel/dma/mapping.c        | 164 +++++++++++++++++++++++++++++
 4 files changed, 244 insertions(+), 234 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index ab2e20cba951..2b045c509146 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -87,4 +87,62 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
 		unsigned long attrs);
 int dma_direct_supported(struct device *dev, u64 mask);
 bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs);
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+		enum dma_data_direction dir, unsigned long attrs);
+dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+}
+static inline void dma_direct_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+}
+static inline void dma_direct_unmap_sg(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+}
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+}
+static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+size_t dma_direct_max_mapping_size(struct device *dev);
+
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index a33ed3954ed4..bd0a6f5ee445 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -188,73 +188,6 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_DMA_DECLARE_COHERENT */
 
-static inline bool dma_is_direct(const struct dma_map_ops *ops)
-{
-	return likely(!ops);
-}
-
-/*
- * All the dma_direct_* declarations are here just for the indirect call bypass,
- * and must not be used directly drivers!
- */
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs);
-int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
-		enum dma_data_direction dir, unsigned long attrs);
-dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
-
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
-    defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
-#else
-static inline void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-}
-static inline void dma_direct_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
-{
-}
-#endif
-
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
-    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
-    defined(CONFIG_SWIOTLB)
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
-#else
-static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-}
-static inline void dma_direct_unmap_sg(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-}
-static inline void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-}
-static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
-{
-}
-#endif
-
-size_t dma_direct_max_mapping_size(struct device *dev);
-
 #ifdef CONFIG_HAS_DMA
 #include <asm/dma-mapping.h>
 
@@ -271,164 +204,6 @@ static inline void set_dma_ops(struct device *dev,
 	dev->dma_ops = dma_ops;
 }
 
-static inline dma_addr_t dma_map_page_attrs(struct device *dev,
-		struct page *page, size_t offset, size_t size,
-		enum dma_data_direction dir, unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	dma_addr_t addr;
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
-	else
-		addr = ops->map_page(dev, page, offset, size, dir, attrs);
-	debug_dma_map_page(dev, page, offset, size, dir, addr);
-
-	return addr;
-}
-
-static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_unmap_page(dev, addr, size, dir, attrs);
-	else if (ops->unmap_page)
-		ops->unmap_page(dev, addr, size, dir, attrs);
-	debug_dma_unmap_page(dev, addr, size, dir);
-}
-
-/*
- * dma_maps_sg_attrs returns 0 on error and > 0 on success.
- * It should never return a value < 0.
- */
-static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
-				   int nents, enum dma_data_direction dir,
-				   unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	int ents;
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
-	else
-		ents = ops->map_sg(dev, sg, nents, dir, attrs);
-	BUG_ON(ents < 0);
-	debug_dma_map_sg(dev, sg, nents, ents, dir);
-
-	return ents;
-}
-
-static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
-				      int nents, enum dma_data_direction dir,
-				      unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	debug_dma_unmap_sg(dev, sg, nents, dir);
-	if (dma_is_direct(ops))
-		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
-	else if (ops->unmap_sg)
-		ops->unmap_sg(dev, sg, nents, dir, attrs);
-}
-
-static inline dma_addr_t dma_map_resource(struct device *dev,
-					  phys_addr_t phys_addr,
-					  size_t size,
-					  enum dma_data_direction dir,
-					  unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	dma_addr_t addr = DMA_MAPPING_ERROR;
-
-	BUG_ON(!valid_dma_direction(dir));
-
-	/* Don't allow RAM to be mapped */
-	if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
-		return DMA_MAPPING_ERROR;
-
-	if (dma_is_direct(ops))
-		addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
-	else if (ops->map_resource)
-		addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
-
-	debug_dma_map_resource(dev, phys_addr, size, dir, addr);
-	return addr;
-}
-
-static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr,
-				      size_t size, enum dma_data_direction dir,
-				      unsigned long attrs)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (!dma_is_direct(ops) && ops->unmap_resource)
-		ops->unmap_resource(dev, addr, size, dir, attrs);
-	debug_dma_unmap_resource(dev, addr, size, dir);
-}
-
-static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
-					   size_t size,
-					   enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
-	else if (ops->sync_single_for_cpu)
-		ops->sync_single_for_cpu(dev, addr, size, dir);
-	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
-}
-
-static inline void dma_sync_single_for_device(struct device *dev,
-					      dma_addr_t addr, size_t size,
-					      enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_sync_single_for_device(dev, addr, size, dir);
-	else if (ops->sync_single_for_device)
-		ops->sync_single_for_device(dev, addr, size, dir);
-	debug_dma_sync_single_for_device(dev, addr, size, dir);
-}
-
-static inline void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
-		    int nelems, enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
-	else if (ops->sync_sg_for_cpu)
-		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
-	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
-}
-
-static inline void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
-		       int nelems, enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
-		dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
-	else if (ops->sync_sg_for_device)
-		ops->sync_sg_for_device(dev, sg, nelems, dir);
-	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
-
-}
 
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
@@ -439,6 +214,28 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return 0;
 }
 
+dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
+		size_t offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs);
+void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs);
+int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir, unsigned long attrs);
+void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+				      int nents, enum dma_data_direction dir,
+				      unsigned long attrs);
+dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs);
+void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir);
+void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir);
+void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+		    int nelems, enum dma_data_direction dir);
+void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+		       int nelems, enum dma_data_direction dir);
 void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t flag, unsigned long attrs);
 void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 67f060b86a73..e545d14fccee 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -315,7 +315,6 @@ void dma_direct_sync_single_for_device(struct device *dev,
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_device(paddr, size, dir);
 }
-EXPORT_SYMBOL(dma_direct_sync_single_for_device);
 
 void dma_direct_sync_sg_for_device(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
@@ -335,7 +334,6 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 					dir);
 	}
 }
-EXPORT_SYMBOL(dma_direct_sync_sg_for_device);
 #endif
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
@@ -354,7 +352,6 @@ void dma_direct_sync_single_for_cpu(struct device *dev,
 	if (unlikely(is_swiotlb_buffer(paddr)))
 		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
 }
-EXPORT_SYMBOL(dma_direct_sync_single_for_cpu);
 
 void dma_direct_sync_sg_for_cpu(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
@@ -376,7 +373,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_cpu_all();
 }
-EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
 
 void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
@@ -389,7 +385,6 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 	if (unlikely(is_swiotlb_buffer(phys)))
 		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
 }
-EXPORT_SYMBOL(dma_direct_unmap_page);
 
 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs)
@@ -401,7 +396,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
 			     attrs);
 }
-EXPORT_SYMBOL(dma_direct_unmap_sg);
 #endif
 
 dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
@@ -428,7 +422,6 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		arch_sync_dma_for_device(phys, size, dir);
 	return dma_addr;
 }
-EXPORT_SYMBOL(dma_direct_map_page);
 
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs)
@@ -450,7 +443,6 @@ out_unmap:
 	dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
 	return 0;
 }
-EXPORT_SYMBOL(dma_direct_map_sg);
 
 dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
@@ -467,7 +459,6 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 
 	return dma_addr;
 }
-EXPORT_SYMBOL(dma_direct_map_resource);
 
 int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index a8c18c9a796f..b53953024512 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -105,6 +105,170 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 }
 EXPORT_SYMBOL(dmam_alloc_attrs);
 
+static inline bool dma_is_direct(const struct dma_map_ops *ops)
+{
+	return likely(!ops);
+}
+
+dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
+		size_t offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	dma_addr_t addr;
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+	else
+		addr = ops->map_page(dev, page, offset, size, dir, attrs);
+	debug_dma_map_page(dev, page, offset, size, dir, addr);
+
+	return addr;
+}
+EXPORT_SYMBOL(dma_map_page_attrs);
+
+void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		dma_direct_unmap_page(dev, addr, size, dir, attrs);
+	else if (ops->unmap_page)
+		ops->unmap_page(dev, addr, size, dir, attrs);
+	debug_dma_unmap_page(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unmap_page_attrs);
+
+/*
+ * dma_maps_sg_attrs returns 0 on error and > 0 on success.
+ * It should never return a value < 0.
+ */
+int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	int ents;
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
+	else
+		ents = ops->map_sg(dev, sg, nents, dir, attrs);
+	BUG_ON(ents < 0);
+	debug_dma_map_sg(dev, sg, nents, ents, dir);
+
+	return ents;
+}
+EXPORT_SYMBOL(dma_map_sg_attrs);
+
+void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+				      int nents, enum dma_data_direction dir,
+				      unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	debug_dma_unmap_sg(dev, sg, nents, dir);
+	if (dma_is_direct(ops))
+		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
+	else if (ops->unmap_sg)
+		ops->unmap_sg(dev, sg, nents, dir, attrs);
+}
+EXPORT_SYMBOL(dma_unmap_sg_attrs);
+
+dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	dma_addr_t addr = DMA_MAPPING_ERROR;
+
+	BUG_ON(!valid_dma_direction(dir));
+
+	/* Don't allow RAM to be mapped */
+	if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
+		return DMA_MAPPING_ERROR;
+
+	if (dma_is_direct(ops))
+		addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
+	else if (ops->map_resource)
+		addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
+
+	debug_dma_map_resource(dev, phys_addr, size, dir, addr);
+	return addr;
+}
+EXPORT_SYMBOL(dma_map_resource);
+
+void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (!dma_is_direct(ops) && ops->unmap_resource)
+		ops->unmap_resource(dev, addr, size, dir, attrs);
+	debug_dma_unmap_resource(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unmap_resource);
+
+void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+	else if (ops->sync_single_for_cpu)
+		ops->sync_single_for_cpu(dev, addr, size, dir);
+	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_sync_single_for_cpu);
+
+void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		dma_direct_sync_single_for_device(dev, addr, size, dir);
+	else if (ops->sync_single_for_device)
+		ops->sync_single_for_device(dev, addr, size, dir);
+	debug_dma_sync_single_for_device(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(dma_sync_single_for_device);
+
+void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+		    int nelems, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
+	else if (ops->sync_sg_for_cpu)
+		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
+	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_cpu);
+
+void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+		       int nelems, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	BUG_ON(!valid_dma_direction(dir));
+	if (dma_is_direct(ops))
+		dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
+	else if (ops->sync_sg_for_device)
+		ops->sync_sg_for_device(dev, sg, nelems, dir);
+	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(dma_sync_sg_for_device);
+
 /*
  * Create scatter-list for the already allocated DMA buffer.
  */
-- 
cgit v1.2.3


From b4174173005972f8f6497883d08d87e0aba1b604 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Jul 2020 09:47:08 +0200
Subject: dma-mapping: inline the fast path dma-direct calls

Inline the single page map/unmap/sync dma-direct calls into the now
out of line generic wrappers.  This restores the behavior of a single
function call that we had before moving the generic calls out of line.
Besides the dma-mapping callers there are just a few callers in IOMMU
drivers that have a bypass mode, and more of those are going to be
switched to the generic bypass soon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 include/linux/dma-direct.h | 92 ++++++++++++++++++++++++++++++++++------------
 kernel/dma/direct.c        | 65 --------------------------------
 2 files changed, 69 insertions(+), 88 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 2b045c509146..5a3ce2a24794 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -1,10 +1,16 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Internals of the DMA direct mapping implementation.  Only for use by the
+ * DMA mapping code and IOMMU drivers.
+ */
 #ifndef _LINUX_DMA_DIRECT_H
 #define _LINUX_DMA_DIRECT_H 1
 
 #include <linux/dma-mapping.h>
+#include <linux/dma-noncoherent.h>
 #include <linux/memblock.h> /* for min_low_pfn */
 #include <linux/mem_encrypt.h>
+#include <linux/swiotlb.h>
 
 extern unsigned int zone_dma_bits;
 
@@ -87,25 +93,17 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
 		unsigned long attrs);
 int dma_direct_supported(struct device *dev, u64 mask);
 bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs);
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs);
 dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
+size_t dma_direct_max_mapping_size(struct device *dev);
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
     defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir);
 #else
-static inline void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-}
 static inline void dma_direct_sync_sg_for_device(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
 {
@@ -115,34 +113,82 @@ static inline void dma_direct_sync_sg_for_device(struct device *dev,
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
     defined(CONFIG_SWIOTLB)
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
 void dma_direct_sync_sg_for_cpu(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
 #else
-static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-}
 static inline void dma_direct_unmap_sg(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir,
 		unsigned long attrs)
 {
 }
+static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+static inline void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	phys_addr_t paddr = dma_to_phys(dev, addr);
+
+	if (unlikely(is_swiotlb_buffer(paddr)))
+		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_dma_for_device(paddr, size, dir);
+}
+
 static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 		dma_addr_t addr, size_t size, enum dma_data_direction dir)
 {
+	phys_addr_t paddr = dma_to_phys(dev, addr);
+
+	if (!dev_is_dma_coherent(dev)) {
+		arch_sync_dma_for_cpu(paddr, size, dir);
+		arch_sync_dma_for_cpu_all();
+	}
+
+	if (unlikely(is_swiotlb_buffer(paddr)))
+		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
 }
-static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+
+static inline dma_addr_t dma_direct_map_page(struct device *dev,
+		struct page *page, unsigned long offset, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
 {
+	phys_addr_t phys = page_to_phys(page) + offset;
+	dma_addr_t dma_addr = phys_to_dma(dev, phys);
+
+	if (unlikely(swiotlb_force == SWIOTLB_FORCE))
+		return swiotlb_map(dev, phys, size, dir, attrs);
+
+	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
+		if (swiotlb_force != SWIOTLB_NO_FORCE)
+			return swiotlb_map(dev, phys, size, dir, attrs);
+
+		dev_WARN_ONCE(dev, 1,
+			     "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+			     &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+		return DMA_MAPPING_ERROR;
+	}
+
+	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		arch_sync_dma_for_device(phys, size, dir);
+	return dma_addr;
 }
-#endif
 
-size_t dma_direct_max_mapping_size(struct device *dev);
+static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	phys_addr_t phys = dma_to_phys(dev, addr);
 
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+
+	if (unlikely(is_swiotlb_buffer(phys)))
+		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+}
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index e545d14fccee..bb0041e99659 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -10,11 +10,9 @@
 #include <linux/dma-direct.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-contiguous.h>
-#include <linux/dma-noncoherent.h>
 #include <linux/pfn.h>
 #include <linux/vmalloc.h>
 #include <linux/set_memory.h>
-#include <linux/swiotlb.h>
 
 /*
  * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it
@@ -304,18 +302,6 @@ void dma_direct_free(struct device *dev, size_t size,
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
     defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-	phys_addr_t paddr = dma_to_phys(dev, addr);
-
-	if (unlikely(is_swiotlb_buffer(paddr)))
-		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
-
-	if (!dev_is_dma_coherent(dev))
-		arch_sync_dma_for_device(paddr, size, dir);
-}
-
 void dma_direct_sync_sg_for_device(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
 {
@@ -339,20 +325,6 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
     defined(CONFIG_SWIOTLB)
-void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-	phys_addr_t paddr = dma_to_phys(dev, addr);
-
-	if (!dev_is_dma_coherent(dev)) {
-		arch_sync_dma_for_cpu(paddr, size, dir);
-		arch_sync_dma_for_cpu_all();
-	}
-
-	if (unlikely(is_swiotlb_buffer(paddr)))
-		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
-}
-
 void dma_direct_sync_sg_for_cpu(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
 {
@@ -374,18 +346,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		arch_sync_dma_for_cpu_all();
 }
 
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	phys_addr_t phys = dma_to_phys(dev, addr);
-
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
-
-	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
-}
-
 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs)
 {
@@ -398,31 +358,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 }
 #endif
 
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-	phys_addr_t phys = page_to_phys(page) + offset;
-	dma_addr_t dma_addr = phys_to_dma(dev, phys);
-
-	if (unlikely(swiotlb_force == SWIOTLB_FORCE))
-		return swiotlb_map(dev, phys, size, dir, attrs);
-
-	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
-		if (swiotlb_force != SWIOTLB_NO_FORCE)
-			return swiotlb_map(dev, phys, size, dir, attrs);
-
-		dev_WARN_ONCE(dev, 1,
-			     "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
-			     &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
-		return DMA_MAPPING_ERROR;
-	}
-
-	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		arch_sync_dma_for_device(phys, size, dir);
-	return dma_addr;
-}
-
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs)
 {
-- 
cgit v1.2.3


From 9b74ebb2b0f259474da65fa0178c657e5fa5c640 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 14 Jul 2020 15:56:34 +0200
Subject: cpumap: Use non-locked version __ptr_ring_consume_batched

Commit 77361825bb01 ("bpf: cpumap use ptr_ring_consume_batched") changed
away from using single frame ptr_ring dequeue (__ptr_ring_consume) to
consume a batched, but it uses a locked version, which as the comment
explain isn't needed.

Change to use the non-locked version __ptr_ring_consume_batched.

Fixes: 77361825bb01 ("bpf: cpumap use ptr_ring_consume_batched")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/a9c7d06f9a009e282209f0c8c7b2c5d9b9ad60b9.1594734381.git.lorenzo@kernel.org
---
 kernel/bpf/cpumap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index bd8658055c16..323c91c4fab0 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -259,7 +259,7 @@ static int cpu_map_kthread_run(void *data)
 		 * kthread CPU pinned. Lockless access to ptr_ring
 		 * consume side valid as no-resize allowed of queue.
 		 */
-		n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
+		n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
 
 		for (i = 0; i < n; i++) {
 			void *f = frames[i];
-- 
cgit v1.2.3


From 644bfe51fa49c22244d24e896cd3fe3ee2f2cfd1 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 14 Jul 2020 15:56:37 +0200
Subject: cpumap: Formalize map value as a named struct

As it has been already done for devmap, introduce 'struct bpf_cpumap_val'
to formalize the expected values that can be passed in for a CPUMAP.
Update cpumap code to use the struct.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/754f950674665dae6139c061d28c1d982aaf4170.1594734381.git.lorenzo@kernel.org
---
 include/uapi/linux/bpf.h       |  9 +++++++++
 kernel/bpf/cpumap.c            | 28 +++++++++++++++-------------
 tools/include/uapi/linux/bpf.h |  9 +++++++++
 3 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5e386389913a..109623527358 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3849,6 +3849,15 @@ struct bpf_devmap_val {
 	} bpf_prog;
 };
 
+/* CPUMAP map-value layout
+ *
+ * The struct data-layout of map-value is a configuration interface.
+ * New members can only be added to the end of this structure.
+ */
+struct bpf_cpumap_val {
+	__u32 qsize;	/* queue size to remote target CPU */
+};
+
 enum sk_action {
 	SK_DROP = 0,
 	SK_PASS,
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 323c91c4fab0..ff48dc00e8d0 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -52,7 +52,6 @@ struct xdp_bulk_queue {
 struct bpf_cpu_map_entry {
 	u32 cpu;    /* kthread CPU and map index */
 	int map_id; /* Back reference to map */
-	u32 qsize;  /* Queue size placeholder for map lookup */
 
 	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
 	struct xdp_bulk_queue __percpu *bulkq;
@@ -62,10 +61,13 @@ struct bpf_cpu_map_entry {
 	/* Queue with potential multi-producers, and single-consumer kthread */
 	struct ptr_ring *queue;
 	struct task_struct *kthread;
-	struct work_struct kthread_stop_wq;
+
+	struct bpf_cpumap_val value;
 
 	atomic_t refcnt; /* Control when this struct can be free'ed */
 	struct rcu_head rcu;
+
+	struct work_struct kthread_stop_wq;
 };
 
 struct bpf_cpu_map {
@@ -307,8 +309,8 @@ static int cpu_map_kthread_run(void *data)
 	return 0;
 }
 
-static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
-						       int map_id)
+static struct bpf_cpu_map_entry *
+__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
 {
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 	struct bpf_cpu_map_entry *rcpu;
@@ -338,13 +340,13 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
 	if (!rcpu->queue)
 		goto free_bulkq;
 
-	err = ptr_ring_init(rcpu->queue, qsize, gfp);
+	err = ptr_ring_init(rcpu->queue, value->qsize, gfp);
 	if (err)
 		goto free_queue;
 
 	rcpu->cpu    = cpu;
 	rcpu->map_id = map_id;
-	rcpu->qsize  = qsize;
+	rcpu->value.qsize  = value->qsize;
 
 	/* Setup kthread */
 	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
@@ -437,12 +439,12 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
 			       u64 map_flags)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	struct bpf_cpumap_val cpumap_value = {};
 	struct bpf_cpu_map_entry *rcpu;
-
 	/* Array index key correspond to CPU number */
 	u32 key_cpu = *(u32 *)key;
-	/* Value is the queue size */
-	u32 qsize = *(u32 *)value;
+
+	memcpy(&cpumap_value, value, map->value_size);
 
 	if (unlikely(map_flags > BPF_EXIST))
 		return -EINVAL;
@@ -450,18 +452,18 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
 		return -E2BIG;
 	if (unlikely(map_flags == BPF_NOEXIST))
 		return -EEXIST;
-	if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+	if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */
 		return -EOVERFLOW;
 
 	/* Make sure CPU is a valid possible cpu */
 	if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu))
 		return -ENODEV;
 
-	if (qsize == 0) {
+	if (cpumap_value.qsize == 0) {
 		rcpu = NULL; /* Same as deleting */
 	} else {
 		/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
-		rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+		rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id);
 		if (!rcpu)
 			return -ENOMEM;
 		rcpu->cmap = cmap;
@@ -523,7 +525,7 @@ static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
 	struct bpf_cpu_map_entry *rcpu =
 		__cpu_map_lookup_elem(map, *(u32 *)key);
 
-	return rcpu ? &rcpu->qsize : NULL;
+	return rcpu ? &rcpu->value : NULL;
 }
 
 static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5e386389913a..109623527358 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3849,6 +3849,15 @@ struct bpf_devmap_val {
 	} bpf_prog;
 };
 
+/* CPUMAP map-value layout
+ *
+ * The struct data-layout of map-value is a configuration interface.
+ * New members can only be added to the end of this structure.
+ */
+struct bpf_cpumap_val {
+	__u32 qsize;	/* queue size to remote target CPU */
+};
+
 enum sk_action {
 	SK_DROP = 0,
 	SK_PASS,
-- 
cgit v1.2.3


From 9216477449f33cdbc9c9a99d49f500b7fbb81702 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 14 Jul 2020 15:56:38 +0200
Subject: bpf: cpumap: Add the possibility to attach an eBPF program to cpumap

Introduce the capability to attach an eBPF program to cpumap entries.
The idea behind this feature is to add the possibility to define on
which CPU run the eBPF program if the underlying hw does not support
RSS. Current supported verdicts are XDP_DROP and XDP_PASS.

This patch has been tested on Marvell ESPRESSObin using xdp_redirect_cpu
sample available in the kernel tree to identify possible performance
regressions. Results show there are no observable differences in
packet-per-second:

$./xdp_redirect_cpu --progname xdp_cpu_map0 --dev eth0 --cpu 1
rx: 354.8 Kpps
rx: 356.0 Kpps
rx: 356.8 Kpps
rx: 356.3 Kpps
rx: 356.6 Kpps
rx: 356.6 Kpps
rx: 356.7 Kpps
rx: 355.8 Kpps
rx: 356.8 Kpps
rx: 356.8 Kpps

Co-developed-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/5c9febdf903d810b3415732e5cd98491d7d9067a.1594734381.git.lorenzo@kernel.org
---
 include/linux/bpf.h            |   6 ++
 include/net/xdp.h              |   5 ++
 include/trace/events/xdp.h     |  14 +++--
 include/uapi/linux/bpf.h       |   5 ++
 kernel/bpf/cpumap.c            | 121 ++++++++++++++++++++++++++++++++++++-----
 net/core/dev.c                 |   9 +++
 tools/include/uapi/linux/bpf.h |   5 ++
 7 files changed, 148 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c67c88ad35f8..54ad426dbea1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1272,6 +1272,7 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_flush(void);
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
+bool cpu_map_prog_allowed(struct bpf_map *map);
 
 /* Return map's numa specified by userspace */
 static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
@@ -1432,6 +1433,11 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 	return 0;
 }
 
+static inline bool cpu_map_prog_allowed(struct bpf_map *map)
+{
+	return false;
+}
+
 static inline struct bpf_prog *bpf_prog_get_type_path(const char *name,
 				enum bpf_prog_type type)
 {
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 5b383c450858..83b9e0142b52 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -98,6 +98,11 @@ struct xdp_frame {
 	struct net_device *dev_rx; /* used by cpumap */
 };
 
+struct xdp_cpumap_stats {
+	unsigned int pass;
+	unsigned int drop;
+};
+
 /* Clear kernel pointers in xdp_frame */
 static inline void xdp_scrub_frame(struct xdp_frame *frame)
 {
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index b73d3e141323..e2c99f5bee39 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -177,9 +177,9 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err,
 TRACE_EVENT(xdp_cpumap_kthread,
 
 	TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
-		 int sched),
+		 int sched, struct xdp_cpumap_stats *xdp_stats),
 
-	TP_ARGS(map_id, processed, drops, sched),
+	TP_ARGS(map_id, processed, drops, sched, xdp_stats),
 
 	TP_STRUCT__entry(
 		__field(int, map_id)
@@ -188,6 +188,8 @@ TRACE_EVENT(xdp_cpumap_kthread,
 		__field(unsigned int, drops)
 		__field(unsigned int, processed)
 		__field(int, sched)
+		__field(unsigned int, xdp_pass)
+		__field(unsigned int, xdp_drop)
 	),
 
 	TP_fast_assign(
@@ -197,16 +199,20 @@ TRACE_EVENT(xdp_cpumap_kthread,
 		__entry->drops		= drops;
 		__entry->processed	= processed;
 		__entry->sched	= sched;
+		__entry->xdp_pass	= xdp_stats->pass;
+		__entry->xdp_drop	= xdp_stats->drop;
 	),
 
 	TP_printk("kthread"
 		  " cpu=%d map_id=%d action=%s"
 		  " processed=%u drops=%u"
-		  " sched=%d",
+		  " sched=%d"
+		  " xdp_pass=%u xdp_drop=%u",
 		  __entry->cpu, __entry->map_id,
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
 		  __entry->processed, __entry->drops,
-		  __entry->sched)
+		  __entry->sched,
+		  __entry->xdp_pass, __entry->xdp_drop)
 );
 
 TRACE_EVENT(xdp_cpumap_enqueue,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 109623527358..c010b57fce3f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -227,6 +227,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_GETSOCKNAME,
 	BPF_XDP_DEVMAP,
 	BPF_CGROUP_INET_SOCK_RELEASE,
+	BPF_XDP_CPUMAP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -3856,6 +3857,10 @@ struct bpf_devmap_val {
  */
 struct bpf_cpumap_val {
 	__u32 qsize;	/* queue size to remote target CPU */
+	union {
+		int   fd;	/* prog fd on map write */
+		__u32 id;	/* prog id on map read */
+	} bpf_prog;
 };
 
 enum sk_action {
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ff48dc00e8d0..b3a8aea81ee5 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -63,6 +63,7 @@ struct bpf_cpu_map_entry {
 	struct task_struct *kthread;
 
 	struct bpf_cpumap_val value;
+	struct bpf_prog *prog;
 
 	atomic_t refcnt; /* Control when this struct can be free'ed */
 	struct rcu_head rcu;
@@ -82,6 +83,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq);
 
 static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 {
+	u32 value_size = attr->value_size;
 	struct bpf_cpu_map *cmap;
 	int err = -ENOMEM;
 	u64 cost;
@@ -92,7 +94,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+	    (value_size != offsetofend(struct bpf_cpumap_val, qsize) &&
+	     value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) ||
+	    attr->map_flags & ~BPF_F_NUMA_NODE)
 		return ERR_PTR(-EINVAL);
 
 	cmap = kzalloc(sizeof(*cmap), GFP_USER);
@@ -214,6 +218,8 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
 static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 {
 	if (atomic_dec_and_test(&rcpu->refcnt)) {
+		if (rcpu->prog)
+			bpf_prog_put(rcpu->prog);
 		/* The queue should be empty at this point */
 		__cpu_map_ring_cleanup(rcpu->queue);
 		ptr_ring_cleanup(rcpu->queue, NULL);
@@ -222,6 +228,62 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 	}
 }
 
+static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
+				    void **frames, int n,
+				    struct xdp_cpumap_stats *stats)
+{
+	struct xdp_rxq_info rxq;
+	struct xdp_buff xdp;
+	int i, nframes = 0;
+
+	if (!rcpu->prog)
+		return n;
+
+	rcu_read_lock();
+
+	xdp_set_return_frame_no_direct();
+	xdp.rxq = &rxq;
+
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		u32 act;
+		int err;
+
+		rxq.dev = xdpf->dev_rx;
+		rxq.mem = xdpf->mem;
+		/* TODO: report queue_index to xdp_rxq_info */
+
+		xdp_convert_frame_to_buff(xdpf, &xdp);
+
+		act = bpf_prog_run_xdp(rcpu->prog, &xdp);
+		switch (act) {
+		case XDP_PASS:
+			err = xdp_update_frame_from_buff(&xdp, xdpf);
+			if (err < 0) {
+				xdp_return_frame(xdpf);
+				stats->drop++;
+			} else {
+				frames[nframes++] = xdpf;
+				stats->pass++;
+			}
+			break;
+		default:
+			bpf_warn_invalid_xdp_action(act);
+			/* fallthrough */
+		case XDP_DROP:
+			xdp_return_frame(xdpf);
+			stats->drop++;
+			break;
+		}
+	}
+
+	xdp_clear_return_frame_no_direct();
+
+	rcu_read_unlock();
+
+	return nframes;
+}
+
 #define CPUMAP_BATCH 8
 
 static int cpu_map_kthread_run(void *data)
@@ -236,11 +298,12 @@ static int cpu_map_kthread_run(void *data)
 	 * kthread_stop signal until queue is empty.
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+		struct xdp_cpumap_stats stats = {}; /* zero stats */
+		gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
 		unsigned int drops = 0, sched = 0;
 		void *frames[CPUMAP_BATCH];
 		void *skbs[CPUMAP_BATCH];
-		gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
-		int i, n, m;
+		int i, n, m, nframes;
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
@@ -261,8 +324,8 @@ static int cpu_map_kthread_run(void *data)
 		 * kthread CPU pinned. Lockless access to ptr_ring
 		 * consume side valid as no-resize allowed of queue.
 		 */
-		n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
-
+		n = __ptr_ring_consume_batched(rcpu->queue, frames,
+					       CPUMAP_BATCH);
 		for (i = 0; i < n; i++) {
 			void *f = frames[i];
 			struct page *page = virt_to_page(f);
@@ -274,15 +337,19 @@ static int cpu_map_kthread_run(void *data)
 			prefetchw(page);
 		}
 
-		m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs);
-		if (unlikely(m == 0)) {
-			for (i = 0; i < n; i++)
-				skbs[i] = NULL; /* effect: xdp_return_frame */
-			drops = n;
+		/* Support running another XDP prog on this CPU */
+		nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats);
+		if (nframes) {
+			m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
+			if (unlikely(m == 0)) {
+				for (i = 0; i < nframes; i++)
+					skbs[i] = NULL; /* effect: xdp_return_frame */
+				drops += nframes;
+			}
 		}
 
 		local_bh_disable();
-		for (i = 0; i < n; i++) {
+		for (i = 0; i < nframes; i++) {
 			struct xdp_frame *xdpf = frames[i];
 			struct sk_buff *skb = skbs[i];
 			int ret;
@@ -299,7 +366,7 @@ static int cpu_map_kthread_run(void *data)
 				drops++;
 		}
 		/* Feedback loop via tracepoint */
-		trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched);
+		trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats);
 
 		local_bh_enable(); /* resched point, may call do_softirq() */
 	}
@@ -309,13 +376,38 @@ static int cpu_map_kthread_run(void *data)
 	return 0;
 }
 
+bool cpu_map_prog_allowed(struct bpf_map *map)
+{
+	return map->map_type == BPF_MAP_TYPE_CPUMAP &&
+	       map->value_size != offsetofend(struct bpf_cpumap_val, qsize);
+}
+
+static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
+{
+	struct bpf_prog *prog;
+
+	prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (prog->expected_attach_type != BPF_XDP_CPUMAP) {
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	rcpu->value.bpf_prog.id = prog->aux->id;
+	rcpu->prog = prog;
+
+	return 0;
+}
+
 static struct bpf_cpu_map_entry *
 __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
 {
+	int numa, err, i, fd = value->bpf_prog.fd;
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 	struct bpf_cpu_map_entry *rcpu;
 	struct xdp_bulk_queue *bq;
-	int numa, err, i;
 
 	/* Have map->numa_node, but choose node of redirect target CPU */
 	numa = cpu_to_node(cpu);
@@ -357,6 +449,9 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
 	get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
 	get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
 
+	if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
+		goto free_ptr_ring;
+
 	/* Make sure kthread runs on a single CPU */
 	kthread_bind(rcpu->kthread, cpu);
 	wake_up_process(rcpu->kthread);
diff --git a/net/core/dev.c b/net/core/dev.c
index b61075828358..b820527f0a8d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5448,6 +5448,8 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 		for (i = 0; i < new->aux->used_map_cnt; i++) {
 			if (dev_map_can_have_prog(new->aux->used_maps[i]))
 				return -EINVAL;
+			if (cpu_map_prog_allowed(new->aux->used_maps[i]))
+				return -EINVAL;
 		}
 	}
 
@@ -8875,6 +8877,13 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 			return -EINVAL;
 		}
 
+		if (prog->expected_attach_type == BPF_XDP_CPUMAP) {
+			NL_SET_ERR_MSG(extack,
+				       "BPF_XDP_CPUMAP programs can not be attached to a device");
+			bpf_prog_put(prog);
+			return -EINVAL;
+		}
+
 		/* prog->aux->id may be 0 for orphaned device-bound progs */
 		if (prog->aux->id && prog->aux->id == prog_id) {
 			bpf_prog_put(prog);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 109623527358..c010b57fce3f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -227,6 +227,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_GETSOCKNAME,
 	BPF_XDP_DEVMAP,
 	BPF_CGROUP_INET_SOCK_RELEASE,
+	BPF_XDP_CPUMAP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -3856,6 +3857,10 @@ struct bpf_devmap_val {
  */
 struct bpf_cpumap_val {
 	__u32 qsize;	/* queue size to remote target CPU */
+	union {
+		int   fd;	/* prog fd on map write */
+		__u32 id;	/* prog id on map read */
+	} bpf_prog;
 };
 
 enum sk_action {
-- 
cgit v1.2.3


From 28b1520ebf81ced970141640d90279ac7b9f1f9a Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 14 Jul 2020 15:56:39 +0200
Subject: bpf: cpumap: Implement XDP_REDIRECT for eBPF programs attached to map
 entries

Introduce XDP_REDIRECT support for eBPF programs attached to cpumap
entries.
This patch has been tested on Marvell ESPRESSObin using a modified
version of xdp_redirect_cpu sample in order to attach a XDP program
to CPUMAP entries to perform a redirect on the mvneta interface.
In particular the following scenario has been tested:

rq (cpu0) --> mvneta - XDP_REDIRECT (cpu0) --> CPUMAP - XDP_REDIRECT (cpu1) --> mvneta

$./xdp_redirect_cpu -p xdp_cpu_map0 -d eth0 -c 1 -e xdp_redirect \
	-f xdp_redirect_kern.o -m tx_port -r eth0

tx: 285.2 Kpps rx: 285.2 Kpps

Attaching a simple XDP program on eth0 to perform XDP_TX gives
comparable results:

tx: 288.4 Kpps rx: 288.4 Kpps

Co-developed-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/2cf8373a731867af302b00c4ff16c122630c4980.1594734381.git.lorenzo@kernel.org
---
 include/net/xdp.h          |  1 +
 include/trace/events/xdp.h |  6 ++++--
 kernel/bpf/cpumap.c        | 17 +++++++++++++++--
 3 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 83b9e0142b52..5be0d4d65b94 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -99,6 +99,7 @@ struct xdp_frame {
 };
 
 struct xdp_cpumap_stats {
+	unsigned int redirect;
 	unsigned int pass;
 	unsigned int drop;
 };
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index e2c99f5bee39..cd24e8a59529 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -190,6 +190,7 @@ TRACE_EVENT(xdp_cpumap_kthread,
 		__field(int, sched)
 		__field(unsigned int, xdp_pass)
 		__field(unsigned int, xdp_drop)
+		__field(unsigned int, xdp_redirect)
 	),
 
 	TP_fast_assign(
@@ -201,18 +202,19 @@ TRACE_EVENT(xdp_cpumap_kthread,
 		__entry->sched	= sched;
 		__entry->xdp_pass	= xdp_stats->pass;
 		__entry->xdp_drop	= xdp_stats->drop;
+		__entry->xdp_redirect	= xdp_stats->redirect;
 	),
 
 	TP_printk("kthread"
 		  " cpu=%d map_id=%d action=%s"
 		  " processed=%u drops=%u"
 		  " sched=%d"
-		  " xdp_pass=%u xdp_drop=%u",
+		  " xdp_pass=%u xdp_drop=%u xdp_redirect=%u",
 		  __entry->cpu, __entry->map_id,
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
 		  __entry->processed, __entry->drops,
 		  __entry->sched,
-		  __entry->xdp_pass, __entry->xdp_drop)
+		  __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect)
 );
 
 TRACE_EVENT(xdp_cpumap_enqueue,
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index b3a8aea81ee5..4c95d0615ca2 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -239,7 +239,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
 	if (!rcpu->prog)
 		return n;
 
-	rcu_read_lock();
+	rcu_read_lock_bh();
 
 	xdp_set_return_frame_no_direct();
 	xdp.rxq = &rxq;
@@ -267,6 +267,16 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
 				stats->pass++;
 			}
 			break;
+		case XDP_REDIRECT:
+			err = xdp_do_redirect(xdpf->dev_rx, &xdp,
+					      rcpu->prog);
+			if (unlikely(err)) {
+				xdp_return_frame(xdpf);
+				stats->drop++;
+			} else {
+				stats->redirect++;
+			}
+			break;
 		default:
 			bpf_warn_invalid_xdp_action(act);
 			/* fallthrough */
@@ -277,9 +287,12 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
 		}
 	}
 
+	if (stats->redirect)
+		xdp_do_flush_map();
+
 	xdp_clear_return_frame_no_direct();
 
-	rcu_read_unlock();
+	rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
 
 	return nframes;
 }
-- 
cgit v1.2.3


From 3f649ab728cda8038259d8f14492fe400fbab911 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 3 Jun 2020 13:09:38 -0700
Subject: treewide: Remove uninitialized_var() usage

Using uninitialized_var() is dangerous as it papers over real bugs[1]
(or can in the future), and suppresses unrelated compiler warnings
(e.g. "unused variable"). If the compiler thinks it is uninitialized,
either simply initialize the variable or make compiler changes.

In preparation for removing[2] the[3] macro[4], remove all remaining
needless uses with the following script:

git grep '\buninitialized_var\b' | cut -d: -f1 | sort -u | \
	xargs perl -pi -e \
		's/\buninitialized_var\(([^\)]+)\)/\1/g;
		 s:\s*/\* (GCC be quiet|to make compiler happy) \*/$::g;'

drivers/video/fbdev/riva/riva_hw.c was manually tweaked to avoid
pathological white-space.

No outstanding warnings were found building allmodconfig with GCC 9.3.0
for x86_64, i386, arm64, arm, powerpc, powerpc64le, s390x, mips, sparc64,
alpha, and m68k.

[1] https://lore.kernel.org/lkml/20200603174714.192027-1-glider@google.com/
[2] https://lore.kernel.org/lkml/CA+55aFw+Vbj0i=1TGqCR5vQkCzWJ0QxK6CernOU6eedsudAixw@mail.gmail.com/
[3] https://lore.kernel.org/lkml/CA+55aFwgbgqhbp1fkxvRKEpzyR5J8n1vKT1VZdz9knmPuXhOeg@mail.gmail.com/
[4] https://lore.kernel.org/lkml/CA+55aFz2500WfbKXAx8s67wrm9=yVJu65TpLgN_ybYNv0VEOKA@mail.gmail.com/

Reviewed-by: Leon Romanovsky <leonro@mellanox.com> # drivers/infiniband and mlx4/mlx5
Acked-by: Jason Gunthorpe <jgg@mellanox.com> # IB
Acked-by: Kalle Valo <kvalo@codeaurora.org> # wireless drivers
Reviewed-by: Chao Yu <yuchao0@huawei.com> # erofs
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/arm/mach-sa1100/assabet.c                      |  2 +-
 arch/arm/mm/alignment.c                             |  2 +-
 arch/ia64/kernel/process.c                          |  2 +-
 arch/ia64/mm/discontig.c                            |  2 +-
 arch/ia64/mm/tlb.c                                  |  2 +-
 arch/mips/lib/dump_tlb.c                            |  2 +-
 arch/mips/mm/init.c                                 |  2 +-
 arch/mips/mm/tlb-r4k.c                              |  6 +++---
 arch/powerpc/kvm/book3s_64_mmu_radix.c              |  2 +-
 arch/powerpc/kvm/powerpc.c                          |  2 +-
 arch/powerpc/platforms/52xx/mpc52xx_pic.c           |  2 +-
 arch/s390/kernel/smp.c                              |  2 +-
 arch/x86/kernel/quirks.c                            | 10 +++++-----
 arch/x86/kvm/mmu/mmu.c                              |  2 +-
 arch/x86/kvm/mmu/paging_tmpl.h                      |  2 +-
 arch/x86/kvm/x86.c                                  |  2 +-
 block/blk-merge.c                                   |  2 +-
 drivers/acpi/acpi_pad.c                             |  2 +-
 drivers/ata/libata-scsi.c                           |  2 +-
 drivers/atm/zatm.c                                  |  2 +-
 drivers/block/drbd/drbd_nl.c                        |  6 +++---
 drivers/block/rbd.c                                 |  2 +-
 drivers/clk/clk-gate.c                              |  2 +-
 drivers/firewire/ohci.c                             | 14 +++++++-------
 drivers/gpu/drm/bridge/sil-sii8620.c                |  2 +-
 drivers/gpu/drm/drm_edid.c                          |  2 +-
 drivers/gpu/drm/exynos/exynos_drm_dsi.c             |  6 +++---
 drivers/gpu/drm/i915/display/intel_fbc.c            |  2 +-
 drivers/gpu/drm/i915/gt/intel_lrc.c                 |  2 +-
 drivers/gpu/drm/i915/intel_uncore.c                 |  2 +-
 drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c     |  4 ++--
 drivers/i2c/busses/i2c-rk3x.c                       |  2 +-
 drivers/ide/ide-acpi.c                              |  2 +-
 drivers/ide/ide-atapi.c                             |  2 +-
 drivers/ide/ide-io-std.c                            |  4 ++--
 drivers/ide/ide-io.c                                |  8 ++++----
 drivers/ide/ide-sysfs.c                             |  2 +-
 drivers/ide/umc8672.c                               |  2 +-
 drivers/idle/intel_idle.c                           |  2 +-
 drivers/infiniband/core/uverbs_cmd.c                |  4 ++--
 drivers/infiniband/hw/cxgb4/cm.c                    |  2 +-
 drivers/infiniband/hw/cxgb4/cq.c                    |  2 +-
 drivers/infiniband/hw/mlx4/qp.c                     |  6 +++---
 drivers/infiniband/hw/mlx5/cq.c                     |  6 +++---
 drivers/infiniband/hw/mlx5/devx.c                   |  2 +-
 drivers/infiniband/hw/mlx5/wr.c                     |  2 +-
 drivers/infiniband/hw/mthca/mthca_qp.c              | 10 +++++-----
 drivers/infiniband/sw/siw/siw_qp_rx.c               |  2 +-
 drivers/input/serio/serio_raw.c                     |  2 +-
 drivers/iommu/intel/iommu.c                         |  2 +-
 drivers/md/dm-io.c                                  |  2 +-
 drivers/md/dm-ioctl.c                               |  2 +-
 drivers/md/dm-snap-persistent.c                     |  2 +-
 drivers/md/dm-table.c                               |  2 +-
 drivers/md/dm-writecache.c                          |  2 +-
 drivers/md/raid5.c                                  |  2 +-
 drivers/media/dvb-frontends/rtl2832.c               |  2 +-
 drivers/media/tuners/qt1010.c                       |  4 ++--
 drivers/media/usb/gspca/vicam.c                     |  2 +-
 drivers/media/usb/uvc/uvc_video.c                   |  8 ++++----
 drivers/memstick/host/jmb38x_ms.c                   |  2 +-
 drivers/memstick/host/tifm_ms.c                     |  2 +-
 drivers/mmc/host/sdhci.c                            |  2 +-
 drivers/mtd/nand/raw/nand_ecc.c                     |  2 +-
 drivers/mtd/nand/raw/s3c2410.c                      |  2 +-
 drivers/mtd/parsers/afs.c                           |  4 ++--
 drivers/mtd/ubi/eba.c                               |  2 +-
 drivers/net/can/janz-ican3.c                        |  2 +-
 drivers/net/ethernet/broadcom/bnx2.c                |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c |  4 ++--
 drivers/net/ethernet/neterion/s2io.c                |  2 +-
 drivers/net/ethernet/qlogic/qla3xxx.c               |  2 +-
 drivers/net/ethernet/sun/cassini.c                  |  2 +-
 drivers/net/ethernet/sun/niu.c                      |  6 +++---
 drivers/net/wan/z85230.c                            |  2 +-
 drivers/net/wireless/ath/ath10k/core.c              |  2 +-
 drivers/net/wireless/ath/ath6kl/init.c              |  2 +-
 drivers/net/wireless/ath/ath9k/init.c               |  2 +-
 drivers/net/wireless/broadcom/b43/debugfs.c         |  2 +-
 drivers/net/wireless/broadcom/b43/dma.c             |  2 +-
 drivers/net/wireless/broadcom/b43/lo.c              |  2 +-
 drivers/net/wireless/broadcom/b43/phy_n.c           |  2 +-
 drivers/net/wireless/broadcom/b43/xmit.c            | 12 ++++++------
 drivers/net/wireless/broadcom/b43legacy/debugfs.c   |  2 +-
 drivers/net/wireless/broadcom/b43legacy/main.c      |  2 +-
 drivers/net/wireless/intel/iwlegacy/3945.c          |  2 +-
 drivers/net/wireless/intel/iwlegacy/4965-mac.c      |  2 +-
 drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c |  4 ++--
 drivers/pci/pcie/aer.c                              |  2 +-
 drivers/platform/x86/hdaps.c                        |  4 ++--
 drivers/scsi/dc395x.c                               |  2 +-
 drivers/scsi/pm8001/pm8001_hwi.c                    |  2 +-
 drivers/scsi/pm8001/pm80xx_hwi.c                    |  2 +-
 drivers/ssb/driver_chipcommon.c                     |  4 ++--
 drivers/tty/cyclades.c                              |  2 +-
 drivers/tty/isicom.c                                |  2 +-
 drivers/usb/musb/cppi_dma.c                         |  2 +-
 drivers/usb/storage/sddr55.c                        |  4 ++--
 drivers/vhost/net.c                                 |  6 +++---
 drivers/video/fbdev/matrox/matroxfb_maven.c         |  6 +++---
 drivers/video/fbdev/pm3fb.c                         |  6 +++---
 drivers/video/fbdev/riva/riva_hw.c                  |  3 +--
 drivers/virtio/virtio_ring.c                        |  6 +++---
 fs/afs/dir.c                                        |  2 +-
 fs/afs/security.c                                   |  2 +-
 fs/dlm/netlink.c                                    |  2 +-
 fs/erofs/data.c                                     |  4 ++--
 fs/erofs/zdata.c                                    |  2 +-
 fs/fat/dir.c                                        |  2 +-
 fs/fuse/control.c                                   |  4 ++--
 fs/fuse/cuse.c                                      |  2 +-
 fs/fuse/file.c                                      |  2 +-
 fs/gfs2/aops.c                                      |  2 +-
 fs/gfs2/bmap.c                                      |  2 +-
 fs/gfs2/lops.c                                      |  2 +-
 fs/hfsplus/unicode.c                                |  2 +-
 fs/isofs/namei.c                                    |  4 ++--
 fs/jffs2/erase.c                                    |  2 +-
 fs/nfsd/nfsctl.c                                    |  2 +-
 fs/ocfs2/alloc.c                                    |  4 ++--
 fs/ocfs2/dir.c                                      | 14 +++++++-------
 fs/ocfs2/extent_map.c                               |  4 ++--
 fs/ocfs2/namei.c                                    |  2 +-
 fs/ocfs2/refcounttree.c                             |  2 +-
 fs/ocfs2/xattr.c                                    |  2 +-
 fs/omfs/file.c                                      |  2 +-
 fs/overlayfs/copy_up.c                              |  2 +-
 fs/ubifs/commit.c                                   |  6 +++---
 fs/ubifs/dir.c                                      |  2 +-
 fs/ubifs/file.c                                     |  4 ++--
 fs/ubifs/journal.c                                  |  4 ++--
 fs/ubifs/lpt.c                                      |  2 +-
 fs/ubifs/tnc.c                                      |  6 +++---
 fs/ubifs/tnc_misc.c                                 |  4 ++--
 fs/udf/balloc.c                                     |  2 +-
 fs/xfs/xfs_bmap_util.c                              |  2 +-
 include/net/flow_offload.h                          |  2 +-
 kernel/async.c                                      |  4 ++--
 kernel/audit.c                                      |  2 +-
 kernel/debug/kdb/kdb_io.c                           |  2 +-
 kernel/dma/debug.c                                  |  2 +-
 kernel/events/core.c                                |  2 +-
 kernel/events/uprobes.c                             |  2 +-
 kernel/exit.c                                       |  2 +-
 kernel/futex.c                                      | 14 +++++++-------
 kernel/locking/lockdep.c                            | 16 ++++++++--------
 kernel/trace/ring_buffer.c                          |  2 +-
 lib/radix-tree.c                                    |  2 +-
 lib/test_lockup.c                                   |  2 +-
 mm/frontswap.c                                      |  2 +-
 mm/ksm.c                                            |  2 +-
 mm/memcontrol.c                                     |  2 +-
 mm/memory.c                                         |  2 +-
 mm/mempolicy.c                                      |  4 ++--
 mm/page_alloc.c                                     |  2 +-
 mm/percpu.c                                         |  2 +-
 mm/slub.c                                           |  4 ++--
 mm/swap.c                                           |  4 ++--
 net/dccp/options.c                                  |  2 +-
 net/ipv4/netfilter/nf_socket_ipv4.c                 |  6 +++---
 net/ipv6/ip6_flowlabel.c                            |  2 +-
 net/ipv6/netfilter/nf_socket_ipv6.c                 |  2 +-
 net/netfilter/nf_conntrack_ftp.c                    |  2 +-
 net/netfilter/nfnetlink_log.c                       |  2 +-
 net/netfilter/nfnetlink_queue.c                     |  4 ++--
 net/sched/cls_flow.c                                |  2 +-
 net/sched/sch_cake.c                                |  2 +-
 net/sched/sch_cbq.c                                 |  2 +-
 net/sched/sch_fq_codel.c                            |  2 +-
 net/sched/sch_fq_pie.c                              |  2 +-
 net/sched/sch_hfsc.c                                |  2 +-
 net/sched/sch_htb.c                                 |  2 +-
 net/sched/sch_sfq.c                                 |  2 +-
 net/sunrpc/svcsock.c                                |  4 ++--
 net/sunrpc/xprtsock.c                               | 10 +++++-----
 net/tls/tls_sw.c                                    |  2 +-
 sound/core/control_compat.c                         |  2 +-
 sound/isa/sb/sb16_csp.c                             |  2 +-
 sound/usb/endpoint.c                                |  2 +-
 179 files changed, 278 insertions(+), 279 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c
index aa265ede5730..2012fa8c28cf 100644
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -652,7 +652,7 @@ static void __init map_sa1100_gpio_regs( void )
  */
 static void __init get_assabet_scr(void)
 {
-	unsigned long uninitialized_var(scr), i;
+	unsigned long scr, i;
 
 	GPDR |= 0x3fc;			/* Configure GPIO 9:2 as outputs */
 	GPSR = 0x3fc;			/* Write 0xFF to GPIO 9:2 */
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index 81a627e6e1c5..f4bfc1cac91a 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -799,7 +799,7 @@ static int alignment_get_thumb(struct pt_regs *regs, u16 *ip, u16 *inst)
 static int
 do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
-	union offset_union uninitialized_var(offset);
+	union offset_union offset;
 	unsigned long instrptr;
 	int (*handler)(unsigned long addr, u32 instr, struct pt_regs *regs);
 	unsigned int type;
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 96dfb9e4b16f..da55b41ae33e 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -445,7 +445,7 @@ static void
 do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg)
 {
 	unsigned long mask, sp, nat_bits = 0, ar_rnat, urbs_end, cfm;
-	unsigned long uninitialized_var(ip);	/* GCC be quiet */
+	unsigned long ip;
 	elf_greg_t *dst = arg;
 	struct pt_regs *pt;
 	char nat;
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index dd8284bcbf16..da810ca234da 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -180,7 +180,7 @@ static void *per_cpu_node_setup(void *cpu_data, int node)
 void __init setup_per_cpu_areas(void)
 {
 	struct pcpu_alloc_info *ai;
-	struct pcpu_group_info *uninitialized_var(gi);
+	struct pcpu_group_info *gi;
 	unsigned int *cpu_map;
 	void *base;
 	unsigned long base_offset;
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
index 72cc568bc841..71c19918e387 100644
--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -369,7 +369,7 @@ EXPORT_SYMBOL(flush_tlb_range);
 
 void ia64_tlb_init(void)
 {
-	ia64_ptce_info_t uninitialized_var(ptce_info); /* GCC be quiet */
+	ia64_ptce_info_t ptce_info;
 	u64 tr_pgbits;
 	long status;
 	pal_vm_info_1_u_t vm_info_1;
diff --git a/arch/mips/lib/dump_tlb.c b/arch/mips/lib/dump_tlb.c
index 5a418ba5e75f..4256423632c4 100644
--- a/arch/mips/lib/dump_tlb.c
+++ b/arch/mips/lib/dump_tlb.c
@@ -79,7 +79,7 @@ static void dump_tlb(int first, int last)
 	unsigned int pagemask, guestctl1 = 0, c0, c1, i;
 	unsigned long asidmask = cpu_asid_mask(&current_cpu_data);
 	int asidwidth = DIV_ROUND_UP(ilog2(asidmask) + 1, 4);
-	unsigned long uninitialized_var(s_mmid);
+	unsigned long s_mmid;
 #ifdef CONFIG_32BIT
 	bool xpa = cpu_has_xpa && (read_c0_pagegrain() & PG_ELPA);
 	int pwidth = xpa ? 11 : 8;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 336b58173dc7..6c7bbfe35ba3 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -83,7 +83,7 @@ void setup_zero_pages(void)
 static void *__kmap_pgprot(struct page *page, unsigned long addr, pgprot_t prot)
 {
 	enum fixed_addresses idx;
-	unsigned int uninitialized_var(old_mmid);
+	unsigned int old_mmid;
 	unsigned long vaddr, flags, entrylo;
 	unsigned long old_ctx;
 	pte_t pte;
diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c
index 6677dcb72580..38e2894d5fa3 100644
--- a/arch/mips/mm/tlb-r4k.c
+++ b/arch/mips/mm/tlb-r4k.c
@@ -119,7 +119,7 @@ void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		if (size <= (current_cpu_data.tlbsizeftlbsets ?
 			     current_cpu_data.tlbsize / 8 :
 			     current_cpu_data.tlbsize / 2)) {
-			unsigned long old_entryhi, uninitialized_var(old_mmid);
+			unsigned long old_entryhi, old_mmid;
 			int newpid = cpu_asid(cpu, mm);
 
 			old_entryhi = read_c0_entryhi();
@@ -213,7 +213,7 @@ void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 	int cpu = smp_processor_id();
 
 	if (cpu_context(cpu, vma->vm_mm) != 0) {
-		unsigned long uninitialized_var(old_mmid);
+		unsigned long old_mmid;
 		unsigned long flags, old_entryhi;
 		int idx;
 
@@ -382,7 +382,7 @@ void add_wired_entry(unsigned long entrylo0, unsigned long entrylo1,
 #ifdef CONFIG_XPA
 	panic("Broken for XPA kernels");
 #else
-	unsigned int uninitialized_var(old_mmid);
+	unsigned int old_mmid;
 	unsigned long flags;
 	unsigned long wired;
 	unsigned long old_pagemask;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 6a73714759ba..777aa5625d5f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -33,7 +33,7 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
 					      gva_t eaddr, void *to, void *from,
 					      unsigned long n)
 {
-	int uninitialized_var(old_pid), old_lpid;
+	int old_pid, old_lpid;
 	unsigned long quadrant, ret = n;
 	bool is_load = !!to;
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index dd7d141e33e8..aaa7b62f2f82 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1110,7 +1110,7 @@ static inline u32 dp_to_sp(u64 fprd)
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *run = vcpu->run;
-	u64 uninitialized_var(gpr);
+	u64 gpr;
 
 	if (run->mmio.len > sizeof(gpr)) {
 		printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pic.c b/arch/powerpc/platforms/52xx/mpc52xx_pic.c
index fc98912f42cf..76a8102bdb98 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_pic.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_pic.c
@@ -340,7 +340,7 @@ static int mpc52xx_irqhost_map(struct irq_domain *h, unsigned int virq,
 {
 	int l1irq;
 	int l2irq;
-	struct irq_chip *uninitialized_var(irqchip);
+	struct irq_chip *irqchip;
 	void *hndlr;
 	int type;
 	u32 reg;
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index e6be63ff162a..c2181471ac63 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -146,7 +146,7 @@ static int pcpu_sigp_retry(struct pcpu *pcpu, u8 order, u32 parm)
 
 static inline int pcpu_stopped(struct pcpu *pcpu)
 {
-	u32 uninitialized_var(status);
+	u32 status;
 
 	if (__pcpu_sigp(pcpu->address, SIGP_SENSE,
 			0, &status) != SIGP_CC_STATUS_STORED)
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 896d74cb5081..1b10717c9321 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -95,7 +95,7 @@ static void ich_force_hpet_resume(void)
 static void ich_force_enable_hpet(struct pci_dev *dev)
 {
 	u32 val;
-	u32 uninitialized_var(rcba);
+	u32 rcba;
 	int err = 0;
 
 	if (hpet_address || force_hpet_address)
@@ -185,7 +185,7 @@ static void hpet_print_force_info(void)
 static void old_ich_force_hpet_resume(void)
 {
 	u32 val;
-	u32 uninitialized_var(gen_cntl);
+	u32 gen_cntl;
 
 	if (!force_hpet_address || !cached_dev)
 		return;
@@ -207,7 +207,7 @@ static void old_ich_force_hpet_resume(void)
 static void old_ich_force_enable_hpet(struct pci_dev *dev)
 {
 	u32 val;
-	u32 uninitialized_var(gen_cntl);
+	u32 gen_cntl;
 
 	if (hpet_address || force_hpet_address)
 		return;
@@ -298,7 +298,7 @@ static void vt8237_force_hpet_resume(void)
 
 static void vt8237_force_enable_hpet(struct pci_dev *dev)
 {
-	u32 uninitialized_var(val);
+	u32 val;
 
 	if (hpet_address || force_hpet_address)
 		return;
@@ -429,7 +429,7 @@ static void nvidia_force_hpet_resume(void)
 
 static void nvidia_force_enable_hpet(struct pci_dev *dev)
 {
-	u32 uninitialized_var(val);
+	u32 val;
 
 	if (hpet_address || force_hpet_address)
 		return;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 76817d13c86e..deafcced65d2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1986,7 +1986,7 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 			 unsigned long data)
 {
 	u64 *sptep;
-	struct rmap_iterator uninitialized_var(iter);
+	struct rmap_iterator iter;
 	int young = 0;
 
 	for_each_rmap_spte(rmap_head, &iter, sptep)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index bd70ece1ef8b..275564a0ebdb 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -314,7 +314,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 {
 	int ret;
 	pt_element_t pte;
-	pt_element_t __user *uninitialized_var(ptep_user);
+	pt_element_t __user *ptep_user;
 	gfn_t table_gfn;
 	u64 pt_access, pte_access;
 	unsigned index, accessed_dirty, pte_pkey;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3b92db412335..598d5be960c9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9927,7 +9927,7 @@ void kvm_arch_sync_events(struct kvm *kvm)
 int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 {
 	int i, r;
-	unsigned long hva, uninitialized_var(old_npages);
+	unsigned long hva, old_npages;
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	struct kvm_memory_slot *slot;
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index f0b0bae075a0..006402edef6b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -473,7 +473,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
 			     struct scatterlist *sglist,
 			     struct scatterlist **sg)
 {
-	struct bio_vec uninitialized_var(bvec), bvprv = { NULL };
+	struct bio_vec bvec, bvprv = { NULL };
 	struct bvec_iter iter;
 	int nsegs = 0;
 	bool new_bio = false;
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index e7dc0133f817..6cc4c92d9ff9 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -88,7 +88,7 @@ static void round_robin_cpu(unsigned int tsk_index)
 	cpumask_var_t tmp;
 	int cpu;
 	unsigned long min_weight = -1;
-	unsigned long uninitialized_var(preferred_cpu);
+	unsigned long preferred_cpu;
 
 	if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
 		return;
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 46336084b1a9..ec233208585b 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -93,7 +93,7 @@ static ssize_t ata_scsi_park_show(struct device *device,
 	struct ata_link *link;
 	struct ata_device *dev;
 	unsigned long now;
-	unsigned int uninitialized_var(msecs);
+	unsigned int msecs;
 	int rc = 0;
 
 	ap = ata_shost_to_port(sdev->host);
diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c
index 57f97b95a453..165eebe06e39 100644
--- a/drivers/atm/zatm.c
+++ b/drivers/atm/zatm.c
@@ -940,7 +940,7 @@ static int open_tx_first(struct atm_vcc *vcc)
 	    vcc->qos.txtp.max_pcr >= ATM_OC3_PCR);
 	if (unlimited && zatm_dev->ubr != -1) zatm_vcc->shaper = zatm_dev->ubr;
 	else {
-		int uninitialized_var(pcr);
+		int pcr;
 
 		if (unlimited) vcc->qos.txtp.max_sdu = ATM_MAX_AAL5_PDU;
 		if ((zatm_vcc->shaper = alloc_shaper(vcc->dev,&pcr,
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index da4a3ebe04ef..c0017cc51ecc 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -3423,7 +3423,7 @@ int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct nlattr *resource_filter;
 	struct drbd_resource *resource;
-	struct drbd_device *uninitialized_var(device);
+	struct drbd_device *device;
 	int minor, err, retcode;
 	struct drbd_genlmsghdr *dh;
 	struct device_info device_info;
@@ -3512,7 +3512,7 @@ int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct nlattr *resource_filter;
 	struct drbd_resource *resource = NULL, *next_resource;
-	struct drbd_connection *uninitialized_var(connection);
+	struct drbd_connection *connection;
 	int err = 0, retcode;
 	struct drbd_genlmsghdr *dh;
 	struct connection_info connection_info;
@@ -3674,7 +3674,7 @@ int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct nlattr *resource_filter;
 	struct drbd_resource *resource;
-	struct drbd_device *uninitialized_var(device);
+	struct drbd_device *device;
 	struct drbd_peer_device *peer_device = NULL;
 	int minor, err, retcode;
 	struct drbd_genlmsghdr *dh;
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4f61e9209461..d9c0e7d154f9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1993,7 +1993,7 @@ static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
 	struct ceph_osd_data *osd_data;
 	u64 objno;
-	u8 state, new_state, uninitialized_var(current_state);
+	u8 state, new_state, current_state;
 	bool has_current_state;
 	void *p;
 
diff --git a/drivers/clk/clk-gate.c b/drivers/clk/clk-gate.c
index 2ca1f2ac38a6..070dc47e95a1 100644
--- a/drivers/clk/clk-gate.c
+++ b/drivers/clk/clk-gate.c
@@ -56,7 +56,7 @@ static void clk_gate_endisable(struct clk_hw *hw, int enable)
 {
 	struct clk_gate *gate = to_clk_gate(hw);
 	int set = gate->flags & CLK_GATE_SET_TO_DISABLE ? 1 : 0;
-	unsigned long uninitialized_var(flags);
+	unsigned long flags;
 	u32 reg;
 
 	set ^= enable;
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 54fdc39cd0bc..7dde21b18b04 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -1099,7 +1099,7 @@ static void context_tasklet(unsigned long data)
 static int context_add_buffer(struct context *ctx)
 {
 	struct descriptor_buffer *desc;
-	dma_addr_t uninitialized_var(bus_addr);
+	dma_addr_t bus_addr;
 	int offset;
 
 	/*
@@ -1289,7 +1289,7 @@ static int at_context_queue_packet(struct context *ctx,
 				   struct fw_packet *packet)
 {
 	struct fw_ohci *ohci = ctx->ohci;
-	dma_addr_t d_bus, uninitialized_var(payload_bus);
+	dma_addr_t d_bus, payload_bus;
 	struct driver_data *driver_data;
 	struct descriptor *d, *last;
 	__le32 *header;
@@ -2445,7 +2445,7 @@ static int ohci_set_config_rom(struct fw_card *card,
 {
 	struct fw_ohci *ohci;
 	__be32 *next_config_rom;
-	dma_addr_t uninitialized_var(next_config_rom_bus);
+	dma_addr_t next_config_rom_bus;
 
 	ohci = fw_ohci(card);
 
@@ -2933,10 +2933,10 @@ static struct fw_iso_context *ohci_allocate_iso_context(struct fw_card *card,
 				int type, int channel, size_t header_size)
 {
 	struct fw_ohci *ohci = fw_ohci(card);
-	struct iso_context *uninitialized_var(ctx);
-	descriptor_callback_t uninitialized_var(callback);
-	u64 *uninitialized_var(channels);
-	u32 *uninitialized_var(mask), uninitialized_var(regs);
+	struct iso_context *ctx;
+	descriptor_callback_t callback;
+	u64 *channels;
+	u32 *mask, regs;
 	int index, ret = -EBUSY;
 
 	spin_lock_irq(&ohci->lock);
diff --git a/drivers/gpu/drm/bridge/sil-sii8620.c b/drivers/gpu/drm/bridge/sil-sii8620.c
index 92acd336aa89..6cd8e012de5d 100644
--- a/drivers/gpu/drm/bridge/sil-sii8620.c
+++ b/drivers/gpu/drm/bridge/sil-sii8620.c
@@ -986,7 +986,7 @@ static void sii8620_set_auto_zone(struct sii8620 *ctx)
 
 static void sii8620_stop_video(struct sii8620 *ctx)
 {
-	u8 uninitialized_var(val);
+	u8 val;
 
 	sii8620_write_seq_static(ctx,
 		REG_TPI_INTR_EN, 0,
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index fed653f13c26..b98fa573e706 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -3051,7 +3051,7 @@ static int drm_cvt_modes(struct drm_connector *connector,
 	const u8 empty[3] = { 0, 0, 0 };
 
 	for (i = 0; i < 4; i++) {
-		int uninitialized_var(width), height;
+		int width, height;
 		cvt = &(timing->data.other_data.data.cvt[i]);
 
 		if (!memcmp(cvt->code, empty, 3))
diff --git a/drivers/gpu/drm/exynos/exynos_drm_dsi.c b/drivers/gpu/drm/exynos/exynos_drm_dsi.c
index ee96a95fb6be..7a6f6df5e954 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_dsi.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_dsi.c
@@ -547,9 +547,9 @@ static unsigned long exynos_dsi_pll_find_pms(struct exynos_dsi *dsi,
 	unsigned long best_freq = 0;
 	u32 min_delta = 0xffffffff;
 	u8 p_min, p_max;
-	u8 _p, uninitialized_var(best_p);
-	u16 _m, uninitialized_var(best_m);
-	u8 _s, uninitialized_var(best_s);
+	u8 _p, best_p;
+	u16 _m, best_m;
+	u8 _s, best_s;
 
 	p_min = DIV_ROUND_UP(fin, (12 * MHZ));
 	p_max = fin / (6 * MHZ);
diff --git a/drivers/gpu/drm/i915/display/intel_fbc.c b/drivers/gpu/drm/i915/display/intel_fbc.c
index 1c26673acb2d..c3a83c06c1b5 100644
--- a/drivers/gpu/drm/i915/display/intel_fbc.c
+++ b/drivers/gpu/drm/i915/display/intel_fbc.c
@@ -474,7 +474,7 @@ static int intel_fbc_alloc_cfb(struct drm_i915_private *dev_priv,
 			       unsigned int size, unsigned int fb_cpp)
 {
 	struct intel_fbc *fbc = &dev_priv->fbc;
-	struct drm_mm_node *uninitialized_var(compressed_llb);
+	struct drm_mm_node *compressed_llb;
 	int ret;
 
 	drm_WARN_ON(&dev_priv->drm,
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 7c3d8ef4a47c..234bf45c290b 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1103,7 +1103,7 @@ static struct i915_request *
 __unwind_incomplete_requests(struct intel_engine_cs *engine)
 {
 	struct i915_request *rq, *rn, *active = NULL;
-	struct list_head *uninitialized_var(pl);
+	struct list_head *pl;
 	int prio = I915_PRIORITY_INVALID;
 
 	lockdep_assert_held(&engine->active.lock);
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index a61cb8ca4d50..c8fd2bcb17ee 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1991,7 +1991,7 @@ int __intel_wait_for_register_fw(struct intel_uncore *uncore,
 				 unsigned int slow_timeout_ms,
 				 u32 *out_value)
 {
-	u32 uninitialized_var(reg_value);
+	u32 reg_value;
 #define done (((reg_value = intel_uncore_read_fw(uncore, reg)) & mask) == value)
 	int ret;
 
diff --git a/drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c b/drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c
index 3feff0c45b3f..542dcf7eddd6 100644
--- a/drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c
+++ b/drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c
@@ -517,8 +517,8 @@ dw_mipi_dsi_get_lane_mbps(void *priv_data, const struct drm_display_mode *mode,
 	unsigned long best_freq = 0;
 	unsigned long fvco_min, fvco_max, fin, fout;
 	unsigned int min_prediv, max_prediv;
-	unsigned int _prediv, uninitialized_var(best_prediv);
-	unsigned long _fbdiv, uninitialized_var(best_fbdiv);
+	unsigned int _prediv, best_prediv;
+	unsigned long _fbdiv, best_fbdiv;
 	unsigned long min_delta = ULONG_MAX;
 
 	dsi->format = format;
diff --git a/drivers/i2c/busses/i2c-rk3x.c b/drivers/i2c/busses/i2c-rk3x.c
index bc698240c4aa..15324bfbc6cb 100644
--- a/drivers/i2c/busses/i2c-rk3x.c
+++ b/drivers/i2c/busses/i2c-rk3x.c
@@ -415,7 +415,7 @@ static void rk3x_i2c_handle_read(struct rk3x_i2c *i2c, unsigned int ipd)
 {
 	unsigned int i;
 	unsigned int len = i2c->msg->len - i2c->processed;
-	u32 uninitialized_var(val);
+	u32 val;
 	u8 byte;
 
 	/* we only care for MBRF here. */
diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index 7d4e5c08f133..05e18d658141 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -180,7 +180,7 @@ err:
 static acpi_handle ide_acpi_hwif_get_handle(ide_hwif_t *hwif)
 {
 	struct device		*dev = hwif->gendev.parent;
-	acpi_handle		uninitialized_var(dev_handle);
+	acpi_handle		dev_handle;
 	u64			pcidevfn;
 	acpi_handle		chan_handle;
 	int			err;
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 80bc3bf82f4d..2162bc80f09e 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -609,7 +609,7 @@ static int ide_delayed_transfer_pc(ide_drive_t *drive)
 
 static ide_startstop_t ide_transfer_pc(ide_drive_t *drive)
 {
-	struct ide_atapi_pc *uninitialized_var(pc);
+	struct ide_atapi_pc *pc;
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq = hwif->rq;
 	ide_expiry_t *expiry;
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
index 18c20a7aa0ce..94bdcf1ea186 100644
--- a/drivers/ide/ide-io-std.c
+++ b/drivers/ide/ide-io-std.c
@@ -173,7 +173,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
 
 	if (io_32bit) {
-		unsigned long uninitialized_var(flags);
+		unsigned long flags;
 
 		if ((io_32bit & 2) && !mmio) {
 			local_irq_save(flags);
@@ -217,7 +217,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
 	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
 
 	if (io_32bit) {
-		unsigned long uninitialized_var(flags);
+		unsigned long flags;
 
 		if ((io_32bit & 2) && !mmio) {
 			local_irq_save(flags);
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index c31f1d2b3b07..1a53c7a75224 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -622,12 +622,12 @@ static int drive_is_ready(ide_drive_t *drive)
 void ide_timer_expiry (struct timer_list *t)
 {
 	ide_hwif_t	*hwif = from_timer(hwif, t, timer);
-	ide_drive_t	*uninitialized_var(drive);
+	ide_drive_t	*drive;
 	ide_handler_t	*handler;
 	unsigned long	flags;
 	int		wait = -1;
 	int		plug_device = 0;
-	struct request	*uninitialized_var(rq_in_flight);
+	struct request	*rq_in_flight;
 
 	spin_lock_irqsave(&hwif->lock, flags);
 
@@ -780,13 +780,13 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 {
 	ide_hwif_t *hwif = (ide_hwif_t *)dev_id;
 	struct ide_host *host = hwif->host;
-	ide_drive_t *uninitialized_var(drive);
+	ide_drive_t *drive;
 	ide_handler_t *handler;
 	unsigned long flags;
 	ide_startstop_t startstop;
 	irqreturn_t irq_ret = IRQ_NONE;
 	int plug_device = 0;
-	struct request *uninitialized_var(rq_in_flight);
+	struct request *rq_in_flight;
 
 	if (host->host_flags & IDE_HFLAG_SERIALIZE) {
 		if (hwif != host->cur_port)
diff --git a/drivers/ide/ide-sysfs.c b/drivers/ide/ide-sysfs.c
index b9dfeb2e8bd6..c08a8a0916e2 100644
--- a/drivers/ide/ide-sysfs.c
+++ b/drivers/ide/ide-sysfs.c
@@ -131,7 +131,7 @@ static struct device_attribute *ide_port_attrs[] = {
 
 int ide_sysfs_register_port(ide_hwif_t *hwif)
 {
-	int i, uninitialized_var(rc);
+	int i, rc;
 
 	for (i = 0; ide_port_attrs[i]; i++) {
 		rc = device_create_file(hwif->portdev, ide_port_attrs[i]);
diff --git a/drivers/ide/umc8672.c b/drivers/ide/umc8672.c
index 870e235e30af..cf996f788292 100644
--- a/drivers/ide/umc8672.c
+++ b/drivers/ide/umc8672.c
@@ -108,7 +108,7 @@ static void umc_set_speeds(u8 speeds[])
 static void umc_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
 {
 	ide_hwif_t *mate = hwif->mate;
-	unsigned long uninitialized_var(flags);
+	unsigned long flags;
 	const u8 pio = drive->pio_mode - XFER_PIO_0;
 
 	printk("%s: setting umc8672 to PIO mode%d (speed %d)\n",
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index f4495841bf68..aae53e650638 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -132,7 +132,7 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	struct cpuidle_state *state = &drv->states[index];
 	unsigned long eax = flg2MWAIT(state->flags);
 	unsigned long ecx = 1; /* break on interrupt flag */
-	bool uninitialized_var(tick);
+	bool tick;
 	int cpu = smp_processor_id();
 
 	/*
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index b48b3f6e632d..76e7ec0f0775 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1584,7 +1584,7 @@ static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs)
 	struct ib_uverbs_create_qp_resp resp;
 	struct ib_uqp_object           *obj;
 	struct ib_xrcd		       *xrcd;
-	struct ib_uobject	       *uninitialized_var(xrcd_uobj);
+	struct ib_uobject	       *xrcd_uobj;
 	struct ib_qp                   *qp;
 	struct ib_qp_open_attr          attr = {};
 	int ret;
@@ -3406,7 +3406,7 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,
 	struct ib_usrq_object           *obj;
 	struct ib_pd                    *pd;
 	struct ib_srq                   *srq;
-	struct ib_uobject               *uninitialized_var(xrcd_uobj);
+	struct ib_uobject               *xrcd_uobj;
 	struct ib_srq_init_attr          attr;
 	int ret;
 	struct ib_device *ib_dev;
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 30e08bcc9afb..77bc02a9228e 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -3282,7 +3282,7 @@ static int get_lladdr(struct net_device *dev, struct in6_addr *addr,
 
 static int pick_local_ip6addrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id)
 {
-	struct in6_addr uninitialized_var(addr);
+	struct in6_addr addr;
 	struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->m_local_addr;
 	struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->m_remote_addr;
 
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
index b1bb61c65f4f..352b8af1998a 100644
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -754,7 +754,7 @@ skip_cqe:
 static int __c4iw_poll_cq_one(struct c4iw_cq *chp, struct c4iw_qp *qhp,
 			      struct ib_wc *wc, struct c4iw_srq *srq)
 {
-	struct t4_cqe uninitialized_var(cqe);
+	struct t4_cqe cqe;
 	struct t4_wq *wq = qhp ? &qhp->wq : NULL;
 	u32 credit = 0;
 	u8 cqe_flushed;
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index cf51e3cbd969..f9ca6e000a81 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -3541,11 +3541,11 @@ static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 	int nreq;
 	int err = 0;
 	unsigned ind;
-	int uninitialized_var(size);
-	unsigned uninitialized_var(seglen);
+	int size;
+	unsigned seglen;
 	__be32 dummy;
 	__be32 *lso_wqe;
-	__be32 uninitialized_var(lso_hdr_sz);
+	__be32 lso_hdr_sz;
 	__be32 blh;
 	int i;
 	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 0c18cb6a2f14..0133ebb8d740 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -925,8 +925,8 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_cq *cq = to_mcq(ibcq);
 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
-	int uninitialized_var(index);
-	int uninitialized_var(inlen);
+	int index;
+	int inlen;
 	u32 *cqb = NULL;
 	void *cqc;
 	int cqe_size;
@@ -1246,7 +1246,7 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 	__be64 *pas;
 	int page_shift;
 	int inlen;
-	int uninitialized_var(cqe_size);
+	int cqe_size;
 	unsigned long flags;
 
 	if (!MLX5_CAP_GEN(dev->mdev, cq_resize)) {
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 9454a66c12cc..655ea9c984e1 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -2536,7 +2536,7 @@ static ssize_t devx_async_event_read(struct file *filp, char __user *buf,
 {
 	struct devx_async_event_file *ev_file = filp->private_data;
 	struct devx_event_subscription *event_sub;
-	struct devx_async_event_data *uninitialized_var(event);
+	struct devx_async_event_data *event;
 	int ret = 0;
 	size_t eventsz;
 	bool omit_data;
diff --git a/drivers/infiniband/hw/mlx5/wr.c b/drivers/infiniband/hw/mlx5/wr.c
index 2c6df1c43b55..bc35dbe4855b 100644
--- a/drivers/infiniband/hw/mlx5/wr.c
+++ b/drivers/infiniband/hw/mlx5/wr.c
@@ -1249,7 +1249,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 	struct mlx5_wqe_xrc_seg *xrc;
 	struct mlx5_bf *bf;
 	void *cur_edge;
-	int uninitialized_var(size);
+	int size;
 	unsigned long flags;
 	unsigned int idx;
 	int err = 0;
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index d04c245359eb..c6e95d0d760a 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -1639,8 +1639,8 @@ int mthca_tavor_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 	 * without initializing f0 and size0, and they are in fact
 	 * never used uninitialized.
 	 */
-	int uninitialized_var(size0);
-	u32 uninitialized_var(f0);
+	int size0;
+	u32 f0;
 	int ind;
 	u8 op0 = 0;
 
@@ -1835,7 +1835,7 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
 	 * without initializing size0, and it is in fact never used
 	 * uninitialized.
 	 */
-	int uninitialized_var(size0);
+	int size0;
 	int ind;
 	void *wqe;
 	void *prev_wqe;
@@ -1943,8 +1943,8 @@ int mthca_arbel_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 	 * without initializing f0 and size0, and they are in fact
 	 * never used uninitialized.
 	 */
-	int uninitialized_var(size0);
-	u32 uninitialized_var(f0);
+	int size0;
+	u32 f0;
 	int ind;
 	u8 op0 = 0;
 
diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c
index 7271d705f4b0..857be5a7d0bd 100644
--- a/drivers/infiniband/sw/siw/siw_qp_rx.c
+++ b/drivers/infiniband/sw/siw/siw_qp_rx.c
@@ -333,7 +333,7 @@ static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
 	struct siw_srq *srq;
 	struct siw_wqe *wqe = NULL;
 	bool srq_event = false;
-	unsigned long uninitialized_var(flags);
+	unsigned long flags;
 
 	srq = qp->srq;
 	if (srq) {
diff --git a/drivers/input/serio/serio_raw.c b/drivers/input/serio/serio_raw.c
index e9647ebff187..1e4770094415 100644
--- a/drivers/input/serio/serio_raw.c
+++ b/drivers/input/serio/serio_raw.c
@@ -159,7 +159,7 @@ static ssize_t serio_raw_read(struct file *file, char __user *buffer,
 {
 	struct serio_raw_client *client = file->private_data;
 	struct serio_raw *serio_raw = client->serio_raw;
-	char uninitialized_var(c);
+	char c;
 	ssize_t read = 0;
 	int error;
 
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index d759e7234e98..c29fd0991857 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2236,7 +2236,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 			    unsigned long nr_pages, int prot)
 {
 	struct dma_pte *first_pte = NULL, *pte = NULL;
-	phys_addr_t uninitialized_var(pteval);
+	phys_addr_t pteval;
 	unsigned long sg_res = 0;
 	unsigned int largepage_lvl = 0;
 	unsigned long lvl_pages = 0;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 81ffc59d05c9..4312007d2d34 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -306,7 +306,7 @@ static void do_region(int op, int op_flags, unsigned region,
 	struct request_queue *q = bdev_get_queue(where->bdev);
 	unsigned short logical_block_size = queue_logical_block_size(q);
 	sector_t num_sectors;
-	unsigned int uninitialized_var(special_cmd_max_sectors);
+	unsigned int special_cmd_max_sectors;
 
 	/*
 	 * Reject unsupported discard and write same requests.
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 489935d5f22d..056d891a32a9 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1844,7 +1844,7 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us
 	int ioctl_flags;
 	int param_flags;
 	unsigned int cmd;
-	struct dm_ioctl *uninitialized_var(param);
+	struct dm_ioctl *param;
 	ioctl_fn fn = NULL;
 	size_t input_param_size;
 	struct dm_ioctl param_kernel;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 963d3774c93e..247089c2be25 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -613,7 +613,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
 						    chunk_t old, chunk_t new),
 				    void *callback_context)
 {
-	int r, uninitialized_var(new_snapshot);
+	int r, new_snapshot;
 	struct pstore *ps = get_info(store);
 
 	/*
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8277b959e00b..89d7cda07640 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -642,7 +642,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
 	 */
 	unsigned short remaining = 0;
 
-	struct dm_target *uninitialized_var(ti);
+	struct dm_target *ti;
 	struct queue_limits ti_limits;
 	unsigned i;
 
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 30505d70f423..ec83edff25bf 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -1752,7 +1752,7 @@ static void writecache_writeback(struct work_struct *work)
 {
 	struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
 	struct blk_plug plug;
-	struct wc_entry *f, *uninitialized_var(g), *e = NULL;
+	struct wc_entry *f, *g, *e = NULL;
 	struct rb_node *node, *next_node;
 	struct list_head skipped;
 	struct writeback_list wbl;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ab8067f9ce8c..401b366af076 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2601,7 +2601,7 @@ static void raid5_end_write_request(struct bio *bi)
 	struct stripe_head *sh = bi->bi_private;
 	struct r5conf *conf = sh->raid_conf;
 	int disks = sh->disks, i;
-	struct md_rdev *uninitialized_var(rdev);
+	struct md_rdev *rdev;
 	sector_t first_bad;
 	int bad_sectors;
 	int replacement = 0;
diff --git a/drivers/media/dvb-frontends/rtl2832.c b/drivers/media/dvb-frontends/rtl2832.c
index 6ec277421390..e5bffaaeed38 100644
--- a/drivers/media/dvb-frontends/rtl2832.c
+++ b/drivers/media/dvb-frontends/rtl2832.c
@@ -640,7 +640,7 @@ static int rtl2832_read_status(struct dvb_frontend *fe, enum fe_status *status)
 	struct i2c_client *client = dev->client;
 	struct dtv_frontend_properties *c = &fe->dtv_property_cache;
 	int ret;
-	u32 uninitialized_var(tmp);
+	u32 tmp;
 	u8 u8tmp, buf[2];
 	u16 u16tmp;
 
diff --git a/drivers/media/tuners/qt1010.c b/drivers/media/tuners/qt1010.c
index 85bbdd4ecdbb..e48faf942830 100644
--- a/drivers/media/tuners/qt1010.c
+++ b/drivers/media/tuners/qt1010.c
@@ -215,7 +215,7 @@ static int qt1010_set_params(struct dvb_frontend *fe)
 static int qt1010_init_meas1(struct qt1010_priv *priv,
 			     u8 oper, u8 reg, u8 reg_init_val, u8 *retval)
 {
-	u8 i, val1, uninitialized_var(val2);
+	u8 i, val1, val2;
 	int err;
 
 	qt1010_i2c_oper_t i2c_data[] = {
@@ -250,7 +250,7 @@ static int qt1010_init_meas1(struct qt1010_priv *priv,
 static int qt1010_init_meas2(struct qt1010_priv *priv,
 			    u8 reg_init_val, u8 *retval)
 {
-	u8 i, uninitialized_var(val);
+	u8 i, val;
 	int err;
 	qt1010_i2c_oper_t i2c_data[] = {
 		{ QT1010_WR, 0x07, reg_init_val },
diff --git a/drivers/media/usb/gspca/vicam.c b/drivers/media/usb/gspca/vicam.c
index 179b2ec3df57..d98343fd33fe 100644
--- a/drivers/media/usb/gspca/vicam.c
+++ b/drivers/media/usb/gspca/vicam.c
@@ -225,7 +225,7 @@ static int sd_init(struct gspca_dev *gspca_dev)
 {
 	int ret;
 	const struct ihex_binrec *rec;
-	const struct firmware *uninitialized_var(fw);
+	const struct firmware *fw;
 	u8 *firmware_buf;
 
 	ret = request_ihex_firmware(&fw, VICAM_FIRMWARE,
diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
index 8fa77a81dd7f..a65d5353a441 100644
--- a/drivers/media/usb/uvc/uvc_video.c
+++ b/drivers/media/usb/uvc/uvc_video.c
@@ -765,9 +765,9 @@ static void uvc_video_stats_decode(struct uvc_streaming *stream,
 	unsigned int header_size;
 	bool has_pts = false;
 	bool has_scr = false;
-	u16 uninitialized_var(scr_sof);
-	u32 uninitialized_var(scr_stc);
-	u32 uninitialized_var(pts);
+	u16 scr_sof;
+	u32 scr_stc;
+	u32 pts;
 
 	if (stream->stats.stream.nb_frames == 0 &&
 	    stream->stats.frame.nb_packets == 0)
@@ -1828,7 +1828,7 @@ static int uvc_video_start_transfer(struct uvc_streaming *stream,
 		struct usb_host_endpoint *best_ep = NULL;
 		unsigned int best_psize = UINT_MAX;
 		unsigned int bandwidth;
-		unsigned int uninitialized_var(altsetting);
+		unsigned int altsetting;
 		int intfnum = stream->intfnum;
 
 		/* Isochronous endpoint, select the alternate setting. */
diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c
index 0a9c5ddf2f59..383091517ed7 100644
--- a/drivers/memstick/host/jmb38x_ms.c
+++ b/drivers/memstick/host/jmb38x_ms.c
@@ -314,7 +314,7 @@ static int jmb38x_ms_transfer_data(struct jmb38x_ms_host *host)
 	}
 
 	while (length) {
-		unsigned int uninitialized_var(p_off);
+		unsigned int p_off;
 
 		if (host->req->long_data) {
 			pg = nth_page(sg_page(&host->req->sg),
diff --git a/drivers/memstick/host/tifm_ms.c b/drivers/memstick/host/tifm_ms.c
index 5b966b54d6e9..fc35c7404429 100644
--- a/drivers/memstick/host/tifm_ms.c
+++ b/drivers/memstick/host/tifm_ms.c
@@ -198,7 +198,7 @@ static unsigned int tifm_ms_transfer_data(struct tifm_ms *host)
 		host->block_pos);
 
 	while (length) {
-		unsigned int uninitialized_var(p_off);
+		unsigned int p_off;
 
 		if (host->req->long_data) {
 			pg = nth_page(sg_page(&host->req->sg),
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 37b1158c1c0c..1ee866a38794 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -493,7 +493,7 @@ static void sdhci_read_block_pio(struct sdhci_host *host)
 {
 	unsigned long flags;
 	size_t blksize, len, chunk;
-	u32 uninitialized_var(scratch);
+	u32 scratch;
 	u8 *buf;
 
 	DBG("PIO reading\n");
diff --git a/drivers/mtd/nand/raw/nand_ecc.c b/drivers/mtd/nand/raw/nand_ecc.c
index 09fdced659f5..b6a46b1b7781 100644
--- a/drivers/mtd/nand/raw/nand_ecc.c
+++ b/drivers/mtd/nand/raw/nand_ecc.c
@@ -131,7 +131,7 @@ void __nand_calculate_ecc(const unsigned char *buf, unsigned int eccsize,
 	/* rp0..rp15..rp17 are the various accumulated parities (per byte) */
 	uint32_t rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
 	uint32_t rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15, rp16;
-	uint32_t uninitialized_var(rp17);	/* to make compiler happy */
+	uint32_t rp17;
 	uint32_t par;		/* the cumulative parity for all data */
 	uint32_t tmppar;	/* the cumulative parity for this iteration;
 				   for rp12, rp14 and rp16 at the end of the
diff --git a/drivers/mtd/nand/raw/s3c2410.c b/drivers/mtd/nand/raw/s3c2410.c
index f86dff311464..d0dd0c446e4d 100644
--- a/drivers/mtd/nand/raw/s3c2410.c
+++ b/drivers/mtd/nand/raw/s3c2410.c
@@ -291,7 +291,7 @@ static int s3c2410_nand_setrate(struct s3c2410_nand_info *info)
 	int tacls_max = (info->cpu_type == TYPE_S3C2412) ? 8 : 4;
 	int tacls, twrph0, twrph1;
 	unsigned long clkrate = clk_get_rate(info->clk);
-	unsigned long uninitialized_var(set), cfg, uninitialized_var(mask);
+	unsigned long set, cfg, mask;
 	unsigned long flags;
 
 	/* calculate the timing information for the controller */
diff --git a/drivers/mtd/parsers/afs.c b/drivers/mtd/parsers/afs.c
index 752b6cf005f7..980e332bdac4 100644
--- a/drivers/mtd/parsers/afs.c
+++ b/drivers/mtd/parsers/afs.c
@@ -126,8 +126,8 @@ static int afs_parse_v1_partition(struct mtd_info *mtd,
 	 * Static checks cannot see that we bail out if we have an error
 	 * reading the footer.
 	 */
-	u_int uninitialized_var(iis_ptr);
-	u_int uninitialized_var(img_ptr);
+	u_int iis_ptr;
+	u_int img_ptr;
 	u_int ptr;
 	size_t sz;
 	int ret;
diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c
index 5133e1be5331..0edecfdbd01f 100644
--- a/drivers/mtd/ubi/eba.c
+++ b/drivers/mtd/ubi/eba.c
@@ -599,7 +599,7 @@ int ubi_eba_read_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum,
 	int err, pnum, scrub = 0, vol_id = vol->vol_id;
 	struct ubi_vid_io_buf *vidb;
 	struct ubi_vid_hdr *vid_hdr;
-	uint32_t uninitialized_var(crc);
+	uint32_t crc;
 
 	err = leb_read_lock(ubi, vol_id, lnum);
 	if (err)
diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c
index a761092e6ac9..f929db893957 100644
--- a/drivers/net/can/janz-ican3.c
+++ b/drivers/net/can/janz-ican3.c
@@ -1451,7 +1451,7 @@ static int ican3_napi(struct napi_struct *napi, int budget)
 
 	/* process all communication messages */
 	while (true) {
-		struct ican3_msg uninitialized_var(msg);
+		struct ican3_msg msg;
 		ret = ican3_recv_msg(mod, &msg);
 		if (ret)
 			break;
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index e1c236cab2a7..c8cc14eadbb4 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -1455,7 +1455,7 @@ bnx2_test_and_disable_2g5(struct bnx2 *bp)
 static void
 bnx2_enable_forced_2g5(struct bnx2 *bp)
 {
-	u32 uninitialized_var(bmcr);
+	u32 bmcr;
 	int err;
 
 	if (!(bp->phy_flags & BNX2_PHY_FLAG_2_5G_CAPABLE))
@@ -1499,7 +1499,7 @@ bnx2_enable_forced_2g5(struct bnx2 *bp)
 static void
 bnx2_disable_forced_2g5(struct bnx2 *bp)
 {
-	u32 uninitialized_var(bmcr);
+	u32 bmcr;
 	int err;
 
 	if (!(bp->phy_flags & BNX2_PHY_FLAG_2_5G_CAPABLE))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
index 5ddd18639a1e..c410a0ce35c8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
@@ -529,8 +529,8 @@ static int req_pages_handler(struct notifier_block *nb,
 
 int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot)
 {
-	u16 uninitialized_var(func_id);
-	s32 uninitialized_var(npages);
+	u16 func_id;
+	s32 npages;
 	int err;
 
 	err = mlx5_cmd_query_pages(dev, &func_id, &npages, boot);
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index 67e62603fe3b..15b8b1bf8163 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -7276,7 +7276,7 @@ static int rx_osm_handler(struct ring_info *ring_data, struct RxD_t * rxdp)
 	int ring_no = ring_data->ring_no;
 	u16 l3_csum, l4_csum;
 	unsigned long long err = rxdp->Control_1 & RXD_T_CODE;
-	struct lro *uninitialized_var(lro);
+	struct lro *lro;
 	u8 err_mask;
 	struct swStat *swstats = &sp->mac_control.stats_info->sw_stat;
 
diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c
index 0fade19e00d4..0d0e38debbc2 100644
--- a/drivers/net/ethernet/qlogic/qla3xxx.c
+++ b/drivers/net/ethernet/qlogic/qla3xxx.c
@@ -3769,7 +3769,7 @@ static int ql3xxx_probe(struct pci_dev *pdev,
 	struct net_device *ndev = NULL;
 	struct ql3_adapter *qdev = NULL;
 	static int cards_found;
-	int uninitialized_var(pci_using_dac), err;
+	int pci_using_dac, err;
 
 	err = pci_enable_device(pdev);
 	if (err) {
diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c
index debd3c3fa6fb..015fdb851cdb 100644
--- a/drivers/net/ethernet/sun/cassini.c
+++ b/drivers/net/ethernet/sun/cassini.c
@@ -2271,7 +2271,7 @@ static int cas_rx_ringN(struct cas *cp, int ring, int budget)
 	drops = 0;
 	while (1) {
 		struct cas_rx_comp *rxc = rxcs + entry;
-		struct sk_buff *uninitialized_var(skb);
+		struct sk_buff *skb;
 		int type, len;
 		u64 words[4];
 		int i, dring;
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index 9a5004f674c7..1b697e4cd7dc 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -429,7 +429,7 @@ static int serdes_init_niu_1g_serdes(struct niu *np)
 	struct niu_link_config *lp = &np->link_config;
 	u16 pll_cfg, pll_sts;
 	int max_retry = 100;
-	u64 uninitialized_var(sig), mask, val;
+	u64 sig, mask, val;
 	u32 tx_cfg, rx_cfg;
 	unsigned long i;
 	int err;
@@ -526,7 +526,7 @@ static int serdes_init_niu_10g_serdes(struct niu *np)
 	struct niu_link_config *lp = &np->link_config;
 	u32 tx_cfg, rx_cfg, pll_cfg, pll_sts;
 	int max_retry = 100;
-	u64 uninitialized_var(sig), mask, val;
+	u64 sig, mask, val;
 	unsigned long i;
 	int err;
 
@@ -714,7 +714,7 @@ static int esr_write_glue0(struct niu *np, unsigned long chan, u32 val)
 
 static int esr_reset(struct niu *np)
 {
-	u32 uninitialized_var(reset);
+	u32 reset;
 	int err;
 
 	err = mdio_write(np, np->port, NIU_ESR_DEV_ADDR,
diff --git a/drivers/net/wan/z85230.c b/drivers/net/wan/z85230.c
index 7ad3d24195ba..138930c66ad2 100644
--- a/drivers/net/wan/z85230.c
+++ b/drivers/net/wan/z85230.c
@@ -702,7 +702,7 @@ EXPORT_SYMBOL(z8530_nop);
 irqreturn_t z8530_interrupt(int irq, void *dev_id)
 {
 	struct z8530_dev *dev=dev_id;
-	u8 uninitialized_var(intr);
+	u8 intr;
 	static volatile int locker=0;
 	int work=0;
 	struct z8530_irqhandler *irqs;
diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index 22b6937ac225..340ce327ac14 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -2240,7 +2240,7 @@ static int ath10k_init_uart(struct ath10k *ar)
 
 static int ath10k_init_hw_params(struct ath10k *ar)
 {
-	const struct ath10k_hw_params *uninitialized_var(hw_params);
+	const struct ath10k_hw_params *hw_params;
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(ath10k_hw_params_list); i++) {
diff --git a/drivers/net/wireless/ath/ath6kl/init.c b/drivers/net/wireless/ath/ath6kl/init.c
index aa1c71a76ef7..811fad6d60c0 100644
--- a/drivers/net/wireless/ath/ath6kl/init.c
+++ b/drivers/net/wireless/ath/ath6kl/init.c
@@ -1575,7 +1575,7 @@ static int ath6kl_init_upload(struct ath6kl *ar)
 
 int ath6kl_init_hw_params(struct ath6kl *ar)
 {
-	const struct ath6kl_hw *uninitialized_var(hw);
+	const struct ath6kl_hw *hw;
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(hw_list); i++) {
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index 289a2444d534..4d72cd7daaa2 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -230,7 +230,7 @@ static unsigned int ath9k_reg_rmw(void *hw_priv, u32 reg_offset, u32 set, u32 cl
 	struct ath_hw *ah = hw_priv;
 	struct ath_common *common = ath9k_hw_common(ah);
 	struct ath_softc *sc = (struct ath_softc *) common->priv;
-	unsigned long uninitialized_var(flags);
+	unsigned long flags;
 	u32 val;
 
 	if (NR_CPUS > 1 && ah->config.serialize_regmode == SER_REG_MODE_ON) {
diff --git a/drivers/net/wireless/broadcom/b43/debugfs.c b/drivers/net/wireless/broadcom/b43/debugfs.c
index dc1819ca52ac..89a25aefb327 100644
--- a/drivers/net/wireless/broadcom/b43/debugfs.c
+++ b/drivers/net/wireless/broadcom/b43/debugfs.c
@@ -493,7 +493,7 @@ static ssize_t b43_debugfs_read(struct file *file, char __user *userbuf,
 	struct b43_wldev *dev;
 	struct b43_debugfs_fops *dfops;
 	struct b43_dfs_file *dfile;
-	ssize_t uninitialized_var(ret);
+	ssize_t ret;
 	char *buf;
 	const size_t bufsize = 1024 * 16; /* 16 kiB buffer */
 	const size_t buforder = get_order(bufsize);
diff --git a/drivers/net/wireless/broadcom/b43/dma.c b/drivers/net/wireless/broadcom/b43/dma.c
index 9733c64bf978..ca671fc13116 100644
--- a/drivers/net/wireless/broadcom/b43/dma.c
+++ b/drivers/net/wireless/broadcom/b43/dma.c
@@ -37,7 +37,7 @@
 static u32 b43_dma_address(struct b43_dma *dma, dma_addr_t dmaaddr,
 			   enum b43_addrtype addrtype)
 {
-	u32 uninitialized_var(addr);
+	u32 addr;
 
 	switch (addrtype) {
 	case B43_DMA_ADDR_LOW:
diff --git a/drivers/net/wireless/broadcom/b43/lo.c b/drivers/net/wireless/broadcom/b43/lo.c
index 5d97cf06eceb..338b6545a1e7 100644
--- a/drivers/net/wireless/broadcom/b43/lo.c
+++ b/drivers/net/wireless/broadcom/b43/lo.c
@@ -729,7 +729,7 @@ struct b43_lo_calib *b43_calibrate_lo_setting(struct b43_wldev *dev,
 	};
 	int max_rx_gain;
 	struct b43_lo_calib *cal;
-	struct lo_g_saved_values uninitialized_var(saved_regs);
+	struct lo_g_saved_values saved_regs;
 	/* Values from the "TXCTL Register and Value Table" */
 	u16 txctl_reg;
 	u16 txctl_value;
diff --git a/drivers/net/wireless/broadcom/b43/phy_n.c b/drivers/net/wireless/broadcom/b43/phy_n.c
index 46db91846007..39de18d3ce91 100644
--- a/drivers/net/wireless/broadcom/b43/phy_n.c
+++ b/drivers/net/wireless/broadcom/b43/phy_n.c
@@ -5643,7 +5643,7 @@ static int b43_nphy_rev2_cal_rx_iq(struct b43_wldev *dev,
 	u8 rfctl[2];
 	u8 afectl_core;
 	u16 tmp[6];
-	u16 uninitialized_var(cur_hpf1), uninitialized_var(cur_hpf2), cur_lna;
+	u16 cur_hpf1, cur_hpf2, cur_lna;
 	u32 real, imag;
 	enum nl80211_band band;
 
diff --git a/drivers/net/wireless/broadcom/b43/xmit.c b/drivers/net/wireless/broadcom/b43/xmit.c
index 55babc6d1091..7651b1bdb592 100644
--- a/drivers/net/wireless/broadcom/b43/xmit.c
+++ b/drivers/net/wireless/broadcom/b43/xmit.c
@@ -422,10 +422,10 @@ int b43_generate_txhdr(struct b43_wldev *dev,
 	if ((rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) ||
 	    (rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT)) {
 		unsigned int len;
-		struct ieee80211_hdr *uninitialized_var(hdr);
+		struct ieee80211_hdr *hdr;
 		int rts_rate, rts_rate_fb;
 		int rts_rate_ofdm, rts_rate_fb_ofdm;
-		struct b43_plcp_hdr6 *uninitialized_var(plcp);
+		struct b43_plcp_hdr6 *plcp;
 		struct ieee80211_rate *rts_cts_rate;
 
 		rts_cts_rate = ieee80211_get_rts_cts_rate(dev->wl->hw, info);
@@ -436,7 +436,7 @@ int b43_generate_txhdr(struct b43_wldev *dev,
 		rts_rate_fb_ofdm = b43_is_ofdm_rate(rts_rate_fb);
 
 		if (rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT) {
-			struct ieee80211_cts *uninitialized_var(cts);
+			struct ieee80211_cts *cts;
 
 			switch (dev->fw.hdr_format) {
 			case B43_FW_HDR_598:
@@ -458,7 +458,7 @@ int b43_generate_txhdr(struct b43_wldev *dev,
 			mac_ctl |= B43_TXH_MAC_SENDCTS;
 			len = sizeof(struct ieee80211_cts);
 		} else {
-			struct ieee80211_rts *uninitialized_var(rts);
+			struct ieee80211_rts *rts;
 
 			switch (dev->fw.hdr_format) {
 			case B43_FW_HDR_598:
@@ -637,8 +637,8 @@ void b43_rx(struct b43_wldev *dev, struct sk_buff *skb, const void *_rxhdr)
 	const struct b43_rxhdr_fw4 *rxhdr = _rxhdr;
 	__le16 fctl;
 	u16 phystat0, phystat3;
-	u16 uninitialized_var(chanstat), uninitialized_var(mactime);
-	u32 uninitialized_var(macstat);
+	u16 chanstat, mactime;
+	u32 macstat;
 	u16 chanid;
 	int padding, rate_idx;
 
diff --git a/drivers/net/wireless/broadcom/b43legacy/debugfs.c b/drivers/net/wireless/broadcom/b43legacy/debugfs.c
index fa133dfb2ecb..e7e4293c01f2 100644
--- a/drivers/net/wireless/broadcom/b43legacy/debugfs.c
+++ b/drivers/net/wireless/broadcom/b43legacy/debugfs.c
@@ -190,7 +190,7 @@ static ssize_t b43legacy_debugfs_read(struct file *file, char __user *userbuf,
 	struct b43legacy_wldev *dev;
 	struct b43legacy_debugfs_fops *dfops;
 	struct b43legacy_dfs_file *dfile;
-	ssize_t uninitialized_var(ret);
+	ssize_t ret;
 	char *buf;
 	const size_t bufsize = 1024 * 16; /* 16 KiB buffer */
 	const size_t buforder = get_order(bufsize);
diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c
index 5208a39fd6f7..220c11d34c23 100644
--- a/drivers/net/wireless/broadcom/b43legacy/main.c
+++ b/drivers/net/wireless/broadcom/b43legacy/main.c
@@ -2580,7 +2580,7 @@ static void b43legacy_put_phy_into_reset(struct b43legacy_wldev *dev)
 static int b43legacy_switch_phymode(struct b43legacy_wl *wl,
 				      unsigned int new_mode)
 {
-	struct b43legacy_wldev *uninitialized_var(up_dev);
+	struct b43legacy_wldev *up_dev;
 	struct b43legacy_wldev *down_dev;
 	int err;
 	bool gmode = false;
diff --git a/drivers/net/wireless/intel/iwlegacy/3945.c b/drivers/net/wireless/intel/iwlegacy/3945.c
index 2ac494f5ae22..fd63eba47ba2 100644
--- a/drivers/net/wireless/intel/iwlegacy/3945.c
+++ b/drivers/net/wireless/intel/iwlegacy/3945.c
@@ -2100,7 +2100,7 @@ il3945_txpower_set_from_eeprom(struct il_priv *il)
 
 		/* set tx power value for all OFDM rates */
 		for (rate_idx = 0; rate_idx < IL_OFDM_RATES; rate_idx++) {
-			s32 uninitialized_var(power_idx);
+			s32 power_idx;
 			int rc;
 
 			/* use channel group's clip-power table,
diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
index da6d4202611c..a159d1d18c2c 100644
--- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c
+++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
@@ -2769,7 +2769,7 @@ il4965_hdl_tx(struct il_priv *il, struct il_rx_buf *rxb)
 	struct ieee80211_tx_info *info;
 	struct il4965_tx_resp *tx_resp = (void *)&pkt->u.raw[0];
 	u32 status = le32_to_cpu(tx_resp->u.status);
-	int uninitialized_var(tid);
+	int tid;
 	int sta_id;
 	int freed;
 	u8 *qc = NULL;
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c
index 5b071b70bc08..0ae9cfc65272 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c
@@ -618,8 +618,8 @@ static void _rtl92cu_init_chipn_two_out_ep_priority(struct ieee80211_hw *hw,
 						     u8 queue_sel)
 {
 	u16 beq, bkq, viq, voq, mgtq, hiq;
-	u16 uninitialized_var(valuehi);
-	u16 uninitialized_var(valuelow);
+	u16 valuehi;
+	u16 valuelow;
 
 	switch (queue_sel) {
 	case (TX_SELE_HQ | TX_SELE_LQ):
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 3acf56683915..14af4c97c626 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1138,7 +1138,7 @@ static irqreturn_t aer_isr(int irq, void *context)
 {
 	struct pcie_device *dev = (struct pcie_device *)context;
 	struct aer_rpc *rpc = get_service_data(dev);
-	struct aer_err_source uninitialized_var(e_src);
+	struct aer_err_source e_src;
 
 	if (kfifo_is_empty(&rpc->aer_fifo))
 		return IRQ_NONE;
diff --git a/drivers/platform/x86/hdaps.c b/drivers/platform/x86/hdaps.c
index 04c4da6692d7..a72270932ec3 100644
--- a/drivers/platform/x86/hdaps.c
+++ b/drivers/platform/x86/hdaps.c
@@ -365,7 +365,7 @@ static ssize_t hdaps_variance_show(struct device *dev,
 static ssize_t hdaps_temp1_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	u8 uninitialized_var(temp);
+	u8 temp;
 	int ret;
 
 	ret = hdaps_readb_one(HDAPS_PORT_TEMP1, &temp);
@@ -378,7 +378,7 @@ static ssize_t hdaps_temp1_show(struct device *dev,
 static ssize_t hdaps_temp2_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	u8 uninitialized_var(temp);
+	u8 temp;
 	int ret;
 
 	ret = hdaps_readb_one(HDAPS_PORT_TEMP2, &temp);
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index e95f5b3bef4d..37c6cc374079 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -4126,7 +4126,7 @@ static int adapter_sg_tables_alloc(struct AdapterCtlBlk *acb)
 	const unsigned srbs_per_page = PAGE_SIZE/SEGMENTX_LEN;
 	int srb_idx = 0;
 	unsigned i = 0;
-	struct SGentry *uninitialized_var(ptr);
+	struct SGentry *ptr;
 
 	for (i = 0; i < DC395x_MAX_SRB_CNT; i++)
 		acb->srb_array[i].segment_x = NULL;
diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c
index fb9848e1d481..0b4499210b95 100644
--- a/drivers/scsi/pm8001/pm8001_hwi.c
+++ b/drivers/scsi/pm8001/pm8001_hwi.c
@@ -4202,7 +4202,7 @@ static int process_oq(struct pm8001_hba_info *pm8001_ha, u8 vec)
 {
 	struct outbound_queue_table *circularQ;
 	void *pMsg1 = NULL;
-	u8 uninitialized_var(bc);
+	u8 bc;
 	u32 ret = MPI_IO_STATUS_FAIL;
 	unsigned long flags;
 
diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c
index 4d205ebaee87..05c944a3bdca 100644
--- a/drivers/scsi/pm8001/pm80xx_hwi.c
+++ b/drivers/scsi/pm8001/pm80xx_hwi.c
@@ -4182,7 +4182,7 @@ static int process_oq(struct pm8001_hba_info *pm8001_ha, u8 vec)
 {
 	struct outbound_queue_table *circularQ;
 	void *pMsg1 = NULL;
-	u8 uninitialized_var(bc);
+	u8 bc;
 	u32 ret = MPI_IO_STATUS_FAIL;
 	unsigned long flags;
 	u32 regval;
diff --git a/drivers/ssb/driver_chipcommon.c b/drivers/ssb/driver_chipcommon.c
index 3861cb659cb9..6c647ba4ba0b 100644
--- a/drivers/ssb/driver_chipcommon.c
+++ b/drivers/ssb/driver_chipcommon.c
@@ -119,7 +119,7 @@ void ssb_chipco_set_clockmode(struct ssb_chipcommon *cc,
 static enum ssb_clksrc chipco_pctl_get_slowclksrc(struct ssb_chipcommon *cc)
 {
 	struct ssb_bus *bus = cc->dev->bus;
-	u32 uninitialized_var(tmp);
+	u32 tmp;
 
 	if (cc->dev->id.revision < 6) {
 		if (bus->bustype == SSB_BUSTYPE_SSB ||
@@ -149,7 +149,7 @@ static enum ssb_clksrc chipco_pctl_get_slowclksrc(struct ssb_chipcommon *cc)
 /* Get maximum or minimum (depending on get_max flag) slowclock frequency. */
 static int chipco_pctl_clockfreqlimit(struct ssb_chipcommon *cc, int get_max)
 {
-	int uninitialized_var(limit);
+	int limit;
 	enum ssb_clksrc clocksrc;
 	int divisor = 1;
 	u32 tmp;
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index a6aabfd6e2da..097266342e5e 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -3643,7 +3643,7 @@ static int cy_pci_probe(struct pci_dev *pdev,
 	struct cyclades_card *card;
 	void __iomem *addr0 = NULL, *addr2 = NULL;
 	char *card_name = NULL;
-	u32 uninitialized_var(mailbox);
+	u32 mailbox;
 	unsigned int device_id, nchan = 0, card_no, i, j;
 	unsigned char plx_ver;
 	int retval, irq;
diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c
index fc38f96475bf..3b2f9fb01aa0 100644
--- a/drivers/tty/isicom.c
+++ b/drivers/tty/isicom.c
@@ -1514,7 +1514,7 @@ static unsigned int card_count;
 static int isicom_probe(struct pci_dev *pdev,
 	const struct pci_device_id *ent)
 {
-	unsigned int uninitialized_var(signature), index;
+	unsigned int signature, index;
 	int retval = -EPERM;
 	struct isi_board *board = NULL;
 
diff --git a/drivers/usb/musb/cppi_dma.c b/drivers/usb/musb/cppi_dma.c
index b4d6d9bb3239..c545b27ea568 100644
--- a/drivers/usb/musb/cppi_dma.c
+++ b/drivers/usb/musb/cppi_dma.c
@@ -1146,7 +1146,7 @@ irqreturn_t cppi_interrupt(int irq, void *dev_id)
 	struct musb_hw_ep	*hw_ep = NULL;
 	u32			rx, tx;
 	int			i, index;
-	unsigned long		uninitialized_var(flags);
+	unsigned long		flags;
 
 	cppi = container_of(musb->dma_controller, struct cppi, controller);
 	if (cppi->irq)
diff --git a/drivers/usb/storage/sddr55.c b/drivers/usb/storage/sddr55.c
index ba955d65eb0e..c8a988d2cfdd 100644
--- a/drivers/usb/storage/sddr55.c
+++ b/drivers/usb/storage/sddr55.c
@@ -554,8 +554,8 @@ static int sddr55_reset(struct us_data *us)
 
 static unsigned long sddr55_get_capacity(struct us_data *us) {
 
-	unsigned char uninitialized_var(manufacturerID);
-	unsigned char uninitialized_var(deviceID);
+	unsigned char manufacturerID;
+	unsigned char deviceID;
 	int result;
 	struct sddr55_card_info *info = (struct sddr55_card_info *)us->extra;
 
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e992decfec53..eea902b83afe 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -862,7 +862,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 	struct tun_msg_ctl ctl;
 	size_t len, total_len = 0;
 	int err;
-	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+	struct vhost_net_ubuf_ref *ubufs;
 	bool zcopy_used;
 	int sent_pkts = 0;
 
@@ -1042,7 +1042,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 	/* len is always initialized before use since we are always called with
 	 * datalen > 0.
 	 */
-	u32 uninitialized_var(len);
+	u32 len;
 
 	while (datalen > 0 && headcount < quota) {
 		if (unlikely(seg >= UIO_MAXIOV)) {
@@ -1099,7 +1099,7 @@ static void handle_rx(struct vhost_net *net)
 {
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
 	struct vhost_virtqueue *vq = &nvq->vq;
-	unsigned uninitialized_var(in), log;
+	unsigned in, log;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
 		.msg_name = NULL,
diff --git a/drivers/video/fbdev/matrox/matroxfb_maven.c b/drivers/video/fbdev/matrox/matroxfb_maven.c
index eda893b7a2e9..9a98c4a6ba33 100644
--- a/drivers/video/fbdev/matrox/matroxfb_maven.c
+++ b/drivers/video/fbdev/matrox/matroxfb_maven.c
@@ -300,7 +300,7 @@ static int matroxfb_mavenclock(const struct matrox_pll_ctl *ctl,
 		unsigned int* in, unsigned int* feed, unsigned int* post,
 		unsigned int* htotal2) {
 	unsigned int fvco;
-	unsigned int uninitialized_var(p);
+	unsigned int p;
 
 	fvco = matroxfb_PLL_mavenclock(&maven1000_pll, ctl, htotal, vtotal, in, feed, &p, htotal2);
 	if (!fvco)
@@ -732,8 +732,8 @@ static int maven_find_exact_clocks(unsigned int ht, unsigned int vt,
 
 	for (x = 0; x < 8; x++) {
 		unsigned int c;
-		unsigned int uninitialized_var(a), uninitialized_var(b),
-			     uninitialized_var(h2);
+		unsigned int a, b,
+			     h2;
 		unsigned int h = ht + 2 + x;
 
 		if (!matroxfb_mavenclock((m->mode == MATROXFB_OUTPUT_MODE_PAL) ? &maven_PAL : &maven_NTSC, h, vt, &a, &b, &c, &h2)) {
diff --git a/drivers/video/fbdev/pm3fb.c b/drivers/video/fbdev/pm3fb.c
index 7497bd36334c..a8faf46adeb1 100644
--- a/drivers/video/fbdev/pm3fb.c
+++ b/drivers/video/fbdev/pm3fb.c
@@ -821,9 +821,9 @@ static void pm3fb_write_mode(struct fb_info *info)
 
 	wmb();
 	{
-		unsigned char uninitialized_var(m);	/* ClkPreScale */
-		unsigned char uninitialized_var(n);	/* ClkFeedBackScale */
-		unsigned char uninitialized_var(p);	/* ClkPostScale */
+		unsigned char m;	/* ClkPreScale */
+		unsigned char n;	/* ClkFeedBackScale */
+		unsigned char p;	/* ClkPostScale */
 		unsigned long pixclock = PICOS2KHZ(info->var.pixclock);
 
 		(void)pm3fb_calculate_clock(pixclock, &m, &n, &p);
diff --git a/drivers/video/fbdev/riva/riva_hw.c b/drivers/video/fbdev/riva/riva_hw.c
index 08c9ee46978e..4168ac464565 100644
--- a/drivers/video/fbdev/riva/riva_hw.c
+++ b/drivers/video/fbdev/riva/riva_hw.c
@@ -1245,8 +1245,7 @@ int CalcStateExt
 )
 {
     int pixelDepth;
-    int uninitialized_var(VClk),uninitialized_var(m),
-        uninitialized_var(n),	uninitialized_var(p);
+    int VClk, m, n, p;
 
     /*
      * Save mode parameters.
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 58b96baa8d48..a2de775801af 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -424,7 +424,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	struct scatterlist *sg;
 	struct vring_desc *desc;
-	unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx;
+	unsigned int i, n, avail, descs_used, prev, err_idx;
 	int head;
 	bool indirect;
 
@@ -1101,8 +1101,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
 	struct vring_packed_desc *desc;
 	struct scatterlist *sg;
 	unsigned int i, n, c, descs_used, err_idx;
-	__le16 uninitialized_var(head_flags), flags;
-	u16 head, id, uninitialized_var(prev), curr, avail_used_flags;
+	__le16 head_flags, flags;
+	u16 head, id, prev, curr, avail_used_flags;
 
 	START_USE(vq);
 
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 96757f3abd74..1d2e61e0ab04 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1053,7 +1053,7 @@ static int afs_d_revalidate_rcu(struct dentry *dentry)
 static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct afs_vnode *vnode, *dir;
-	struct afs_fid uninitialized_var(fid);
+	struct afs_fid fid;
 	struct dentry *parent;
 	struct inode *inode;
 	struct key *key;
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 90d852704328..9cf3102f370c 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -399,7 +399,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key,
 int afs_permission(struct inode *inode, int mask)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
-	afs_access_t uninitialized_var(access);
+	afs_access_t access;
 	struct key *key;
 	int ret = 0;
 
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index e7f550327d5d..e338c407cb75 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -113,7 +113,7 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
 
 void dlm_timeout_warn(struct dlm_lkb *lkb)
 {
-	struct sk_buff *uninitialized_var(send_skb);
+	struct sk_buff *send_skb;
 	struct dlm_lock_data *data;
 	size_t size;
 	int rv;
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 64b56c7df023..d0542151e8c4 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -265,7 +265,7 @@ submit_bio_out:
  */
 static int erofs_raw_access_readpage(struct file *file, struct page *page)
 {
-	erofs_off_t uninitialized_var(last_block);
+	erofs_off_t last_block;
 	struct bio *bio;
 
 	trace_erofs_readpage(page, true);
@@ -282,7 +282,7 @@ static int erofs_raw_access_readpage(struct file *file, struct page *page)
 
 static void erofs_raw_access_readahead(struct readahead_control *rac)
 {
-	erofs_off_t uninitialized_var(last_block);
+	erofs_off_t last_block;
 	struct bio *bio = NULL;
 	struct page *page;
 
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index be50a4d9d273..24a26aaf847f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1161,7 +1161,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
 	struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
 	void *bi_private;
 	/* since bio will be NULL, no need to initialize last_index */
-	pgoff_t uninitialized_var(last_index);
+	pgoff_t last_index;
 	unsigned int nr_bios = 0;
 	struct bio *bio = NULL;
 
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index b4ddf48fa444..c4a274285858 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1284,7 +1284,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
 	struct super_block *sb = dir->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */
-	struct msdos_dir_entry *uninitialized_var(de);
+	struct msdos_dir_entry *de;
 	int err, free_slots, i, nr_bhs;
 	loff_t pos, i_pos;
 
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index c23f6f243ad4..a1303ad303ba 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -120,7 +120,7 @@ static ssize_t fuse_conn_max_background_write(struct file *file,
 					      const char __user *buf,
 					      size_t count, loff_t *ppos)
 {
-	unsigned uninitialized_var(val);
+	unsigned val;
 	ssize_t ret;
 
 	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -162,7 +162,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
 						    const char __user *buf,
 						    size_t count, loff_t *ppos)
 {
-	unsigned uninitialized_var(val);
+	unsigned val;
 	struct fuse_conn *fc;
 	ssize_t ret;
 
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 030f094910c3..2cc17816d7b1 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -270,7 +270,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
 static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
 {
 	char *end = p + len;
-	char *uninitialized_var(key), *uninitialized_var(val);
+	char *key, *val;
 	int rc;
 
 	while (true) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e573b0cd2737..32301fe626ba 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2963,7 +2963,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
 {
 	spin_lock(&fc->lock);
 	if (RB_EMPTY_NODE(&ff->polled_node)) {
-		struct rb_node **link, *uninitialized_var(parent);
+		struct rb_node **link, *parent;
 
 		link = fuse_find_polled_node(fc, ff->kh, &parent);
 		BUG_ON(*link);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 72c9560f4467..baa17c781870 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -335,7 +335,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
 	int done = 0;
 	struct pagevec pvec;
 	int nr_pages;
-	pgoff_t uninitialized_var(writeback_index);
+	pgoff_t writeback_index;
 	pgoff_t index;
 	pgoff_t end;
 	pgoff_t done_index;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6306eaae378b..8dfe09f52cbc 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1761,7 +1761,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
 	u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
 	__u16 start_list[GFS2_MAX_META_HEIGHT];
 	__u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
-	unsigned int start_aligned, uninitialized_var(end_aligned);
+	unsigned int start_aligned, end_aligned;
 	unsigned int strip_h = ip->i_height - 1;
 	u32 btotal = 0;
 	int ret, state;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index cb2a11b458c6..ed1da4323967 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -419,7 +419,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
 			      struct page *page)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
-	struct gfs2_log_header_host uninitialized_var(lh);
+	struct gfs2_log_header_host lh;
 	void *kaddr = kmap_atomic(page);
 	unsigned int offset;
 	bool ret = false;
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index c8d1b2be7854..73342c925a4b 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -398,7 +398,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
 	astr = str->name;
 	len = str->len;
 	while (len > 0) {
-		int uninitialized_var(dsize);
+		int dsize;
 		size = asc2unichar(sb, astr, len, &c);
 		astr += size;
 		len -= size;
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index cac468f04820..402769881c32 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -152,8 +152,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
 struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
 	int found;
-	unsigned long uninitialized_var(block);
-	unsigned long uninitialized_var(offset);
+	unsigned long block;
+	unsigned long offset;
 	struct inode *inode;
 	struct page *page;
 
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 83b8f06b4a64..7e9abdb89712 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -401,7 +401,7 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 {
 	size_t retlen;
 	int ret;
-	uint32_t uninitialized_var(bad_offset);
+	uint32_t bad_offset;
 
 	switch (jffs2_block_check_erase(c, jeb, &bad_offset)) {
 	case -EAGAIN:	goto refile;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b68e96681522..bb1804bab1e1 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -351,7 +351,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 {
 	char *dname, *path;
-	int uninitialized_var(maxsize);
+	int maxsize;
 	char *mesg = buf;
 	int len;
 	struct auth_domain *dom;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 2f834add165b..4c1b90442d6f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4707,7 +4707,7 @@ int ocfs2_insert_extent(handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac)
 {
 	int status;
-	int uninitialized_var(free_records);
+	int free_records;
 	struct buffer_head *last_eb_bh = NULL;
 	struct ocfs2_insert_type insert = {0, };
 	struct ocfs2_extent_rec rec;
@@ -7051,7 +7051,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 	int need_free = 0;
 	u32 bit_off, num;
 	handle_t *handle;
-	u64 uninitialized_var(block);
+	u64 block;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 5761060d2ba8..bdfba9db558a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -848,9 +848,9 @@ static int ocfs2_dx_dir_lookup(struct inode *inode,
 			       u64 *ret_phys_blkno)
 {
 	int ret = 0;
-	unsigned int cend, uninitialized_var(clen);
-	u32 uninitialized_var(cpos);
-	u64 uninitialized_var(blkno);
+	unsigned int cend, clen;
+	u32 cpos;
+	u64 blkno;
 	u32 name_hash = hinfo->major_hash;
 
 	ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
@@ -894,7 +894,7 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
 			       struct ocfs2_dir_lookup_result *res)
 {
 	int ret, i, found;
-	u64 uninitialized_var(phys);
+	u64 phys;
 	struct buffer_head *dx_leaf_bh = NULL;
 	struct ocfs2_dx_leaf *dx_leaf;
 	struct ocfs2_dx_entry *dx_entry = NULL;
@@ -4393,9 +4393,9 @@ out:
 int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
 {
 	int ret;
-	unsigned int uninitialized_var(clen);
-	u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
-	u64 uninitialized_var(blkno);
+	unsigned int clen;
+	u32 major_hash = UINT_MAX, p_cpos, cpos;
+	u64 blkno;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 	struct buffer_head *dx_root_bh = NULL;
 	struct ocfs2_dx_root_block *dx_root;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index a94852af5510..7b93e9c766f6 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -403,7 +403,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
 {
 	int i, ret, tree_height, len;
 	struct ocfs2_dinode *di;
-	struct ocfs2_extent_block *uninitialized_var(eb);
+	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_rec *rec;
 	struct buffer_head *eb_bh = NULL;
@@ -599,7 +599,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 		       unsigned int *extent_flags)
 {
 	int ret;
-	unsigned int uninitialized_var(hole_len), flags = 0;
+	unsigned int hole_len, flags = 0;
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_extent_rec rec;
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 5381020aaa9a..c46bf7f581a1 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2498,7 +2498,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 	struct buffer_head *new_di_bh = NULL;
 	struct ocfs2_alloc_context *inode_ac = NULL;
 	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
-	u64 uninitialized_var(di_blkno), suballoc_loc;
+	u64 di_blkno, suballoc_loc;
 	u16 suballoc_bit;
 
 	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index cfb77f70c888..3b397fa9c9e8 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1063,7 +1063,7 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
 				  struct buffer_head **ret_bh)
 {
 	int ret = 0, i, found;
-	u32 low_cpos, uninitialized_var(cpos_end);
+	u32 low_cpos, cpos_end;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_rec *rec = NULL;
 	struct ocfs2_extent_block *eb = NULL;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 90c830e3758e..9ccd19d8f7b1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1211,7 +1211,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 	struct ocfs2_xattr_value_root *xv;
 	size_t size;
 	int ret = -ENODATA, name_offset, name_len, i;
-	int uninitialized_var(block_off);
+	int block_off;
 
 	xs->bucket = ocfs2_xattr_bucket_new(inode);
 	if (!xs->bucket) {
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index d7b5f09d298c..2c7b70ee1388 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -220,7 +220,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
 	struct buffer_head *bh;
 	sector_t next, offset;
 	int ret;
-	u64 uninitialized_var(new_block);
+	u64 new_block;
 	u32 max_extents;
 	int extent_count;
 	struct omfs_extent *oe;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 79dd052c7dbf..8e1c308524a2 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -787,7 +787,7 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
 	struct path upperpath, datapath;
 	int err;
 	char *capability = NULL;
-	ssize_t uninitialized_var(cap_size);
+	ssize_t cap_size;
 
 	ovl_path_upper(c->dentry, &upperpath);
 	if (WARN_ON(upperpath.dentry == NULL))
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index ad292c5a43a9..b5cdac9b0368 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -552,11 +552,11 @@ out:
  */
 int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 {
-	int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
+	int lnum, offs, len, err = 0, last_level, child_cnt;
 	int first = 1, iip;
 	struct ubifs_debug_info *d = c->dbg;
-	union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key;
-	unsigned long long uninitialized_var(last_sqnum);
+	union ubifs_key lower_key, upper_key, l_key, u_key;
+	unsigned long long last_sqnum;
 	struct ubifs_idx_node *idx;
 	struct list_head list;
 	struct idx_node *i;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ef85ec167a84..9d042942d8b2 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1260,7 +1260,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
 			.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
 	struct timespec64 time;
-	unsigned int uninitialized_var(saved_nlink);
+	unsigned int saved_nlink;
 	struct fscrypt_name old_nm, new_nm;
 
 	/*
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 49fe062ce45e..b77d1637bbbc 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -222,7 +222,7 @@ static int write_begin_slow(struct address_space *mapping,
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	pgoff_t index = pos >> PAGE_SHIFT;
 	struct ubifs_budget_req req = { .new_page = 1 };
-	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+	int err, appending = !!(pos + len > inode->i_size);
 	struct page *page;
 
 	dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
@@ -426,7 +426,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	pgoff_t index = pos >> PAGE_SHIFT;
-	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+	int err, appending = !!(pos + len > inode->i_size);
 	int skipped_read = 0;
 	struct page *page;
 
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index e5ec1afe1c66..2e6264318bd9 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -1222,7 +1222,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 	int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
 	int last_reference = !!(new_inode && new_inode->i_nlink == 0);
 	int move = (old_dir != new_dir);
-	struct ubifs_inode *uninitialized_var(new_ui);
+	struct ubifs_inode *new_ui;
 	u8 hash_old_dir[UBIFS_HASH_ARR_SZ];
 	u8 hash_new_dir[UBIFS_HASH_ARR_SZ];
 	u8 hash_new_inode[UBIFS_HASH_ARR_SZ];
@@ -1507,7 +1507,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 	union ubifs_key key, to_key;
 	struct ubifs_ino_node *ino;
 	struct ubifs_trun_node *trun;
-	struct ubifs_data_node *uninitialized_var(dn);
+	struct ubifs_data_node *dn;
 	int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	ino_t inum = inode->i_ino;
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index e21abf250951..6e0a153b7194 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -275,7 +275,7 @@ uint32_t ubifs_unpack_bits(const struct ubifs_info *c, uint8_t **addr, int *pos,
 	const int k = 32 - nrbits;
 	uint8_t *p = *addr;
 	int b = *pos;
-	uint32_t uninitialized_var(val);
+	uint32_t val;
 	const int bytes = (nrbits + b + 7) >> 3;
 
 	ubifs_assert(c, nrbits > 0);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e8e7b0e9532e..f609f6cdde70 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -892,7 +892,7 @@ static int fallible_resolve_collision(struct ubifs_info *c,
 				      int adding)
 {
 	struct ubifs_znode *o_znode = NULL, *znode = *zn;
-	int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
+	int o_n, err, cmp, unsure = 0, nn = *n;
 
 	cmp = fallible_matches_name(c, &znode->zbranch[nn], nm);
 	if (unlikely(cmp < 0))
@@ -1514,8 +1514,8 @@ out:
  */
 int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
 {
-	int n, err = 0, lnum = -1, uninitialized_var(offs);
-	int uninitialized_var(len);
+	int n, err = 0, lnum = -1, offs;
+	int len;
 	unsigned int block = key_block(c, &bu->key);
 	struct ubifs_znode *znode;
 
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index 49cb34c3f324..ccaf94ea5be3 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -126,8 +126,8 @@ int ubifs_search_zbranch(const struct ubifs_info *c,
 			 const struct ubifs_znode *znode,
 			 const union ubifs_key *key, int *n)
 {
-	int beg = 0, end = znode->child_cnt, uninitialized_var(mid);
-	int uninitialized_var(cmp);
+	int beg = 0, end = znode->child_cnt, mid;
+	int cmp;
 	const struct ubifs_zbranch *zbr = &znode->zbranch[0];
 
 	ubifs_assert(c, end > beg);
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 02f03fadb75b..8e597db4d971 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -564,7 +564,7 @@ static udf_pblk_t udf_table_new_block(struct super_block *sb,
 	udf_pblk_t newblock = 0;
 	uint32_t adsize;
 	uint32_t elen, goal_elen = 0;
-	struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
+	struct kernel_lb_addr eloc, goal_eloc;
 	struct extent_position epos, goal_epos;
 	int8_t etype;
 	struct udf_inode_info *iinfo = UDF_I(table);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index f37f5cc4b19f..30525861c596 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -126,7 +126,7 @@ xfs_bmap_rtalloc(
 	 * pick an extent that will space things out in the rt area.
 	 */
 	if (ap->eof && ap->offset == 0) {
-		xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
+		xfs_rtblock_t rtx; /* realtime extent no */
 
 		error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
 		if (error)
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 6315324b9dc2..56e4afb2e729 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -305,7 +305,7 @@ flow_action_mixed_hw_stats_check(const struct flow_action *action,
 				 struct netlink_ext_ack *extack)
 {
 	const struct flow_action_entry *action_entry;
-	u8 uninitialized_var(last_hw_stats);
+	u8 last_hw_stats;
 	int i;
 
 	if (flow_offload_has_one_action(action))
diff --git a/kernel/async.c b/kernel/async.c
index 4f9c1d614016..33258e6e20f8 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -111,7 +111,7 @@ static void async_run_entry_fn(struct work_struct *work)
 	struct async_entry *entry =
 		container_of(work, struct async_entry, work);
 	unsigned long flags;
-	ktime_t uninitialized_var(calltime), delta, rettime;
+	ktime_t calltime, delta, rettime;
 
 	/* 1) run (and print duration) */
 	if (initcall_debug && system_state < SYSTEM_RUNNING) {
@@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
  */
 void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
 {
-	ktime_t uninitialized_var(starttime), delta, endtime;
+	ktime_t starttime, delta, endtime;
 
 	if (initcall_debug && system_state < SYSTEM_RUNNING) {
 		pr_debug("async_waiting @ %i\n", task_pid_nr(current));
diff --git a/kernel/audit.c b/kernel/audit.c
index 8c201f414226..ec38479f9228 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1800,7 +1800,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 {
 	struct audit_buffer *ab;
 	struct timespec64 t;
-	unsigned int uninitialized_var(serial);
+	unsigned int serial;
 
 	if (audit_initialized != AUDIT_INITIALIZED)
 		return NULL;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 683a799618ad..9d847ab851db 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -591,7 +591,7 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
 	int this_cpu, old_cpu;
 	char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
 	char *moreprompt = "more> ";
-	unsigned long uninitialized_var(flags);
+	unsigned long flags;
 
 	/* Serialize kdb_printf if multiple cpus try to write at once.
 	 * But if any cpu goes recursive in kdb, just print the output,
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 36c962a86bf2..d628ab09d97b 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -882,7 +882,7 @@ static int device_dma_allocations(struct device *dev, struct dma_debug_entry **o
 static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data)
 {
 	struct device *dev = data;
-	struct dma_debug_entry *uninitialized_var(entry);
+	struct dma_debug_entry *entry;
 	int count;
 
 	if (dma_debug_disabled())
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 856d98c36f56..851fc5e0e24b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11483,7 +11483,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	struct perf_event *group_leader = NULL, *output_event = NULL;
 	struct perf_event *event, *sibling;
 	struct perf_event_attr attr;
-	struct perf_event_context *ctx, *uninitialized_var(gctx);
+	struct perf_event_context *ctx, *gctx;
 	struct file *event_file = NULL;
 	struct fd group = {NULL, 0};
 	struct task_struct *task = NULL;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index bb0862873dba..e84eb52b646b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2189,7 +2189,7 @@ static void handle_swbp(struct pt_regs *regs)
 {
 	struct uprobe *uprobe;
 	unsigned long bp_vaddr;
-	int uninitialized_var(is_swbp);
+	int is_swbp;
 
 	bp_vaddr = uprobe_get_swbp_addr(regs);
 	if (bp_vaddr == get_trampoline_vaddr())
diff --git a/kernel/exit.c b/kernel/exit.c
index 727150f28103..7bcd571618dd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -93,7 +93,7 @@ static void __exit_signal(struct task_struct *tsk)
 	struct signal_struct *sig = tsk->signal;
 	bool group_dead = thread_group_leader(tsk);
 	struct sighand_struct *sighand;
-	struct tty_struct *uninitialized_var(tty);
+	struct tty_struct *tty;
 	u64 utime, stime;
 
 	sighand = rcu_dereference_check(tsk->sighand,
diff --git a/kernel/futex.c b/kernel/futex.c
index e646661f6282..05e88562de68 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1326,7 +1326,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
 {
 	int err;
-	u32 uninitialized_var(curval);
+	u32 curval;
 
 	if (unlikely(should_fail_futex(true)))
 		return -EFAULT;
@@ -1496,7 +1496,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
  */
 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
 {
-	u32 uninitialized_var(curval), newval;
+	u32 curval, newval;
 	struct task_struct *new_owner;
 	bool postunlock = false;
 	DEFINE_WAKE_Q(wake_q);
@@ -2370,7 +2370,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 				struct task_struct *argowner)
 {
 	struct futex_pi_state *pi_state = q->pi_state;
-	u32 uval, uninitialized_var(curval), newval;
+	u32 uval, curval, newval;
 	struct task_struct *oldowner, *newowner;
 	u32 newtid;
 	int ret, err = 0;
@@ -2996,7 +2996,7 @@ uaddr_faulted:
  */
 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
-	u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
+	u32 curval, uval, vpid = task_pid_vnr(current);
 	union futex_key key = FUTEX_KEY_INIT;
 	struct futex_hash_bucket *hb;
 	struct futex_q *top_waiter;
@@ -3479,7 +3479,7 @@ err_unlock:
 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
 			      bool pi, bool pending_op)
 {
-	u32 uval, uninitialized_var(nval), mval;
+	u32 uval, nval, mval;
 	int err;
 
 	/* Futex address must be 32bit aligned */
@@ -3609,7 +3609,7 @@ static void exit_robust_list(struct task_struct *curr)
 	struct robust_list_head __user *head = curr->robust_list;
 	struct robust_list __user *entry, *next_entry, *pending;
 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-	unsigned int uninitialized_var(next_pi);
+	unsigned int next_pi;
 	unsigned long futex_offset;
 	int rc;
 
@@ -3909,7 +3909,7 @@ static void compat_exit_robust_list(struct task_struct *curr)
 	struct compat_robust_list_head __user *head = curr->compat_robust_list;
 	struct robust_list __user *entry, *next_entry, *pending;
 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-	unsigned int uninitialized_var(next_pi);
+	unsigned int next_pi;
 	compat_uptr_t uentry, next_uentry, upending;
 	compat_long_t futex_offset;
 	int rc;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 29a8de4c50b9..84ed1d1d5013 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1723,7 +1723,7 @@ static int noop_count(struct lock_list *entry, void *data)
 static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
 {
 	unsigned long  count = 0;
-	struct lock_list *uninitialized_var(target_entry);
+	struct lock_list *target_entry;
 
 	__bfs_forwards(this, (void *)&count, noop_count, &target_entry);
 
@@ -1749,7 +1749,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
 static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
 {
 	unsigned long  count = 0;
-	struct lock_list *uninitialized_var(target_entry);
+	struct lock_list *target_entry;
 
 	__bfs_backwards(this, (void *)&count, noop_count, &target_entry);
 
@@ -1804,7 +1804,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
 		  struct lock_trace **const trace)
 {
 	int ret;
-	struct lock_list *uninitialized_var(target_entry);
+	struct lock_list *target_entry;
 	struct lock_list src_entry = {
 		.class = hlock_class(src),
 		.parent = NULL,
@@ -1842,7 +1842,7 @@ static noinline int
 check_redundant(struct held_lock *src, struct held_lock *target)
 {
 	int ret;
-	struct lock_list *uninitialized_var(target_entry);
+	struct lock_list *target_entry;
 	struct lock_list src_entry = {
 		.class = hlock_class(src),
 		.parent = NULL,
@@ -2244,8 +2244,8 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
 {
 	unsigned long usage_mask = 0, forward_mask, backward_mask;
 	enum lock_usage_bit forward_bit = 0, backward_bit = 0;
-	struct lock_list *uninitialized_var(target_entry1);
-	struct lock_list *uninitialized_var(target_entry);
+	struct lock_list *target_entry1;
+	struct lock_list *target_entry;
 	struct lock_list this, that;
 	int ret;
 
@@ -3438,7 +3438,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
 {
 	int ret;
 	struct lock_list root;
-	struct lock_list *uninitialized_var(target_entry);
+	struct lock_list *target_entry;
 
 	root.parent = NULL;
 	root.class = hlock_class(this);
@@ -3465,7 +3465,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
 {
 	int ret;
 	struct lock_list root;
-	struct lock_list *uninitialized_var(target_entry);
+	struct lock_list *target_entry;
 
 	root.parent = NULL;
 	root.class = hlock_class(this);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 00867ff82412..f15471ce969e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -577,7 +577,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
  */
 int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
 {
-	struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
+	struct ring_buffer_per_cpu *cpu_buffer;
 	DEFINE_WAIT(wait);
 	struct rb_irq_work *work;
 	int ret = 0;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 34e406fe561f..8e4a3a4397f2 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1029,7 +1029,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 {
 	struct radix_tree_node *node, *parent;
 	unsigned long maxindex;
-	int uninitialized_var(offset);
+	int offset;
 
 	radix_tree_load_root(root, &node, &maxindex);
 	if (index > maxindex)
diff --git a/lib/test_lockup.c b/lib/test_lockup.c
index bd7c7ff39f6b..ff26f36d729f 100644
--- a/lib/test_lockup.c
+++ b/lib/test_lockup.c
@@ -168,7 +168,7 @@ static int master_cpu;
 
 static void test_lock(bool master, bool verbose)
 {
-	u64 uninitialized_var(wait_start);
+	u64 wait_start;
 
 	if (measure_lock_wait)
 		wait_start = local_clock();
diff --git a/mm/frontswap.c b/mm/frontswap.c
index bfa3a339253e..9d977b1fc016 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -446,7 +446,7 @@ static int __frontswap_shrink(unsigned long target_pages,
 void frontswap_shrink(unsigned long target_pages)
 {
 	unsigned long pages_to_unuse = 0;
-	int uninitialized_var(type), ret;
+	int type, ret;
 
 	/*
 	 * we don't want to hold swap_lock while doing a very
diff --git a/mm/ksm.c b/mm/ksm.c
index 4102034cd55a..5fb176d497ea 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2387,7 +2387,7 @@ next_mm:
 static void ksm_do_scan(unsigned int scan_npages)
 {
 	struct rmap_item *rmap_item;
-	struct page *uninitialized_var(page);
+	struct page *page;
 
 	while (scan_npages-- && likely(!freezing(current))) {
 		cond_resched();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 19622328e4b5..c888d9b5c745 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1004,7 +1004,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
 				   struct mem_cgroup_reclaim_cookie *reclaim)
 {
-	struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+	struct mem_cgroup_reclaim_iter *iter;
 	struct cgroup_subsys_state *css = NULL;
 	struct mem_cgroup *memcg = NULL;
 	struct mem_cgroup *pos = NULL;
diff --git a/mm/memory.c b/mm/memory.c
index 87ec87cdc1ff..5b5887b0e7f4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2205,7 +2205,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 {
 	pte_t *pte;
 	int err = 0;
-	spinlock_t *uninitialized_var(ptl);
+	spinlock_t *ptl;
 
 	if (create) {
 		pte = (mm == &init_mm) ?
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 381320671677..b9e85d467352 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1234,7 +1234,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 static struct page *new_page(struct page *page, unsigned long start)
 {
 	struct vm_area_struct *vma;
-	unsigned long uninitialized_var(address);
+	unsigned long address;
 
 	vma = find_vma(current->mm, start);
 	while (vma) {
@@ -1629,7 +1629,7 @@ static int kernel_get_mempolicy(int __user *policy,
 				unsigned long flags)
 {
 	int err;
-	int uninitialized_var(pval);
+	int pval;
 	nodemask_t nodes;
 
 	addr = untagged_addr(addr);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 48eb0f1410d4..b52a3a2a5edd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -961,7 +961,7 @@ static inline void __free_one_page(struct page *page,
 		int migratetype, bool report)
 {
 	struct capture_control *capc = task_capc(zone);
-	unsigned long uninitialized_var(buddy_pfn);
+	unsigned long buddy_pfn;
 	unsigned long combined_pfn;
 	unsigned int max_order;
 	struct page *buddy;
diff --git a/mm/percpu.c b/mm/percpu.c
index 696367b18222..b626766160ce 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2513,7 +2513,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
 	const size_t static_size = __per_cpu_end - __per_cpu_start;
 	int nr_groups = 1, nr_units = 0;
 	size_t size_sum, min_unit_size, alloc_size;
-	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
+	int upa, max_upa, best_upa;	/* units_per_alloc */
 	int last_allocs, group, unit;
 	unsigned int cpu, tcpu;
 	struct pcpu_alloc_info *ai;
diff --git a/mm/slub.c b/mm/slub.c
index ef303070d175..f226d66408ee 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1218,7 +1218,7 @@ static noinline int free_debug_processing(
 	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 	void *object = head;
 	int cnt = 0;
-	unsigned long uninitialized_var(flags);
+	unsigned long flags;
 	int ret = 0;
 
 	spin_lock_irqsave(&n->list_lock, flags);
@@ -2901,7 +2901,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 	struct page new;
 	unsigned long counters;
 	struct kmem_cache_node *n = NULL;
-	unsigned long uninitialized_var(flags);
+	unsigned long flags;
 
 	stat(s, FREE_SLOWPATH);
 
diff --git a/mm/swap.c b/mm/swap.c
index a82efc33411f..de257c0a89b1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -830,8 +830,8 @@ void release_pages(struct page **pages, int nr)
 	LIST_HEAD(pages_to_free);
 	struct pglist_data *locked_pgdat = NULL;
 	struct lruvec *lruvec;
-	unsigned long uninitialized_var(flags);
-	unsigned int uninitialized_var(lock_batch);
+	unsigned long flags;
+	unsigned int lock_batch;
 
 	for (i = 0; i < nr; i++) {
 		struct page *page = pages[i];
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 3b42f5c6a63d..9fed0ae21e63 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -56,7 +56,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 					(dh->dccph_doff * 4);
 	struct dccp_options_received *opt_recv = &dp->dccps_options_received;
 	unsigned char opt, len;
-	unsigned char *uninitialized_var(value);
+	unsigned char *value;
 	u32 elapsed_time;
 	__be32 opt_val;
 	int rc;
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
index c94445b44d8c..2d42e4c35a20 100644
--- a/net/ipv4/netfilter/nf_socket_ipv4.c
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -84,11 +84,11 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
 struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
 				  const struct net_device *indev)
 {
-	__be32 uninitialized_var(daddr), uninitialized_var(saddr);
-	__be16 uninitialized_var(dport), uninitialized_var(sport);
+	__be32 daddr, saddr;
+	__be16 dport, sport;
 	const struct iphdr *iph = ip_hdr(skb);
 	struct sk_buff *data_skb = NULL;
-	u8 uninitialized_var(protocol);
+	u8 protocol;
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn const *ct;
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index ce4fbba4acce..73bb047e6037 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -535,7 +535,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
 
 int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
 {
-	int uninitialized_var(err);
+	int err;
 	struct net *net = sock_net(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct in6_flowlabel_req freq;
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
index b9df879c48d3..6fd54744cbc3 100644
--- a/net/ipv6/netfilter/nf_socket_ipv6.c
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -97,7 +97,7 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
 struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
 				  const struct net_device *indev)
 {
-	__be16 uninitialized_var(dport), uninitialized_var(sport);
+	__be16 dport, sport;
 	const struct in6_addr *daddr = NULL, *saddr = NULL;
 	struct ipv6hdr *iph = ipv6_hdr(skb);
 	struct sk_buff *data_skb = NULL;
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 9eca90414bb7..b22801f97bce 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -382,7 +382,7 @@ static int help(struct sk_buff *skb,
 	int ret;
 	u32 seq;
 	int dir = CTINFO2DIR(ctinfo);
-	unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff);
+	unsigned int matchlen, matchoff;
 	struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct);
 	struct nf_conntrack_expect *exp;
 	union nf_inet_addr *daddr;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 0ba020ca38e6..f02992419850 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -689,7 +689,7 @@ nfulnl_log_packet(struct net *net,
 	struct nfnl_log_net *log = nfnl_log_pernet(net);
 	const struct nfnl_ct_hook *nfnl_ct = NULL;
 	struct nf_conn *ct = NULL;
-	enum ip_conntrack_info uninitialized_var(ctinfo);
+	enum ip_conntrack_info ctinfo;
 
 	if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
 		li = li_user;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 3243a31f6e82..dadfc06245a3 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -388,7 +388,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 	struct net_device *indev;
 	struct net_device *outdev;
 	struct nf_conn *ct = NULL;
-	enum ip_conntrack_info uninitialized_var(ctinfo);
+	enum ip_conntrack_info ctinfo;
 	struct nfnl_ct_hook *nfnl_ct;
 	bool csum_verify;
 	char *secdata = NULL;
@@ -1168,7 +1168,7 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
 	struct nfqnl_instance *queue;
 	unsigned int verdict;
 	struct nf_queue_entry *entry;
-	enum ip_conntrack_info uninitialized_var(ctinfo);
+	enum ip_conntrack_info ctinfo;
 	struct nfnl_ct_hook *nfnl_ct;
 	struct nf_conn *ct = NULL;
 	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 80ae7b9fa90a..354c3cc90741 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -225,7 +225,7 @@ static u32 flow_get_skgid(const struct sk_buff *skb)
 
 static u32 flow_get_vlan_tag(const struct sk_buff *skb)
 {
-	u16 uninitialized_var(tag);
+	u16 tag;
 
 	if (vlan_get_tag(skb, &tag) < 0)
 		return 0;
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index ca813697728e..4f823a8223d8 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1692,7 +1692,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 {
 	struct cake_sched_data *q = qdisc_priv(sch);
 	int len = qdisc_pkt_len(skb);
-	int uninitialized_var(ret);
+	int ret;
 	struct sk_buff *ack = NULL;
 	ktime_t now = ktime_get();
 	struct cake_tin_data *b;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 39b427dc7512..ce4519358106 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -360,7 +360,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	    struct sk_buff **to_free)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
-	int uninitialized_var(ret);
+	int ret;
 	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 
 #ifdef CONFIG_NET_CLS_ACT
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 459a784056c0..985d5208f563 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -187,7 +187,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 	unsigned int idx, prev_backlog, prev_qlen;
 	struct fq_codel_flow *flow;
-	int uninitialized_var(ret);
+	int ret;
 	unsigned int pkt_len;
 	bool memory_limited;
 
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index fb760cee824e..4d307f17e084 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -130,7 +130,7 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 {
 	struct fq_pie_sched_data *q = qdisc_priv(sch);
 	struct fq_pie_flow *sel_flow;
-	int uninitialized_var(ret);
+	int ret;
 	u8 memory_limited = false;
 	u8 enqueue = false;
 	u32 pkt_len;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 433f2190960f..92ad4115e473 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1533,7 +1533,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 {
 	unsigned int len = qdisc_pkt_len(skb);
 	struct hfsc_class *cl;
-	int uninitialized_var(err);
+	int err;
 	bool first;
 
 	cl = hfsc_classify(skb, sch, &err);
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 8184c87da8be..6feab225b4ba 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -579,7 +579,7 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		       struct sk_buff **to_free)
 {
-	int uninitialized_var(ret);
+	int ret;
 	unsigned int len = qdisc_pkt_len(skb);
 	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *cl = htb_classify(skb, sch, &ret);
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 5a6def5e4e6d..15f400dcb400 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -349,7 +349,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 	unsigned int hash, dropped;
 	sfq_index x, qlen;
 	struct sfq_slot *slot;
-	int uninitialized_var(ret);
+	int ret;
 	struct sk_buff *head;
 	int delta;
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 5c4ec9386f81..b97947e9ca45 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -563,7 +563,7 @@ static int svc_udp_sendto(struct svc_rqst *rqstp)
 		.msg_control	= cmh,
 		.msg_controllen	= sizeof(buffer),
 	};
-	unsigned int uninitialized_var(sent);
+	unsigned int sent;
 	int err;
 
 	svc_udp_release_rqst(rqstp);
@@ -1080,7 +1080,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
 	struct msghdr msg = {
 		.msg_flags	= 0,
 	};
-	unsigned int uninitialized_var(sent);
+	unsigned int sent;
 	int err;
 
 	svc_tcp_release_rqst(rqstp);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 914508ea9b84..c57aef829403 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -496,8 +496,8 @@ xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg,
 		int flags, struct rpc_rqst *req)
 {
 	struct xdr_buf *buf = &req->rq_private_buf;
-	size_t want, uninitialized_var(read);
-	ssize_t uninitialized_var(ret);
+	size_t want, read;
+	ssize_t ret;
 
 	xs_read_header(transport, buf);
 
@@ -844,7 +844,7 @@ static int xs_local_send_request(struct rpc_rqst *req)
 	struct msghdr msg = {
 		.msg_flags	= XS_SENDMSG_FLAGS,
 	};
-	unsigned int uninitialized_var(sent);
+	unsigned int sent;
 	int status;
 
 	/* Close the stream if the previous transmission was incomplete */
@@ -915,7 +915,7 @@ static int xs_udp_send_request(struct rpc_rqst *req)
 		.msg_namelen	= xprt->addrlen,
 		.msg_flags	= XS_SENDMSG_FLAGS,
 	};
-	unsigned int uninitialized_var(sent);
+	unsigned int sent;
 	int status;
 
 	xs_pktdump("packet data:",
@@ -999,7 +999,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
 		.msg_flags	= XS_SENDMSG_FLAGS,
 	};
 	bool vm_wait = false;
-	unsigned int uninitialized_var(sent);
+	unsigned int sent;
 	int status;
 
 	/* Close the stream if the previous transmission was incomplete */
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 24f64bc0de18..710bd44eaa49 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -670,7 +670,7 @@ static int tls_push_record(struct sock *sk, int flags,
 	struct tls_prot_info *prot = &tls_ctx->prot_info;
 	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	struct tls_rec *rec = ctx->open_rec, *tmp = NULL;
-	u32 i, split_point, uninitialized_var(orig_end);
+	u32 i, split_point, orig_end;
 	struct sk_msg *msg_pl, *msg_en;
 	struct aead_request *req;
 	bool split;
diff --git a/sound/core/control_compat.c b/sound/core/control_compat.c
index d55be1db1a8a..02df1d7db9a1 100644
--- a/sound/core/control_compat.c
+++ b/sound/core/control_compat.c
@@ -223,7 +223,7 @@ static int copy_ctl_value_from_user(struct snd_card *card,
 {
 	struct snd_ctl_elem_value32 __user *data32 = userdata;
 	int i, type, size;
-	int uninitialized_var(count);
+	int count;
 	unsigned int indirect;
 
 	if (copy_from_user(&data->id, &data32->id, sizeof(data->id)))
diff --git a/sound/isa/sb/sb16_csp.c b/sound/isa/sb/sb16_csp.c
index 4ad0ff0c4508..270af863e198 100644
--- a/sound/isa/sb/sb16_csp.c
+++ b/sound/isa/sb/sb16_csp.c
@@ -102,7 +102,7 @@ static void info_read(struct snd_info_entry *entry, struct snd_info_buffer *buff
 int snd_sb_csp_new(struct snd_sb *chip, int device, struct snd_hwdep ** rhwdep)
 {
 	struct snd_sb_csp *p;
-	int uninitialized_var(version);
+	int version;
 	int err;
 	struct snd_hwdep *hw;
 
diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c
index 9bea7d3f99f8..9468a2de8c9b 100644
--- a/sound/usb/endpoint.c
+++ b/sound/usb/endpoint.c
@@ -335,7 +335,7 @@ static void queue_pending_output_urbs(struct snd_usb_endpoint *ep)
 	while (test_bit(EP_FLAG_RUNNING, &ep->flags)) {
 
 		unsigned long flags;
-		struct snd_usb_packet_info *uninitialized_var(packet);
+		struct snd_usb_packet_info *packet;
 		struct snd_urb_ctx *ctx = NULL;
 		int err, i;
 
-- 
cgit v1.2.3


From 3d2e83a2a6a0657c1cf145fa6ba23620715d6c36 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 17 Jul 2020 16:05:41 +0200
Subject: timers: Preserve higher bits of expiration on index calculation

The higher bits of the timer expiration are cropped while calling
calc_index() due to the implicit cast from unsigned long to unsigned int.

This loss shouldn't have consequences on the current code since all the
computation to calculate the index is done on the lower 32 bits.

However to prepare for returning the actual bucket expiration from
calc_index() in order to properly fix base->next_expiry updates, the higher
bits need to be preserved.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200717140551.29076-3-frederic@kernel.org
---
 kernel/time/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index df1ff803acc4..bcdc3045138d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -487,7 +487,7 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
  * Helper function to calculate the array index for a given expiry
  * time.
  */
-static inline unsigned calc_index(unsigned expires, unsigned lvl)
+static inline unsigned calc_index(unsigned long expires, unsigned lvl)
 {
 	expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
 	return LVL_OFFS(lvl) + (expires & LVL_MASK);
-- 
cgit v1.2.3


From 1f32cab0db4bdf6491eb4a60838f278e01c31698 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Fri, 17 Jul 2020 16:05:42 +0200
Subject: timers: Use only bucket expiry for base->next_expiry value

The bucket expiry time is the effective expriy time of timers and is
greater than or equal to the requested timer expiry time. This is due
to the guarantee that timers never expire early and the reduced expiry
granularity in the secondary wheel levels.

When a timer is enqueued, trigger_dyntick_cpu() checks whether the
timer is the new first timer. This check compares next_expiry with
the requested timer expiry value and not with the effective expiry
value of the bucket into which the timer was queued.

Storing the requested timer expiry value in base->next_expiry can lead
to base->clk going backwards if the requested timer expiry value is
smaller than base->clk. Commit 30c66fc30ee7 ("timer: Prevent base->clk
from moving backward") worked around this by preventing the store when
timer->expiry is before base->clk, but did not fix the underlying
problem.

Use the expiry value of the bucket into which the timer is queued to
do the new first timer check. This fixes the base->clk going backward
problem.

The workaround of commit 30c66fc30ee7 ("timer: Prevent base->clk from
moving backward") in trigger_dyntick_cpu() is not longer necessary as the
timers bucket expiry is guaranteed to be greater than or equal base->clk.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200717140551.29076-4-frederic@kernel.org
---
 kernel/time/timer.c | 64 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index bcdc3045138d..a7a3cf737411 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -487,35 +487,39 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
  * Helper function to calculate the array index for a given expiry
  * time.
  */
-static inline unsigned calc_index(unsigned long expires, unsigned lvl)
+static inline unsigned calc_index(unsigned long expires, unsigned lvl,
+				  unsigned long *bucket_expiry)
 {
 	expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+	*bucket_expiry = expires << LVL_SHIFT(lvl);
 	return LVL_OFFS(lvl) + (expires & LVL_MASK);
 }
 
-static int calc_wheel_index(unsigned long expires, unsigned long clk)
+static int calc_wheel_index(unsigned long expires, unsigned long clk,
+			    unsigned long *bucket_expiry)
 {
 	unsigned long delta = expires - clk;
 	unsigned int idx;
 
 	if (delta < LVL_START(1)) {
-		idx = calc_index(expires, 0);
+		idx = calc_index(expires, 0, bucket_expiry);
 	} else if (delta < LVL_START(2)) {
-		idx = calc_index(expires, 1);
+		idx = calc_index(expires, 1, bucket_expiry);
 	} else if (delta < LVL_START(3)) {
-		idx = calc_index(expires, 2);
+		idx = calc_index(expires, 2, bucket_expiry);
 	} else if (delta < LVL_START(4)) {
-		idx = calc_index(expires, 3);
+		idx = calc_index(expires, 3, bucket_expiry);
 	} else if (delta < LVL_START(5)) {
-		idx = calc_index(expires, 4);
+		idx = calc_index(expires, 4, bucket_expiry);
 	} else if (delta < LVL_START(6)) {
-		idx = calc_index(expires, 5);
+		idx = calc_index(expires, 5, bucket_expiry);
 	} else if (delta < LVL_START(7)) {
-		idx = calc_index(expires, 6);
+		idx = calc_index(expires, 6, bucket_expiry);
 	} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
-		idx = calc_index(expires, 7);
+		idx = calc_index(expires, 7, bucket_expiry);
 	} else if ((long) delta < 0) {
 		idx = clk & LVL_MASK;
+		*bucket_expiry = clk;
 	} else {
 		/*
 		 * Force expire obscene large timeouts to expire at the
@@ -524,7 +528,7 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk)
 		if (delta >= WHEEL_TIMEOUT_CUTOFF)
 			expires = clk + WHEEL_TIMEOUT_MAX;
 
-		idx = calc_index(expires, LVL_DEPTH - 1);
+		idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
 	}
 	return idx;
 }
@@ -544,16 +548,18 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
 }
 
 static void
-__internal_add_timer(struct timer_base *base, struct timer_list *timer)
+__internal_add_timer(struct timer_base *base, struct timer_list *timer,
+		     unsigned long *bucket_expiry)
 {
 	unsigned int idx;
 
-	idx = calc_wheel_index(timer->expires, base->clk);
+	idx = calc_wheel_index(timer->expires, base->clk, bucket_expiry);
 	enqueue_timer(base, timer, idx);
 }
 
 static void
-trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
+trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer,
+		    unsigned long bucket_expiry)
 {
 	if (!is_timers_nohz_active())
 		return;
@@ -576,31 +582,29 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
 	if (!base->is_idle)
 		return;
 
-	/* Check whether this is the new first expiring timer: */
-	if (time_after_eq(timer->expires, base->next_expiry))
+	/*
+	 * Check whether this is the new first expiring timer. The
+	 * effective expiry time of the timer is required here
+	 * (bucket_expiry) instead of timer->expires.
+	 */
+	if (time_after_eq(bucket_expiry, base->next_expiry))
 		return;
 
 	/*
 	 * Set the next expiry time and kick the CPU so it can reevaluate the
 	 * wheel:
 	 */
-	if (time_before(timer->expires, base->clk)) {
-		/*
-		 * Prevent from forward_timer_base() moving the base->clk
-		 * backward
-		 */
-		base->next_expiry = base->clk;
-	} else {
-		base->next_expiry = timer->expires;
-	}
+	base->next_expiry = bucket_expiry;
 	wake_up_nohz_cpu(base->cpu);
 }
 
 static void
 internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
-	__internal_add_timer(base, timer);
-	trigger_dyntick_cpu(base, timer);
+	unsigned long bucket_expiry;
+
+	__internal_add_timer(base, timer, &bucket_expiry);
+	trigger_dyntick_cpu(base, timer, bucket_expiry);
 }
 
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
@@ -959,9 +963,9 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
 static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
 {
+	unsigned long clk = 0, flags, bucket_expiry;
 	struct timer_base *base, *new_base;
 	unsigned int idx = UINT_MAX;
-	unsigned long clk = 0, flags;
 	int ret = 0;
 
 	BUG_ON(!timer->function);
@@ -1000,7 +1004,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
 		}
 
 		clk = base->clk;
-		idx = calc_wheel_index(expires, clk);
+		idx = calc_wheel_index(expires, clk, &bucket_expiry);
 
 		/*
 		 * Retrieve and compare the array index of the pending
@@ -1059,7 +1063,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
 	 */
 	if (idx != UINT_MAX && clk == base->clk) {
 		enqueue_timer(base, timer, idx);
-		trigger_dyntick_cpu(base, timer);
+		trigger_dyntick_cpu(base, timer, bucket_expiry);
 	} else {
 		internal_add_timer(base, timer);
 	}
-- 
cgit v1.2.3


From 9a2b764b06c880678416d803d027f575ae40ec99 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 17 Jul 2020 16:05:43 +0200
Subject: timers: Move trigger_dyntick_cpu() to enqueue_timer()

Consolidate the code by calling trigger_dyntick_cpu() from
enqueue_timer() instead of calling it from all its callers.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200717140551.29076-5-frederic@kernel.org
---
 kernel/time/timer.c | 61 ++++++++++++++++++++++-------------------------------
 1 file changed, 25 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a7a3cf737411..2af08a169564 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -533,30 +533,6 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk,
 	return idx;
 }
 
-/*
- * Enqueue the timer into the hash bucket, mark it pending in
- * the bitmap and store the index in the timer flags.
- */
-static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
-			  unsigned int idx)
-{
-	hlist_add_head(&timer->entry, base->vectors + idx);
-	__set_bit(idx, base->pending_map);
-	timer_set_idx(timer, idx);
-
-	trace_timer_start(timer, timer->expires, timer->flags);
-}
-
-static void
-__internal_add_timer(struct timer_base *base, struct timer_list *timer,
-		     unsigned long *bucket_expiry)
-{
-	unsigned int idx;
-
-	idx = calc_wheel_index(timer->expires, base->clk, bucket_expiry);
-	enqueue_timer(base, timer, idx);
-}
-
 static void
 trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer,
 		    unsigned long bucket_expiry)
@@ -598,15 +574,31 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer,
 	wake_up_nohz_cpu(base->cpu);
 }
 
-static void
-internal_add_timer(struct timer_base *base, struct timer_list *timer)
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap, store the index in the timer flags then wake up
+ * the target CPU if needed.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+			  unsigned int idx, unsigned long bucket_expiry)
 {
-	unsigned long bucket_expiry;
+	hlist_add_head(&timer->entry, base->vectors + idx);
+	__set_bit(idx, base->pending_map);
+	timer_set_idx(timer, idx);
 
-	__internal_add_timer(base, timer, &bucket_expiry);
+	trace_timer_start(timer, timer->expires, timer->flags);
 	trigger_dyntick_cpu(base, timer, bucket_expiry);
 }
 
+static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
+{
+	unsigned long bucket_expiry;
+	unsigned int idx;
+
+	idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
+	enqueue_timer(base, timer, idx, bucket_expiry);
+}
+
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 
 static struct debug_obj_descr timer_debug_descr;
@@ -1057,16 +1049,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
 	/*
 	 * If 'idx' was calculated above and the base time did not advance
 	 * between calculating 'idx' and possibly switching the base, only
-	 * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
-	 * we need to (re)calculate the wheel index via
-	 * internal_add_timer().
+	 * enqueue_timer() is required. Otherwise we need to (re)calculate
+	 * the wheel index via internal_add_timer().
 	 */
-	if (idx != UINT_MAX && clk == base->clk) {
-		enqueue_timer(base, timer, idx);
-		trigger_dyntick_cpu(base, timer, bucket_expiry);
-	} else {
+	if (idx != UINT_MAX && clk == base->clk)
+		enqueue_timer(base, timer, idx, bucket_expiry);
+	else
 		internal_add_timer(base, timer);
-	}
 
 out_unlock:
 	raw_spin_unlock_irqrestore(&base->lock, flags);
-- 
cgit v1.2.3


From 4468897211628865ee2392acb5ad281f74176f63 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 17 Jul 2020 16:05:44 +0200
Subject: timers: Add comments about calc_index() ceiling work

calc_index() adds 1 unit of the level granularity to the expiry passed
in parameter to ensure that the timer doesn't expire too early. Add a
comment to explain that and the resulting layout in the wheel.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200717140551.29076-6-frederic@kernel.org
---
 kernel/time/timer.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2af08a169564..af1c08b0b168 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -156,7 +156,8 @@ EXPORT_SYMBOL(jiffies_64);
 
 /*
  * The time start value for each level to select the bucket at enqueue
- * time.
+ * time. We start from the last possible delta of the previous level
+ * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
  */
 #define LVL_START(n)	((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
 
@@ -490,6 +491,15 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
 static inline unsigned calc_index(unsigned long expires, unsigned lvl,
 				  unsigned long *bucket_expiry)
 {
+
+	/*
+	 * The timer wheel has to guarantee that a timer does not fire
+	 * early. Early expiry can happen due to:
+	 * - Timer is armed at the edge of a tick
+	 * - Truncation of the expiry time in the outer wheel levels
+	 *
+	 * Round up with level granularity to prevent this.
+	 */
 	expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
 	*bucket_expiry = expires << LVL_SHIFT(lvl);
 	return LVL_OFFS(lvl) + (expires & LVL_MASK);
-- 
cgit v1.2.3


From 001ec1b3925da0d51847c23fc0aa4129282db526 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 17 Jul 2020 16:05:45 +0200
Subject: timers: Optimize _next_timer_interrupt() level iteration

If a level has a timer that expires before reaching the next level, there
is no need to iterate further.

The next level is reached when the 3 lower bits of the current level are
cleared. If the next event happens before/during that, the next levels
won't provide an earlier expiration.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200717140551.29076-7-frederic@kernel.org
---
 kernel/time/timer.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index af1c08b0b168..9abc41715fd2 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1526,6 +1526,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
 	clk = base->clk;
 	for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
 		int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
+		unsigned long lvl_clk = clk & LVL_CLK_MASK;
 
 		if (pos >= 0) {
 			unsigned long tmp = clk + (unsigned long) pos;
@@ -1533,6 +1534,13 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
 			tmp <<= LVL_SHIFT(lvl);
 			if (time_before(tmp, next))
 				next = tmp;
+
+			/*
+			 * If the next expiration happens before we reach
+			 * the next level, no need to check further.
+			 */
+			if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
+				break;
 		}
 		/*
 		 * Clock for the next level. If the current level clock lower
@@ -1570,7 +1578,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
 		 * So the simple check whether the lower bits of the current
 		 * level are 0 or not is sufficient for all cases.
 		 */
-		adj = clk & LVL_CLK_MASK ? 1 : 0;
+		adj = lvl_clk ? 1 : 0;
 		clk >>= LVL_CLK_SHIFT;
 		clk += adj;
 	}
-- 
cgit v1.2.3


From dc2a0f1fb2a06df09f5094f29aea56b763aa7cca Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 17 Jul 2020 16:05:46 +0200
Subject: timers: Always keep track of next expiry

So far next expiry was only tracked while the CPU was in nohz_idle mode
in order to cope with missing ticks that can't increment the base->clk
periodically anymore.

This logic is going to be expanded beyond nohz in order to spare timer
softirqs so do it unconditionally.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200717140551.29076-8-frederic@kernel.org
---
 kernel/time/timer.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 9abc41715fd2..76fd9644638b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -544,8 +544,7 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk,
 }
 
 static void
-trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer,
-		    unsigned long bucket_expiry)
+trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
 {
 	if (!is_timers_nohz_active())
 		return;
@@ -565,23 +564,8 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer,
 	 * timer is not deferrable. If the other CPU is on the way to idle
 	 * then it can't set base->is_idle as we hold the base lock:
 	 */
-	if (!base->is_idle)
-		return;
-
-	/*
-	 * Check whether this is the new first expiring timer. The
-	 * effective expiry time of the timer is required here
-	 * (bucket_expiry) instead of timer->expires.
-	 */
-	if (time_after_eq(bucket_expiry, base->next_expiry))
-		return;
-
-	/*
-	 * Set the next expiry time and kick the CPU so it can reevaluate the
-	 * wheel:
-	 */
-	base->next_expiry = bucket_expiry;
-	wake_up_nohz_cpu(base->cpu);
+	if (base->is_idle)
+		wake_up_nohz_cpu(base->cpu);
 }
 
 /*
@@ -592,12 +576,26 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer,
 static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
 			  unsigned int idx, unsigned long bucket_expiry)
 {
+
 	hlist_add_head(&timer->entry, base->vectors + idx);
 	__set_bit(idx, base->pending_map);
 	timer_set_idx(timer, idx);
 
 	trace_timer_start(timer, timer->expires, timer->flags);
-	trigger_dyntick_cpu(base, timer, bucket_expiry);
+
+	/*
+	 * Check whether this is the new first expiring timer. The
+	 * effective expiry time of the timer is required here
+	 * (bucket_expiry) instead of timer->expires.
+	 */
+	if (time_before(bucket_expiry, base->next_expiry)) {
+		/*
+		 * Set the next expiry time and kick the CPU so it
+		 * can reevaluate the wheel:
+		 */
+		base->next_expiry = bucket_expiry;
+		trigger_dyntick_cpu(base, timer);
+	}
 }
 
 static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
@@ -1493,7 +1491,6 @@ static int __collect_expired_timers(struct timer_base *base,
 	return levels;
 }
 
-#ifdef CONFIG_NO_HZ_COMMON
 /*
  * Find the next pending bucket of a level. Search from level start (@offset)
  * + @clk upwards and if nothing there, search from start of the level
@@ -1585,6 +1582,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
 	return next;
 }
 
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * Check, if the next hrtimer event is before the next timer wheel
  * event:
@@ -1790,6 +1788,7 @@ static inline void __run_timers(struct timer_base *base)
 
 		levels = collect_expired_timers(base, heads);
 		base->clk++;
+		base->next_expiry = __next_timer_interrupt(base);
 
 		while (levels--)
 			expire_timers(base, heads + levels);
@@ -2042,6 +2041,7 @@ static void __init init_timer_cpu(int cpu)
 		base->cpu = cpu;
 		raw_spin_lock_init(&base->lock);
 		base->clk = jiffies;
+		base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
 		timer_base_init_expiry_lock(base);
 	}
 }
-- 
cgit v1.2.3


From 90d52f65f303091be17b5f4ffab7090b2064b4a1 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 17 Jul 2020 16:05:47 +0200
Subject: timers: Reuse next expiry cache after nohz exit

Now that the next expiry it tracked unconditionally when a timer is added,
this information can be reused on a tick firing after exiting nohz.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200717140551.29076-9-frederic@kernel.org
---
 kernel/time/timer.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 76fd9644638b..13f48ee708aa 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1706,13 +1706,11 @@ static int collect_expired_timers(struct timer_base *base,
 	 * the next expiring timer.
 	 */
 	if ((long)(now - base->clk) > 2) {
-		unsigned long next = __next_timer_interrupt(base);
-
 		/*
 		 * If the next timer is ahead of time forward to current
 		 * jiffies, otherwise forward to the next expiry time:
 		 */
-		if (time_after(next, now)) {
+		if (time_after(base->next_expiry, now)) {
 			/*
 			 * The call site will increment base->clk and then
 			 * terminate the expiry loop immediately.
@@ -1720,7 +1718,7 @@ static int collect_expired_timers(struct timer_base *base,
 			base->clk = now;
 			return 0;
 		}
-		base->clk = next;
+		base->clk = base->next_expiry;
 	}
 	return __collect_expired_timers(base, heads);
 }
-- 
cgit v1.2.3


From 1f8a4212dc83f8353843fabf6465fd918372fbbf Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 17 Jul 2020 16:05:48 +0200
Subject: timers: Expand clk forward logic beyond nohz

As for next_expiry, the base->clk catch up logic will be expanded beyond
NOHZ in order to avoid triggering useless softirqs.

If softirqs should only fire to expire pending timers, periodic base->clk
increments must be skippable for random amounts of time.  Therefore prepare
to catch-up with missing updates whenever an up-to-date base clock is
needed.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200717140551.29076-10-frederic@kernel.org
---
 kernel/time/timer.c | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 13f48ee708aa..1be92b53b75f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -888,19 +888,12 @@ get_target_base(struct timer_base *base, unsigned tflags)
 
 static inline void forward_timer_base(struct timer_base *base)
 {
-#ifdef CONFIG_NO_HZ_COMMON
 	unsigned long jnow;
 
-	/*
-	 * We only forward the base when we are idle or have just come out of
-	 * idle (must_forward_clk logic), and have a delta between base clock
-	 * and jiffies. In the common case, run_timers will take care of it.
-	 */
-	if (likely(!base->must_forward_clk))
+	if (!base->must_forward_clk)
 		return;
 
 	jnow = READ_ONCE(jiffies);
-	base->must_forward_clk = base->is_idle;
 	if ((long)(jnow - base->clk) < 2)
 		return;
 
@@ -915,7 +908,6 @@ static inline void forward_timer_base(struct timer_base *base)
 			return;
 		base->clk = base->next_expiry;
 	}
-#endif
 }
 
 
@@ -1667,10 +1659,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 		 * logic is only maintained for the BASE_STD base, deferrable
 		 * timers may still see large granularity skew (by design).
 		 */
-		if ((expires - basem) > TICK_NSEC) {
-			base->must_forward_clk = true;
+		if ((expires - basem) > TICK_NSEC)
 			base->is_idle = true;
-		}
 	}
 	raw_spin_unlock(&base->lock);
 
@@ -1769,16 +1759,7 @@ static inline void __run_timers(struct timer_base *base)
 	/*
 	 * timer_base::must_forward_clk must be cleared before running
 	 * timers so that any timer functions that call mod_timer() will
-	 * not try to forward the base. Idle tracking / clock forwarding
-	 * logic is only used with BASE_STD timers.
-	 *
-	 * The must_forward_clk flag is cleared unconditionally also for
-	 * the deferrable base. The deferrable base is not affected by idle
-	 * tracking and never forwarded, so clearing the flag is a NOOP.
-	 *
-	 * The fact that the deferrable base is never forwarded can cause
-	 * large variations in granularity for deferrable timers, but they
-	 * can be deferred for long periods due to idle anyway.
+	 * not try to forward the base.
 	 */
 	base->must_forward_clk = false;
 
@@ -1791,6 +1772,7 @@ static inline void __run_timers(struct timer_base *base)
 		while (levels--)
 			expire_timers(base, heads + levels);
 	}
+	base->must_forward_clk = true;
 	raw_spin_unlock_irq(&base->lock);
 	timer_base_unlock_expiry(base);
 }
-- 
cgit v1.2.3


From d4f7dae87096dfe722bf32aa82076ece1063746c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 17 Jul 2020 16:05:49 +0200
Subject: timers: Spare timer softirq until next expiry

Now that the core timer infrastructure doesn't depend anymore on
periodic base->clk increments, even when the CPU is not in NO_HZ mode,
timer softirqs can be skipped until there are timers to expire.

Some spurious softirqs can still remain since base->next_expiry doesn't
keep track of canceled timers but this still reduces the number of softirqs
significantly: ~15 times less for HZ=1000 and ~5 times less for HZ=100.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200717140551.29076-11-frederic@kernel.org
---
 kernel/time/timer.c | 49 ++++++++-----------------------------------------
 1 file changed, 8 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1be92b53b75f..4f78a7bff9e1 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1458,10 +1458,10 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 	}
 }
 
-static int __collect_expired_timers(struct timer_base *base,
-				    struct hlist_head *heads)
+static int collect_expired_timers(struct timer_base *base,
+				  struct hlist_head *heads)
 {
-	unsigned long clk = base->clk;
+	unsigned long clk = base->clk = base->next_expiry;
 	struct hlist_head *vec;
 	int i, levels = 0;
 	unsigned int idx;
@@ -1684,40 +1684,6 @@ void timer_clear_idle(void)
 	 */
 	base->is_idle = false;
 }
-
-static int collect_expired_timers(struct timer_base *base,
-				  struct hlist_head *heads)
-{
-	unsigned long now = READ_ONCE(jiffies);
-
-	/*
-	 * NOHZ optimization. After a long idle sleep we need to forward the
-	 * base to current jiffies. Avoid a loop by searching the bitfield for
-	 * the next expiring timer.
-	 */
-	if ((long)(now - base->clk) > 2) {
-		/*
-		 * If the next timer is ahead of time forward to current
-		 * jiffies, otherwise forward to the next expiry time:
-		 */
-		if (time_after(base->next_expiry, now)) {
-			/*
-			 * The call site will increment base->clk and then
-			 * terminate the expiry loop immediately.
-			 */
-			base->clk = now;
-			return 0;
-		}
-		base->clk = base->next_expiry;
-	}
-	return __collect_expired_timers(base, heads);
-}
-#else
-static inline int collect_expired_timers(struct timer_base *base,
-					 struct hlist_head *heads)
-{
-	return __collect_expired_timers(base, heads);
-}
 #endif
 
 /*
@@ -1750,7 +1716,7 @@ static inline void __run_timers(struct timer_base *base)
 	struct hlist_head heads[LVL_DEPTH];
 	int levels;
 
-	if (!time_after_eq(jiffies, base->clk))
+	if (time_before(jiffies, base->next_expiry))
 		return;
 
 	timer_base_lock_expiry(base);
@@ -1763,7 +1729,8 @@ static inline void __run_timers(struct timer_base *base)
 	 */
 	base->must_forward_clk = false;
 
-	while (time_after_eq(jiffies, base->clk)) {
+	while (time_after_eq(jiffies, base->clk) &&
+	       time_after_eq(jiffies, base->next_expiry)) {
 
 		levels = collect_expired_timers(base, heads);
 		base->clk++;
@@ -1798,12 +1765,12 @@ void run_local_timers(void)
 
 	hrtimer_run_queues();
 	/* Raise the softirq only if required. */
-	if (time_before(jiffies, base->clk)) {
+	if (time_before(jiffies, base->next_expiry)) {
 		if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
 			return;
 		/* CPU is awake, so check the deferrable base. */
 		base++;
-		if (time_before(jiffies, base->clk))
+		if (time_before(jiffies, base->next_expiry))
 			return;
 	}
 	raise_softirq(TIMER_SOFTIRQ);
-- 
cgit v1.2.3


From 0975fb565b8b8f9e0c96d0de39fcb954833ea5e0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 17 Jul 2020 16:05:50 +0200
Subject: timers: Remove must_forward_clk

There is no reason to keep this guard around. The code makes sure that
base->clk remains sane and won't be forwarded beyond jiffies nor set
backward.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200717140551.29076-12-frederic@kernel.org
---
 kernel/time/timer.c | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 4f78a7bff9e1..8b3fb52d8c47 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -205,7 +205,6 @@ struct timer_base {
 	unsigned long		next_expiry;
 	unsigned int		cpu;
 	bool			is_idle;
-	bool			must_forward_clk;
 	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
 	struct hlist_head	vectors[WHEEL_SIZE];
 } ____cacheline_aligned;
@@ -888,12 +887,13 @@ get_target_base(struct timer_base *base, unsigned tflags)
 
 static inline void forward_timer_base(struct timer_base *base)
 {
-	unsigned long jnow;
+	unsigned long jnow = READ_ONCE(jiffies);
 
-	if (!base->must_forward_clk)
-		return;
-
-	jnow = READ_ONCE(jiffies);
+	/*
+	 * No need to forward if we are close enough below jiffies.
+	 * Also while executing timers, base->clk is 1 offset ahead
+	 * of jiffies to avoid endless requeuing to current jffies.
+	 */
 	if ((long)(jnow - base->clk) < 2)
 		return;
 
@@ -1722,16 +1722,8 @@ static inline void __run_timers(struct timer_base *base)
 	timer_base_lock_expiry(base);
 	raw_spin_lock_irq(&base->lock);
 
-	/*
-	 * timer_base::must_forward_clk must be cleared before running
-	 * timers so that any timer functions that call mod_timer() will
-	 * not try to forward the base.
-	 */
-	base->must_forward_clk = false;
-
 	while (time_after_eq(jiffies, base->clk) &&
 	       time_after_eq(jiffies, base->next_expiry)) {
-
 		levels = collect_expired_timers(base, heads);
 		base->clk++;
 		base->next_expiry = __next_timer_interrupt(base);
@@ -1739,7 +1731,6 @@ static inline void __run_timers(struct timer_base *base)
 		while (levels--)
 			expire_timers(base, heads + levels);
 	}
-	base->must_forward_clk = true;
 	raw_spin_unlock_irq(&base->lock);
 	timer_base_unlock_expiry(base);
 }
@@ -1935,7 +1926,6 @@ int timers_prepare_cpu(unsigned int cpu)
 		base->clk = jiffies;
 		base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
 		base->is_idle = false;
-		base->must_forward_clk = true;
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 36cd28a4cdd05d47ccb62a2d86e8f37839cc879a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 17 Jul 2020 16:05:51 +0200
Subject: timers: Lower base clock forwarding threshold

There is nothing that prevents from forwarding the base clock if it's one
jiffy off. The reason for this arbitrary limit of two jiffies is historical
and does not longer exist.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20200717140551.29076-13-frederic@kernel.org
---
 kernel/time/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 8b3fb52d8c47..77e21e98ec32 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -894,7 +894,7 @@ static inline void forward_timer_base(struct timer_base *base)
 	 * Also while executing timers, base->clk is 1 offset ahead
 	 * of jiffies to avoid endless requeuing to current jffies.
 	 */
-	if ((long)(jnow - base->clk) < 2)
+	if ((long)(jnow - base->clk) < 1)
 		return;
 
 	/*
-- 
cgit v1.2.3


From 9180bd467f9abdb44afde650d07e3b9dd66d837c Mon Sep 17 00:00:00 2001
From: André Almeida <andrealmeid@collabora.com>
Date: Thu, 2 Jul 2020 17:28:40 -0300
Subject: futex: Remove put_futex_key()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since 4b39f99c ("futex: Remove {get,drop}_futex_key_refs()"),
put_futex_key() is empty.

Remove all references for this function and the then redundant labels.

Signed-off-by: André Almeida <andrealmeid@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200702202843.520764-2-andrealmeid@collabora.com
---
 kernel/futex.c | 61 ++++++++++++----------------------------------------------
 1 file changed, 12 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index e646661f6282..bd9adfca5d51 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -677,10 +677,6 @@ out:
 	return err;
 }
 
-static inline void put_futex_key(union futex_key *key)
-{
-}
-
 /**
  * fault_in_user_writeable() - Fault in user address and verify RW access
  * @uaddr:	pointer to faulting user space address
@@ -1617,7 +1613,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 
 	/* Make sure we really have tasks to wakeup */
 	if (!hb_waiters_pending(hb))
-		goto out_put_key;
+		goto out;
 
 	spin_lock(&hb->lock);
 
@@ -1640,8 +1636,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 
 	spin_unlock(&hb->lock);
 	wake_up_q(&wake_q);
-out_put_key:
-	put_futex_key(&key);
 out:
 	return ret;
 }
@@ -1712,7 +1706,7 @@ retry:
 		goto out;
 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
-		goto out_put_key1;
+		goto out;
 
 	hb1 = hash_futex(&key1);
 	hb2 = hash_futex(&key2);
@@ -1730,13 +1724,13 @@ retry_private:
 			 * an MMU, but we might get them from range checking
 			 */
 			ret = op_ret;
-			goto out_put_keys;
+			goto out;
 		}
 
 		if (op_ret == -EFAULT) {
 			ret = fault_in_user_writeable(uaddr2);
 			if (ret)
-				goto out_put_keys;
+				goto out;
 		}
 
 		if (!(flags & FLAGS_SHARED)) {
@@ -1744,8 +1738,6 @@ retry_private:
 			goto retry_private;
 		}
 
-		put_futex_key(&key2);
-		put_futex_key(&key1);
 		cond_resched();
 		goto retry;
 	}
@@ -1781,10 +1773,6 @@ retry_private:
 out_unlock:
 	double_unlock_hb(hb1, hb2);
 	wake_up_q(&wake_q);
-out_put_keys:
-	put_futex_key(&key2);
-out_put_key1:
-	put_futex_key(&key1);
 out:
 	return ret;
 }
@@ -1996,7 +1984,7 @@ retry:
 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
 			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
 	if (unlikely(ret != 0))
-		goto out_put_key1;
+		goto out;
 
 	/*
 	 * The check above which compares uaddrs is not sufficient for
@@ -2004,7 +1992,7 @@ retry:
 	 */
 	if (requeue_pi && match_futex(&key1, &key2)) {
 		ret = -EINVAL;
-		goto out_put_keys;
+		goto out;
 	}
 
 	hb1 = hash_futex(&key1);
@@ -2025,13 +2013,11 @@ retry_private:
 
 			ret = get_user(curval, uaddr1);
 			if (ret)
-				goto out_put_keys;
+				goto out;
 
 			if (!(flags & FLAGS_SHARED))
 				goto retry_private;
 
-			put_futex_key(&key2);
-			put_futex_key(&key1);
 			goto retry;
 		}
 		if (curval != *cmpval) {
@@ -2090,8 +2076,6 @@ retry_private:
 		case -EFAULT:
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);
-			put_futex_key(&key2);
-			put_futex_key(&key1);
 			ret = fault_in_user_writeable(uaddr2);
 			if (!ret)
 				goto retry;
@@ -2106,8 +2090,6 @@ retry_private:
 			 */
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);
-			put_futex_key(&key2);
-			put_futex_key(&key1);
 			/*
 			 * Handle the case where the owner is in the middle of
 			 * exiting. Wait for the exit to complete otherwise
@@ -2217,10 +2199,6 @@ out_unlock:
 	wake_up_q(&wake_q);
 	hb_waiters_dec(hb2);
 
-out_put_keys:
-	put_futex_key(&key2);
-out_put_key1:
-	put_futex_key(&key1);
 out:
 	return ret ? ret : task_count;
 }
@@ -2697,7 +2675,6 @@ retry_private:
 		if (!(flags & FLAGS_SHARED))
 			goto retry_private;
 
-		put_futex_key(&q->key);
 		goto retry;
 	}
 
@@ -2707,8 +2684,6 @@ retry_private:
 	}
 
 out:
-	if (ret)
-		put_futex_key(&q->key);
 	return ret;
 }
 
@@ -2853,7 +2828,6 @@ retry_private:
 			 * - EAGAIN: The user space value changed.
 			 */
 			queue_unlock(hb);
-			put_futex_key(&q.key);
 			/*
 			 * Handle the case where the owner is in the middle of
 			 * exiting. Wait for the exit to complete otherwise
@@ -2961,13 +2935,11 @@ no_block:
 		put_pi_state(pi_state);
 	}
 
-	goto out_put_key;
+	goto out;
 
 out_unlock_put_key:
 	queue_unlock(hb);
 
-out_put_key:
-	put_futex_key(&q.key);
 out:
 	if (to) {
 		hrtimer_cancel(&to->timer);
@@ -2980,12 +2952,11 @@ uaddr_faulted:
 
 	ret = fault_in_user_writeable(uaddr);
 	if (ret)
-		goto out_put_key;
+		goto out;
 
 	if (!(flags & FLAGS_SHARED))
 		goto retry_private;
 
-	put_futex_key(&q.key);
 	goto retry;
 }
 
@@ -3114,16 +3085,13 @@ retry:
 out_unlock:
 	spin_unlock(&hb->lock);
 out_putkey:
-	put_futex_key(&key);
 	return ret;
 
 pi_retry:
-	put_futex_key(&key);
 	cond_resched();
 	goto retry;
 
 pi_faulted:
-	put_futex_key(&key);
 
 	ret = fault_in_user_writeable(uaddr);
 	if (!ret)
@@ -3265,7 +3233,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	 */
 	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
 	if (ret)
-		goto out_key2;
+		goto out;
 
 	/*
 	 * The check above which compares uaddrs is not sufficient for
@@ -3274,7 +3242,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	if (match_futex(&q.key, &key2)) {
 		queue_unlock(hb);
 		ret = -EINVAL;
-		goto out_put_keys;
+		goto out;
 	}
 
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
@@ -3284,7 +3252,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
 	spin_unlock(&hb->lock);
 	if (ret)
-		goto out_put_keys;
+		goto out;
 
 	/*
 	 * In order for us to be here, we know our q.key == key2, and since
@@ -3374,11 +3342,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		ret = -EWOULDBLOCK;
 	}
 
-out_put_keys:
-	put_futex_key(&q.key);
-out_key2:
-	put_futex_key(&key2);
-
 out:
 	if (to) {
 		hrtimer_cancel(&to->timer);
-- 
cgit v1.2.3


From d7c5ed73b19c4640426d9c106f70ec2cb532034d Mon Sep 17 00:00:00 2001
From: André Almeida <andrealmeid@collabora.com>
Date: Thu, 2 Jul 2020 17:28:41 -0300
Subject: futex: Remove needless goto's
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As stated in the coding style documentation, "if there is no cleanup
needed then just return directly", instead of jumping to a label and
then returning.

Remove such goto's and replace with a return statement.  When there's a
ternary operator on the return value, replace it with the result of the
operation when it is logically possible to determine it by the control
flow.

Signed-off-by: André Almeida <andrealmeid@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200702202843.520764-3-andrealmeid@collabora.com
---
 kernel/futex.c | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index bd9adfca5d51..362fbca6d614 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1607,13 +1607,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 
 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
 	if (unlikely(ret != 0))
-		goto out;
+		return ret;
 
 	hb = hash_futex(&key);
 
 	/* Make sure we really have tasks to wakeup */
 	if (!hb_waiters_pending(hb))
-		goto out;
+		return ret;
 
 	spin_lock(&hb->lock);
 
@@ -1636,7 +1636,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 
 	spin_unlock(&hb->lock);
 	wake_up_q(&wake_q);
-out:
 	return ret;
 }
 
@@ -1703,10 +1702,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 retry:
 	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
-		goto out;
+		return ret;
 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
-		goto out;
+		return ret;
 
 	hb1 = hash_futex(&key1);
 	hb2 = hash_futex(&key2);
@@ -1724,13 +1723,13 @@ retry_private:
 			 * an MMU, but we might get them from range checking
 			 */
 			ret = op_ret;
-			goto out;
+			return ret;
 		}
 
 		if (op_ret == -EFAULT) {
 			ret = fault_in_user_writeable(uaddr2);
 			if (ret)
-				goto out;
+				return ret;
 		}
 
 		if (!(flags & FLAGS_SHARED)) {
@@ -1773,7 +1772,6 @@ retry_private:
 out_unlock:
 	double_unlock_hb(hb1, hb2);
 	wake_up_q(&wake_q);
-out:
 	return ret;
 }
 
@@ -1980,20 +1978,18 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 retry:
 	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
-		goto out;
+		return ret;
 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
 			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
 	if (unlikely(ret != 0))
-		goto out;
+		return ret;
 
 	/*
 	 * The check above which compares uaddrs is not sufficient for
 	 * shared futexes. We need to compare the keys:
 	 */
-	if (requeue_pi && match_futex(&key1, &key2)) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (requeue_pi && match_futex(&key1, &key2))
+		return -EINVAL;
 
 	hb1 = hash_futex(&key1);
 	hb2 = hash_futex(&key2);
@@ -2013,7 +2009,7 @@ retry_private:
 
 			ret = get_user(curval, uaddr1);
 			if (ret)
-				goto out;
+				return ret;
 
 			if (!(flags & FLAGS_SHARED))
 				goto retry_private;
@@ -2079,7 +2075,7 @@ retry_private:
 			ret = fault_in_user_writeable(uaddr2);
 			if (!ret)
 				goto retry;
-			goto out;
+			return ret;
 		case -EBUSY:
 		case -EAGAIN:
 			/*
@@ -2198,8 +2194,6 @@ out_unlock:
 	double_unlock_hb(hb1, hb2);
 	wake_up_q(&wake_q);
 	hb_waiters_dec(hb2);
-
-out:
 	return ret ? ret : task_count;
 }
 
@@ -2545,7 +2539,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 		 */
 		if (q->pi_state->owner != current)
 			ret = fixup_pi_state_owner(uaddr, q, current);
-		goto out;
+		return ret ? ret : locked;
 	}
 
 	/*
@@ -2558,7 +2552,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 	 */
 	if (q->pi_state->owner == current) {
 		ret = fixup_pi_state_owner(uaddr, q, NULL);
-		goto out;
+		return ret;
 	}
 
 	/*
@@ -2572,8 +2566,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 				q->pi_state->owner);
 	}
 
-out:
-	return ret ? ret : locked;
+	return ret;
 }
 
 /**
@@ -2670,7 +2663,7 @@ retry_private:
 
 		ret = get_user(uval, uaddr);
 		if (ret)
-			goto out;
+			return ret;
 
 		if (!(flags & FLAGS_SHARED))
 			goto retry_private;
@@ -2683,7 +2676,6 @@ retry_private:
 		ret = -EWOULDBLOCK;
 	}
 
-out:
 	return ret;
 }
 
-- 
cgit v1.2.3


From 9261308598ad28b9a8a2237d881833e9f217244e Mon Sep 17 00:00:00 2001
From: André Almeida <andrealmeid@collabora.com>
Date: Thu, 2 Jul 2020 17:28:43 -0300
Subject: futex: Consistently use fshared as boolean
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since fshared is only conveying true/false values, declare it as bool.

In get_futex_key() the usage of fshared can be restricted to the first part
of the function. If fshared is false the function is terminated early and
the subsequent code can use a constant 'true' instead of the variable.

Signed-off-by: André Almeida <andrealmeid@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200702202843.520764-5-andrealmeid@collabora.com
---
 kernel/futex.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 362fbca6d614..cda91755b77d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -476,7 +476,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
 /**
  * get_futex_key() - Get parameters which are the keys for a futex
  * @uaddr:	virtual address of the futex
- * @fshared:	0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @fshared:	false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
  * @key:	address where result is stored.
  * @rw:		mapping needs to be read/write (values: FUTEX_READ,
  *              FUTEX_WRITE)
@@ -500,8 +500,8 @@ static u64 get_inode_sequence_number(struct inode *inode)
  *
  * lock_page() might sleep, the caller should not hold a spinlock.
  */
-static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
+static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+			 enum futex_access rw)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
@@ -538,7 +538,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a
 
 again:
 	/* Ignore any VERIFY_READ mapping (futex common case) */
-	if (unlikely(should_fail_futex(fshared)))
+	if (unlikely(should_fail_futex(true)))
 		return -EFAULT;
 
 	err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
@@ -626,7 +626,7 @@ again:
 		 * A RO anonymous page will never change and thus doesn't make
 		 * sense for futex operations.
 		 */
-		if (unlikely(should_fail_futex(fshared)) || ro) {
+		if (unlikely(should_fail_futex(true)) || ro) {
 			err = -EFAULT;
 			goto out;
 		}
-- 
cgit v1.2.3


From 9a71df495c3d29dab596bb590e73fd8b20106e2d Mon Sep 17 00:00:00 2001
From: André Almeida <andrealmeid@collabora.com>
Date: Thu, 2 Jul 2020 17:28:42 -0300
Subject: futex: Remove unused or redundant includes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since 82af7aca ("Removal of FUTEX_FD"), some includes related to file
operations aren't needed anymore. More investigation around the includes
showed that a lot of includes aren't required for compilation, possible
due to redundant includes. Simplify the code by removing unused
includes.

Signed-off-by: André Almeida <andrealmeid@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200702202843.520764-4-andrealmeid@collabora.com
---
 kernel/futex.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index cda91755b77d..4616d4ad609d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -32,30 +32,13 @@
  *  "But they come in a choice of three flavours!"
  */
 #include <linux/compat.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fs.h>
-#include <linux/file.h>
 #include <linux/jhash.h>
-#include <linux/init.h>
-#include <linux/futex.h>
-#include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
-#include <linux/signal.h>
-#include <linux/export.h>
-#include <linux/magic.h>
-#include <linux/pid.h>
-#include <linux/nsproxy.h>
-#include <linux/ptrace.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/freezer.h>
 #include <linux/memblock.h>
 #include <linux/fault-inject.h>
-#include <linux/refcount.h>
 
 #include <asm/futex.h>
 
-- 
cgit v1.2.3


From ce3aa9cc5109363099b7c4ac82e2c9768afcaf31 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Fri, 17 Jul 2020 12:35:22 +0200
Subject: bpf, netns: Handle multiple link attachments

Extend the BPF netns link callbacks to rebuild (grow/shrink) or update the
prog_array at given position when link gets attached/updated/released.

This let's us lift the limit of having just one link attached for the new
attach type introduced by subsequent patch.

No functional changes intended.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200717103536.397595-2-jakub@cloudflare.com
---
 include/linux/bpf.h        |  3 ++
 kernel/bpf/core.c          | 55 ++++++++++++++++++++++++++++
 kernel/bpf/net_namespace.c | 90 +++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 139 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 54ad426dbea1..c8c9eabcd106 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -928,6 +928,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
 
 void bpf_prog_array_delete_safe(struct bpf_prog_array *progs,
 				struct bpf_prog *old_prog);
+int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index);
+int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
+			     struct bpf_prog *prog);
 int bpf_prog_array_copy_info(struct bpf_prog_array *array,
 			     u32 *prog_ids, u32 request_cnt,
 			     u32 *prog_cnt);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9df4cc9a2907..7be02e555ab9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1958,6 +1958,61 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
 		}
 }
 
+/**
+ * bpf_prog_array_delete_safe_at() - Replaces the program at the given
+ *                                   index into the program array with
+ *                                   a dummy no-op program.
+ * @array: a bpf_prog_array
+ * @index: the index of the program to replace
+ *
+ * Skips over dummy programs, by not counting them, when calculating
+ * the the position of the program to replace.
+ *
+ * Return:
+ * * 0		- Success
+ * * -EINVAL	- Invalid index value. Must be a non-negative integer.
+ * * -ENOENT	- Index out of range
+ */
+int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
+{
+	return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
+}
+
+/**
+ * bpf_prog_array_update_at() - Updates the program at the given index
+ *                              into the program array.
+ * @array: a bpf_prog_array
+ * @index: the index of the program to update
+ * @prog: the program to insert into the array
+ *
+ * Skips over dummy programs, by not counting them, when calculating
+ * the position of the program to update.
+ *
+ * Return:
+ * * 0		- Success
+ * * -EINVAL	- Invalid index value. Must be a non-negative integer.
+ * * -ENOENT	- Index out of range
+ */
+int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
+			     struct bpf_prog *prog)
+{
+	struct bpf_prog_array_item *item;
+
+	if (unlikely(index < 0))
+		return -EINVAL;
+
+	for (item = array->items; item->prog; item++) {
+		if (item->prog == &dummy_bpf_prog.prog)
+			continue;
+		if (!index) {
+			WRITE_ONCE(item->prog, prog);
+			return 0;
+		}
+		index--;
+	}
+	return -ENOENT;
+}
+
 int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *exclude_prog,
 			struct bpf_prog *include_prog,
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 310241ca7991..e9c8e26ac8f2 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -36,12 +36,50 @@ static void netns_bpf_run_array_detach(struct net *net,
 	bpf_prog_array_free(run_array);
 }
 
+static int link_index(struct net *net, enum netns_bpf_attach_type type,
+		      struct bpf_netns_link *link)
+{
+	struct bpf_netns_link *pos;
+	int i = 0;
+
+	list_for_each_entry(pos, &net->bpf.links[type], node) {
+		if (pos == link)
+			return i;
+		i++;
+	}
+	return -ENOENT;
+}
+
+static int link_count(struct net *net, enum netns_bpf_attach_type type)
+{
+	struct list_head *pos;
+	int i = 0;
+
+	list_for_each(pos, &net->bpf.links[type])
+		i++;
+	return i;
+}
+
+static void fill_prog_array(struct net *net, enum netns_bpf_attach_type type,
+			    struct bpf_prog_array *prog_array)
+{
+	struct bpf_netns_link *pos;
+	unsigned int i = 0;
+
+	list_for_each_entry(pos, &net->bpf.links[type], node) {
+		prog_array->items[i].prog = pos->link.prog;
+		i++;
+	}
+}
+
 static void bpf_netns_link_release(struct bpf_link *link)
 {
 	struct bpf_netns_link *net_link =
 		container_of(link, struct bpf_netns_link, link);
 	enum netns_bpf_attach_type type = net_link->netns_type;
+	struct bpf_prog_array *old_array, *new_array;
 	struct net *net;
+	int cnt, idx;
 
 	mutex_lock(&netns_bpf_mutex);
 
@@ -53,9 +91,27 @@ static void bpf_netns_link_release(struct bpf_link *link)
 	if (!net)
 		goto out_unlock;
 
-	netns_bpf_run_array_detach(net, type);
+	/* Remember link position in case of safe delete */
+	idx = link_index(net, type, net_link);
 	list_del(&net_link->node);
 
+	cnt = link_count(net, type);
+	if (!cnt) {
+		netns_bpf_run_array_detach(net, type);
+		goto out_unlock;
+	}
+
+	old_array = rcu_dereference_protected(net->bpf.run_array[type],
+					      lockdep_is_held(&netns_bpf_mutex));
+	new_array = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+	if (!new_array) {
+		WARN_ON(bpf_prog_array_delete_safe_at(old_array, idx));
+		goto out_unlock;
+	}
+	fill_prog_array(net, type, new_array);
+	rcu_assign_pointer(net->bpf.run_array[type], new_array);
+	bpf_prog_array_free(old_array);
+
 out_unlock:
 	mutex_unlock(&netns_bpf_mutex);
 }
@@ -77,7 +133,7 @@ static int bpf_netns_link_update_prog(struct bpf_link *link,
 	enum netns_bpf_attach_type type = net_link->netns_type;
 	struct bpf_prog_array *run_array;
 	struct net *net;
-	int ret = 0;
+	int idx, ret;
 
 	if (old_prog && old_prog != link->prog)
 		return -EPERM;
@@ -95,7 +151,10 @@ static int bpf_netns_link_update_prog(struct bpf_link *link,
 
 	run_array = rcu_dereference_protected(net->bpf.run_array[type],
 					      lockdep_is_held(&netns_bpf_mutex));
-	WRITE_ONCE(run_array->items[0].prog, new_prog);
+	idx = link_index(net, type, net_link);
+	ret = bpf_prog_array_update_at(run_array, idx, new_prog);
+	if (ret)
+		goto out_unlock;
 
 	old_prog = xchg(&link->prog, new_prog);
 	bpf_prog_put(old_prog);
@@ -309,18 +368,28 @@ int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
 	return ret;
 }
 
+static int netns_bpf_max_progs(enum netns_bpf_attach_type type)
+{
+	switch (type) {
+	case NETNS_BPF_FLOW_DISSECTOR:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
 static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
 				 enum netns_bpf_attach_type type)
 {
 	struct bpf_netns_link *net_link =
 		container_of(link, struct bpf_netns_link, link);
 	struct bpf_prog_array *run_array;
-	int err;
+	int cnt, err;
 
 	mutex_lock(&netns_bpf_mutex);
 
-	/* Allow attaching only one prog or link for now */
-	if (!list_empty(&net->bpf.links[type])) {
+	cnt = link_count(net, type);
+	if (cnt >= netns_bpf_max_progs(type)) {
 		err = -E2BIG;
 		goto out_unlock;
 	}
@@ -341,16 +410,19 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
 	if (err)
 		goto out_unlock;
 
-	run_array = bpf_prog_array_alloc(1, GFP_KERNEL);
+	run_array = bpf_prog_array_alloc(cnt + 1, GFP_KERNEL);
 	if (!run_array) {
 		err = -ENOMEM;
 		goto out_unlock;
 	}
-	run_array->items[0].prog = link->prog;
-	rcu_assign_pointer(net->bpf.run_array[type], run_array);
 
 	list_add_tail(&net_link->node, &net->bpf.links[type]);
 
+	fill_prog_array(net, type, run_array);
+	run_array = rcu_replace_pointer(net->bpf.run_array[type], run_array,
+					lockdep_is_held(&netns_bpf_mutex));
+	bpf_prog_array_free(run_array);
+
 out_unlock:
 	mutex_unlock(&netns_bpf_mutex);
 	return err;
-- 
cgit v1.2.3


From e9ddbb7707ff5891616240026062b8c1e29864ca Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Fri, 17 Jul 2020 12:35:23 +0200
Subject: bpf: Introduce SK_LOOKUP program type with a dedicated attach point

Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type
BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer
when looking up a listening socket for a new connection request for
connection oriented protocols, or when looking up an unconnected socket for
a packet for connection-less protocols.

When called, SK_LOOKUP BPF program can select a socket that will receive
the packet. This serves as a mechanism to overcome the limits of what
bind() API allows to express. Two use-cases driving this work are:

 (1) steer packets destined to an IP range, on fixed port to a socket

     192.0.2.0/24, port 80 -> NGINX socket

 (2) steer packets destined to an IP address, on any port to a socket

     198.51.100.1, any port -> L7 proxy socket

In its run-time context program receives information about the packet that
triggered the socket lookup. Namely IP version, L4 protocol identifier, and
address 4-tuple. Context can be further extended to include ingress
interface identifier.

To select a socket BPF program fetches it from a map holding socket
references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...)
helper to record the selection. Transport layer then uses the selected
socket as a result of socket lookup.

In its basic form, SK_LOOKUP acts as a filter and hence must return either
SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should
look for a socket to receive the packet, or use the one selected by the
program if available, while SK_DROP informs the transport layer that the
lookup should fail.

This patch only enables the user to attach an SK_LOOKUP program to a
network namespace. Subsequent patches hook it up to run on local delivery
path in ipv4 and ipv6 stacks.

Suggested-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200717103536.397595-3-jakub@cloudflare.com
---
 include/linux/bpf-netns.h  |   3 +
 include/linux/bpf.h        |   1 +
 include/linux/bpf_types.h  |   2 +
 include/linux/filter.h     |  17 +++++
 include/uapi/linux/bpf.h   |  77 +++++++++++++++++++
 kernel/bpf/net_namespace.c |   5 ++
 kernel/bpf/syscall.c       |   9 +++
 kernel/bpf/verifier.c      |  13 +++-
 net/core/filter.c          | 180 +++++++++++++++++++++++++++++++++++++++++++++
 scripts/bpf_helpers_doc.py |   9 ++-
 10 files changed, 312 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h
index 47d5b0c708c9..722f799c1a2e 100644
--- a/include/linux/bpf-netns.h
+++ b/include/linux/bpf-netns.h
@@ -8,6 +8,7 @@
 enum netns_bpf_attach_type {
 	NETNS_BPF_INVALID = -1,
 	NETNS_BPF_FLOW_DISSECTOR = 0,
+	NETNS_BPF_SK_LOOKUP,
 	MAX_NETNS_BPF_ATTACH_TYPE
 };
 
@@ -17,6 +18,8 @@ to_netns_bpf_attach_type(enum bpf_attach_type attach_type)
 	switch (attach_type) {
 	case BPF_FLOW_DISSECTOR:
 		return NETNS_BPF_FLOW_DISSECTOR;
+	case BPF_SK_LOOKUP:
+		return NETNS_BPF_SK_LOOKUP;
 	default:
 		return NETNS_BPF_INVALID;
 	}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c8c9eabcd106..adb16bdc5f0a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -249,6 +249,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_INT,		/* pointer to int */
 	ARG_PTR_TO_LONG,	/* pointer to long */
 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock (fullsock) */
+	ARG_PTR_TO_SOCKET_OR_NULL,	/* pointer to bpf_sock (fullsock) or NULL */
 	ARG_PTR_TO_BTF_ID,	/* pointer to in-kernel struct */
 	ARG_PTR_TO_ALLOC_MEM,	/* pointer to dynamically allocated memory */
 	ARG_PTR_TO_ALLOC_MEM_OR_NULL,	/* pointer to dynamically allocated memory or NULL */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index a18ae82a298a..a52a5688418e 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -64,6 +64,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
 #ifdef CONFIG_INET
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport,
 	      struct sk_reuseport_md, struct sk_reuseport_kern)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_LOOKUP, sk_lookup,
+	      struct bpf_sk_lookup, struct bpf_sk_lookup_kern)
 #endif
 #if defined(CONFIG_BPF_JIT)
 BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops,
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0b0144752d78..fa1ea12ad2cd 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1278,4 +1278,21 @@ struct bpf_sockopt_kern {
 	s32		retval;
 };
 
+struct bpf_sk_lookup_kern {
+	u16		family;
+	u16		protocol;
+	struct {
+		__be32 saddr;
+		__be32 daddr;
+	} v4;
+	struct {
+		const struct in6_addr *saddr;
+		const struct in6_addr *daddr;
+	} v6;
+	__be16		sport;
+	u16		dport;
+	struct sock	*selected_sk;
+	bool		no_reuseport;
+};
+
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7ac3992dacfe..54d0c886e3ba 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -189,6 +189,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_STRUCT_OPS,
 	BPF_PROG_TYPE_EXT,
 	BPF_PROG_TYPE_LSM,
+	BPF_PROG_TYPE_SK_LOOKUP,
 };
 
 enum bpf_attach_type {
@@ -228,6 +229,7 @@ enum bpf_attach_type {
 	BPF_XDP_DEVMAP,
 	BPF_CGROUP_INET_SOCK_RELEASE,
 	BPF_XDP_CPUMAP,
+	BPF_SK_LOOKUP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -3069,6 +3071,10 @@ union bpf_attr {
  *
  * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
  *	Description
+ *		Helper is overloaded depending on BPF program type. This
+ *		description applies to **BPF_PROG_TYPE_SCHED_CLS** and
+ *		**BPF_PROG_TYPE_SCHED_ACT** programs.
+ *
  *		Assign the *sk* to the *skb*. When combined with appropriate
  *		routing configuration to receive the packet towards the socket,
  *		will cause *skb* to be delivered to the specified socket.
@@ -3094,6 +3100,56 @@ union bpf_attr {
  *		**-ESOCKTNOSUPPORT** if the socket type is not supported
  *		(reuseport).
  *
+ * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
+ *	Description
+ *		Helper is overloaded depending on BPF program type. This
+ *		description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs.
+ *
+ *		Select the *sk* as a result of a socket lookup.
+ *
+ *		For the operation to succeed passed socket must be compatible
+ *		with the packet description provided by the *ctx* object.
+ *
+ *		L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must
+ *		be an exact match. While IP family (**AF_INET** or
+ *		**AF_INET6**) must be compatible, that is IPv6 sockets
+ *		that are not v6-only can be selected for IPv4 packets.
+ *
+ *		Only TCP listeners and UDP unconnected sockets can be
+ *		selected. *sk* can also be NULL to reset any previous
+ *		selection.
+ *
+ *		*flags* argument can combination of following values:
+ *
+ *		* **BPF_SK_LOOKUP_F_REPLACE** to override the previous
+ *		  socket selection, potentially done by a BPF program
+ *		  that ran before us.
+ *
+ *		* **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip
+ *		  load-balancing within reuseport group for the socket
+ *		  being selected.
+ *
+ *		On success *ctx->sk* will point to the selected socket.
+ *
+ *	Return
+ *		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EAFNOSUPPORT** if socket family (*sk->family*) is
+ *		  not compatible with packet family (*ctx->family*).
+ *
+ *		* **-EEXIST** if socket has been already selected,
+ *		  potentially by another program, and
+ *		  **BPF_SK_LOOKUP_F_REPLACE** flag was not specified.
+ *
+ *		* **-EINVAL** if unsupported flags were specified.
+ *
+ *		* **-EPROTOTYPE** if socket L4 protocol
+ *		  (*sk->protocol*) doesn't match packet protocol
+ *		  (*ctx->protocol*).
+ *
+ *		* **-ESOCKTNOSUPPORT** if socket is not in allowed
+ *		  state (TCP listening or UDP unconnected).
+ *
  * u64 bpf_ktime_get_boot_ns(void)
  * 	Description
  * 		Return the time elapsed since system boot, in nanoseconds.
@@ -3607,6 +3663,12 @@ enum {
 	BPF_RINGBUF_HDR_SZ		= 8,
 };
 
+/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */
+enum {
+	BPF_SK_LOOKUP_F_REPLACE		= (1ULL << 0),
+	BPF_SK_LOOKUP_F_NO_REUSEPORT	= (1ULL << 1),
+};
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
@@ -4349,4 +4411,19 @@ struct bpf_pidns_info {
 	__u32 pid;
 	__u32 tgid;
 };
+
+/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
+struct bpf_sk_lookup {
+	__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+
+	__u32 family;		/* Protocol family (AF_INET, AF_INET6) */
+	__u32 protocol;		/* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
+	__u32 remote_ip4;	/* Network byte order */
+	__u32 remote_ip6[4];	/* Network byte order */
+	__u32 remote_port;	/* Network byte order */
+	__u32 local_ip4;	/* Network byte order */
+	__u32 local_ip6[4];	/* Network byte order */
+	__u32 local_port;	/* Host byte order */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index e9c8e26ac8f2..38b368bccda2 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -373,6 +373,8 @@ static int netns_bpf_max_progs(enum netns_bpf_attach_type type)
 	switch (type) {
 	case NETNS_BPF_FLOW_DISSECTOR:
 		return 1;
+	case NETNS_BPF_SK_LOOKUP:
+		return 64;
 	default:
 		return 0;
 	}
@@ -403,6 +405,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
 	case NETNS_BPF_FLOW_DISSECTOR:
 		err = flow_dissector_bpf_prog_attach_check(net, link->prog);
 		break;
+	case NETNS_BPF_SK_LOOKUP:
+		err = 0; /* nothing to check */
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7ea9dfbebd8c..d07417d17712 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2022,6 +2022,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		default:
 			return -EINVAL;
 		}
+	case BPF_PROG_TYPE_SK_LOOKUP:
+		if (expected_attach_type == BPF_SK_LOOKUP)
+			return 0;
+		return -EINVAL;
 	case BPF_PROG_TYPE_EXT:
 		if (expected_attach_type)
 			return -EINVAL;
@@ -2756,6 +2760,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+	case BPF_PROG_TYPE_SK_LOOKUP:
 		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
 	case BPF_PROG_TYPE_CGROUP_SKB:
 		if (!capable(CAP_NET_ADMIN))
@@ -2817,6 +2822,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
 	case BPF_TRACE_ITER:
 		return BPF_PROG_TYPE_TRACING;
+	case BPF_SK_LOOKUP:
+		return BPF_PROG_TYPE_SK_LOOKUP;
 	default:
 		return BPF_PROG_TYPE_UNSPEC;
 	}
@@ -2953,6 +2960,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_LIRC_MODE2:
 		return lirc_prog_query(attr, uattr);
 	case BPF_FLOW_DISSECTOR:
+	case BPF_SK_LOOKUP:
 		return netns_bpf_prog_query(attr, uattr);
 	default:
 		return -EINVAL;
@@ -3891,6 +3899,7 @@ static int link_create(union bpf_attr *attr)
 		ret = tracing_bpf_link_attach(attr, prog);
 		break;
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
+	case BPF_PROG_TYPE_SK_LOOKUP:
 		ret = netns_bpf_link_create(attr, prog);
 		break;
 	default:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3c1efc9d08fd..9a6703bc3f36 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3878,10 +3878,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 			}
 			meta->ref_obj_id = reg->ref_obj_id;
 		}
-	} else if (arg_type == ARG_PTR_TO_SOCKET) {
+	} else if (arg_type == ARG_PTR_TO_SOCKET ||
+		   arg_type == ARG_PTR_TO_SOCKET_OR_NULL) {
 		expected_type = PTR_TO_SOCKET;
-		if (type != expected_type)
-			goto err_type;
+		if (!(register_is_null(reg) &&
+		      arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) {
+			if (type != expected_type)
+				goto err_type;
+		}
 	} else if (arg_type == ARG_PTR_TO_BTF_ID) {
 		expected_type = PTR_TO_BTF_ID;
 		if (type != expected_type)
@@ -7354,6 +7358,9 @@ static int check_return_code(struct bpf_verifier_env *env)
 			return -ENOTSUPP;
 		}
 		break;
+	case BPF_PROG_TYPE_SK_LOOKUP:
+		range = tnum_range(SK_DROP, SK_PASS);
+		break;
 	case BPF_PROG_TYPE_EXT:
 		/* freplace program can return anything as its return value
 		 * depends on the to-be-replaced kernel func or bpf program.
diff --git a/net/core/filter.c b/net/core/filter.c
index bdd2382e655d..d099436b3ff5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9229,6 +9229,186 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
 
 const struct bpf_prog_ops sk_reuseport_prog_ops = {
 };
+
+BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
+	   struct sock *, sk, u64, flags)
+{
+	if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE |
+			       BPF_SK_LOOKUP_F_NO_REUSEPORT)))
+		return -EINVAL;
+	if (unlikely(sk && sk_is_refcounted(sk)))
+		return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
+	if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED))
+		return -ESOCKTNOSUPPORT; /* reject connected sockets */
+
+	/* Check if socket is suitable for packet L3/L4 protocol */
+	if (sk && sk->sk_protocol != ctx->protocol)
+		return -EPROTOTYPE;
+	if (sk && sk->sk_family != ctx->family &&
+	    (sk->sk_family == AF_INET || ipv6_only_sock(sk)))
+		return -EAFNOSUPPORT;
+
+	if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE))
+		return -EEXIST;
+
+	/* Select socket as lookup result */
+	ctx->selected_sk = sk;
+	ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT;
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_assign_proto = {
+	.func		= bpf_sk_lookup_assign,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_SOCKET_OR_NULL,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_perf_event_output:
+		return &bpf_event_output_data_proto;
+	case BPF_FUNC_sk_assign:
+		return &bpf_sk_lookup_assign_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+static bool sk_lookup_is_valid_access(int off, int size,
+				      enum bpf_access_type type,
+				      const struct bpf_prog *prog,
+				      struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off >= sizeof(struct bpf_sk_lookup))
+		return false;
+	if (off % size != 0)
+		return false;
+	if (type != BPF_READ)
+		return false;
+
+	switch (off) {
+	case offsetof(struct bpf_sk_lookup, sk):
+		info->reg_type = PTR_TO_SOCKET_OR_NULL;
+		return size == sizeof(__u64);
+
+	case bpf_ctx_range(struct bpf_sk_lookup, family):
+	case bpf_ctx_range(struct bpf_sk_lookup, protocol):
+	case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4):
+	case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
+	case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
+	case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
+	case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
+	case bpf_ctx_range(struct bpf_sk_lookup, local_port):
+		bpf_ctx_record_field_size(info, sizeof(__u32));
+		return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
+
+	default:
+		return false;
+	}
+}
+
+static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
+					const struct bpf_insn *si,
+					struct bpf_insn *insn_buf,
+					struct bpf_prog *prog,
+					u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (si->off) {
+	case offsetof(struct bpf_sk_lookup, sk):
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sk_lookup_kern, selected_sk));
+		break;
+
+	case offsetof(struct bpf_sk_lookup, family):
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+				      bpf_target_off(struct bpf_sk_lookup_kern,
+						     family, 2, target_size));
+		break;
+
+	case offsetof(struct bpf_sk_lookup, protocol):
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+				      bpf_target_off(struct bpf_sk_lookup_kern,
+						     protocol, 2, target_size));
+		break;
+
+	case offsetof(struct bpf_sk_lookup, remote_ip4):
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      bpf_target_off(struct bpf_sk_lookup_kern,
+						     v4.saddr, 4, target_size));
+		break;
+
+	case offsetof(struct bpf_sk_lookup, local_ip4):
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      bpf_target_off(struct bpf_sk_lookup_kern,
+						     v4.daddr, 4, target_size));
+		break;
+
+	case bpf_ctx_range_till(struct bpf_sk_lookup,
+				remote_ip6[0], remote_ip6[3]): {
+#if IS_ENABLED(CONFIG_IPV6)
+		int off = si->off;
+
+		off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]);
+		off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sk_lookup_kern, v6.saddr));
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+	}
+	case bpf_ctx_range_till(struct bpf_sk_lookup,
+				local_ip6[0], local_ip6[3]): {
+#if IS_ENABLED(CONFIG_IPV6)
+		int off = si->off;
+
+		off -= offsetof(struct bpf_sk_lookup, local_ip6[0]);
+		off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sk_lookup_kern, v6.daddr));
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+	}
+	case offsetof(struct bpf_sk_lookup, remote_port):
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+				      bpf_target_off(struct bpf_sk_lookup_kern,
+						     sport, 2, target_size));
+		break;
+
+	case offsetof(struct bpf_sk_lookup, local_port):
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+				      bpf_target_off(struct bpf_sk_lookup_kern,
+						     dport, 2, target_size));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
+const struct bpf_prog_ops sk_lookup_prog_ops = {
+};
+
+const struct bpf_verifier_ops sk_lookup_verifier_ops = {
+	.get_func_proto		= sk_lookup_func_proto,
+	.is_valid_access	= sk_lookup_is_valid_access,
+	.convert_ctx_access	= sk_lookup_convert_ctx_access,
+};
+
 #endif /* CONFIG_INET */
 
 DEFINE_BPF_DISPATCHER(xdp)
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 6843376733df..5bfa448b4704 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -404,6 +404,7 @@ class PrinterHelpers(Printer):
 
     type_fwds = [
             'struct bpf_fib_lookup',
+            'struct bpf_sk_lookup',
             'struct bpf_perf_event_data',
             'struct bpf_perf_event_value',
             'struct bpf_pidns_info',
@@ -450,6 +451,7 @@ class PrinterHelpers(Printer):
             'struct bpf_perf_event_data',
             'struct bpf_perf_event_value',
             'struct bpf_pidns_info',
+            'struct bpf_sk_lookup',
             'struct bpf_sock',
             'struct bpf_sock_addr',
             'struct bpf_sock_ops',
@@ -487,6 +489,11 @@ class PrinterHelpers(Printer):
             'struct sk_msg_buff': 'struct sk_msg_md',
             'struct xdp_buff': 'struct xdp_md',
     }
+    # Helpers overloaded for different context types.
+    overloaded_helpers = [
+        'bpf_get_socket_cookie',
+        'bpf_sk_assign',
+    ]
 
     def print_header(self):
         header = '''\
@@ -543,7 +550,7 @@ class PrinterHelpers(Printer):
         for i, a in enumerate(proto['args']):
             t = a['type']
             n = a['name']
-            if proto['name'] == 'bpf_get_socket_cookie' and i == 0:
+            if proto['name'] in self.overloaded_helpers and i == 0:
                     t = 'void'
                     n = 'ctx'
             one_arg = '{}{}'.format(comma, self.map_type(t))
-- 
cgit v1.2.3


From 1559b4aa1db443096af493c7d621dc156054babe Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Fri, 17 Jul 2020 12:35:25 +0200
Subject: inet: Run SK_LOOKUP BPF program on socket lookup

Run a BPF program before looking up a listening socket on the receive path.
Program selects a listening socket to yield as result of socket lookup by
calling bpf_sk_assign() helper and returning SK_PASS code. Program can
revert its decision by assigning a NULL socket with bpf_sk_assign().

Alternatively, BPF program can also fail the lookup by returning with
SK_DROP, or let the lookup continue as usual with SK_PASS on return, when
no socket has been selected with bpf_sk_assign().

This lets the user match packets with listening sockets freely at the last
possible point on the receive path, where we know that packets are destined
for local delivery after undergoing policing, filtering, and routing.

With BPF code selecting the socket, directing packets destined to an IP
range or to a port range to a single socket becomes possible.

In case multiple programs are attached, they are run in series in the order
in which they were attached. The end result is determined from return codes
of all the programs according to following rules:

 1. If any program returned SK_PASS and selected a valid socket, the socket
    is used as result of socket lookup.
 2. If more than one program returned SK_PASS and selected a socket,
    last selection takes effect.
 3. If any program returned SK_DROP, and no program returned SK_PASS and
    selected a socket, socket lookup fails with -ECONNREFUSED.
 4. If all programs returned SK_PASS and none of them selected a socket,
    socket lookup continues to htable-based lookup.

Suggested-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200717103536.397595-5-jakub@cloudflare.com
---
 include/linux/filter.h     | 91 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/net_namespace.c | 32 +++++++++++++++-
 net/core/filter.c          |  3 ++
 net/ipv4/inet_hashtables.c | 31 ++++++++++++++++
 4 files changed, 156 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index fa1ea12ad2cd..c4f54c216347 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1295,4 +1295,95 @@ struct bpf_sk_lookup_kern {
 	bool		no_reuseport;
 };
 
+extern struct static_key_false bpf_sk_lookup_enabled;
+
+/* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup.
+ *
+ * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and
+ * SK_DROP. Their meaning is as follows:
+ *
+ *  SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result
+ *  SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup
+ *  SK_DROP                           : terminate lookup with -ECONNREFUSED
+ *
+ * This macro aggregates return values and selected sockets from
+ * multiple BPF programs according to following rules in order:
+ *
+ *  1. If any program returned SK_PASS and a non-NULL ctx.selected_sk,
+ *     macro result is SK_PASS and last ctx.selected_sk is used.
+ *  2. If any program returned SK_DROP return value,
+ *     macro result is SK_DROP.
+ *  3. Otherwise result is SK_PASS and ctx.selected_sk is NULL.
+ *
+ * Caller must ensure that the prog array is non-NULL, and that the
+ * array as well as the programs it contains remain valid.
+ */
+#define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func)			\
+	({								\
+		struct bpf_sk_lookup_kern *_ctx = &(ctx);		\
+		struct bpf_prog_array_item *_item;			\
+		struct sock *_selected_sk = NULL;			\
+		bool _no_reuseport = false;				\
+		struct bpf_prog *_prog;					\
+		bool _all_pass = true;					\
+		u32 _ret;						\
+									\
+		migrate_disable();					\
+		_item = &(array)->items[0];				\
+		while ((_prog = READ_ONCE(_item->prog))) {		\
+			/* restore most recent selection */		\
+			_ctx->selected_sk = _selected_sk;		\
+			_ctx->no_reuseport = _no_reuseport;		\
+									\
+			_ret = func(_prog, _ctx);			\
+			if (_ret == SK_PASS && _ctx->selected_sk) {	\
+				/* remember last non-NULL socket */	\
+				_selected_sk = _ctx->selected_sk;	\
+				_no_reuseport = _ctx->no_reuseport;	\
+			} else if (_ret == SK_DROP && _all_pass) {	\
+				_all_pass = false;			\
+			}						\
+			_item++;					\
+		}							\
+		_ctx->selected_sk = _selected_sk;			\
+		_ctx->no_reuseport = _no_reuseport;			\
+		migrate_enable();					\
+		_all_pass || _selected_sk ? SK_PASS : SK_DROP;		\
+	 })
+
+static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol,
+					const __be32 saddr, const __be16 sport,
+					const __be32 daddr, const u16 dport,
+					struct sock **psk)
+{
+	struct bpf_prog_array *run_array;
+	struct sock *selected_sk = NULL;
+	bool no_reuseport = false;
+
+	rcu_read_lock();
+	run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
+	if (run_array) {
+		struct bpf_sk_lookup_kern ctx = {
+			.family		= AF_INET,
+			.protocol	= protocol,
+			.v4.saddr	= saddr,
+			.v4.daddr	= daddr,
+			.sport		= sport,
+			.dport		= dport,
+		};
+		u32 act;
+
+		act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN);
+		if (act == SK_PASS) {
+			selected_sk = ctx.selected_sk;
+			no_reuseport = ctx.no_reuseport;
+		} else {
+			selected_sk = ERR_PTR(-ECONNREFUSED);
+		}
+	}
+	rcu_read_unlock();
+	*psk = selected_sk;
+	return no_reuseport;
+}
+
 #endif /* __LINUX_FILTER_H__ */
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 38b368bccda2..4e1bcaa2c3cb 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -25,6 +25,28 @@ struct bpf_netns_link {
 /* Protects updates to netns_bpf */
 DEFINE_MUTEX(netns_bpf_mutex);
 
+static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type)
+{
+	switch (type) {
+	case NETNS_BPF_SK_LOOKUP:
+		static_branch_dec(&bpf_sk_lookup_enabled);
+		break;
+	default:
+		break;
+	}
+}
+
+static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type)
+{
+	switch (type) {
+	case NETNS_BPF_SK_LOOKUP:
+		static_branch_inc(&bpf_sk_lookup_enabled);
+		break;
+	default:
+		break;
+	}
+}
+
 /* Must be called with netns_bpf_mutex held. */
 static void netns_bpf_run_array_detach(struct net *net,
 				       enum netns_bpf_attach_type type)
@@ -91,6 +113,9 @@ static void bpf_netns_link_release(struct bpf_link *link)
 	if (!net)
 		goto out_unlock;
 
+	/* Mark attach point as unused */
+	netns_bpf_attach_type_unneed(type);
+
 	/* Remember link position in case of safe delete */
 	idx = link_index(net, type, net_link);
 	list_del(&net_link->node);
@@ -428,6 +453,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
 					lockdep_is_held(&netns_bpf_mutex));
 	bpf_prog_array_free(run_array);
 
+	/* Mark attach point as used */
+	netns_bpf_attach_type_need(type);
+
 out_unlock:
 	mutex_unlock(&netns_bpf_mutex);
 	return err;
@@ -503,8 +531,10 @@ static void __net_exit netns_bpf_pernet_pre_exit(struct net *net)
 	mutex_lock(&netns_bpf_mutex);
 	for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) {
 		netns_bpf_run_array_detach(net, type);
-		list_for_each_entry(net_link, &net->bpf.links[type], node)
+		list_for_each_entry(net_link, &net->bpf.links[type], node) {
 			net_link->net = NULL; /* auto-detach link */
+			netns_bpf_attach_type_unneed(type);
+		}
 		if (net->bpf.progs[type])
 			bpf_prog_put(net->bpf.progs[type]);
 	}
diff --git a/net/core/filter.c b/net/core/filter.c
index d099436b3ff5..2bd129b5ae74 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9230,6 +9230,9 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
 const struct bpf_prog_ops sk_reuseport_prog_ops = {
 };
 
+DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled);
+EXPORT_SYMBOL(bpf_sk_lookup_enabled);
+
 BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
 	   struct sock *, sk, u64, flags)
 {
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ab64834837c8..4eb4cd8d20dd 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -299,6 +299,29 @@ static struct sock *inet_lhash2_lookup(struct net *net,
 	return result;
 }
 
+static inline struct sock *inet_lookup_run_bpf(struct net *net,
+					       struct inet_hashinfo *hashinfo,
+					       struct sk_buff *skb, int doff,
+					       __be32 saddr, __be16 sport,
+					       __be32 daddr, u16 hnum)
+{
+	struct sock *sk, *reuse_sk;
+	bool no_reuseport;
+
+	if (hashinfo != &tcp_hashinfo)
+		return NULL; /* only TCP is supported */
+
+	no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP,
+					    saddr, sport, daddr, hnum, &sk);
+	if (no_reuseport || IS_ERR_OR_NULL(sk))
+		return sk;
+
+	reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum);
+	if (reuse_sk)
+		sk = reuse_sk;
+	return sk;
+}
+
 struct sock *__inet_lookup_listener(struct net *net,
 				    struct inet_hashinfo *hashinfo,
 				    struct sk_buff *skb, int doff,
@@ -310,6 +333,14 @@ struct sock *__inet_lookup_listener(struct net *net,
 	struct sock *result = NULL;
 	unsigned int hash2;
 
+	/* Lookup redirect from BPF */
+	if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
+		result = inet_lookup_run_bpf(net, hashinfo, skb, doff,
+					     saddr, sport, daddr, hnum);
+		if (result)
+			goto done;
+	}
+
 	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
 	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
 
-- 
cgit v1.2.3


From 2f9237d4f6df49b74c51cdac555b0a9979d0c237 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Jul 2020 09:30:00 +0200
Subject: dma-mapping: make support for dma ops optional

Avoid the overhead of the dma ops support for tiny builds that only
use the direct mapping.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/alpha/Kconfig               |  1 +
 arch/arm/Kconfig                 |  1 +
 arch/ia64/Kconfig                |  1 +
 arch/mips/Kconfig                |  1 +
 arch/parisc/Kconfig              |  1 +
 arch/powerpc/Kconfig             |  1 +
 arch/s390/Kconfig                |  1 +
 arch/sparc/Kconfig               |  1 +
 arch/x86/Kconfig                 |  1 +
 drivers/infiniband/core/device.c |  6 +++++-
 drivers/iommu/Kconfig            |  2 ++
 drivers/macintosh/macio_asic.c   |  4 ++--
 drivers/misc/mic/Kconfig         |  4 ++++
 drivers/vdpa/Kconfig             |  1 +
 drivers/xen/Kconfig              |  1 +
 include/linux/device.h           |  3 ++-
 include/linux/dma-mapping.h      | 12 +++++++++++-
 kernel/dma/Kconfig               |  4 ++++
 kernel/dma/Makefile              |  3 ++-
 19 files changed, 43 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 10862c5a8c76..9c5f06e8eb9b 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -7,6 +7,7 @@ config ALPHA
 	select ARCH_NO_PREEMPT
 	select ARCH_NO_SG_CHAIN
 	select ARCH_USE_CMPXCHG_LOCKREF
+	select DMA_OPS if PCI
 	select FORCE_PCI if !ALPHA_JENSEN
 	select PCI_DOMAINS if PCI
 	select PCI_SYSCALL if PCI
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2ac74904a3ce..bee35b0187e4 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -41,6 +41,7 @@ config ARM
 	select CPU_PM if SUSPEND || CPU_IDLE
 	select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select DMA_DECLARE_COHERENT
+	select DMA_OPS
 	select DMA_REMAP if MMU
 	select EDAC_SUPPORT
 	select EDAC_ATOMIC_SCRUB
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 1fa2fe2ef053..5b4ec80bf586 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -192,6 +192,7 @@ config IA64_SGI_UV
 
 config IA64_HP_SBA_IOMMU
 	bool "HP SBA IOMMU support"
+	select DMA_OPS
 	default y
 	help
 	  Say Y here to add support for the SBA IOMMU found on HP zx1 and
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 6fee1a133e9d..8a458105e445 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -367,6 +367,7 @@ config MACH_JAZZ
 	select ARC_PROMLIB
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
+	select DMA_OPS
 	select FW_ARC
 	select FW_ARC32
 	select ARCH_MAY_HAVE_PC_FDC
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 8e4c3708773d..38c1eafc1f1a 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -14,6 +14,7 @@ config PARISC
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_NO_SG_CHAIN
 	select ARCH_SUPPORTS_MEMORY_FAILURE
+	select DMA_OPS
 	select RTC_CLASS
 	select RTC_DRV_GENERIC
 	select INIT_ALL_POSSIBLE
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9fa23eb320ff..e9b091d35872 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -151,6 +151,7 @@ config PPC
 	select BUILDTIME_TABLE_SORT
 	select CLONE_BACKWARDS
 	select DCACHE_WORD_ACCESS		if PPC64 && CPU_LITTLE_ENDIAN
+	select DMA_OPS				if PPC64
 	select DYNAMIC_FTRACE			if FUNCTION_TRACER
 	select EDAC_ATOMIC_SCRUB
 	select EDAC_SUPPORT
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index c7d7ede6300c..687fe23f61cc 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -113,6 +113,7 @@ config S390
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select BUILDTIME_TABLE_SORT
 	select CLONE_BACKWARDS2
+	select DMA_OPS if PCI
 	select DYNAMIC_FTRACE if FUNCTION_TRACER
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_CPU_AUTOPROBE
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 5bf2dc163540..5db1faaaee31 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -15,6 +15,7 @@ config SPARC
 	default y
 	select ARCH_MIGHT_HAVE_PC_PARPORT if SPARC64 && PCI
 	select ARCH_MIGHT_HAVE_PC_SERIO
+	select DMA_OPS
 	select OF
 	select OF_PROMTREE
 	select HAVE_ASM_MODVERSIONS
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 883da0abf779..96ab92754158 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -909,6 +909,7 @@ config DMI
 
 config GART_IOMMU
 	bool "Old AMD GART IOMMU support"
+	select DMA_OPS
 	select IOMMU_HELPER
 	select SWIOTLB
 	depends on X86_64 && PCI && AMD_NB
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 905a2beaf885..2927a9d16eaa 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1183,6 +1183,8 @@ static void setup_dma_device(struct ib_device *device)
 	struct device *parent = device->dev.parent;
 
 	WARN_ON_ONCE(device->dma_device);
+
+#ifdef CONFIG_DMA_OPS
 	if (device->dev.dma_ops) {
 		/*
 		 * The caller provided custom DMA operations. Copy the
@@ -1203,7 +1205,9 @@ static void setup_dma_device(struct ib_device *device)
 			else
 				WARN_ON_ONCE(true);
 		}
-	} else {
+	} else
+#endif /* CONFIG_DMA_OPS */
+	{
 		/*
 		 * The caller did not provide custom DMA operations. Use the
 		 * DMA mapping operations of the parent device.
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index b0f308cb7f7c..b622af72448f 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -97,6 +97,7 @@ config OF_IOMMU
 # IOMMU-agnostic DMA-mapping layer
 config IOMMU_DMA
 	bool
+	select DMA_OPS
 	select IOMMU_API
 	select IOMMU_IOVA
 	select IRQ_MSI_IOMMU
@@ -183,6 +184,7 @@ config DMAR_TABLE
 config INTEL_IOMMU
 	bool "Support for Intel IOMMU using DMA Remapping Devices"
 	depends on PCI_MSI && ACPI && (X86 || IA64)
+	select DMA_OPS
 	select IOMMU_API
 	select IOMMU_IOVA
 	select NEED_DMA_MAP_STATE
diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c
index 92d142d2b75f..49af60bdac92 100644
--- a/drivers/macintosh/macio_asic.c
+++ b/drivers/macintosh/macio_asic.c
@@ -382,7 +382,7 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip,
 	dma_set_max_seg_size(&dev->ofdev.dev, 65536);
 	dma_set_seg_boundary(&dev->ofdev.dev, 0xffffffff);
 
-#ifdef CONFIG_PCI
+#if defined(CONFIG_PCI) && defined(CONFIG_DMA_OPS)
 	/* Set the DMA ops to the ones from the PCI device, this could be
 	 * fishy if we didn't know that on PowerMac it's always direct ops
 	 * or iommu ops that will work fine
@@ -391,7 +391,7 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip,
 	 */
 	dev->ofdev.dev.archdata = chip->lbus.pdev->dev.archdata;
 	dev->ofdev.dev.dma_ops = chip->lbus.pdev->dev.dma_ops;
-#endif /* CONFIG_PCI */
+#endif /* CONFIG_PCI && CONFIG_DMA_OPS */
 
 #ifdef DEBUG
 	printk("preparing mdev @%p, ofdev @%p, dev @%p, kobj @%p\n",
diff --git a/drivers/misc/mic/Kconfig b/drivers/misc/mic/Kconfig
index 8f201d019f5a..b9bb086785db 100644
--- a/drivers/misc/mic/Kconfig
+++ b/drivers/misc/mic/Kconfig
@@ -4,6 +4,7 @@ menu "Intel MIC & related support"
 config INTEL_MIC_BUS
 	tristate "Intel MIC Bus Driver"
 	depends on 64BIT && PCI && X86
+	select DMA_OPS
 	help
 	  This option is selected by any driver which registers a
 	  device or driver on the MIC Bus, such as CONFIG_INTEL_MIC_HOST,
@@ -19,6 +20,7 @@ config INTEL_MIC_BUS
 config SCIF_BUS
 	tristate "SCIF Bus Driver"
 	depends on 64BIT && PCI && X86
+	select DMA_OPS
 	help
 	  This option is selected by any driver which registers a
 	  device or driver on the SCIF Bus, such as CONFIG_INTEL_MIC_HOST
@@ -33,6 +35,7 @@ config SCIF_BUS
 
 config VOP_BUS
 	tristate "VOP Bus Driver"
+	select DMA_OPS
 	help
 	  This option is selected by any driver which registers a
 	  device or driver on the VOP Bus, such as CONFIG_INTEL_MIC_HOST
@@ -49,6 +52,7 @@ config INTEL_MIC_HOST
 	tristate "Intel MIC Host Driver"
 	depends on 64BIT && PCI && X86
 	depends on INTEL_MIC_BUS && SCIF_BUS && MIC_COSM && VOP_BUS
+	select DMA_OPS
 	help
 	  This enables Host Driver support for the Intel Many Integrated
 	  Core (MIC) family of PCIe form factor coprocessor devices that
diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
index 3e1ceb8e9f2b..d93a69b12f81 100644
--- a/drivers/vdpa/Kconfig
+++ b/drivers/vdpa/Kconfig
@@ -11,6 +11,7 @@ if VDPA
 config VDPA_SIM
 	tristate "vDPA device simulator"
 	depends on RUNTIME_TESTING_MENU && HAS_DMA
+	select DMA_OPS
 	select VHOST_RING
 	default n
 	help
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 727f11eb46b2..1d339ef92422 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -179,6 +179,7 @@ config XEN_GRANT_DMA_ALLOC
 
 config SWIOTLB_XEN
 	def_bool y
+	select DMA_OPS
 	select SWIOTLB
 
 config XEN_PCIDEV_BACKEND
diff --git a/include/linux/device.h b/include/linux/device.h
index 15460a5ac024..4c4af98321eb 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -568,8 +568,9 @@ struct device {
 #ifdef CONFIG_GENERIC_MSI_IRQ
 	struct list_head	msi_list;
 #endif
-
+#ifdef CONFIG_DMA_OPS
 	const struct dma_map_ops *dma_ops;
+#endif
 	u64		*dma_mask;	/* dma mask (if dma'able device) */
 	u64		coherent_dma_mask;/* Like dma_mask, but for
 					     alloc_coherent mappings as
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index bd0a6f5ee445..39da883c8619 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -191,6 +191,7 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma,
 #ifdef CONFIG_HAS_DMA
 #include <asm/dma-mapping.h>
 
+#ifdef CONFIG_DMA_OPS
 static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	if (dev->dma_ops)
@@ -203,7 +204,16 @@ static inline void set_dma_ops(struct device *dev,
 {
 	dev->dma_ops = dma_ops;
 }
-
+#else /* CONFIG_DMA_OPS */
+static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
+{
+	return NULL;
+}
+static inline void set_dma_ops(struct device *dev,
+			       const struct dma_map_ops *dma_ops)
+{
+}
+#endif /* CONFIG_DMA_OPS */
 
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 1da3f44f2565..5cfb2428593a 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -5,6 +5,9 @@ config HAS_DMA
 	depends on !NO_DMA
 	default y
 
+config DMA_OPS
+	bool
+
 config NEED_SG_DMA_LENGTH
 	bool
 
@@ -60,6 +63,7 @@ config DMA_NONCOHERENT_CACHE_SYNC
 config DMA_VIRT_OPS
 	bool
 	depends on HAS_DMA
+	select DMA_OPS
 
 config SWIOTLB
 	bool
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 370f63344e9c..32c7c1942bbd 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
-obj-$(CONFIG_HAS_DMA)			+= mapping.o direct.o dummy.o
+obj-$(CONFIG_HAS_DMA)			+= mapping.o direct.o
+obj-$(CONFIG_DMA_OPS)			+= dummy.o
 obj-$(CONFIG_DMA_CMA)			+= contiguous.o
 obj-$(CONFIG_DMA_DECLARE_COHERENT)	+= coherent.o
 obj-$(CONFIG_DMA_VIRT_OPS)		+= virt.o
-- 
cgit v1.2.3


From d35834c64820c7ef397f8a244061d4450720540e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 23 Mar 2020 18:19:30 +0100
Subject: dma-mapping: add a dma_ops_bypass flag to struct device

Several IOMMU drivers have a bypass mode where they can use a direct
mapping if the devices DMA mask is large enough.  Add generic support
to the core dma-mapping code to do that to switch those drivers to
a common solution.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 include/linux/device.h |  8 ++++++
 kernel/dma/Kconfig     |  8 ++++++
 kernel/dma/mapping.c   | 74 +++++++++++++++++++++++++++++++++++---------------
 3 files changed, 68 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/device.h b/include/linux/device.h
index 4c4af98321eb..1f71acf37f78 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -523,6 +523,11 @@ struct dev_links_info {
  *		  sync_state() callback.
  * @dma_coherent: this particular device is dma coherent, even if the
  *		architecture supports non-coherent devices.
+ * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
+ *		streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
+ *		and optionall (if the coherent mask is large enough) also
+ *		for dma allocations.  This flag is managed by the dma ops
+ *		instance from ->dma_supported.
  *
  * At the lowest level, every device in a Linux system is represented by an
  * instance of struct device. The device structure contains the information
@@ -623,6 +628,9 @@ struct device {
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
 	bool			dma_coherent:1;
 #endif
+#ifdef CONFIG_DMA_OPS_BYPASS
+	bool			dma_ops_bypass : 1;
+#endif
 };
 
 static inline struct device *kobj_to_dev(struct kobject *kobj)
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 5cfb2428593a..f4770fcfa62b 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -8,6 +8,14 @@ config HAS_DMA
 config DMA_OPS
 	bool
 
+#
+# IOMMU drivers that can bypass the IOMMU code and optionally use the direct
+# mapping fast path should select this option and set the dma_ops_bypass
+# flag in struct device where applicable
+#
+config DMA_OPS_BYPASS
+	bool
+
 config NEED_SG_DMA_LENGTH
 	bool
 
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index b53953024512..0d129421e75f 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -105,9 +105,35 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 }
 EXPORT_SYMBOL(dmam_alloc_attrs);
 
-static inline bool dma_is_direct(const struct dma_map_ops *ops)
+static bool dma_go_direct(struct device *dev, dma_addr_t mask,
+		const struct dma_map_ops *ops)
 {
-	return likely(!ops);
+	if (likely(!ops))
+		return true;
+#ifdef CONFIG_DMA_OPS_BYPASS
+	if (dev->dma_ops_bypass)
+		return min_not_zero(mask, dev->bus_dma_limit) >=
+			    dma_direct_get_required_mask(dev);
+#endif
+	return false;
+}
+
+
+/*
+ * Check if the devices uses a direct mapping for streaming DMA operations.
+ * This allows IOMMU drivers to set a bypass mode if the DMA mask is large
+ * enough.
+ */
+static inline bool dma_alloc_direct(struct device *dev,
+		const struct dma_map_ops *ops)
+{
+	return dma_go_direct(dev, dev->coherent_dma_mask, ops);
+}
+
+static inline bool dma_map_direct(struct device *dev,
+		const struct dma_map_ops *ops)
+{
+	return dma_go_direct(dev, *dev->dma_mask, ops);
 }
 
 dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
@@ -118,7 +144,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
 	dma_addr_t addr;
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
 	else
 		addr = ops->map_page(dev, page, offset, size, dir, attrs);
@@ -134,7 +160,7 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		dma_direct_unmap_page(dev, addr, size, dir, attrs);
 	else if (ops->unmap_page)
 		ops->unmap_page(dev, addr, size, dir, attrs);
@@ -153,7 +179,7 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
 	int ents;
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
 	else
 		ents = ops->map_sg(dev, sg, nents, dir, attrs);
@@ -172,7 +198,7 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
 
 	BUG_ON(!valid_dma_direction(dir));
 	debug_dma_unmap_sg(dev, sg, nents, dir);
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
 	else if (ops->unmap_sg)
 		ops->unmap_sg(dev, sg, nents, dir, attrs);
@@ -191,7 +217,7 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
 	if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
 		return DMA_MAPPING_ERROR;
 
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
 	else if (ops->map_resource)
 		addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
@@ -207,7 +233,7 @@ void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (!dma_is_direct(ops) && ops->unmap_resource)
+	if (!dma_map_direct(dev, ops) && ops->unmap_resource)
 		ops->unmap_resource(dev, addr, size, dir, attrs);
 	debug_dma_unmap_resource(dev, addr, size, dir);
 }
@@ -219,7 +245,7 @@ void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 	else if (ops->sync_single_for_cpu)
 		ops->sync_single_for_cpu(dev, addr, size, dir);
@@ -233,7 +259,7 @@ void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		dma_direct_sync_single_for_device(dev, addr, size, dir);
 	else if (ops->sync_single_for_device)
 		ops->sync_single_for_device(dev, addr, size, dir);
@@ -247,7 +273,7 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
 	else if (ops->sync_sg_for_cpu)
 		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
@@ -261,7 +287,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
 	else if (ops->sync_sg_for_device)
 		ops->sync_sg_for_device(dev, sg, nelems, dir);
@@ -302,7 +328,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
 				size, attrs);
 	if (!ops->get_sgtable)
@@ -372,7 +398,7 @@ bool dma_can_mmap(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		return dma_direct_can_mmap(dev);
 	return ops->mmap != NULL;
 }
@@ -397,7 +423,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
 				attrs);
 	if (!ops->mmap)
@@ -410,7 +436,7 @@ u64 dma_get_required_mask(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		return dma_direct_get_required_mask(dev);
 	if (ops->get_required_mask)
 		return ops->get_required_mask(dev);
@@ -441,7 +467,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	/* let the implementation decide on the zone to allocate from: */
 	flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
 	else if (ops->alloc)
 		cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
@@ -473,7 +499,7 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
 		return;
 
 	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
 	else if (ops->free)
 		ops->free(dev, size, cpu_addr, dma_handle, attrs);
@@ -484,7 +510,11 @@ int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	/*
+	 * ->dma_supported sets the bypass flag, so we must always call
+	 * into the method here unless the device is truly direct mapped.
+	 */
+	if (!ops)
 		return dma_direct_supported(dev, mask);
 	if (!ops->dma_supported)
 		return 1;
@@ -540,7 +570,7 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 
 	BUG_ON(!valid_dma_direction(dir));
 
-	if (dma_is_direct(ops))
+	if (dma_alloc_direct(dev, ops))
 		arch_dma_cache_sync(dev, vaddr, size, dir);
 	else if (ops->cache_sync)
 		ops->cache_sync(dev, vaddr, size, dir);
@@ -552,7 +582,7 @@ size_t dma_max_mapping_size(struct device *dev)
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	size_t size = SIZE_MAX;
 
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		size = dma_direct_max_mapping_size(dev);
 	else if (ops && ops->max_mapping_size)
 		size = ops->max_mapping_size(dev);
@@ -565,7 +595,7 @@ bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops))
+	if (dma_map_direct(dev, ops))
 		return dma_direct_need_sync(dev, dma_addr);
 	return ops->sync_single_for_cpu || ops->sync_single_for_device;
 }
-- 
cgit v1.2.3


From 23efed6fa7519e29a01f12723b0cfa8118dc87cb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 16 Jul 2020 17:00:16 +0200
Subject: dma-debug: use named initializers for dir2name

Make dir2name a little more readable and maintainable by using
named initializers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 kernel/dma/debug.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 36c962a86bf2..4dd0d9028ec8 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -144,8 +144,12 @@ static const char *type2name[] = {
 	[dma_debug_resource] = "resource",
 };
 
-static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
-				   "DMA_FROM_DEVICE", "DMA_NONE" };
+static const char *dir2name[] = {
+	[DMA_BIDIRECTIONAL]	= "DMA_BIDIRECTIONAL",
+	[DMA_TO_DEVICE]		= "DMA_TO_DEVICE",
+	[DMA_FROM_DEVICE]	= "DMA_FROM_DEVICE",
+	[DMA_NONE]		= "DMA_NONE",
+};
 
 /*
  * The access to some variables in this macro is racy. We can't use atomic_t
-- 
cgit v1.2.3


From 1caef81da05a84a40dbf02110e967ce6d1135ff6 Mon Sep 17 00:00:00 2001
From: Adrian Reber <areber@redhat.com>
Date: Sun, 19 Jul 2020 12:04:12 +0200
Subject: pid: use checkpoint_restore_ns_capable() for set_tid

Use the newly introduced capability CAP_CHECKPOINT_RESTORE to allow
using clone3() with set_tid set.

Signed-off-by: Adrian Reber <areber@redhat.com>
Signed-off-by: Nicolas Viennot <Nicolas.Viennot@twosigma.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/20200719100418.2112740-3-areber@redhat.com
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 kernel/pid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index f1496b757162..450d40469b1c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -198,7 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 			if (tid != 1 && !tmp->child_reaper)
 				goto out_free;
 			retval = -EPERM;
-			if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
+			if (!checkpoint_restore_ns_capable(tmp->user_ns))
 				goto out_free;
 			set_tid_size--;
 		}
-- 
cgit v1.2.3


From b9a3db92e1a1f281724e2f34b849d18d1a53bece Mon Sep 17 00:00:00 2001
From: Adrian Reber <areber@redhat.com>
Date: Sun, 19 Jul 2020 12:04:13 +0200
Subject: pid_namespace: use checkpoint_restore_ns_capable() for ns_last_pid

Use the newly introduced capability CAP_CHECKPOINT_RESTORE to allow
writing to ns_last_pid.

Signed-off-by: Adrian Reber <areber@redhat.com>
Signed-off-by: Nicolas Viennot <Nicolas.Viennot@twosigma.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/20200719100418.2112740-4-areber@redhat.com
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 kernel/pid_namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0e5ac162c3a8..ac135bd600eb 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -269,7 +269,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 	struct ctl_table tmp = *table;
 	int ret, next;
 
-	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+	if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
 		return -EPERM;
 
 	/*
-- 
cgit v1.2.3


From ebd6de6812387a2db9a52842cfbe004da1dd3be8 Mon Sep 17 00:00:00 2001
From: Nicolas Viennot <Nicolas.Viennot@twosigma.com>
Date: Sun, 19 Jul 2020 12:04:15 +0200
Subject: prctl: Allow local CAP_CHECKPOINT_RESTORE to change /proc/self/exe

Originally, only a local CAP_SYS_ADMIN could change the exe link,
making it difficult for doing checkpoint/restore without CAP_SYS_ADMIN.
This commit adds CAP_CHECKPOINT_RESTORE in addition to CAP_SYS_ADMIN
for permitting changing the exe link.

The following describes the history of the /proc/self/exe permission
checks as it may be difficult to understand what decisions lead to this
point.

* [1] May 2012: This commit introduces the ability of changing
  /proc/self/exe if the user is CAP_SYS_RESOURCE capable.
  In the related discussion [2], no clear thread model is presented for
  what could happen if the /proc/self/exe changes multiple times, or why
  would the admin be at the mercy of userspace.

* [3] Oct 2014: This commit introduces a new API to change
  /proc/self/exe. The permission no longer checks for CAP_SYS_RESOURCE,
  but instead checks if the current user is root (uid=0) in its local
  namespace. In the related discussion [4] it is said that "Controlling
  exe_fd without privileges may turn out to be dangerous. At least
  things like tomoyo examine it for making policy decisions (see
  tomoyo_manager())."

* [5] Dec 2016: This commit removes the restriction to change
  /proc/self/exe at most once. The related discussion [6] informs that
  the audit subsystem relies on the exe symlink, presumably
  audit_log_d_path_exe() in kernel/audit.c.

* [7] May 2017: This commit changed the check from uid==0 to local
  CAP_SYS_ADMIN. No discussion.

* [8] July 2020: A PoC to spoof any program's /proc/self/exe via ptrace
  is demonstrated

Overall, the concrete points that were made to retain capability checks
around changing the exe symlink is that tomoyo_manager() and
audit_log_d_path_exe() uses the exe_file path.

Christian Brauner said that relying on /proc/<pid>/exe being immutable (or
guarded by caps) in a sake of security is a bit misleading. It can only
be used as a hint without any guarantees of what code is being executed
once execve() returns to userspace. Christian suggested that in the
future, we could call audit_log() or similar to inform the admin of all
exe link changes, instead of attempting to provide security guarantees
via permission checks. However, this proposed change requires the
understanding of the security implications in the tomoyo/audit subsystems.

[1] b32dfe377102 ("c/r: prctl: add ability to set new mm_struct::exe_file")
[2] https://lore.kernel.org/patchwork/patch/292515/
[3] f606b77f1a9e ("prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation")
[4] https://lore.kernel.org/patchwork/patch/479359/
[5] 3fb4afd9a504 ("prctl: remove one-shot limitation for changing exe link")
[6] https://lore.kernel.org/patchwork/patch/697304/
[7] 4d28df6152aa ("prctl: Allow local CAP_SYS_ADMIN changing exe_file")
[8] https://github.com/nviennot/run_as_exe

Signed-off-by: Nicolas Viennot <Nicolas.Viennot@twosigma.com>
Signed-off-by: Adrian Reber <areber@redhat.com>
Link: https://lore.kernel.org/r/20200719100418.2112740-6-areber@redhat.com
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 kernel/sys.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 00a96746e28a..a3f4ef0bbda3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2007,11 +2007,14 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 
 	if (prctl_map.exe_fd != (u32)-1) {
 		/*
-		 * Make sure the caller has the rights to
-		 * change /proc/pid/exe link: only local sys admin should
-		 * be allowed to.
+		 * Check if the current user is checkpoint/restore capable.
+		 * At the time of this writing, it checks for CAP_SYS_ADMIN
+		 * or CAP_CHECKPOINT_RESTORE.
+		 * Note that a user with access to ptrace can masquerade an
+		 * arbitrary program as any executable, even setuid ones.
+		 * This may have implications in the tomoyo subsystem.
 		 */
-		if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+		if (!checkpoint_restore_ns_capable(current_user_ns()))
 			return -EINVAL;
 
 		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
-- 
cgit v1.2.3


From 227175b2c914bfa4a9aa5b210c7cabd42c5858ac Mon Sep 17 00:00:00 2001
From: Nicolas Viennot <Nicolas.Viennot@twosigma.com>
Date: Sun, 19 Jul 2020 12:04:16 +0200
Subject: prctl: exe link permission error changed from -EINVAL to -EPERM

This brings consistency with the rest of the prctl() syscall where
-EPERM is returned when failing a capability check.

Signed-off-by: Nicolas Viennot <Nicolas.Viennot@twosigma.com>
Signed-off-by: Adrian Reber <areber@redhat.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Link: https://lore.kernel.org/r/20200719100418.2112740-7-areber@redhat.com
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 kernel/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index a3f4ef0bbda3..ca11af9d815d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2015,7 +2015,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 		 * This may have implications in the tomoyo subsystem.
 		 */
 		if (!checkpoint_restore_ns_capable(current_user_ns()))
-			return -EINVAL;
+			return -EPERM;
 
 		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
 		if (error)
-- 
cgit v1.2.3


From 1b86abc1c645ad5c9c7bf70910cb3ce73939d2d7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jul 2020 13:11:24 +0800
Subject: sched_clock: Expose struct clock_read_data

In order to support perf_event_mmap_page::cap_time features, an
architecture needs, aside from a userspace readable counter register,
to expose the exact clock data so that userspace can convert the
counter register into a correct timestamp.

Provide struct clock_read_data and two (seqcount) helpers so that
architectures (arm64 in specific) can expose the numbers to userspace.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Link: https://lore.kernel.org/r/20200716051130.4359-2-leo.yan@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/sched_clock.h | 28 ++++++++++++++++++++++++++++
 kernel/time/sched_clock.c   | 41 +++++++++++++----------------------------
 2 files changed, 41 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h
index 0bb04a96a6d4..528718e4ed52 100644
--- a/include/linux/sched_clock.h
+++ b/include/linux/sched_clock.h
@@ -6,6 +6,34 @@
 #define LINUX_SCHED_CLOCK
 
 #ifdef CONFIG_GENERIC_SCHED_CLOCK
+/**
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns:		sched_clock() value at last update
+ * @epoch_cyc:		Clock cycle value at last update.
+ * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
+ *			clocks.
+ * @read_sched_clock:	Current clock source (or dummy source when suspended).
+ * @mult:		Multipler for scaled math conversion.
+ * @shift:		Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
+	u64 epoch_ns;
+	u64 epoch_cyc;
+	u64 sched_clock_mask;
+	u64 (*read_sched_clock)(void);
+	u32 mult;
+	u32 shift;
+};
+
+extern struct clock_read_data *sched_clock_read_begin(unsigned int *seq);
+extern int sched_clock_read_retry(unsigned int seq);
+
 extern void generic_sched_clock_init(void);
 
 extern void sched_clock_register(u64 (*read)(void), int bits,
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index fa3f800d7d76..0acaadc3156c 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -19,31 +19,6 @@
 
 #include "timekeeping.h"
 
-/**
- * struct clock_read_data - data required to read from sched_clock()
- *
- * @epoch_ns:		sched_clock() value at last update
- * @epoch_cyc:		Clock cycle value at last update.
- * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
- *			clocks.
- * @read_sched_clock:	Current clock source (or dummy source when suspended).
- * @mult:		Multipler for scaled math conversion.
- * @shift:		Shift value for scaled math conversion.
- *
- * Care must be taken when updating this structure; it is read by
- * some very hot code paths. It occupies <=40 bytes and, when combined
- * with the seqcount used to synchronize access, comfortably fits into
- * a 64 byte cache line.
- */
-struct clock_read_data {
-	u64 epoch_ns;
-	u64 epoch_cyc;
-	u64 sched_clock_mask;
-	u64 (*read_sched_clock)(void);
-	u32 mult;
-	u32 shift;
-};
-
 /**
  * struct clock_data - all data needed for sched_clock() (including
  *                     registration of a new clock source)
@@ -93,6 +68,17 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 	return (cyc * mult) >> shift;
 }
 
+struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
+{
+	*seq = raw_read_seqcount(&cd.seq);
+	return cd.read_data + (*seq & 1);
+}
+
+int sched_clock_read_retry(unsigned int seq)
+{
+	return read_seqcount_retry(&cd.seq, seq);
+}
+
 unsigned long long notrace sched_clock(void)
 {
 	u64 cyc, res;
@@ -100,13 +86,12 @@ unsigned long long notrace sched_clock(void)
 	struct clock_read_data *rd;
 
 	do {
-		seq = raw_read_seqcount(&cd.seq);
-		rd = cd.read_data + (seq & 1);
+		rd = sched_clock_read_begin(&seq);
 
 		cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
 		      rd->sched_clock_mask;
 		res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
-	} while (read_seqcount_retry(&cd.seq, seq));
+	} while (sched_clock_read_retry(seq));
 
 	return res;
 }
-- 
cgit v1.2.3


From aadd6e5caaacd6feca9691ba30536e7de5a7d152 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Thu, 16 Jul 2020 13:11:25 +0800
Subject: time/sched_clock: Use raw_read_seqcount_latch()

sched_clock uses seqcount_t latching to switch between two storage
places protected by the sequence counter. This allows it to have
interruptible, NMI-safe, seqcount_t write side critical sections.

Since 7fc26327b756 ("seqlock: Introduce raw_read_seqcount_latch()"),
raw_read_seqcount_latch() became the standardized way for seqcount_t
latch read paths. Due to the dependent load, it also has one read
memory barrier less than the currently used raw_read_seqcount() API.

Use raw_read_seqcount_latch() for the seqcount_t latch read path.

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Link: https://lkml.kernel.org/r/20200625085745.GD117543@hirez.programming.kicks-ass.net
Link: https://lkml.kernel.org/r/20200715092345.GA231464@debian-buster-darwi.lab.linutronix.de
Link: https://lore.kernel.org/r/20200716051130.4359-3-leo.yan@linaro.org
References: 1809bfa44e10 ("timers, sched/clock: Avoid deadlock during read from NMI")
Signed-off-by: Will Deacon <will@kernel.org>
---
 kernel/time/sched_clock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0acaadc3156c..0deaf4b79fb4 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -70,7 +70,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 
 struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
 {
-	*seq = raw_read_seqcount(&cd.seq);
+	*seq = raw_read_seqcount_latch(&cd.seq);
 	return cd.read_data + (*seq & 1);
 }
 
-- 
cgit v1.2.3


From 4834177e633258fbf3c5754b1220f01c705b79eb Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.microsoft.com>
Date: Thu, 9 Jul 2020 01:19:11 -0500
Subject: ima: Support additional conditionals in the KEXEC_CMDLINE hook
 function

Take the properties of the kexec kernel's inode and the current task
ownership into consideration when matching a KEXEC_CMDLINE operation to
the rules in the IMA policy. This allows for some uniformity when
writing IMA policy rules for KEXEC_KERNEL_CHECK, KEXEC_INITRAMFS_CHECK,
and KEXEC_CMDLINE operations.

Prior to this patch, it was not possible to write a set of rules like
this:

 dont_measure func=KEXEC_KERNEL_CHECK obj_type=foo_t
 dont_measure func=KEXEC_INITRAMFS_CHECK obj_type=foo_t
 dont_measure func=KEXEC_CMDLINE obj_type=foo_t
 measure func=KEXEC_KERNEL_CHECK
 measure func=KEXEC_INITRAMFS_CHECK
 measure func=KEXEC_CMDLINE

The inode information associated with the kernel being loaded by a
kexec_kernel_load(2) syscall can now be included in the decision to
measure or not

Additonally, the uid, euid, and subj_* conditionals can also now be
used in KEXEC_CMDLINE rules. There was no technical reason as to why
those conditionals weren't being considered previously other than
ima_match_rules() didn't have a valid inode to use so it immediately
bailed out for KEXEC_CMDLINE operations rather than going through the
full list of conditional comparisons.

Signed-off-by: Tyler Hicks <tyhicks@linux.microsoft.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: kexec@lists.infradead.org
Reviewed-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/ima.h                          |  4 ++--
 kernel/kexec_file.c                          |  2 +-
 security/integrity/ima/ima.h                 |  2 +-
 security/integrity/ima/ima_api.c             |  2 +-
 security/integrity/ima/ima_appraise.c        |  2 +-
 security/integrity/ima/ima_asymmetric_keys.c |  2 +-
 security/integrity/ima/ima_main.c            | 23 +++++++++++++++++------
 security/integrity/ima/ima_policy.c          | 17 ++++++-----------
 security/integrity/ima/ima_queue_keys.c      |  2 +-
 9 files changed, 31 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 9164e1534ec9..d15100de6cdd 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -25,7 +25,7 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
 			      enum kernel_read_file_id id);
 extern void ima_post_path_mknod(struct dentry *dentry);
 extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
-extern void ima_kexec_cmdline(const void *buf, int size);
+extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
 
 #ifdef CONFIG_IMA_KEXEC
 extern void ima_add_kexec_buffer(struct kimage *image);
@@ -103,7 +103,7 @@ static inline int ima_file_hash(struct file *file, char *buf, size_t buf_size)
 	return -EOPNOTSUPP;
 }
 
-static inline void ima_kexec_cmdline(const void *buf, int size) {}
+static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {}
 #endif /* CONFIG_IMA */
 
 #ifndef CONFIG_IMA_KEXEC
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index bb05fd52de85..07df431c1f21 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -287,7 +287,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
 			goto out;
 		}
 
-		ima_kexec_cmdline(image->cmdline_buf,
+		ima_kexec_cmdline(kernel_fd, image->cmdline_buf,
 				  image->cmdline_buf_len - 1);
 	}
 
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index ea7e77536f3c..576ae2c6d418 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -265,7 +265,7 @@ void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file,
 			   struct evm_ima_xattr_data *xattr_value,
 			   int xattr_len, const struct modsig *modsig, int pcr,
 			   struct ima_template_desc *template_desc);
-void process_buffer_measurement(const void *buf, int size,
+void process_buffer_measurement(struct inode *inode, const void *buf, int size,
 				const char *eventname, enum ima_hooks func,
 				int pcr, const char *keyring);
 void ima_audit_measurement(struct integrity_iint_cache *iint,
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index bf22de8b7ce0..4f39fb93f278 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -162,7 +162,7 @@ err_out:
 
 /**
  * ima_get_action - appraise & measure decision based on policy.
- * @inode: pointer to inode to measure
+ * @inode: pointer to the inode associated with the object being validated
  * @cred: pointer to credentials structure to validate
  * @secid: secid of the task being validated
  * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXEC,
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index a9649b04b9f1..6c52bf7ea7f0 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -328,7 +328,7 @@ int ima_check_blacklist(struct integrity_iint_cache *iint,
 
 		rc = is_binary_blacklisted(digest, digestsize);
 		if ((rc == -EPERM) && (iint->flags & IMA_MEASURE))
-			process_buffer_measurement(digest, digestsize,
+			process_buffer_measurement(NULL, digest, digestsize,
 						   "blacklisted-hash", NONE,
 						   pcr, NULL);
 	}
diff --git a/security/integrity/ima/ima_asymmetric_keys.c b/security/integrity/ima/ima_asymmetric_keys.c
index aaae80c4e376..1c68c500c26f 100644
--- a/security/integrity/ima/ima_asymmetric_keys.c
+++ b/security/integrity/ima/ima_asymmetric_keys.c
@@ -58,7 +58,7 @@ void ima_post_key_create_or_update(struct key *keyring, struct key *key,
 	 * if the IMA policy is configured to measure a key linked
 	 * to the given keyring.
 	 */
-	process_buffer_measurement(payload, payload_len,
+	process_buffer_measurement(NULL, payload, payload_len,
 				   keyring->description, KEY_CHECK, 0,
 				   keyring->description);
 }
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 8351b2fd48e0..8a91711ca79b 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -726,6 +726,7 @@ int ima_load_data(enum kernel_load_data_id id)
 
 /*
  * process_buffer_measurement - Measure the buffer to ima log.
+ * @inode: inode associated with the object being measured (NULL for KEY_CHECK)
  * @buf: pointer to the buffer that needs to be added to the log.
  * @size: size of buffer(in bytes).
  * @eventname: event name to be used for the buffer entry.
@@ -735,7 +736,7 @@ int ima_load_data(enum kernel_load_data_id id)
  *
  * Based on policy, the buffer is measured into the ima log.
  */
-void process_buffer_measurement(const void *buf, int size,
+void process_buffer_measurement(struct inode *inode, const void *buf, int size,
 				const char *eventname, enum ima_hooks func,
 				int pcr, const char *keyring)
 {
@@ -768,7 +769,7 @@ void process_buffer_measurement(const void *buf, int size,
 	 */
 	if (func) {
 		security_task_getsecid(current, &secid);
-		action = ima_get_action(NULL, current_cred(), secid, 0, func,
+		action = ima_get_action(inode, current_cred(), secid, 0, func,
 					&pcr, &template, keyring);
 		if (!(action & IMA_MEASURE))
 			return;
@@ -823,16 +824,26 @@ out:
 
 /**
  * ima_kexec_cmdline - measure kexec cmdline boot args
+ * @kernel_fd: file descriptor of the kexec kernel being loaded
  * @buf: pointer to buffer
  * @size: size of buffer
  *
  * Buffers can only be measured, not appraised.
  */
-void ima_kexec_cmdline(const void *buf, int size)
+void ima_kexec_cmdline(int kernel_fd, const void *buf, int size)
 {
-	if (buf && size != 0)
-		process_buffer_measurement(buf, size, "kexec-cmdline",
-					   KEXEC_CMDLINE, 0, NULL);
+	struct fd f;
+
+	if (!buf || !size)
+		return;
+
+	f = fdget(kernel_fd);
+	if (!f.file)
+		return;
+
+	process_buffer_measurement(file_inode(f.file), buf, size,
+				   "kexec-cmdline", KEXEC_CMDLINE, 0, NULL);
+	fdput(f);
 }
 
 static int __init init_ima(void)
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index dcd1aaac4ff0..9284055ee13a 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -443,13 +443,9 @@ static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode,
 {
 	int i;
 
-	if ((func == KEXEC_CMDLINE) || (func == KEY_CHECK)) {
-		if ((rule->flags & IMA_FUNC) && (rule->func == func)) {
-			if (func == KEY_CHECK)
-				return ima_match_keyring(rule, keyring, cred);
-			return true;
-		}
-		return false;
+	if (func == KEY_CHECK) {
+		return (rule->flags & IMA_FUNC) && (rule->func == func) &&
+		       ima_match_keyring(rule, keyring, cred);
 	}
 	if ((rule->flags & IMA_FUNC) &&
 	    (rule->func != func && func != POST_SETATTR))
@@ -1035,10 +1031,9 @@ static bool ima_validate_rule(struct ima_rule_entry *entry)
 		if (entry->action & ~(MEASURE | DONT_MEASURE))
 			return false;
 
-		if (entry->flags & ~(IMA_FUNC | IMA_PCR))
-			return false;
-
-		if (ima_rule_contains_lsm_cond(entry))
+		if (entry->flags & ~(IMA_FUNC | IMA_FSMAGIC | IMA_UID |
+				     IMA_FOWNER | IMA_FSUUID | IMA_EUID |
+				     IMA_PCR | IMA_FSNAME))
 			return false;
 
 		break;
diff --git a/security/integrity/ima/ima_queue_keys.c b/security/integrity/ima/ima_queue_keys.c
index 56ce24a18b66..69a8626a35c0 100644
--- a/security/integrity/ima/ima_queue_keys.c
+++ b/security/integrity/ima/ima_queue_keys.c
@@ -158,7 +158,7 @@ void ima_process_queued_keys(void)
 
 	list_for_each_entry_safe(entry, tmp, &ima_keys, list) {
 		if (!timer_expired)
-			process_buffer_measurement(entry->payload,
+			process_buffer_measurement(NULL, entry->payload,
 						   entry->payload_len,
 						   entry->keyring_name,
 						   KEY_CHECK, 0,
-- 
cgit v1.2.3


From be619f7f063a49c656f620a46af4f8ea3e759e91 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 13 Jul 2020 12:06:48 -0500
Subject: exec: Implement kernel_execve

To allow the kernel not to play games with set_fs to call exec
implement kernel_execve.  The function kernel_execve takes pointers
into kernel memory and copies the values pointed to onto the new
userspace stack.

The calls with arguments from kernel space of do_execve are replaced
with calls to kernel_execve.

The calls do_execve and do_execveat are made static as there are now
no callers outside of exec.

The comments that mention do_execve are updated to refer to
kernel_execve or execve depending on the circumstances.  In addition
to correcting the comments, this makes it easy to grep for do_execve
and verify it is not used.

Inspired-by: https://lkml.kernel.org/r/20200627072704.2447163-1-hch@lst.de
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/87wo365ikj.fsf@x220.int.ebiederm.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/x86/entry/entry_32.S      |  2 +-
 arch/x86/entry/entry_64.S      |  2 +-
 arch/x86/kernel/unwind_frame.c |  2 +-
 fs/exec.c                      | 88 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/binfmts.h        |  9 +----
 init/main.c                    |  4 +-
 kernel/umh.c                   |  6 +--
 security/tomoyo/common.h       |  2 +-
 security/tomoyo/domain.c       |  4 +-
 security/tomoyo/tomoyo.c       |  4 +-
 10 files changed, 100 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 024d7d276cd4..8f4e085ee06d 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -854,7 +854,7 @@ SYM_CODE_START(ret_from_fork)
 	CALL_NOSPEC ebx
 	/*
 	 * A kernel thread is allowed to return here after successfully
-	 * calling do_execve().  Exit to userspace to complete the execve()
+	 * calling kernel_execve().  Exit to userspace to complete the execve()
 	 * syscall.
 	 */
 	movl	$0, PT_EAX(%esp)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d2a00c97e53f..73c7e255256b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -293,7 +293,7 @@ SYM_CODE_START(ret_from_fork)
 	CALL_NOSPEC rbx
 	/*
 	 * A kernel thread is allowed to return here after successfully
-	 * calling do_execve().  Exit to userspace to complete the execve()
+	 * calling kernel_execve().  Exit to userspace to complete the execve()
 	 * syscall.
 	 */
 	movq	$0, RAX(%rsp)
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 722a85f3b2dd..e40b4942157f 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -275,7 +275,7 @@ bool unwind_next_frame(struct unwind_state *state)
 		 * This user_mode() check is slightly broader than a PF_KTHREAD
 		 * check because it also catches the awkward situation where a
 		 * newly forked kthread transitions into a user task by calling
-		 * do_execve(), which eventually clears PF_KTHREAD.
+		 * kernel_execve(), which eventually clears PF_KTHREAD.
 		 */
 		if (!user_mode(regs))
 			goto the_end;
diff --git a/fs/exec.c b/fs/exec.c
index f8135dc149b3..3698252719a3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -448,6 +448,23 @@ static int count(struct user_arg_ptr argv, int max)
 	return i;
 }
 
+static int count_strings_kernel(const char *const *argv)
+{
+	int i;
+
+	if (!argv)
+		return 0;
+
+	for (i = 0; argv[i]; ++i) {
+		if (i >= MAX_ARG_STRINGS)
+			return -E2BIG;
+		if (fatal_signal_pending(current))
+			return -ERESTARTNOHAND;
+		cond_resched();
+	}
+	return i;
+}
+
 static int bprm_stack_limits(struct linux_binprm *bprm)
 {
 	unsigned long limit, ptr_size;
@@ -624,6 +641,20 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
 }
 EXPORT_SYMBOL(copy_string_kernel);
 
+static int copy_strings_kernel(int argc, const char *const *argv,
+			       struct linux_binprm *bprm)
+{
+	while (argc-- > 0) {
+		int ret = copy_string_kernel(argv[argc], bprm);
+		if (ret < 0)
+			return ret;
+		if (fatal_signal_pending(current))
+			return -ERESTARTNOHAND;
+		cond_resched();
+	}
+	return 0;
+}
+
 #ifdef CONFIG_MMU
 
 /*
@@ -1991,7 +2022,60 @@ out_ret:
 	return retval;
 }
 
-int do_execve(struct filename *filename,
+int kernel_execve(const char *kernel_filename,
+		  const char *const *argv, const char *const *envp)
+{
+	struct filename *filename;
+	struct linux_binprm *bprm;
+	int fd = AT_FDCWD;
+	int retval;
+
+	filename = getname_kernel(kernel_filename);
+	if (IS_ERR(filename))
+		return PTR_ERR(filename);
+
+	bprm = alloc_bprm(fd, filename);
+	if (IS_ERR(bprm)) {
+		retval = PTR_ERR(bprm);
+		goto out_ret;
+	}
+
+	retval = count_strings_kernel(argv);
+	if (retval < 0)
+		goto out_free;
+	bprm->argc = retval;
+
+	retval = count_strings_kernel(envp);
+	if (retval < 0)
+		goto out_free;
+	bprm->envc = retval;
+
+	retval = bprm_stack_limits(bprm);
+	if (retval < 0)
+		goto out_free;
+
+	retval = copy_string_kernel(bprm->filename, bprm);
+	if (retval < 0)
+		goto out_free;
+	bprm->exec = bprm->p;
+
+	retval = copy_strings_kernel(bprm->envc, envp, bprm);
+	if (retval < 0)
+		goto out_free;
+
+	retval = copy_strings_kernel(bprm->argc, argv, bprm);
+	if (retval < 0)
+		goto out_free;
+
+	retval = bprm_execve(bprm, fd, filename, 0);
+out_free:
+	free_bprm(bprm);
+out_ret:
+	putname(filename);
+	return retval;
+}
+
+static int do_execve(struct filename *filename,
 	const char __user *const __user *__argv,
 	const char __user *const __user *__envp)
 {
@@ -2000,7 +2084,7 @@ int do_execve(struct filename *filename,
 	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
 }
 
-int do_execveat(int fd, struct filename *filename,
+static int do_execveat(int fd, struct filename *filename,
 		const char __user *const __user *__argv,
 		const char __user *const __user *__envp,
 		int flags)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 8e9e1b0c8eb8..0571701ab1c5 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -135,12 +135,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm);
 extern void set_binfmt(struct linux_binfmt *new);
 extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t);
 
-extern int do_execve(struct filename *,
-		     const char __user * const __user *,
-		     const char __user * const __user *);
-extern int do_execveat(int, struct filename *,
-		       const char __user * const __user *,
-		       const char __user * const __user *,
-		       int);
+int kernel_execve(const char *filename,
+		  const char *const *argv, const char *const *envp);
 
 #endif /* _LINUX_BINFMTS_H */
diff --git a/init/main.c b/init/main.c
index 0ead83e86b5a..78ccec5c28f3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1329,9 +1329,7 @@ static int run_init_process(const char *init_filename)
 	pr_debug("  with environment:\n");
 	for (p = envp_init; *p; p++)
 		pr_debug("    %s\n", *p);
-	return do_execve(getname_kernel(init_filename),
-		(const char __user *const __user *)argv_init,
-		(const char __user *const __user *)envp_init);
+	return kernel_execve(init_filename, argv_init, envp_init);
 }
 
 static int try_to_run_init_process(const char *init_filename)
diff --git a/kernel/umh.c b/kernel/umh.c
index 6ca2096298b9..a25433f9cd9a 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -98,9 +98,9 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	retval = do_execve(getname_kernel(sub_info->path),
-			   (const char __user *const __user *)sub_info->argv,
-			   (const char __user *const __user *)sub_info->envp);
+	retval = kernel_execve(sub_info->path,
+			       (const char *const *)sub_info->argv,
+			       (const char *const *)sub_info->envp);
 out:
 	sub_info->retval = retval;
 	/*
diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h
index 050473df5809..85246b9df7ca 100644
--- a/security/tomoyo/common.h
+++ b/security/tomoyo/common.h
@@ -425,7 +425,7 @@ struct tomoyo_request_info {
 	struct tomoyo_obj_info *obj;
 	/*
 	 * For holding parameters specific to execve() request.
-	 * NULL if not dealing do_execve().
+	 * NULL if not dealing execve().
 	 */
 	struct tomoyo_execve *ee;
 	struct tomoyo_domain_info *domain;
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 7869d6a9980b..53b3e1f5f227 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -767,7 +767,7 @@ retry:
 
 	/*
 	 * Check for domain transition preference if "file execute" matched.
-	 * If preference is given, make do_execve() fail if domain transition
+	 * If preference is given, make execve() fail if domain transition
 	 * has failed, for domain transition preference should be used with
 	 * destination domain defined.
 	 */
@@ -810,7 +810,7 @@ force_reset_domain:
 		snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "<%s>",
 			 candidate->name);
 		/*
-		 * Make do_execve() fail if domain transition across namespaces
+		 * Make execve() fail if domain transition across namespaces
 		 * has failed.
 		 */
 		reject_on_transition_failure = true;
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index f9adddc42ac8..1f3cd432d830 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -93,7 +93,7 @@ static int tomoyo_bprm_check_security(struct linux_binprm *bprm)
 	struct tomoyo_task *s = tomoyo_task(current);
 
 	/*
-	 * Execute permission is checked against pathname passed to do_execve()
+	 * Execute permission is checked against pathname passed to execve()
 	 * using current domain.
 	 */
 	if (!s->old_domain_info) {
@@ -307,7 +307,7 @@ static int tomoyo_file_fcntl(struct file *file, unsigned int cmd,
  */
 static int tomoyo_file_open(struct file *f)
 {
-	/* Don't check read permission here if called from do_execve(). */
+	/* Don't check read permission here if called from execve(). */
 	if (current->in_execve)
 		return 0;
 	return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path,
-- 
cgit v1.2.3


From f1d9b23cabc61e58509164c3c3132556476491d2 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Mon, 13 Jul 2020 15:51:59 -0400
Subject: audit: purge audit_log_string from the intra-kernel audit API

audit_log_string() was inteded to be an internal audit function and
since there are only two internal uses, remove them.  Purge all external
uses of it by restructuring code to use an existing audit_log_format()
or using audit_log_format().

Please see the upstream issue
https://github.com/linux-audit/audit-kernel/issues/84

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h     |  5 -----
 kernel/audit.c            |  4 ++--
 security/apparmor/audit.c | 10 ++++------
 security/apparmor/file.c  | 25 +++++++------------------
 security/apparmor/ipc.c   | 46 +++++++++++++++++++++++-----------------------
 security/apparmor/net.c   | 14 ++++++++------
 security/lsm_audit.c      |  4 ++--
 7 files changed, 46 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 523f77494847..b3d859831a31 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -694,9 +694,4 @@ static inline bool audit_loginuid_set(struct task_struct *tsk)
 	return uid_valid(audit_get_loginuid(tsk));
 }
 
-static inline void audit_log_string(struct audit_buffer *ab, const char *buf)
-{
-	audit_log_n_string(ab, buf, strlen(buf));
-}
-
 #endif
diff --git a/kernel/audit.c b/kernel/audit.c
index 8c201f414226..a2f3e34aa724 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2080,13 +2080,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	/* We will allow 11 spaces for ' (deleted)' to be appended */
 	pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
 	if (!pathname) {
-		audit_log_string(ab, "<no_memory>");
+		audit_log_format(ab, "\"<no_memory>\"");
 		return;
 	}
 	p = d_path(path, pathname, PATH_MAX+11);
 	if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
 		/* FIXME: can we save some information here? */
-		audit_log_string(ab, "<too_long>");
+		audit_log_format(ab, "\"<too_long>\"");
 	} else
 		audit_log_untrustedstring(ab, p);
 	kfree(pathname);
diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c
index 597732503815..f7e97c7e80f3 100644
--- a/security/apparmor/audit.c
+++ b/security/apparmor/audit.c
@@ -57,18 +57,16 @@ static void audit_pre(struct audit_buffer *ab, void *ca)
 	struct common_audit_data *sa = ca;
 
 	if (aa_g_audit_header) {
-		audit_log_format(ab, "apparmor=");
-		audit_log_string(ab, aa_audit_type[aad(sa)->type]);
+		audit_log_format(ab, "apparmor=\"%s\"",
+				 aa_audit_type[aad(sa)->type]);
 	}
 
 	if (aad(sa)->op) {
-		audit_log_format(ab, " operation=");
-		audit_log_string(ab, aad(sa)->op);
+		audit_log_format(ab, " operation=\"%s\"", aad(sa)->op);
 	}
 
 	if (aad(sa)->info) {
-		audit_log_format(ab, " info=");
-		audit_log_string(ab, aad(sa)->info);
+		audit_log_format(ab, " info=\"%s\"", aad(sa)->info);
 		if (aad(sa)->error)
 			audit_log_format(ab, " error=%d", aad(sa)->error);
 	}
diff --git a/security/apparmor/file.c b/security/apparmor/file.c
index 9a2d14b7c9f8..92acf9a49405 100644
--- a/security/apparmor/file.c
+++ b/security/apparmor/file.c
@@ -34,20 +34,6 @@ static u32 map_mask_to_chr_mask(u32 mask)
 	return m;
 }
 
-/**
- * audit_file_mask - convert mask to permission string
- * @buffer: buffer to write string to (NOT NULL)
- * @mask: permission mask to convert
- */
-static void audit_file_mask(struct audit_buffer *ab, u32 mask)
-{
-	char str[10];
-
-	aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs,
-			    map_mask_to_chr_mask(mask));
-	audit_log_string(ab, str);
-}
-
 /**
  * file_audit_cb - call back for file specific audit fields
  * @ab: audit_buffer  (NOT NULL)
@@ -57,14 +43,17 @@ static void file_audit_cb(struct audit_buffer *ab, void *va)
 {
 	struct common_audit_data *sa = va;
 	kuid_t fsuid = current_fsuid();
+	char str[10];
 
 	if (aad(sa)->request & AA_AUDIT_FILE_MASK) {
-		audit_log_format(ab, " requested_mask=");
-		audit_file_mask(ab, aad(sa)->request);
+		aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs,
+				    map_mask_to_chr_mask(aad(sa)->request));
+		audit_log_format(ab, " requested_mask=\"%s\"", str);
 	}
 	if (aad(sa)->denied & AA_AUDIT_FILE_MASK) {
-		audit_log_format(ab, " denied_mask=");
-		audit_file_mask(ab, aad(sa)->denied);
+		aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs,
+				    map_mask_to_chr_mask(aad(sa)->denied));
+		audit_log_format(ab, " denied_mask=\"%s\"", str);
 	}
 	if (aad(sa)->request & AA_AUDIT_FILE_MASK) {
 		audit_log_format(ab, " fsuid=%d",
diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c
index 4ecedffbdd33..fe36d112aad9 100644
--- a/security/apparmor/ipc.c
+++ b/security/apparmor/ipc.c
@@ -20,25 +20,23 @@
 
 /**
  * audit_ptrace_mask - convert mask to permission string
- * @buffer: buffer to write string to (NOT NULL)
  * @mask: permission mask to convert
+ *
+ * Returns: pointer to static string
  */
-static void audit_ptrace_mask(struct audit_buffer *ab, u32 mask)
+static const char *audit_ptrace_mask(u32 mask)
 {
 	switch (mask) {
 	case MAY_READ:
-		audit_log_string(ab, "read");
-		break;
+		return "read";
 	case MAY_WRITE:
-		audit_log_string(ab, "trace");
-		break;
+		return "trace";
 	case AA_MAY_BE_READ:
-		audit_log_string(ab, "readby");
-		break;
+		return "readby";
 	case AA_MAY_BE_TRACED:
-		audit_log_string(ab, "tracedby");
-		break;
+		return "tracedby";
 	}
+	return "";
 }
 
 /* call back to audit ptrace fields */
@@ -47,12 +45,12 @@ static void audit_ptrace_cb(struct audit_buffer *ab, void *va)
 	struct common_audit_data *sa = va;
 
 	if (aad(sa)->request & AA_PTRACE_PERM_MASK) {
-		audit_log_format(ab, " requested_mask=");
-		audit_ptrace_mask(ab, aad(sa)->request);
+		audit_log_format(ab, " requested_mask=\"%s\"",
+				 audit_ptrace_mask(aad(sa)->request));
 
 		if (aad(sa)->denied & AA_PTRACE_PERM_MASK) {
-			audit_log_format(ab, " denied_mask=");
-			audit_ptrace_mask(ab, aad(sa)->denied);
+			audit_log_format(ab, " denied_mask=\"%s\"",
+					 audit_ptrace_mask(aad(sa)->denied));
 		}
 	}
 	audit_log_format(ab, " peer=");
@@ -142,16 +140,18 @@ static inline int map_signal_num(int sig)
 }
 
 /**
- * audit_file_mask - convert mask to permission string
- * @buffer: buffer to write string to (NOT NULL)
+ * audit_signal_mask - convert mask to permission string
  * @mask: permission mask to convert
+ *
+ * Returns: pointer to static string
  */
-static void audit_signal_mask(struct audit_buffer *ab, u32 mask)
+static const char *audit_signal_mask(u32 mask)
 {
 	if (mask & MAY_READ)
-		audit_log_string(ab, "receive");
+		return "receive";
 	if (mask & MAY_WRITE)
-		audit_log_string(ab, "send");
+		return "send";
+	return "";
 }
 
 /**
@@ -164,11 +164,11 @@ static void audit_signal_cb(struct audit_buffer *ab, void *va)
 	struct common_audit_data *sa = va;
 
 	if (aad(sa)->request & AA_SIGNAL_PERM_MASK) {
-		audit_log_format(ab, " requested_mask=");
-		audit_signal_mask(ab, aad(sa)->request);
+		audit_log_format(ab, " requested_mask=\"%s\"",
+				 audit_signal_mask(aad(sa)->request));
 		if (aad(sa)->denied & AA_SIGNAL_PERM_MASK) {
-			audit_log_format(ab, " denied_mask=");
-			audit_signal_mask(ab, aad(sa)->denied);
+			audit_log_format(ab, " denied_mask=\"%s\"",
+					 audit_signal_mask(aad(sa)->denied));
 		}
 	}
 	if (aad(sa)->signal == SIGUNKNOWN)
diff --git a/security/apparmor/net.c b/security/apparmor/net.c
index d8afc39f663a..fa0e85568450 100644
--- a/security/apparmor/net.c
+++ b/security/apparmor/net.c
@@ -72,16 +72,18 @@ void audit_net_cb(struct audit_buffer *ab, void *va)
 {
 	struct common_audit_data *sa = va;
 
-	audit_log_format(ab, " family=");
 	if (address_family_names[sa->u.net->family])
-		audit_log_string(ab, address_family_names[sa->u.net->family]);
+		audit_log_format(ab, " family=\"%s\"",
+				 address_family_names[sa->u.net->family]);
 	else
-		audit_log_format(ab, "\"unknown(%d)\"", sa->u.net->family);
-	audit_log_format(ab, " sock_type=");
+		audit_log_format(ab, " family=\"unknown(%d)\"",
+				 sa->u.net->family);
 	if (sock_type_names[aad(sa)->net.type])
-		audit_log_string(ab, sock_type_names[aad(sa)->net.type]);
+		audit_log_format(ab, " sock_type=\"%s\"",
+				 sock_type_names[aad(sa)->net.type]);
 	else
-		audit_log_format(ab, "\"unknown(%d)\"", aad(sa)->net.type);
+		audit_log_format(ab, " sock_type=\"unknown(%d)\"",
+				 aad(sa)->net.type);
 	audit_log_format(ab, " protocol=%d", aad(sa)->net.protocol);
 
 	if (aad(sa)->request & NET_PERMS_MASK) {
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 7c555621c2bd..53d0d183db8f 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -432,8 +432,8 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 				 a->u.ibendport->port);
 		break;
 	case LSM_AUDIT_DATA_LOCKDOWN:
-		audit_log_format(ab, " lockdown_reason=");
-		audit_log_string(ab, lockdown_reasons[a->u.reason]);
+		audit_log_format(ab, " lockdown_reason=\"%s\"",
+				 lockdown_reasons[a->u.reason]);
 		break;
 	} /* switch (a->type) */
 }
-- 
cgit v1.2.3


From b43870c74f3fdf0cd06bf5f1b7a5ed70a2cd4ed2 Mon Sep 17 00:00:00 2001
From: Max Englander <max.englander@gmail.com>
Date: Sat, 4 Jul 2020 15:15:28 +0000
Subject: audit: report audit wait metric in audit status reply

In environments where the preservation of audit events and predictable
usage of system memory are prioritized, admins may use a combination of
--backlog_wait_time and -b options at the risk of degraded performance
resulting from backlog waiting. In some cases, this risk may be
preferred to lost events or unbounded memory usage. Ideally, this risk
can be mitigated by making adjustments when backlog waiting is detected.

However, detection can be difficult using the currently available
metrics. For example, an admin attempting to debug degraded performance
may falsely believe a full backlog indicates backlog waiting. It may
turn out the backlog frequently fills up but drains quickly.

To make it easier to reliably track degraded performance to backlog
waiting, this patch makes the following changes:

Add a new field backlog_wait_time_total to the audit status reply.
Initialize this field to zero. Add to this field the total time spent
by the current task on scheduled timeouts while the backlog limit is
exceeded. Reset field to zero upon request via AUDIT_SET.

Tested on Ubuntu 18.04 using complementary changes to the
audit-userspace and audit-testsuite:
- https://github.com/linux-audit/audit-userspace/pull/134
- https://github.com/linux-audit/audit-testsuite/pull/97

Signed-off-by: Max Englander <max.englander@gmail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/uapi/linux/audit.h | 18 +++++++++++-------
 kernel/audit.c             | 35 +++++++++++++++++++++++++----------
 2 files changed, 36 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 9b6a973f4cc3..cd2d8279a5e4 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -333,14 +333,15 @@ enum {
 };
 
 /* Status symbols */
-				/* Mask values */
-#define AUDIT_STATUS_ENABLED		0x0001
-#define AUDIT_STATUS_FAILURE		0x0002
-#define AUDIT_STATUS_PID		0x0004
+						/* Mask values */
+#define AUDIT_STATUS_ENABLED			0x0001
+#define AUDIT_STATUS_FAILURE			0x0002
+#define AUDIT_STATUS_PID			0x0004
 #define AUDIT_STATUS_RATE_LIMIT		0x0008
-#define AUDIT_STATUS_BACKLOG_LIMIT	0x0010
-#define AUDIT_STATUS_BACKLOG_WAIT_TIME	0x0020
-#define AUDIT_STATUS_LOST		0x0040
+#define AUDIT_STATUS_BACKLOG_LIMIT		0x0010
+#define AUDIT_STATUS_BACKLOG_WAIT_TIME		0x0020
+#define AUDIT_STATUS_LOST			0x0040
+#define AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL	0x0080
 
 #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT	0x00000001
 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME	0x00000002
@@ -467,6 +468,9 @@ struct audit_status {
 		__u32	feature_bitmap;	/* bitmap of kernel audit features */
 	};
 	__u32		backlog_wait_time;/* message queue wait timeout */
+	__u32           backlog_wait_time_actual;/* time spent waiting while
+						  * message limit exceeded
+						  */
 };
 
 struct audit_features {
diff --git a/kernel/audit.c b/kernel/audit.c
index a2f3e34aa724..d72663ac248c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -136,6 +136,11 @@ u32		audit_sig_sid = 0;
 */
 static atomic_t	audit_lost = ATOMIC_INIT(0);
 
+/* Monotonically increasing sum of time the kernel has spent
+ * waiting while the backlog limit is exceeded.
+ */
+static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0);
+
 /* Hash for inode-based rules */
 struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
 
@@ -1201,17 +1206,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	case AUDIT_GET: {
 		struct audit_status	s;
 		memset(&s, 0, sizeof(s));
-		s.enabled		= audit_enabled;
-		s.failure		= audit_failure;
+		s.enabled		   = audit_enabled;
+		s.failure		   = audit_failure;
 		/* NOTE: use pid_vnr() so the PID is relative to the current
 		 *       namespace */
-		s.pid			= auditd_pid_vnr();
-		s.rate_limit		= audit_rate_limit;
-		s.backlog_limit		= audit_backlog_limit;
-		s.lost			= atomic_read(&audit_lost);
-		s.backlog		= skb_queue_len(&audit_queue);
-		s.feature_bitmap	= AUDIT_FEATURE_BITMAP_ALL;
-		s.backlog_wait_time	= audit_backlog_wait_time;
+		s.pid			   = auditd_pid_vnr();
+		s.rate_limit		   = audit_rate_limit;
+		s.backlog_limit		   = audit_backlog_limit;
+		s.lost			   = atomic_read(&audit_lost);
+		s.backlog		   = skb_queue_len(&audit_queue);
+		s.feature_bitmap	   = AUDIT_FEATURE_BITMAP_ALL;
+		s.backlog_wait_time	   = audit_backlog_wait_time;
+		s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual);
 		audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
 		break;
 	}
@@ -1315,6 +1321,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			audit_log_config_change("lost", 0, lost, 1);
 			return lost;
 		}
+		if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) {
+			u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0);
+
+			audit_log_config_change("backlog_wait_time_actual", 0, actual, 1);
+			return actual;
+		}
 		break;
 	}
 	case AUDIT_GET_FEATURE:
@@ -1826,12 +1838,15 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 			/* sleep if we are allowed and we haven't exhausted our
 			 * backlog wait limit */
 			if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) {
+				long rtime = stime;
+
 				DECLARE_WAITQUEUE(wait, current);
 
 				add_wait_queue_exclusive(&audit_backlog_wait,
 							 &wait);
 				set_current_state(TASK_UNINTERRUPTIBLE);
-				stime = schedule_timeout(stime);
+				stime = schedule_timeout(rtime);
+				atomic_add(rtime - stime, &audit_backlog_wait_time_actual);
 				remove_wait_queue(&audit_backlog_wait, &wait);
 			} else {
 				if (audit_rate_check() && printk_ratelimit())
-- 
cgit v1.2.3


From 343ead287dde0064c430ee1db477726f52e0269c Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Tue, 21 Jul 2020 12:07:16 +0200
Subject: bpf, netns: Fix build without CONFIG_INET

When CONFIG_NET is set but CONFIG_INET isn't, build fails with:

  ld: kernel/bpf/net_namespace.o: in function `netns_bpf_attach_type_unneed':
  kernel/bpf/net_namespace.c:32: undefined reference to `bpf_sk_lookup_enabled'
  ld: kernel/bpf/net_namespace.o: in function `netns_bpf_attach_type_need':
  kernel/bpf/net_namespace.c:43: undefined reference to `bpf_sk_lookup_enabled'

This is because without CONFIG_INET bpf_sk_lookup_enabled symbol is not
available. Wrap references to bpf_sk_lookup_enabled with preprocessor
conditionals.

Fixes: 1559b4aa1db4 ("inet: Run SK_LOOKUP BPF program on socket lookup")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Link: https://lore.kernel.org/bpf/20200721100716.720477-1-jakub@cloudflare.com
---
 kernel/bpf/net_namespace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 4e1bcaa2c3cb..71405edd667c 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -28,9 +28,11 @@ DEFINE_MUTEX(netns_bpf_mutex);
 static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type)
 {
 	switch (type) {
+#ifdef CONFIG_INET
 	case NETNS_BPF_SK_LOOKUP:
 		static_branch_dec(&bpf_sk_lookup_enabled);
 		break;
+#endif
 	default:
 		break;
 	}
@@ -39,9 +41,11 @@ static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type)
 static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type)
 {
 	switch (type) {
+#ifdef CONFIG_INET
 	case NETNS_BPF_SK_LOOKUP:
 		static_branch_inc(&bpf_sk_lookup_enabled);
 		break;
+#endif
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From c576b9c77bea9a8ad11ddc277af3edbf15733ddd Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Sun, 19 Jul 2020 17:52:41 +0200
Subject: bpf: cpumap: Fix possible rcpu kthread hung

Fix the following cpumap kthread hung. The issue is currently occurring
when __cpu_map_load_bpf_program fails (e.g if the bpf prog has not
BPF_XDP_CPUMAP as expected_attach_type)

$./test_progs -n 101
101/1 cpumap_with_progs:OK
101 xdp_cpumap_attach:OK
Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED
[  369.996478] INFO: task cpumap/0/map:7:205 blocked for more than 122 seconds.
[  369.998463]       Not tainted 5.8.0-rc4-01472-ge57892f50a07 #212
[  370.000102] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  370.001918] cpumap/0/map:7  D    0   205      2 0x00004000
[  370.003228] Call Trace:
[  370.003930]  __schedule+0x5c7/0xf50
[  370.004901]  ? io_schedule_timeout+0xb0/0xb0
[  370.005934]  ? static_obj+0x31/0x80
[  370.006788]  ? mark_held_locks+0x24/0x90
[  370.007752]  ? cpu_map_bpf_prog_run_xdp+0x6c0/0x6c0
[  370.008930]  schedule+0x6f/0x160
[  370.009728]  schedule_preempt_disabled+0x14/0x20
[  370.010829]  kthread+0x17b/0x240
[  370.011433]  ? kthread_create_worker_on_cpu+0xd0/0xd0
[  370.011944]  ret_from_fork+0x1f/0x30
[  370.012348]
               Showing all locks held in the system:
[  370.013025] 1 lock held by khungtaskd/33:
[  370.013432]  #0: ffffffff82b24720 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x28/0x1c3

[  370.014461] =============================================

Fixes: 9216477449f3 ("bpf: cpumap: Add the possibility to attach an eBPF program to cpumap")
Reported-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/e54f2aabf959f298939e5507b09c48f8c2e380be.1595170625.git.lorenzo@kernel.org
---
 kernel/bpf/cpumap.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 4c95d0615ca2..f1c46529929b 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -453,24 +453,27 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
 	rcpu->map_id = map_id;
 	rcpu->value.qsize  = value->qsize;
 
+	if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
+		goto free_ptr_ring;
+
 	/* Setup kthread */
 	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
 					       "cpumap/%d/map:%d", cpu, map_id);
 	if (IS_ERR(rcpu->kthread))
-		goto free_ptr_ring;
+		goto free_prog;
 
 	get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
 	get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
 
-	if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
-		goto free_ptr_ring;
-
 	/* Make sure kthread runs on a single CPU */
 	kthread_bind(rcpu->kthread, cpu);
 	wake_up_process(rcpu->kthread);
 
 	return rcpu;
 
+free_prog:
+	if (rcpu->prog)
+		bpf_prog_put(rcpu->prog);
 free_ptr_ring:
 	ptr_ring_cleanup(rcpu->queue, NULL);
 free_queue:
-- 
cgit v1.2.3


From bc4f0548f683a3d53359cef15f088d2d5bb4bc39 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 20 Jul 2020 09:33:58 -0700
Subject: bpf: Compute bpf_skc_to_*() helper socket btf ids at build time

Currently, socket types (struct tcp_sock, udp_sock, etc.)
used by bpf_skc_to_*() helpers are computed when vmlinux_btf
is first built in the kernel.

Commit 5a2798ab32ba
("bpf: Add BTF_ID_LIST/BTF_ID/BTF_ID_UNUSED macros")
implemented a mechanism to compute btf_ids at kernel build
time which can simplify kernel implementation and reduce
runtime overhead by removing in-kernel btf_id calculation.
This patch did exactly this, removing in-kernel btf_id
computation and utilizing build-time btf_id computation.

If CONFIG_DEBUG_INFO_BTF is not defined, BTF_ID_LIST will
define an array with size of 5, which is not enough for
btf_sock_ids. So define its own static array if
CONFIG_DEBUG_INFO_BTF is not defined.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200720163358.1393023-1-yhs@fb.com
---
 include/linux/bpf.h |  4 ----
 kernel/bpf/btf.c    |  1 -
 net/core/filter.c   | 49 ++++++++++++++++++-------------------------------
 3 files changed, 18 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index adb16bdc5f0a..1df1c0fd3f28 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1541,7 +1541,6 @@ static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
 
 struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
 void bpf_map_offload_map_free(struct bpf_map *map);
-void init_btf_sock_ids(struct btf *btf);
 #else
 static inline int bpf_prog_offload_init(struct bpf_prog *prog,
 					union bpf_attr *attr)
@@ -1567,9 +1566,6 @@ static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
 static inline void bpf_map_offload_map_free(struct bpf_map *map)
 {
 }
-static inline void init_btf_sock_ids(struct btf *btf)
-{
-}
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
 #if defined(CONFIG_BPF_STREAM_PARSER)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 03d6d43bb1d6..315cde73421b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3672,7 +3672,6 @@ struct btf *btf_parse_vmlinux(void)
 		goto errout;
 
 	bpf_struct_ops_init(btf, log);
-	init_btf_sock_ids(btf);
 
 	btf_verifier_env_free(env);
 	refcount_set(&btf->refcnt, 1);
diff --git a/net/core/filter.c b/net/core/filter.c
index 2bd129b5ae74..5a65fb4b95ff 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9426,19 +9426,19 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
  * sock_common as the first argument in its memory layout.
  */
 #define BTF_SOCK_TYPE_xxx \
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, "inet_sock")			\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, "inet_connection_sock")	\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, "inet_request_sock")	\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, "inet_timewait_sock")	\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, "request_sock")		\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, "sock")			\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, "sock_common")		\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, "tcp_sock")			\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, "tcp_request_sock")	\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, "tcp_timewait_sock")	\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, "tcp6_sock")			\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, "udp_sock")			\
-	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, "udp6_sock")
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, inet_sock)			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, inet_connection_sock)	\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, inet_request_sock)	\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, inet_timewait_sock)	\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, request_sock)			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, sock)				\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, sock_common)		\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, tcp_sock)			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, tcp_request_sock)		\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, tcp_timewait_sock)		\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock)			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock)			\
+	BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock)
 
 enum {
 #define BTF_SOCK_TYPE(name, str) name,
@@ -9447,26 +9447,13 @@ BTF_SOCK_TYPE_xxx
 MAX_BTF_SOCK_TYPE,
 };
 
-static int btf_sock_ids[MAX_BTF_SOCK_TYPE];
-
-#ifdef CONFIG_BPF_SYSCALL
-static const char *bpf_sock_types[] = {
-#define BTF_SOCK_TYPE(name, str) str,
+#ifdef CONFIG_DEBUG_INFO_BTF
+BTF_ID_LIST(btf_sock_ids)
+#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
 BTF_SOCK_TYPE_xxx
 #undef BTF_SOCK_TYPE
-};
-
-void init_btf_sock_ids(struct btf *btf)
-{
-	int i, btf_id;
-
-	for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) {
-		btf_id = btf_find_by_name_kind(btf, bpf_sock_types[i],
-					       BTF_KIND_STRUCT);
-		if (btf_id > 0)
-			btf_sock_ids[i] = btf_id;
-	}
-}
+#else
+static u32 btf_sock_ids[MAX_BTF_SOCK_TYPE];
 #endif
 
 static bool check_arg_btf_id(u32 btf_id, u32 arg)
-- 
cgit v1.2.3


From 951cf368bcb11d6f817709660cf5cd914072c36f Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 20 Jul 2020 09:34:03 -0700
Subject: bpf: net: Use precomputed btf_id for bpf iterators

One additional field btf_id is added to struct
bpf_ctx_arg_aux to store the precomputed btf_ids.
The btf_id is computed at build time with
BTF_ID_LIST or BTF_ID_LIST_GLOBAL macro definitions.
All existing bpf iterators are changed to used
pre-compute btf_ids.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200720163403.1393551-1-yhs@fb.com
---
 include/linux/bpf.h      |  1 +
 kernel/bpf/btf.c         |  5 +++--
 kernel/bpf/map_iter.c    |  7 ++++++-
 kernel/bpf/task_iter.c   | 12 ++++++++++--
 net/ipv4/tcp_ipv4.c      |  4 +++-
 net/ipv4/udp.c           |  4 +++-
 net/ipv6/route.c         |  7 ++++++-
 net/netlink/af_netlink.c |  7 ++++++-
 8 files changed, 38 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1df1c0fd3f28..bae557ff2da8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -668,6 +668,7 @@ struct bpf_jit_poke_descriptor {
 struct bpf_ctx_arg_aux {
 	u32 offset;
 	enum bpf_reg_type reg_type;
+	u32 btf_id;
 };
 
 struct bpf_prog_aux {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 315cde73421b..ee36b7f60936 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3817,16 +3817,17 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		return true;
 
 	/* this is a pointer to another type */
-	info->reg_type = PTR_TO_BTF_ID;
 	for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
 		const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
 
 		if (ctx_arg_info->offset == off) {
 			info->reg_type = ctx_arg_info->reg_type;
-			break;
+			info->btf_id = ctx_arg_info->btf_id;
+			return true;
 		}
 	}
 
+	info->reg_type = PTR_TO_BTF_ID;
 	if (tgt_prog) {
 		ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
 		if (ret > 0) {
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index c69071e334bf..8a7af11b411f 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -4,6 +4,7 @@
 #include <linux/fs.h>
 #include <linux/filter.h>
 #include <linux/kernel.h>
+#include <linux/btf_ids.h>
 
 struct bpf_iter_seq_map_info {
 	u32 mid;
@@ -81,7 +82,10 @@ static const struct seq_operations bpf_map_seq_ops = {
 	.show	= bpf_map_seq_show,
 };
 
-static const struct bpf_iter_reg bpf_map_reg_info = {
+BTF_ID_LIST(btf_bpf_map_id)
+BTF_ID(struct, bpf_map)
+
+static struct bpf_iter_reg bpf_map_reg_info = {
 	.target			= "bpf_map",
 	.seq_ops		= &bpf_map_seq_ops,
 	.init_seq_private	= NULL,
@@ -96,6 +100,7 @@ static const struct bpf_iter_reg bpf_map_reg_info = {
 
 static int __init bpf_map_iter_init(void)
 {
+	bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id;
 	return bpf_iter_reg_target(&bpf_map_reg_info);
 }
 
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 4dbf2b6035f8..2feecf095609 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/fdtable.h>
 #include <linux/filter.h>
+#include <linux/btf_ids.h>
 
 struct bpf_iter_seq_task_common {
 	struct pid_namespace *ns;
@@ -312,7 +313,11 @@ static const struct seq_operations task_file_seq_ops = {
 	.show	= task_file_seq_show,
 };
 
-static const struct bpf_iter_reg task_reg_info = {
+BTF_ID_LIST(btf_task_file_ids)
+BTF_ID(struct, task_struct)
+BTF_ID(struct, file)
+
+static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
 	.seq_ops		= &task_seq_ops,
 	.init_seq_private	= init_seq_pidns,
@@ -325,7 +330,7 @@ static const struct bpf_iter_reg task_reg_info = {
 	},
 };
 
-static const struct bpf_iter_reg task_file_reg_info = {
+static struct bpf_iter_reg task_file_reg_info = {
 	.target			= "task_file",
 	.seq_ops		= &task_file_seq_ops,
 	.init_seq_private	= init_seq_pidns,
@@ -344,10 +349,13 @@ static int __init task_iter_init(void)
 {
 	int ret;
 
+	task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
 	ret = bpf_iter_reg_target(&task_reg_info);
 	if (ret)
 		return ret;
 
+	task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
+	task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
 	return bpf_iter_reg_target(&task_file_reg_info);
 }
 late_initcall(task_iter_init);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 116c11a0aaed..a7f1b41482f8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -76,6 +76,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/inetdevice.h>
+#include <linux/btf_ids.h>
 
 #include <crypto/hash.h>
 #include <linux/scatterlist.h>
@@ -2954,7 +2955,7 @@ static void bpf_iter_fini_tcp(void *priv_data)
 	bpf_iter_fini_seq_net(priv_data);
 }
 
-static const struct bpf_iter_reg tcp_reg_info = {
+static struct bpf_iter_reg tcp_reg_info = {
 	.target			= "tcp",
 	.seq_ops		= &bpf_iter_tcp_seq_ops,
 	.init_seq_private	= bpf_iter_init_tcp,
@@ -2969,6 +2970,7 @@ static const struct bpf_iter_reg tcp_reg_info = {
 
 static void __init bpf_iter_register(void)
 {
+	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
 	if (bpf_iter_reg_target(&tcp_reg_info))
 		pr_warn("Warning: could not register bpf iterator tcp\n");
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b738c63d7a77..b5231ab350e0 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -106,6 +106,7 @@
 #include <net/xfrm.h>
 #include <trace/events/udp.h>
 #include <linux/static_key.h>
+#include <linux/btf_ids.h>
 #include <trace/events/skb.h>
 #include <net/busy_poll.h>
 #include "udp_impl.h"
@@ -3232,7 +3233,7 @@ static void bpf_iter_fini_udp(void *priv_data)
 	bpf_iter_fini_seq_net(priv_data);
 }
 
-static const struct bpf_iter_reg udp_reg_info = {
+static struct bpf_iter_reg udp_reg_info = {
 	.target			= "udp",
 	.seq_ops		= &bpf_iter_udp_seq_ops,
 	.init_seq_private	= bpf_iter_init_udp,
@@ -3247,6 +3248,7 @@ static const struct bpf_iter_reg udp_reg_info = {
 
 static void __init bpf_iter_register(void)
 {
+	udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP];
 	if (bpf_iter_reg_target(&udp_reg_info))
 		pr_warn("Warning: could not register bpf iterator udp\n");
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 427b81cbc164..33f5efbad0a9 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -61,6 +61,7 @@
 #include <net/l3mdev.h>
 #include <net/ip.h>
 #include <linux/uaccess.h>
+#include <linux/btf_ids.h>
 
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
@@ -6423,7 +6424,10 @@ void __init ip6_route_init_special_entries(void)
 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
 DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
 
-static const struct bpf_iter_reg ipv6_route_reg_info = {
+BTF_ID_LIST(btf_fib6_info_id)
+BTF_ID(struct, fib6_info)
+
+static struct bpf_iter_reg ipv6_route_reg_info = {
 	.target			= "ipv6_route",
 	.seq_ops		= &ipv6_route_seq_ops,
 	.init_seq_private	= bpf_iter_init_seq_net,
@@ -6438,6 +6442,7 @@ static const struct bpf_iter_reg ipv6_route_reg_info = {
 
 static int __init bpf_iter_register(void)
 {
+	ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
 	return bpf_iter_reg_target(&ipv6_route_reg_info);
 }
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 4f2c3b14ddbf..3cd58f0c2de4 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -60,6 +60,7 @@
 #include <linux/genetlink.h>
 #include <linux/net_namespace.h>
 #include <linux/nospec.h>
+#include <linux/btf_ids.h>
 
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
@@ -2803,7 +2804,10 @@ static const struct rhashtable_params netlink_rhashtable_params = {
 };
 
 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
-static const struct bpf_iter_reg netlink_reg_info = {
+BTF_ID_LIST(btf_netlink_sock_id)
+BTF_ID(struct, netlink_sock)
+
+static struct bpf_iter_reg netlink_reg_info = {
 	.target			= "netlink",
 	.seq_ops		= &netlink_seq_ops,
 	.init_seq_private	= bpf_iter_init_seq_net,
@@ -2818,6 +2822,7 @@ static const struct bpf_iter_reg netlink_reg_info = {
 
 static int __init bpf_iter_register(void)
 {
+	netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id;
 	return bpf_iter_reg_target(&netlink_reg_info);
 }
 #endif
-- 
cgit v1.2.3


From 58877d347b58c9e971112df5eb311c13bb0acb28 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 2 Jul 2020 14:52:11 +0200
Subject: sched: Better document ttwu()

Dave hit the problem fixed by commit:

  b6e13e85829f ("sched/core: Fix ttwu() race")

and failed to understand much of the code involved. Per his request a
few comments to (hopefully) clarify things.

Requested-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200702125211.GQ4800@hirez.programming.kicks-ass.net
---
 include/linux/sched.h |  12 ++--
 kernel/sched/core.c   | 188 +++++++++++++++++++++++++++++++++++++++++++-------
 kernel/sched/sched.h  |  10 +++
 3 files changed, 179 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 12b10ce51a08..5033813fecd5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -154,24 +154,24 @@ struct task_group;
  *
  *   for (;;) {
  *	set_current_state(TASK_UNINTERRUPTIBLE);
- *	if (!need_sleep)
- *		break;
+ *	if (CONDITION)
+ *	   break;
  *
  *	schedule();
  *   }
  *   __set_current_state(TASK_RUNNING);
  *
  * If the caller does not need such serialisation (because, for instance, the
- * condition test and condition change and wakeup are under the same lock) then
+ * CONDITION test and condition change and wakeup are under the same lock) then
  * use __set_current_state().
  *
  * The above is typically ordered against the wakeup, which does:
  *
- *   need_sleep = false;
+ *   CONDITION = 1;
  *   wake_up_state(p, TASK_UNINTERRUPTIBLE);
  *
- * where wake_up_state() executes a full memory barrier before accessing the
- * task state.
+ * where wake_up_state()/try_to_wake_up() executes a full memory barrier before
+ * accessing p->state.
  *
  * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
  * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 08d02ce26b71..12db8fbd9c97 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -79,6 +79,100 @@ __read_mostly int scheduler_running;
  */
 int sysctl_sched_rt_runtime = 950000;
 
+
+/*
+ * Serialization rules:
+ *
+ * Lock order:
+ *
+ *   p->pi_lock
+ *     rq->lock
+ *       hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
+ *
+ *  rq1->lock
+ *    rq2->lock  where: rq1 < rq2
+ *
+ * Regular state:
+ *
+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
+ * always looks at the local rq data structures to find the most elegible task
+ * to run next.
+ *
+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
+ * the local CPU to avoid bouncing the runqueue state around [ see
+ * ttwu_queue_wakelist() ]
+ *
+ * Task wakeup, specifically wakeups that involve migration, are horribly
+ * complicated to avoid having to take two rq->locks.
+ *
+ * Special state:
+ *
+ * System-calls and anything external will use task_rq_lock() which acquires
+ * both p->pi_lock and rq->lock. As a consequence the state they change is
+ * stable while holding either lock:
+ *
+ *  - sched_setaffinity()/
+ *    set_cpus_allowed_ptr():	p->cpus_ptr, p->nr_cpus_allowed
+ *  - set_user_nice():		p->se.load, p->*prio
+ *  - __sched_setscheduler():	p->sched_class, p->policy, p->*prio,
+ *				p->se.load, p->rt_priority,
+ *				p->dl.dl_{runtime, deadline, period, flags, bw, density}
+ *  - sched_setnuma():		p->numa_preferred_nid
+ *  - sched_move_task()/
+ *    cpu_cgroup_fork():	p->sched_task_group
+ *  - uclamp_update_active()	p->uclamp*
+ *
+ * p->state <- TASK_*:
+ *
+ *   is changed locklessly using set_current_state(), __set_current_state() or
+ *   set_special_state(), see their respective comments, or by
+ *   try_to_wake_up(). This latter uses p->pi_lock to serialize against
+ *   concurrent self.
+ *
+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
+ *
+ *   is set by activate_task() and cleared by deactivate_task(), under
+ *   rq->lock. Non-zero indicates the task is runnable, the special
+ *   ON_RQ_MIGRATING state is used for migration without holding both
+ *   rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
+ *
+ * p->on_cpu <- { 0, 1 }:
+ *
+ *   is set by prepare_task() and cleared by finish_task() such that it will be
+ *   set before p is scheduled-in and cleared after p is scheduled-out, both
+ *   under rq->lock. Non-zero indicates the task is running on its CPU.
+ *
+ *   [ The astute reader will observe that it is possible for two tasks on one
+ *     CPU to have ->on_cpu = 1 at the same time. ]
+ *
+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
+ *
+ *  - Don't call set_task_cpu() on a blocked task:
+ *
+ *    We don't care what CPU we're not running on, this simplifies hotplug,
+ *    the CPU assignment of blocked tasks isn't required to be valid.
+ *
+ *  - for try_to_wake_up(), called under p->pi_lock:
+ *
+ *    This allows try_to_wake_up() to only take one rq->lock, see its comment.
+ *
+ *  - for migration called under rq->lock:
+ *    [ see task_on_rq_migrating() in task_rq_lock() ]
+ *
+ *    o move_queued_task()
+ *    o detach_task()
+ *
+ *  - for migration called under double_rq_lock():
+ *
+ *    o __migrate_swap_task()
+ *    o push_rt_task() / pull_rt_task()
+ *    o push_dl_task() / pull_dl_task()
+ *    o dl_task_offline_migration()
+ *
+ */
+
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
@@ -1543,8 +1637,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
 {
 	lockdep_assert_held(&rq->lock);
 
-	WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
-	dequeue_task(rq, p, DEQUEUE_NOCLOCK);
+	deactivate_task(rq, p, DEQUEUE_NOCLOCK);
 	set_task_cpu(p, new_cpu);
 	rq_unlock(rq, rf);
 
@@ -1552,8 +1645,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
 
 	rq_lock(rq, rf);
 	BUG_ON(task_cpu(p) != new_cpu);
-	enqueue_task(rq, p, 0);
-	p->on_rq = TASK_ON_RQ_QUEUED;
+	activate_task(rq, p, 0);
 	check_preempt_curr(rq, p, 0);
 
 	return rq;
@@ -2318,12 +2410,31 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 }
 
 /*
- * Called in case the task @p isn't fully descheduled from its runqueue,
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
- * since all we need to do is flip p->state to TASK_RUNNING, since
- * the task is still ->on_rq.
+ * Consider @p being inside a wait loop:
+ *
+ *   for (;;) {
+ *      set_current_state(TASK_UNINTERRUPTIBLE);
+ *
+ *      if (CONDITION)
+ *         break;
+ *
+ *      schedule();
+ *   }
+ *   __set_current_state(TASK_RUNNING);
+ *
+ * between set_current_state() and schedule(). In this case @p is still
+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
+ * an atomic manner.
+ *
+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
+ * then schedule() must still happen and p->state can be changed to
+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
+ * need to do a full wakeup with enqueue.
+ *
+ * Returns: %true when the wakeup is done,
+ *          %false otherwise.
  */
-static int ttwu_remote(struct task_struct *p, int wake_flags)
+static int ttwu_runnable(struct task_struct *p, int wake_flags)
 {
 	struct rq_flags rf;
 	struct rq *rq;
@@ -2464,6 +2575,14 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
 
 	return false;
 }
+
+#else /* !CONFIG_SMP */
+
+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+	return false;
+}
+
 #endif /* CONFIG_SMP */
 
 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2471,10 +2590,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 	struct rq *rq = cpu_rq(cpu);
 	struct rq_flags rf;
 
-#if defined(CONFIG_SMP)
 	if (ttwu_queue_wakelist(p, cpu, wake_flags))
 		return;
-#endif
 
 	rq_lock(rq, &rf);
 	update_rq_clock(rq);
@@ -2530,8 +2647,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  * migration. However the means are completely different as there is no lock
  * chain to provide order. Instead we do:
  *
- *   1) smp_store_release(X->on_cpu, 0)
- *   2) smp_cond_load_acquire(!X->on_cpu)
+ *   1) smp_store_release(X->on_cpu, 0)   -- finish_task()
+ *   2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
  *
  * Example:
  *
@@ -2571,15 +2688,33 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  * @state: the mask of task states that can be woken
  * @wake_flags: wake modifier flags (WF_*)
  *
- * If (@state & @p->state) @p->state = TASK_RUNNING.
+ * Conceptually does:
+ *
+ *   If (@state & @p->state) @p->state = TASK_RUNNING.
  *
  * If the task was not queued/runnable, also place it back on a runqueue.
  *
- * Atomic against schedule() which would dequeue a task, also see
- * set_current_state().
+ * This function is atomic against schedule() which would dequeue the task.
+ *
+ * It issues a full memory barrier before accessing @p->state, see the comment
+ * with set_current_state().
+ *
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
  *
- * This function executes a full memory barrier before accessing the task
- * state; see set_current_state().
+ * Relies on p->pi_lock stabilizing:
+ *  - p->sched_class
+ *  - p->cpus_ptr
+ *  - p->sched_task_group
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
+ *
+ * Tries really hard to only take one task_rq(p)->lock for performance.
+ * Takes rq->lock in:
+ *  - ttwu_runnable()    -- old rq, unavoidable, see comment there;
+ *  - ttwu_queue()       -- new rq, for enqueue of the task;
+ *  - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
+ *
+ * As a consequence we race really badly with just about everything. See the
+ * many memory barriers and their comments for details.
  *
  * Return: %true if @p->state changes (an actual wakeup was done),
  *	   %false otherwise.
@@ -2595,7 +2730,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		/*
 		 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
 		 * == smp_processor_id()'. Together this means we can special
-		 * case the whole 'p->on_rq && ttwu_remote()' case below
+		 * case the whole 'p->on_rq && ttwu_runnable()' case below
 		 * without taking any locks.
 		 *
 		 * In particular:
@@ -2616,8 +2751,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	/*
 	 * If we are going to wake up a thread waiting for CONDITION we
 	 * need to ensure that CONDITION=1 done by the caller can not be
-	 * reordered with p->state check below. This pairs with mb() in
-	 * set_current_state() the waiting thread does.
+	 * reordered with p->state check below. This pairs with smp_store_mb()
+	 * in set_current_state() that the waiting thread does.
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	smp_mb__after_spinlock();
@@ -2652,7 +2787,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
 	 */
 	smp_rmb();
-	if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags))
+	if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
 		goto unlock;
 
 	if (p->in_iowait) {
@@ -3222,8 +3357,10 @@ static inline void prepare_task(struct task_struct *next)
 	/*
 	 * Claim the task as running, we do this before switching to it
 	 * such that any running task will have this set.
+	 *
+	 * See the ttwu() WF_ON_CPU case and its ordering comment.
 	 */
-	next->on_cpu = 1;
+	WRITE_ONCE(next->on_cpu, 1);
 #endif
 }
 
@@ -3231,8 +3368,9 @@ static inline void finish_task(struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
 	/*
-	 * After ->on_cpu is cleared, the task can be moved to a different CPU.
-	 * We must ensure this doesn't happen until the switch is completely
+	 * This must be the very last reference to @prev from this CPU. After
+	 * p->on_cpu is cleared, the task can be moved to a different CPU. We
+	 * must ensure this doesn't happen until the switch is completely
 	 * finished.
 	 *
 	 * In particular, the load of prev->state in finish_task_switch() must
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 65b72e0487bf..9f33c77258ea 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1203,6 +1203,16 @@ struct rq_flags {
 #endif
 };
 
+/*
+ * Lockdep annotation that avoids accidental unlocks; it's like a
+ * sticky/continuous lockdep_assert_held().
+ *
+ * This avoids code that has access to 'struct rq *rq' (basically everything in
+ * the scheduler) from accidentally unlocking the rq if they do not also have a
+ * copy of the (on-stack) 'struct rq_flags rf'.
+ *
+ * Also see Documentation/locking/lockdep-design.rst.
+ */
 static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
 {
 	rf->cookie = lockdep_pin_lock(&rq->lock);
-- 
cgit v1.2.3


From 46132e3ac58cb2ee48051ed80bffc0070ad59b2e Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Wed, 1 Jul 2020 14:34:18 -0400
Subject: sched: nohz: stop passing around unused "ticks" parameter.

The "ticks" parameter was added in commit 0f004f5a696a ("sched: Cure more
NO_HZ load average woes") since calc_global_nohz() was called and needed
the "ticks" argument.

But in commit c308b56b5398 ("sched: Fix nohz load accounting -- again!")
it became unused as the function calc_global_nohz() dropped using "ticks".

Fixes: c308b56b5398 ("sched: Fix nohz load accounting -- again!")
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593628458-32290-1-git-send-email-paul.gortmaker@windriver.com
---
 include/linux/sched/loadavg.h | 2 +-
 kernel/sched/loadavg.c        | 2 +-
 kernel/time/timekeeping.c     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h
index 4859bea47a7b..83ec54b65e79 100644
--- a/include/linux/sched/loadavg.h
+++ b/include/linux/sched/loadavg.h
@@ -43,6 +43,6 @@ extern unsigned long calc_load_n(unsigned long load, unsigned long exp,
 #define LOAD_INT(x) ((x) >> FSHIFT)
 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 
-extern void calc_global_load(unsigned long ticks);
+extern void calc_global_load(void);
 
 #endif /* _LINUX_SCHED_LOADAVG_H */
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index de22da666ac7..d2a655643a02 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -347,7 +347,7 @@ static inline void calc_global_nohz(void) { }
  *
  * Called from the global timer code.
  */
-void calc_global_load(unsigned long ticks)
+void calc_global_load(void)
 {
 	unsigned long sample_window;
 	long active, delta;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d20d489841c8..63a632f9896c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2193,7 +2193,7 @@ EXPORT_SYMBOL(ktime_get_coarse_ts64);
 void do_timer(unsigned long ticks)
 {
 	jiffies_64 += ticks;
-	calc_global_load(ticks);
+	calc_global_load();
 }
 
 /**
-- 
cgit v1.2.3


From 3edecfef028536cb19a120ec8788bd8a11f93b9e Mon Sep 17 00:00:00 2001
From: Peter Puhov <peter.puhov@linaro.org>
Date: Tue, 14 Jul 2020 08:59:41 -0400
Subject: sched/fair: update_pick_idlest() Select group with lowest group_util
 when idle_cpus are equal

In slow path, when selecting idlest group, if both groups have type
group_has_spare, only idle_cpus count gets compared.
As a result, if multiple tasks are created in a tight loop,
and go back to sleep immediately
(while waiting for all tasks to be created),
they may be scheduled on the same core, because CPU is back to idle
when the new fork happen.

For example:
sudo perf record -e sched:sched_wakeup_new -- \
                                  sysbench threads --threads=4 run
...
    total number of events:              61582
...
sudo perf script
sysbench 129378 [006] 74586.633466: sched:sched_wakeup_new:
                            sysbench:129380 [120] success=1 CPU:007
sysbench 129378 [006] 74586.634718: sched:sched_wakeup_new:
                            sysbench:129381 [120] success=1 CPU:007
sysbench 129378 [006] 74586.635957: sched:sched_wakeup_new:
                            sysbench:129382 [120] success=1 CPU:007
sysbench 129378 [006] 74586.637183: sched:sched_wakeup_new:
                            sysbench:129383 [120] success=1 CPU:007

This may have negative impact on performance for workloads with frequent
creation of multiple threads.

In this patch we are using group_util to select idlest group if both groups
have equal number of idle_cpus. Comparing the number of idle cpu is
not enough in this case, because the newly forked thread sleeps
immediately and before we select the cpu for the next one.
This is shown in the trace where the same CPU7 is selected for
all wakeup_new events.
That's why, looking at utilization when there is the same number of
CPU is a good way to see where the previous task was placed. Using
nr_running doesn't solve the problem because the newly forked task is not
running and the cpu would not have been idle in this case and an idle
CPU would have been selected instead.

With this patch newly created tasks would be better distributed.

With this patch:
sudo perf record -e sched:sched_wakeup_new -- \
                                    sysbench threads --threads=4 run
...
    total number of events:              74401
...
sudo perf script
sysbench 129455 [006] 75232.853257: sched:sched_wakeup_new:
                            sysbench:129457 [120] success=1 CPU:008
sysbench 129455 [006] 75232.854489: sched:sched_wakeup_new:
                            sysbench:129458 [120] success=1 CPU:009
sysbench 129455 [006] 75232.855732: sched:sched_wakeup_new:
                            sysbench:129459 [120] success=1 CPU:010
sysbench 129455 [006] 75232.856980: sched:sched_wakeup_new:
                            sysbench:129460 [120] success=1 CPU:011

We tested this patch with following benchmarks:
master: 'commit b3a9e3b9622a ("Linux 5.8-rc1")'

100 iterations of: perf bench -f simple futex wake -s -t 128 -w 1
Lower result is better
|         |   BASELINE |   +PATCH |   DELTA (%) |
|---------|------------|----------|-------------|
| mean    |      0.33  |    0.313 |      +5.152 |
| std (%) |     10.433 |    7.563 |             |

100 iterations of: sysbench threads --threads=8 run
Higher result is better
|         |   BASELINE |   +PATCH |   DELTA (%) |
|---------|------------|----------|-------------|
| mean    |   5235.02  | 5863.73  |      +12.01 |
| std (%) |      8.166 |   10.265 |             |

100 iterations of: sysbench mutex --mutex-num=1 --threads=8 run
Lower result is better
|         |   BASELINE |   +PATCH |   DELTA (%) |
|---------|------------|----------|-------------|
| mean    |      0.413 |    0.404 |      +2.179 |
| std (%) |      3.791 |    1.816 |             |

Signed-off-by: Peter Puhov <peter.puhov@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200714125941.4174-1-peter.puhov@linaro.org
---
 kernel/sched/fair.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 98a53a2fe354..2ba8f230feb9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8711,8 +8711,14 @@ static bool update_pick_idlest(struct sched_group *idlest,
 
 	case group_has_spare:
 		/* Select group with most idle CPUs */
-		if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
+		if (idlest_sgs->idle_cpus > sgs->idle_cpus)
 			return false;
+
+		/* Select group with lowest group_util */
+		if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
+			idlest_sgs->group_util <= sgs->group_util)
+			return false;
+
 		break;
 	}
 
-- 
cgit v1.2.3


From 589343569d7b58a64ec2446e6686c8e79aea7fcf Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 16 Jul 2020 15:04:57 +0800
Subject: smp: Fix a potential usage of stale nr_cpus

The get_option() maybe return 0, it means that the nr_cpus is
not initialized. Then we will use the stale nr_cpus to initialize
the nr_cpu_ids. So fix it.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200716070457.53255-1-songmuchun@bytedance.com
---
 kernel/smp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index aa17eedff5be..d0ae8eb6bf8b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -634,8 +634,7 @@ static int __init nrcpus(char *str)
 {
 	int nr_cpus;
 
-	get_option(&str, &nr_cpus);
-	if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
+	if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
 		nr_cpu_ids = nr_cpus;
 
 	return 0;
-- 
cgit v1.2.3


From 25980c7a79af42f2daa73e2f475ebf4cbac8253e Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Sun, 12 Jul 2020 17:59:15 +0100
Subject: arch_topology, sched/core: Cleanup thermal pressure definition

The following commit:

  14533a16c46d ("thermal/cpu-cooling, sched/core: Move the arch_set_thermal_pressure() API to generic scheduler code")

moved the definition of arch_set_thermal_pressure() to sched/core.c, but
kept its declaration in linux/arch_topology.h. When building e.g. an x86
kernel with CONFIG_SCHED_THERMAL_PRESSURE=y, cpufreq_cooling.c ends up
getting the declaration of arch_set_thermal_pressure() from
include/linux/arch_topology.h, which is somewhat awkward.

On top of this, sched/core.c unconditionally defines
o The thermal_pressure percpu variable
o arch_set_thermal_pressure()

while arch_scale_thermal_pressure() does nothing unless redefined by the
architecture.

arch_*() functions are meant to be defined by architectures, so revert the
aforementioned commit and re-implement it in a way that keeps
arch_set_thermal_pressure() architecture-definable, and doesn't define the
thermal pressure percpu variable for kernels that don't need
it (CONFIG_SCHED_THERMAL_PRESSURE=n).

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200712165917.9168-2-valentin.schneider@arm.com
---
 arch/arm/include/asm/topology.h   |  3 ++-
 arch/arm64/include/asm/topology.h |  3 ++-
 drivers/base/arch_topology.c      | 11 +++++++++++
 include/linux/arch_topology.h     |  4 ++--
 include/linux/sched/topology.h    |  7 +++++++
 kernel/sched/core.c               | 11 -----------
 6 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 435aba289fc5..e0593cf095d0 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -16,8 +16,9 @@
 /* Enable topology flag updates */
 #define arch_update_cpu_topology topology_update_cpu_topology
 
-/* Replace task scheduler's default thermal pressure retrieve API */
+/* Replace task scheduler's default thermal pressure API */
 #define arch_scale_thermal_pressure topology_get_thermal_pressure
+#define arch_set_thermal_pressure   topology_set_thermal_pressure
 
 #else
 
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 0cc835ddfcd1..e042f6527981 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -34,8 +34,9 @@ void topology_scale_freq_tick(void);
 /* Enable topology flag updates */
 #define arch_update_cpu_topology topology_update_cpu_topology
 
-/* Replace task scheduler's default thermal pressure retrieve API */
+/* Replace task scheduler's default thermal pressure API */
 #define arch_scale_thermal_pressure topology_get_thermal_pressure
+#define arch_set_thermal_pressure   topology_set_thermal_pressure
 
 #include <asm-generic/topology.h>
 
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 4d0a0038b476..75f72d684294 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -54,6 +54,17 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
 	per_cpu(cpu_scale, cpu) = capacity;
 }
 
+DEFINE_PER_CPU(unsigned long, thermal_pressure);
+
+void topology_set_thermal_pressure(const struct cpumask *cpus,
+			       unsigned long th_pressure)
+{
+	int cpu;
+
+	for_each_cpu(cpu, cpus)
+		WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
+}
+
 static ssize_t cpu_capacity_show(struct device *dev,
 				 struct device_attribute *attr,
 				 char *buf)
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 0566cb3314ef..69b1dabe39dc 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -39,8 +39,8 @@ static inline unsigned long topology_get_thermal_pressure(int cpu)
 	return per_cpu(thermal_pressure, cpu);
 }
 
-void arch_set_thermal_pressure(struct cpumask *cpus,
-			       unsigned long th_pressure);
+void topology_set_thermal_pressure(const struct cpumask *cpus,
+				   unsigned long th_pressure);
 
 struct cpu_topology {
 	int thread_id;
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index fb11091129b3..764222d637b7 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -232,6 +232,13 @@ unsigned long arch_scale_thermal_pressure(int cpu)
 }
 #endif
 
+#ifndef arch_set_thermal_pressure
+static __always_inline
+void arch_set_thermal_pressure(const struct cpumask *cpus,
+			       unsigned long th_pressure)
+{ }
+#endif
+
 static inline int task_node(const struct task_struct *p)
 {
 	return cpu_to_node(task_cpu(p));
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 12db8fbd9c97..bd8e5211d31f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3869,17 +3869,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	return ns;
 }
 
-DEFINE_PER_CPU(unsigned long, thermal_pressure);
-
-void arch_set_thermal_pressure(struct cpumask *cpus,
-			       unsigned long th_pressure)
-{
-	int cpu;
-
-	for_each_cpu(cpu, cpus)
-		WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
-}
-
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
-- 
cgit v1.2.3


From bd25b4886ddcebec92591f298ce2ce345d7f2ea9 Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Tue, 14 Jul 2020 16:13:51 -0400
Subject: padata: remove start function

padata_start() is only used right after pcrypt allocates an instance
with all possible CPUs, when PADATA_INVALID can't happen, so there's no
need for a separate "start" step.  It can be done during allocation to
save text, make using padata easier, and avoid unneeded calls in the
future.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/pcrypt.c        |  3 ---
 include/linux/padata.h |  1 -
 kernel/padata.c        | 26 +-------------------------
 3 files changed, 1 insertion(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index cbc383a1a3fe..fb9127ec5357 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -355,9 +355,6 @@ static int __init pcrypt_init(void)
 	if (err)
 		goto err_deinit_pencrypt;
 
-	padata_start(pencrypt);
-	padata_start(pdecrypt);
-
 	return crypto_register_template(&pcrypt_tmpl);
 
 err_deinit_pencrypt:
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 7302efff5e65..20294cddc739 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -204,6 +204,5 @@ extern void padata_do_serial(struct padata_priv *padata);
 extern void __init padata_do_multithreaded(struct padata_mt_job *job);
 extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
 			      cpumask_var_t cpumask);
-extern int padata_start(struct padata_instance *pinst);
 extern void padata_stop(struct padata_instance *pinst);
 #endif
diff --git a/kernel/padata.c b/kernel/padata.c
index 4373f7adaa40..931762316612 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -789,30 +789,6 @@ out:
 }
 EXPORT_SYMBOL(padata_set_cpumask);
 
-/**
- * padata_start - start the parallel processing
- *
- * @pinst: padata instance to start
- *
- * Return: 0 on success or negative error code
- */
-int padata_start(struct padata_instance *pinst)
-{
-	int err = 0;
-
-	mutex_lock(&pinst->lock);
-
-	if (pinst->flags & PADATA_INVALID)
-		err = -EINVAL;
-
-	__padata_start(pinst);
-
-	mutex_unlock(&pinst->lock);
-
-	return err;
-}
-EXPORT_SYMBOL(padata_start);
-
 /**
  * padata_stop - stop the parallel processing
  *
@@ -1100,7 +1076,7 @@ static struct padata_instance *padata_alloc(const char *name,
 	if (padata_setup_cpumasks(pinst))
 		goto err_free_rcpumask_cbcpu;
 
-	pinst->flags = 0;
+	__padata_start(pinst);
 
 	kobject_init(&pinst->kobj, &padata_attr_type);
 	mutex_init(&pinst->lock);
-- 
cgit v1.2.3


From 350ef051d4edd884e8dea0be9f3685b4b32142fb Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Tue, 14 Jul 2020 16:13:52 -0400
Subject: padata: remove stop function

padata_stop() has two callers and is unnecessary in both cases.  When
pcrypt calls it before padata_free(), it's being unloaded so there are
no outstanding padata jobs[0].  When __padata_free() calls it, it's
either along the same path or else pcrypt initialization failed, which
of course means there are also no outstanding jobs.

Removing it simplifies padata and saves text.

[0] https://lore.kernel.org/linux-crypto/20191119225017.mjrak2fwa5vccazl@gondor.apana.org.au/

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/core-api/padata.rst | 16 ++--------------
 crypto/pcrypt.c                   | 12 +++---------
 include/linux/padata.h            |  1 -
 kernel/padata.c                   | 14 --------------
 4 files changed, 5 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/core-api/padata.rst b/Documentation/core-api/padata.rst
index 0830e5b0e821..771d50330e5b 100644
--- a/Documentation/core-api/padata.rst
+++ b/Documentation/core-api/padata.rst
@@ -31,18 +31,7 @@ padata_instance structure for overall control of how jobs are to be run::
 
 'name' simply identifies the instance.
 
-There are functions for enabling and disabling the instance::
-
-    int padata_start(struct padata_instance *pinst);
-    void padata_stop(struct padata_instance *pinst);
-
-These functions are setting or clearing the "PADATA_INIT" flag; if that flag is
-not set, other functions will refuse to work.  padata_start() returns zero on
-success (flag set) or -EINVAL if the padata cpumask contains no active CPU
-(flag not set).  padata_stop() clears the flag and blocks until the padata
-instance is unused.
-
-Finally, complete padata initialization by allocating a padata_shell::
+Then, complete padata initialization by allocating a padata_shell::
 
    struct padata_shell *padata_alloc_shell(struct padata_instance *pinst);
 
@@ -155,11 +144,10 @@ submitted.
 Destroying
 ----------
 
-Cleaning up a padata instance predictably involves calling the three free
+Cleaning up a padata instance predictably involves calling the two free
 functions that correspond to the allocation in reverse::
 
     void padata_free_shell(struct padata_shell *ps);
-    void padata_stop(struct padata_instance *pinst);
     void padata_free(struct padata_instance *pinst);
 
 It is the user's responsibility to ensure all outstanding jobs are complete
diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index fb9127ec5357..2d4ac9d44902 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -327,12 +327,6 @@ static int pcrypt_init_padata(struct padata_instance **pinst, const char *name)
 	return ret;
 }
 
-static void pcrypt_fini_padata(struct padata_instance *pinst)
-{
-	padata_stop(pinst);
-	padata_free(pinst);
-}
-
 static struct crypto_template pcrypt_tmpl = {
 	.name = "pcrypt",
 	.create = pcrypt_create,
@@ -358,7 +352,7 @@ static int __init pcrypt_init(void)
 	return crypto_register_template(&pcrypt_tmpl);
 
 err_deinit_pencrypt:
-	pcrypt_fini_padata(pencrypt);
+	padata_free(pencrypt);
 err_unreg_kset:
 	kset_unregister(pcrypt_kset);
 err:
@@ -369,8 +363,8 @@ static void __exit pcrypt_exit(void)
 {
 	crypto_unregister_template(&pcrypt_tmpl);
 
-	pcrypt_fini_padata(pencrypt);
-	pcrypt_fini_padata(pdecrypt);
+	padata_free(pencrypt);
+	padata_free(pdecrypt);
 
 	kset_unregister(pcrypt_kset);
 }
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 20294cddc739..7d53208b43da 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -204,5 +204,4 @@ extern void padata_do_serial(struct padata_priv *padata);
 extern void __init padata_do_multithreaded(struct padata_mt_job *job);
 extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
 			      cpumask_var_t cpumask);
-extern void padata_stop(struct padata_instance *pinst);
 #endif
diff --git a/kernel/padata.c b/kernel/padata.c
index 931762316612..8f55e717ba50 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -789,19 +789,6 @@ out:
 }
 EXPORT_SYMBOL(padata_set_cpumask);
 
-/**
- * padata_stop - stop the parallel processing
- *
- * @pinst: padata instance to stop
- */
-void padata_stop(struct padata_instance *pinst)
-{
-	mutex_lock(&pinst->lock);
-	__padata_stop(pinst);
-	mutex_unlock(&pinst->lock);
-}
-EXPORT_SYMBOL(padata_stop);
-
 #ifdef CONFIG_HOTPLUG_CPU
 
 static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
@@ -883,7 +870,6 @@ static void __padata_free(struct padata_instance *pinst)
 
 	WARN_ON(!list_empty(&pinst->pslist));
 
-	padata_stop(pinst);
 	free_cpumask_var(pinst->rcpumask.cbcpu);
 	free_cpumask_var(pinst->rcpumask.pcpu);
 	free_cpumask_var(pinst->cpumask.pcpu);
-- 
cgit v1.2.3


From cec00e6e120f4f0743d2bf0638418684c74b16e0 Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Tue, 14 Jul 2020 16:13:53 -0400
Subject: padata: inline single call of pd_setup_cpumasks()

pd_setup_cpumasks() has only one caller.  Move its contents inline to
prepare for the next cleanup.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 kernel/padata.c | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index 8f55e717ba50..27f90a3c4dc6 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -441,28 +441,6 @@ static int padata_setup_cpumasks(struct padata_instance *pinst)
 	return err;
 }
 
-static int pd_setup_cpumasks(struct parallel_data *pd,
-			     const struct cpumask *pcpumask,
-			     const struct cpumask *cbcpumask)
-{
-	int err = -ENOMEM;
-
-	if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
-		goto out;
-	if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL))
-		goto free_pcpu_mask;
-
-	cpumask_copy(pd->cpumask.pcpu, pcpumask);
-	cpumask_copy(pd->cpumask.cbcpu, cbcpumask);
-
-	return 0;
-
-free_pcpu_mask:
-	free_cpumask_var(pd->cpumask.pcpu);
-out:
-	return err;
-}
-
 static void __init padata_mt_helper(struct work_struct *w)
 {
 	struct padata_work *pw = container_of(w, struct padata_work, pw_work);
@@ -613,8 +591,14 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
 		goto err_free_pqueue;
 
 	pd->ps = ps;
-	if (pd_setup_cpumasks(pd, pcpumask, cbcpumask))
+
+	if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
 		goto err_free_squeue;
+	if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL))
+		goto err_free_pcpu;
+
+	cpumask_copy(pd->cpumask.pcpu, pcpumask);
+	cpumask_copy(pd->cpumask.cbcpu, cbcpumask);
 
 	padata_init_pqueues(pd);
 	padata_init_squeues(pd);
@@ -626,6 +610,8 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
 
 	return pd;
 
+err_free_pcpu:
+	free_cpumask_var(pd->cpumask.pcpu);
 err_free_squeue:
 	free_percpu(pd->squeue);
 err_free_pqueue:
-- 
cgit v1.2.3


From d69e037bcc4a7e31fdd40ae416aa1bd768dd7d99 Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Tue, 14 Jul 2020 16:13:54 -0400
Subject: padata: remove effective cpumasks from the instance

A padata instance has effective cpumasks that store the user-supplied
masks ANDed with the online mask, but this middleman is unnecessary.
parallel_data keeps the same information around.  Removing this saves
text and code churn in future changes.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h |  2 --
 kernel/padata.c        | 30 +++---------------------------
 2 files changed, 3 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 7d53208b43da..a941b96b7119 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -167,7 +167,6 @@ struct padata_mt_job {
  * @serial_wq: The workqueue used for serial work.
  * @pslist: List of padata_shell objects attached to this instance.
  * @cpumask: User supplied cpumasks for parallel and serial works.
- * @rcpumask: Actual cpumasks based on user cpumask and cpu_online_mask.
  * @kobj: padata instance kernel object.
  * @lock: padata instance lock.
  * @flags: padata flags.
@@ -179,7 +178,6 @@ struct padata_instance {
 	struct workqueue_struct		*serial_wq;
 	struct list_head		pslist;
 	struct padata_cpumask		cpumask;
-	struct padata_cpumask		rcpumask;
 	struct kobject                   kobj;
 	struct mutex			 lock;
 	u8				 flags;
diff --git a/kernel/padata.c b/kernel/padata.c
index 27f90a3c4dc6..4f0a57e5738c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -571,13 +571,8 @@ static void padata_init_pqueues(struct parallel_data *pd)
 static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
 {
 	struct padata_instance *pinst = ps->pinst;
-	const struct cpumask *cbcpumask;
-	const struct cpumask *pcpumask;
 	struct parallel_data *pd;
 
-	cbcpumask = pinst->rcpumask.cbcpu;
-	pcpumask = pinst->rcpumask.pcpu;
-
 	pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
 	if (!pd)
 		goto err;
@@ -597,8 +592,8 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
 	if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL))
 		goto err_free_pcpu;
 
-	cpumask_copy(pd->cpumask.pcpu, pcpumask);
-	cpumask_copy(pd->cpumask.cbcpu, cbcpumask);
+	cpumask_and(pd->cpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask);
+	cpumask_and(pd->cpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask);
 
 	padata_init_pqueues(pd);
 	padata_init_squeues(pd);
@@ -668,12 +663,6 @@ static int padata_replace(struct padata_instance *pinst)
 
 	pinst->flags |= PADATA_RESET;
 
-	cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu,
-		    cpu_online_mask);
-
-	cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu,
-		    cpu_online_mask);
-
 	list_for_each_entry(ps, &pinst->pslist, list) {
 		err = padata_replace_one(ps);
 		if (err)
@@ -856,8 +845,6 @@ static void __padata_free(struct padata_instance *pinst)
 
 	WARN_ON(!list_empty(&pinst->pslist));
 
-	free_cpumask_var(pinst->rcpumask.cbcpu);
-	free_cpumask_var(pinst->rcpumask.pcpu);
 	free_cpumask_var(pinst->cpumask.pcpu);
 	free_cpumask_var(pinst->cpumask.cbcpu);
 	destroy_workqueue(pinst->serial_wq);
@@ -1033,20 +1020,13 @@ static struct padata_instance *padata_alloc(const char *name,
 	    !padata_validate_cpumask(pinst, cbcpumask))
 		goto err_free_masks;
 
-	if (!alloc_cpumask_var(&pinst->rcpumask.pcpu, GFP_KERNEL))
-		goto err_free_masks;
-	if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL))
-		goto err_free_rcpumask_pcpu;
-
 	INIT_LIST_HEAD(&pinst->pslist);
 
 	cpumask_copy(pinst->cpumask.pcpu, pcpumask);
 	cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
-	cpumask_and(pinst->rcpumask.pcpu, pcpumask, cpu_online_mask);
-	cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask);
 
 	if (padata_setup_cpumasks(pinst))
-		goto err_free_rcpumask_cbcpu;
+		goto err_free_masks;
 
 	__padata_start(pinst);
 
@@ -1064,10 +1044,6 @@ static struct padata_instance *padata_alloc(const char *name,
 
 	return pinst;
 
-err_free_rcpumask_cbcpu:
-	free_cpumask_var(pinst->rcpumask.cbcpu);
-err_free_rcpumask_pcpu:
-	free_cpumask_var(pinst->rcpumask.pcpu);
 err_free_masks:
 	free_cpumask_var(pinst->cpumask.pcpu);
 	free_cpumask_var(pinst->cpumask.cbcpu);
-- 
cgit v1.2.3


From 3f257191d31d5eaf154ebdb696efc238837ddd51 Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Tue, 14 Jul 2020 16:13:55 -0400
Subject: padata: fold padata_alloc_possible() into padata_alloc()

There's no reason to have two interfaces when there's only one caller.
Removing _possible saves text and simplifies future changes.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/core-api/padata.rst |  2 +-
 crypto/pcrypt.c                   |  2 +-
 include/linux/padata.h            |  2 +-
 kernel/padata.c                   | 33 +++++----------------------------
 4 files changed, 8 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/core-api/padata.rst b/Documentation/core-api/padata.rst
index 771d50330e5b..35175710b43c 100644
--- a/Documentation/core-api/padata.rst
+++ b/Documentation/core-api/padata.rst
@@ -27,7 +27,7 @@ padata_instance structure for overall control of how jobs are to be run::
 
     #include <linux/padata.h>
 
-    struct padata_instance *padata_alloc_possible(const char *name);
+    struct padata_instance *padata_alloc(const char *name);
 
 'name' simply identifies the instance.
 
diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index 2d4ac9d44902..d569c7ed6c80 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -316,7 +316,7 @@ static int pcrypt_init_padata(struct padata_instance **pinst, const char *name)
 {
 	int ret = -ENOMEM;
 
-	*pinst = padata_alloc_possible(name);
+	*pinst = padata_alloc(name);
 	if (!*pinst)
 		return ret;
 
diff --git a/include/linux/padata.h b/include/linux/padata.h
index a941b96b7119..070a7d43e8af 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -192,7 +192,7 @@ extern void __init padata_init(void);
 static inline void __init padata_init(void) {}
 #endif
 
-extern struct padata_instance *padata_alloc_possible(const char *name);
+extern struct padata_instance *padata_alloc(const char *name);
 extern void padata_free(struct padata_instance *pinst);
 extern struct padata_shell *padata_alloc_shell(struct padata_instance *pinst);
 extern void padata_free_shell(struct padata_shell *ps);
diff --git a/kernel/padata.c b/kernel/padata.c
index 4f0a57e5738c..1c0b97891edb 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -979,18 +979,12 @@ static struct kobj_type padata_attr_type = {
 };
 
 /**
- * padata_alloc - allocate and initialize a padata instance and specify
- *                cpumasks for serial and parallel workers.
- *
+ * padata_alloc - allocate and initialize a padata instance
  * @name: used to identify the instance
- * @pcpumask: cpumask that will be used for padata parallelization
- * @cbcpumask: cpumask that will be used for padata serialization
  *
  * Return: new instance on success, NULL on error
  */
-static struct padata_instance *padata_alloc(const char *name,
-					    const struct cpumask *pcpumask,
-					    const struct cpumask *cbcpumask)
+struct padata_instance *padata_alloc(const char *name)
 {
 	struct padata_instance *pinst;
 
@@ -1016,14 +1010,11 @@ static struct padata_instance *padata_alloc(const char *name,
 		free_cpumask_var(pinst->cpumask.pcpu);
 		goto err_free_serial_wq;
 	}
-	if (!padata_validate_cpumask(pinst, pcpumask) ||
-	    !padata_validate_cpumask(pinst, cbcpumask))
-		goto err_free_masks;
 
 	INIT_LIST_HEAD(&pinst->pslist);
 
-	cpumask_copy(pinst->cpumask.pcpu, pcpumask);
-	cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
+	cpumask_copy(pinst->cpumask.pcpu, cpu_possible_mask);
+	cpumask_copy(pinst->cpumask.cbcpu, cpu_possible_mask);
 
 	if (padata_setup_cpumasks(pinst))
 		goto err_free_masks;
@@ -1057,21 +1048,7 @@ err_free_inst:
 err:
 	return NULL;
 }
-
-/**
- * padata_alloc_possible - Allocate and initialize padata instance.
- *                         Use the cpu_possible_mask for serial and
- *                         parallel workers.
- *
- * @name: used to identify the instance
- *
- * Return: new instance on success, NULL on error
- */
-struct padata_instance *padata_alloc_possible(const char *name)
-{
-	return padata_alloc(name, cpu_possible_mask, cpu_possible_mask);
-}
-EXPORT_SYMBOL(padata_alloc_possible);
+EXPORT_SYMBOL(padata_alloc);
 
 /**
  * padata_free - free a padata instance
-- 
cgit v1.2.3


From f601c725a6ac072608f47083e7c8115a3f504f95 Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Tue, 14 Jul 2020 16:13:56 -0400
Subject: padata: remove padata_parallel_queue

Only its reorder field is actually used now, so remove the struct and
embed @reorder directly in parallel_data.

No functional change, just a cleanup.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h | 15 ++-------------
 kernel/padata.c        | 46 ++++++++++++++++++++--------------------------
 2 files changed, 22 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 070a7d43e8af..a433f13fc4bf 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -66,17 +66,6 @@ struct padata_serial_queue {
        struct parallel_data *pd;
 };
 
-/**
- * struct padata_parallel_queue - The percpu padata parallel queue
- *
- * @reorder: List to wait for reordering after parallel processing.
- * @num_obj: Number of objects that are processed by this cpu.
- */
-struct padata_parallel_queue {
-       struct padata_list    reorder;
-       atomic_t              num_obj;
-};
-
 /**
  * struct padata_cpumask - The cpumasks for the parallel/serial workers
  *
@@ -93,7 +82,7 @@ struct padata_cpumask {
  * that depends on the cpumask in use.
  *
  * @ps: padata_shell object.
- * @pqueue: percpu padata queues used for parallelization.
+ * @reorder_list: percpu reorder lists
  * @squeue: percpu padata queues used for serialuzation.
  * @refcnt: Number of objects holding a reference on this parallel_data.
  * @seq_nr: Sequence number of the parallelized data object.
@@ -105,7 +94,7 @@ struct padata_cpumask {
  */
 struct parallel_data {
 	struct padata_shell		*ps;
-	struct padata_parallel_queue	__percpu *pqueue;
+	struct padata_list		__percpu *reorder_list;
 	struct padata_serial_queue	__percpu *squeue;
 	atomic_t			refcnt;
 	unsigned int			seq_nr;
diff --git a/kernel/padata.c b/kernel/padata.c
index 1c0b97891edb..16cb894dc272 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -250,13 +250,11 @@ EXPORT_SYMBOL(padata_do_parallel);
 static struct padata_priv *padata_find_next(struct parallel_data *pd,
 					    bool remove_object)
 {
-	struct padata_parallel_queue *next_queue;
 	struct padata_priv *padata;
 	struct padata_list *reorder;
 	int cpu = pd->cpu;
 
-	next_queue = per_cpu_ptr(pd->pqueue, cpu);
-	reorder = &next_queue->reorder;
+	reorder = per_cpu_ptr(pd->reorder_list, cpu);
 
 	spin_lock(&reorder->lock);
 	if (list_empty(&reorder->list)) {
@@ -291,7 +289,7 @@ static void padata_reorder(struct parallel_data *pd)
 	int cb_cpu;
 	struct padata_priv *padata;
 	struct padata_serial_queue *squeue;
-	struct padata_parallel_queue *next_queue;
+	struct padata_list *reorder;
 
 	/*
 	 * We need to ensure that only one cpu can work on dequeueing of
@@ -339,9 +337,8 @@ static void padata_reorder(struct parallel_data *pd)
 	 */
 	smp_mb();
 
-	next_queue = per_cpu_ptr(pd->pqueue, pd->cpu);
-	if (!list_empty(&next_queue->reorder.list) &&
-	    padata_find_next(pd, false))
+	reorder = per_cpu_ptr(pd->reorder_list, pd->cpu);
+	if (!list_empty(&reorder->list) && padata_find_next(pd, false))
 		queue_work(pinst->serial_wq, &pd->reorder_work);
 }
 
@@ -401,17 +398,16 @@ void padata_do_serial(struct padata_priv *padata)
 {
 	struct parallel_data *pd = padata->pd;
 	int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr);
-	struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue,
-							   hashed_cpu);
+	struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu);
 	struct padata_priv *cur;
 
-	spin_lock(&pqueue->reorder.lock);
+	spin_lock(&reorder->lock);
 	/* Sort in ascending order of sequence number. */
-	list_for_each_entry_reverse(cur, &pqueue->reorder.list, list)
+	list_for_each_entry_reverse(cur, &reorder->list, list)
 		if (cur->seq_nr < padata->seq_nr)
 			break;
 	list_add(&padata->list, &cur->list);
-	spin_unlock(&pqueue->reorder.lock);
+	spin_unlock(&reorder->lock);
 
 	/*
 	 * Ensure the addition to the reorder list is ordered correctly
@@ -553,17 +549,15 @@ static void padata_init_squeues(struct parallel_data *pd)
 	}
 }
 
-/* Initialize all percpu queues used by parallel workers */
-static void padata_init_pqueues(struct parallel_data *pd)
+/* Initialize per-CPU reorder lists */
+static void padata_init_reorder_list(struct parallel_data *pd)
 {
 	int cpu;
-	struct padata_parallel_queue *pqueue;
+	struct padata_list *list;
 
 	for_each_cpu(cpu, pd->cpumask.pcpu) {
-		pqueue = per_cpu_ptr(pd->pqueue, cpu);
-
-		__padata_list_init(&pqueue->reorder);
-		atomic_set(&pqueue->num_obj, 0);
+		list = per_cpu_ptr(pd->reorder_list, cpu);
+		__padata_list_init(list);
 	}
 }
 
@@ -577,13 +571,13 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
 	if (!pd)
 		goto err;
 
-	pd->pqueue = alloc_percpu(struct padata_parallel_queue);
-	if (!pd->pqueue)
+	pd->reorder_list = alloc_percpu(struct padata_list);
+	if (!pd->reorder_list)
 		goto err_free_pd;
 
 	pd->squeue = alloc_percpu(struct padata_serial_queue);
 	if (!pd->squeue)
-		goto err_free_pqueue;
+		goto err_free_reorder_list;
 
 	pd->ps = ps;
 
@@ -595,7 +589,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
 	cpumask_and(pd->cpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask);
 	cpumask_and(pd->cpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask);
 
-	padata_init_pqueues(pd);
+	padata_init_reorder_list(pd);
 	padata_init_squeues(pd);
 	pd->seq_nr = -1;
 	atomic_set(&pd->refcnt, 1);
@@ -609,8 +603,8 @@ err_free_pcpu:
 	free_cpumask_var(pd->cpumask.pcpu);
 err_free_squeue:
 	free_percpu(pd->squeue);
-err_free_pqueue:
-	free_percpu(pd->pqueue);
+err_free_reorder_list:
+	free_percpu(pd->reorder_list);
 err_free_pd:
 	kfree(pd);
 err:
@@ -621,7 +615,7 @@ static void padata_free_pd(struct parallel_data *pd)
 {
 	free_cpumask_var(pd->cpumask.pcpu);
 	free_cpumask_var(pd->cpumask.cbcpu);
-	free_percpu(pd->pqueue);
+	free_percpu(pd->reorder_list);
 	free_percpu(pd->squeue);
 	kfree(pd);
 }
-- 
cgit v1.2.3


From 072e133d554bb74929f902582c8cf66d9fd12771 Mon Sep 17 00:00:00 2001
From: Peter Enderborg <peter.enderborg@sony.com>
Date: Thu, 16 Jul 2020 09:15:10 +0200
Subject: tracefs: Remove unnecessary debug_fs checks.

This is a preparation for debugfs restricted mode.
We don't need debugfs to trace, the removed check stop tracefs to work
if debugfs is not initialised. We instead tries to automount within
debugfs and relay on it's handling. The code path is to create a
backward compatibility from when tracefs was part of debugfs, it is now
standalone and does not need debugfs. When debugfs is in restricted
it is compiled in but not active and return EPERM to clients and
tracefs wont work if it assumes it is active it is compiled in
kernel.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Peter Enderborg <peter.enderborg@sony.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20200716071511.26864-2-peter.enderborg@sony.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/trace/trace.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bb62269724d5..848f67a5f16d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8945,9 +8945,7 @@ struct dentry *tracing_init_dentry(void)
 	if (tr->dir)
 		return NULL;
 
-	if (WARN_ON(!tracefs_initialized()) ||
-		(IS_ENABLED(CONFIG_DEBUG_FS) &&
-		 WARN_ON(!debugfs_initialized())))
+	if (WARN_ON(!tracefs_initialized()))
 		return ERR_PTR(-ENODEV);
 
 	/*
-- 
cgit v1.2.3


From 31cd0e119d50cf27ebe214d1a8f7ca36692f13a5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 23 Jul 2020 17:16:41 +0200
Subject: timers: Recalculate next timer interrupt only when necessary

The nohz tick code recalculates the timer wheel's next expiry on each idle
loop iteration.

On the other hand, the base next expiry is now always cached and updated
upon timer enqueue and execution. Only timer dequeue may leave
base->next_expiry out of date (but then its stale value won't ever go past
the actual next expiry to be recalculated).

Since recalculating the next_expiry isn't a free operation, especially when
the last wheel level is reached to find out that no timer has been enqueued
at all, reuse the next expiry cache when it is known to be reliable, which
it is most of the time.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200723151641.12236-1-frederic@kernel.org
---
 kernel/time/timer.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 77e21e98ec32..96d802e9769e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -204,6 +204,7 @@ struct timer_base {
 	unsigned long		clk;
 	unsigned long		next_expiry;
 	unsigned int		cpu;
+	bool			next_expiry_recalc;
 	bool			is_idle;
 	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
 	struct hlist_head	vectors[WHEEL_SIZE];
@@ -593,6 +594,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
 		 * can reevaluate the wheel:
 		 */
 		base->next_expiry = bucket_expiry;
+		base->next_expiry_recalc = false;
 		trigger_dyntick_cpu(base, timer);
 	}
 }
@@ -836,8 +838,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
 	if (!timer_pending(timer))
 		return 0;
 
-	if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+	if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
 		__clear_bit(idx, base->pending_map);
+		base->next_expiry_recalc = true;
+	}
 
 	detach_timer(timer, clear_pending);
 	return 1;
@@ -1571,6 +1575,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
 		clk >>= LVL_CLK_SHIFT;
 		clk += adj;
 	}
+
+	base->next_expiry_recalc = false;
+
 	return next;
 }
 
@@ -1631,9 +1638,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 		return expires;
 
 	raw_spin_lock(&base->lock);
-	nextevt = __next_timer_interrupt(base);
+	if (base->next_expiry_recalc)
+		base->next_expiry = __next_timer_interrupt(base);
+	nextevt = base->next_expiry;
 	is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
-	base->next_expiry = nextevt;
+
 	/*
 	 * We have a fresh next event. Check whether we can forward the
 	 * base. We can only do that when @basej is past base->clk
@@ -1725,6 +1734,12 @@ static inline void __run_timers(struct timer_base *base)
 	while (time_after_eq(jiffies, base->clk) &&
 	       time_after_eq(jiffies, base->next_expiry)) {
 		levels = collect_expired_timers(base, heads);
+		/*
+		 * The only possible reason for not finding any expired
+		 * timer at this clk is that all matching timers have been
+		 * dequeued.
+		 */
+		WARN_ON_ONCE(!levels && !base->next_expiry_recalc);
 		base->clk++;
 		base->next_expiry = __next_timer_interrupt(base);
 
-- 
cgit v1.2.3


From 142781e108b13b2b0e8f035cfb5bfbbc8f14d887 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Jul 2020 23:59:56 +0200
Subject: entry: Provide generic syscall entry functionality

On syscall entry certain work needs to be done:

   - Establish state (lockdep, context tracking, tracing)
   - Conditional work (ptrace, seccomp, audit...)

This code is needlessly duplicated and  different in all
architectures.

Provide a generic version based on the x86 implementation which has all the
RCU and instrumentation bits right.

As interrupt/exception entry from user space needs parts of the same
functionality, provide a function for this as well.

syscall_enter_from_user_mode() and irqentry_enter_from_user_mode() must be
called right after the low level ASM entry. The calling code must be
non-instrumentable. After the functions returns state is correct and the
subsequent functions can be instrumented.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20200722220519.513463269@linutronix.de
---
 arch/Kconfig                 |   3 ++
 include/linux/entry-common.h | 121 +++++++++++++++++++++++++++++++++++++++++++
 kernel/Makefile              |   1 +
 kernel/entry/Makefile        |  12 +++++
 kernel/entry/common.c        |  88 +++++++++++++++++++++++++++++++
 5 files changed, 225 insertions(+)
 create mode 100644 include/linux/entry-common.h
 create mode 100644 kernel/entry/Makefile
 create mode 100644 kernel/entry/common.c

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 8cc35dc556c7..852a527f418f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -27,6 +27,9 @@ config HAVE_IMA_KEXEC
 config HOTPLUG_SMT
 	bool
 
+config GENERIC_ENTRY
+       bool
+
 config OPROFILE
 	tristate "OProfile system profiling"
 	depends on PROFILING
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
new file mode 100644
index 000000000000..42fc8e4632bb
--- /dev/null
+++ b/include/linux/entry-common.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_ENTRYCOMMON_H
+#define __LINUX_ENTRYCOMMON_H
+
+#include <linux/tracehook.h>
+#include <linux/syscalls.h>
+#include <linux/seccomp.h>
+#include <linux/sched.h>
+
+#include <asm/entry-common.h>
+
+/*
+ * Define dummy _TIF work flags if not defined by the architecture or for
+ * disabled functionality.
+ */
+#ifndef _TIF_SYSCALL_EMU
+# define _TIF_SYSCALL_EMU		(0)
+#endif
+
+#ifndef _TIF_SYSCALL_TRACEPOINT
+# define _TIF_SYSCALL_TRACEPOINT	(0)
+#endif
+
+#ifndef _TIF_SECCOMP
+# define _TIF_SECCOMP			(0)
+#endif
+
+#ifndef _TIF_SYSCALL_AUDIT
+# define _TIF_SYSCALL_AUDIT		(0)
+#endif
+
+/*
+ * TIF flags handled in syscall_enter_from_usermode()
+ */
+#ifndef ARCH_SYSCALL_ENTER_WORK
+# define ARCH_SYSCALL_ENTER_WORK	(0)
+#endif
+
+#define SYSCALL_ENTER_WORK						\
+	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP |	\
+	 _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_EMU |			\
+	 ARCH_SYSCALL_ENTER_WORK)
+
+/**
+ * arch_check_user_regs - Architecture specific sanity check for user mode regs
+ * @regs:	Pointer to currents pt_regs
+ *
+ * Defaults to an empty implementation. Can be replaced by architecture
+ * specific code.
+ *
+ * Invoked from syscall_enter_from_user_mode() in the non-instrumentable
+ * section. Use __always_inline so the compiler cannot push it out of line
+ * and make it instrumentable.
+ */
+static __always_inline void arch_check_user_regs(struct pt_regs *regs);
+
+#ifndef arch_check_user_regs
+static __always_inline void arch_check_user_regs(struct pt_regs *regs) {}
+#endif
+
+/**
+ * arch_syscall_enter_tracehook - Wrapper around tracehook_report_syscall_entry()
+ * @regs:	Pointer to currents pt_regs
+ *
+ * Returns: 0 on success or an error code to skip the syscall.
+ *
+ * Defaults to tracehook_report_syscall_entry(). Can be replaced by
+ * architecture specific code.
+ *
+ * Invoked from syscall_enter_from_user_mode()
+ */
+static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs);
+
+#ifndef arch_syscall_enter_tracehook
+static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs)
+{
+	return tracehook_report_syscall_entry(regs);
+}
+#endif
+
+/**
+ * syscall_enter_from_user_mode - Check and handle work before invoking
+ *				 a syscall
+ * @regs:	Pointer to currents pt_regs
+ * @syscall:	The syscall number
+ *
+ * Invoked from architecture specific syscall entry code with interrupts
+ * disabled. The calling code has to be non-instrumentable. When the
+ * function returns all state is correct and the subsequent functions can be
+ * instrumented.
+ *
+ * Returns: The original or a modified syscall number
+ *
+ * If the returned syscall number is -1 then the syscall should be
+ * skipped. In this case the caller may invoke syscall_set_error() or
+ * syscall_set_return_value() first.  If neither of those are called and -1
+ * is returned, then the syscall will fail with ENOSYS.
+ *
+ * The following functionality is handled here:
+ *
+ *  1) Establish state (lockdep, RCU (context tracking), tracing)
+ *  2) TIF flag dependent invocations of arch_syscall_enter_tracehook(),
+ *     __secure_computing(), trace_sys_enter()
+ *  3) Invocation of audit_syscall_entry()
+ */
+long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall);
+
+/**
+ * irqentry_enter_from_user_mode - Establish state before invoking the irq handler
+ * @regs:	Pointer to currents pt_regs
+ *
+ * Invoked from architecture specific entry code with interrupts disabled.
+ * Can only be called when the interrupt entry came from user mode. The
+ * calling code must be non-instrumentable.  When the function returns all
+ * state is correct and the subsequent functions can be instrumented.
+ *
+ * The function establishes state (lockdep, RCU (context tracking), tracing)
+ */
+void irqentry_enter_from_user_mode(struct pt_regs *regs);
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index f3218bc5ec69..fde2000d0d0d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -48,6 +48,7 @@ obj-y += irq/
 obj-y += rcu/
 obj-y += livepatch/
 obj-y += dma/
+obj-y += entry/
 
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
new file mode 100644
index 000000000000..c207d202bf3a
--- /dev/null
+++ b/kernel/entry/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Prevent the noinstr section from being pestered by sanitizer and other goodies
+# as long as these things cannot be disabled per function.
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+CFLAGS_REMOVE_common.o	 = -fstack-protector -fstack-protector-strong
+CFLAGS_common.o		+= -fno-stack-protector
+
+obj-$(CONFIG_GENERIC_ENTRY) += common.o
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
new file mode 100644
index 000000000000..1d636dee2fec
--- /dev/null
+++ b/kernel/entry/common.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/context_tracking.h>
+#include <linux/entry-common.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+/**
+ * enter_from_user_mode - Establish state when coming from user mode
+ *
+ * Syscall/interrupt entry disables interrupts, but user mode is traced as
+ * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
+ *
+ * 1) Tell lockdep that interrupts are disabled
+ * 2) Invoke context tracking if enabled to reactivate RCU
+ * 3) Trace interrupts off state
+ */
+static __always_inline void enter_from_user_mode(struct pt_regs *regs)
+{
+	arch_check_user_regs(regs);
+	lockdep_hardirqs_off(CALLER_ADDR0);
+
+	CT_WARN_ON(ct_state() != CONTEXT_USER);
+	user_exit_irqoff();
+
+	instrumentation_begin();
+	trace_hardirqs_off_finish();
+	instrumentation_end();
+}
+
+static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
+{
+	if (unlikely(audit_context())) {
+		unsigned long args[6];
+
+		syscall_get_arguments(current, regs, args);
+		audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
+	}
+}
+
+static long syscall_trace_enter(struct pt_regs *regs, long syscall,
+				unsigned long ti_work)
+{
+	long ret = 0;
+
+	/* Handle ptrace */
+	if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
+		ret = arch_syscall_enter_tracehook(regs);
+		if (ret || (ti_work & _TIF_SYSCALL_EMU))
+			return -1L;
+	}
+
+	/* Do seccomp after ptrace, to catch any tracer changes. */
+	if (ti_work & _TIF_SECCOMP) {
+		ret = __secure_computing(NULL);
+		if (ret == -1L)
+			return ret;
+	}
+
+	if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT))
+		trace_sys_enter(regs, syscall);
+
+	syscall_enter_audit(regs, syscall);
+
+	return ret ? : syscall;
+}
+
+noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+{
+	unsigned long ti_work;
+
+	enter_from_user_mode(regs);
+	instrumentation_begin();
+
+	local_irq_enable();
+	ti_work = READ_ONCE(current_thread_info()->flags);
+	if (ti_work & SYSCALL_ENTER_WORK)
+		syscall = syscall_trace_enter(regs, syscall, ti_work);
+	instrumentation_end();
+
+	return syscall;
+}
+
+noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
+{
+	enter_from_user_mode(regs);
+}
-- 
cgit v1.2.3


From a9f3a74a29af095f3e1b89e9176f8127912ae0f0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Jul 2020 23:59:57 +0200
Subject: entry: Provide generic syscall exit function

Like syscall entry all architectures have similar and pointlessly different
code to handle pending work before returning from a syscall to user space.

  1) One-time syscall exit work:
      - rseq syscall exit
      - audit
      - syscall tracing
      - tracehook (single stepping)

  2) Preparatory work
      - Exit to user mode loop (common TIF handling).
      - Architecture specific one time work arch_exit_to_user_mode_prepare()
      - Address limit and lockdep checks

  3) Final transition (lockdep, tracing, context tracking, RCU). Invokes
     arch_exit_to_user_mode() to handle e.g. speculation mitigations

Provide a generic version based on the x86 code which has all the RCU and
instrumentation protections right.

Provide a variant for interrupt return to user mode as well which shares
the above #2 and #3 work items.

After syscall_exit_to_user_mode() and irqentry_exit_to_user_mode() the
architecture code just has to return to user space. The code after
returning from these functions must not be instrumented.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20200722220519.613977173@linutronix.de
---
 include/linux/entry-common.h | 189 +++++++++++++++++++++++++++++++++++++++++++
 kernel/entry/common.c        | 169 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 358 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 42fc8e4632bb..c4a57be9cde4 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -29,6 +29,14 @@
 # define _TIF_SYSCALL_AUDIT		(0)
 #endif
 
+#ifndef _TIF_PATCH_PENDING
+# define _TIF_PATCH_PENDING		(0)
+#endif
+
+#ifndef _TIF_UPROBE
+# define _TIF_UPROBE			(0)
+#endif
+
 /*
  * TIF flags handled in syscall_enter_from_usermode()
  */
@@ -41,6 +49,29 @@
 	 _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_EMU |			\
 	 ARCH_SYSCALL_ENTER_WORK)
 
+/*
+ * TIF flags handled in syscall_exit_to_user_mode()
+ */
+#ifndef ARCH_SYSCALL_EXIT_WORK
+# define ARCH_SYSCALL_EXIT_WORK		(0)
+#endif
+
+#define SYSCALL_EXIT_WORK						\
+	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |			\
+	 _TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK)
+
+/*
+ * TIF flags handled in exit_to_user_mode_loop()
+ */
+#ifndef ARCH_EXIT_TO_USER_MODE_WORK
+# define ARCH_EXIT_TO_USER_MODE_WORK		(0)
+#endif
+
+#define EXIT_TO_USER_MODE_WORK						\
+	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
+	 _TIF_NEED_RESCHED | _TIF_PATCH_PENDING |			\
+	 ARCH_EXIT_TO_USER_MODE_WORK)
+
 /**
  * arch_check_user_regs - Architecture specific sanity check for user mode regs
  * @regs:	Pointer to currents pt_regs
@@ -105,6 +136,149 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs
  */
 long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall);
 
+/**
+ * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable()
+ * @ti_work:	Cached TIF flags gathered with interrupts disabled
+ *
+ * Defaults to local_irq_enable(). Can be supplied by architecture specific
+ * code.
+ */
+static inline void local_irq_enable_exit_to_user(unsigned long ti_work);
+
+#ifndef local_irq_enable_exit_to_user
+static inline void local_irq_enable_exit_to_user(unsigned long ti_work)
+{
+	local_irq_enable();
+}
+#endif
+
+/**
+ * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable()
+ *
+ * Defaults to local_irq_disable(). Can be supplied by architecture specific
+ * code.
+ */
+static inline void local_irq_disable_exit_to_user(void);
+
+#ifndef local_irq_disable_exit_to_user
+static inline void local_irq_disable_exit_to_user(void)
+{
+	local_irq_disable();
+}
+#endif
+
+/**
+ * arch_exit_to_user_mode_work - Architecture specific TIF work for exit
+ *				 to user mode.
+ * @regs:	Pointer to currents pt_regs
+ * @ti_work:	Cached TIF flags gathered with interrupts disabled
+ *
+ * Invoked from exit_to_user_mode_loop() with interrupt enabled
+ *
+ * Defaults to NOOP. Can be supplied by architecture specific code.
+ */
+static inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
+					       unsigned long ti_work);
+
+#ifndef arch_exit_to_user_mode_work
+static inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
+					       unsigned long ti_work)
+{
+}
+#endif
+
+/**
+ * arch_exit_to_user_mode_prepare - Architecture specific preparation for
+ *				    exit to user mode.
+ * @regs:	Pointer to currents pt_regs
+ * @ti_work:	Cached TIF flags gathered with interrupts disabled
+ *
+ * Invoked from exit_to_user_mode_prepare() with interrupt disabled as the last
+ * function before return. Defaults to NOOP.
+ */
+static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
+						  unsigned long ti_work);
+
+#ifndef arch_exit_to_user_mode_prepare
+static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
+						  unsigned long ti_work)
+{
+}
+#endif
+
+/**
+ * arch_exit_to_user_mode - Architecture specific final work before
+ *			    exit to user mode.
+ *
+ * Invoked from exit_to_user_mode() with interrupt disabled as the last
+ * function before return. Defaults to NOOP.
+ *
+ * This needs to be __always_inline because it is non-instrumentable code
+ * invoked after context tracking switched to user mode.
+ *
+ * An architecture implementation must not do anything complex, no locking
+ * etc. The main purpose is for speculation mitigations.
+ */
+static __always_inline void arch_exit_to_user_mode(void);
+
+#ifndef arch_exit_to_user_mode
+static __always_inline void arch_exit_to_user_mode(void) { }
+#endif
+
+/**
+ * arch_do_signal -  Architecture specific signal delivery function
+ * @regs:	Pointer to currents pt_regs
+ *
+ * Invoked from exit_to_user_mode_loop().
+ */
+void arch_do_signal(struct pt_regs *regs);
+
+/**
+ * arch_syscall_exit_tracehook - Wrapper around tracehook_report_syscall_exit()
+ * @regs:	Pointer to currents pt_regs
+ * @step:	Indicator for single step
+ *
+ * Defaults to tracehook_report_syscall_exit(). Can be replaced by
+ * architecture specific code.
+ *
+ * Invoked from syscall_exit_to_user_mode()
+ */
+static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step);
+
+#ifndef arch_syscall_exit_tracehook
+static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step)
+{
+	tracehook_report_syscall_exit(regs, step);
+}
+#endif
+
+/**
+ * syscall_exit_to_user_mode - Handle work before returning to user mode
+ * @regs:	Pointer to currents pt_regs
+ *
+ * Invoked with interrupts enabled and fully valid regs. Returns with all
+ * work handled, interrupts disabled such that the caller can immediately
+ * switch to user mode. Called from architecture specific syscall and ret
+ * from fork code.
+ *
+ * The call order is:
+ *  1) One-time syscall exit work:
+ *	- rseq syscall exit
+ *      - audit
+ *	- syscall tracing
+ *	- tracehook (single stepping)
+ *
+ *  2) Preparatory work
+ *	- Exit to user mode loop (common TIF handling). Invokes
+ *	  arch_exit_to_user_mode_work() for architecture specific TIF work
+ *	- Architecture specific one time work arch_exit_to_user_mode_prepare()
+ *	- Address limit and lockdep checks
+ *
+ *  3) Final transition (lockdep, tracing, context tracking, RCU). Invokes
+ *     arch_exit_to_user_mode() to handle e.g. speculation mitigations
+ */
+void syscall_exit_to_user_mode(struct pt_regs *regs);
+
 /**
  * irqentry_enter_from_user_mode - Establish state before invoking the irq handler
  * @regs:	Pointer to currents pt_regs
@@ -118,4 +292,19 @@ long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall);
  */
 void irqentry_enter_from_user_mode(struct pt_regs *regs);
 
+/**
+ * irqentry_exit_to_user_mode - Interrupt exit work
+ * @regs:	Pointer to current's pt_regs
+ *
+ * Invoked with interrupts disbled and fully valid regs. Returns with all
+ * work handled, interrupts disabled such that the caller can immediately
+ * switch to user mode. Called from architecture specific interrupt
+ * handling code.
+ *
+ * The call order is #2 and #3 as described in syscall_exit_to_user_mode().
+ * Interrupt exit is not invoking #1 which is the syscall specific one time
+ * work.
+ */
+void irqentry_exit_to_user_mode(struct pt_regs *regs);
+
 #endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 1d636dee2fec..0a051bb91a56 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,6 +2,8 @@
 
 #include <linux/context_tracking.h>
 #include <linux/entry-common.h>
+#include <linux/livepatch.h>
+#include <linux/audit.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
@@ -82,7 +84,174 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
 	return syscall;
 }
 
+/**
+ * exit_to_user_mode - Fixup state when exiting to user mode
+ *
+ * Syscall/interupt exit enables interrupts, but the kernel state is
+ * interrupts disabled when this is invoked. Also tell RCU about it.
+ *
+ * 1) Trace interrupts on state
+ * 2) Invoke context tracking if enabled to adjust RCU state
+ * 3) Invoke architecture specific last minute exit code, e.g. speculation
+ *    mitigations, etc.
+ * 4) Tell lockdep that interrupts are enabled
+ */
+static __always_inline void exit_to_user_mode(void)
+{
+	instrumentation_begin();
+	trace_hardirqs_on_prepare();
+	lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+	instrumentation_end();
+
+	user_enter_irqoff();
+	arch_exit_to_user_mode();
+	lockdep_hardirqs_on(CALLER_ADDR0);
+}
+
+/* Workaround to allow gradual conversion of architecture code */
+void __weak arch_do_signal(struct pt_regs *regs) { }
+
+static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+					    unsigned long ti_work)
+{
+	/*
+	 * Before returning to user space ensure that all pending work
+	 * items have been completed.
+	 */
+	while (ti_work & EXIT_TO_USER_MODE_WORK) {
+
+		local_irq_enable_exit_to_user(ti_work);
+
+		if (ti_work & _TIF_NEED_RESCHED)
+			schedule();
+
+		if (ti_work & _TIF_UPROBE)
+			uprobe_notify_resume(regs);
+
+		if (ti_work & _TIF_PATCH_PENDING)
+			klp_update_patch_state(current);
+
+		if (ti_work & _TIF_SIGPENDING)
+			arch_do_signal(regs);
+
+		if (ti_work & _TIF_NOTIFY_RESUME) {
+			clear_thread_flag(TIF_NOTIFY_RESUME);
+			tracehook_notify_resume(regs);
+			rseq_handle_notify_resume(NULL, regs);
+		}
+
+		/* Architecture specific TIF work */
+		arch_exit_to_user_mode_work(regs, ti_work);
+
+		/*
+		 * Disable interrupts and reevaluate the work flags as they
+		 * might have changed while interrupts and preemption was
+		 * enabled above.
+		 */
+		local_irq_disable_exit_to_user();
+		ti_work = READ_ONCE(current_thread_info()->flags);
+	}
+
+	/* Return the latest work state for arch_exit_to_user_mode() */
+	return ti_work;
+}
+
+static void exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+	unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+
+	lockdep_assert_irqs_disabled();
+
+	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
+		ti_work = exit_to_user_mode_loop(regs, ti_work);
+
+	arch_exit_to_user_mode_prepare(regs, ti_work);
+
+	/* Ensure that the address limit is intact and no locks are held */
+	addr_limit_user_check();
+	lockdep_assert_irqs_disabled();
+	lockdep_sys_exit();
+}
+
+#ifndef _TIF_SINGLESTEP
+static inline bool report_single_step(unsigned long ti_work)
+{
+	return false;
+}
+#else
+/*
+ * If TIF_SYSCALL_EMU is set, then the only reason to report is when
+ * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
+ * instruction has been already reported in syscall_enter_from_usermode().
+ */
+#define SYSEMU_STEP	(_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)
+
+static inline bool report_single_step(unsigned long ti_work)
+{
+	return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP;
+}
+#endif
+
+static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work)
+{
+	bool step;
+
+	audit_syscall_exit(regs);
+
+	if (ti_work & _TIF_SYSCALL_TRACEPOINT)
+		trace_sys_exit(regs, syscall_get_return_value(current, regs));
+
+	step = report_single_step(ti_work);
+	if (step || ti_work & _TIF_SYSCALL_TRACE)
+		arch_syscall_exit_tracehook(regs, step);
+}
+
+/*
+ * Syscall specific exit to user mode preparation. Runs with interrupts
+ * enabled.
+ */
+static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+	u32 cached_flags = READ_ONCE(current_thread_info()->flags);
+	unsigned long nr = syscall_get_nr(current, regs);
+
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+
+	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+		if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
+			local_irq_enable();
+	}
+
+	rseq_syscall(regs);
+
+	/*
+	 * Do one-time syscall specific work. If these work items are
+	 * enabled, we want to run them exactly once per syscall exit with
+	 * interrupts enabled.
+	 */
+	if (unlikely(cached_flags & SYSCALL_EXIT_WORK))
+		syscall_exit_work(regs, cached_flags);
+}
+
+__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
+{
+	instrumentation_begin();
+	syscall_exit_to_user_mode_prepare(regs);
+	local_irq_disable_exit_to_user();
+	exit_to_user_mode_prepare(regs);
+	instrumentation_end();
+	exit_to_user_mode();
+}
+
 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
 {
 	enter_from_user_mode(regs);
 }
+
+noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
+{
+	instrumentation_begin();
+	exit_to_user_mode_prepare(regs);
+	instrumentation_end();
+	exit_to_user_mode();
+}
-- 
cgit v1.2.3


From a5497bab5f72dce38a259a53fd3ac1239a7ecf40 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Jul 2020 23:59:58 +0200
Subject: entry: Provide generic interrupt entry/exit code

Like the syscall entry/exit code interrupt/exception entry after the real
low level ASM bits should not be different accross architectures.

Provide a generic version based on the x86 code.

irqentry_enter() is called after the low level entry code and
irqentry_exit() must be invoked right before returning to the low level
code which just contains the actual return logic. The code before
irqentry_enter() and irqentry_exit() must not be instrumented. Code after
irqentry_enter() and before irqentry_exit() can be instrumented.

irqentry_enter() invokes irqentry_enter_from_user_mode() if the
interrupt/exception came from user mode. If if entered from kernel mode it
handles the kernel mode variant of establishing state for lockdep, RCU and
tracing depending on the kernel context it interrupted (idle, non-idle).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200722220519.723703209@linutronix.de
---
 include/linux/entry-common.h |  62 +++++++++++++++++++++++
 kernel/entry/common.c        | 117 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index c4a57be9cde4..efebbffcd5cc 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -307,4 +307,66 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs);
  */
 void irqentry_exit_to_user_mode(struct pt_regs *regs);
 
+#ifndef irqentry_state
+typedef struct irqentry_state {
+	bool	exit_rcu;
+} irqentry_state_t;
+#endif
+
+/**
+ * irqentry_enter - Handle state tracking on ordinary interrupt entries
+ * @regs:	Pointer to pt_regs of interrupted context
+ *
+ * Invokes:
+ *  - lockdep irqflag state tracking as low level ASM entry disabled
+ *    interrupts.
+ *
+ *  - Context tracking if the exception hit user mode.
+ *
+ *  - The hardirq tracer to keep the state consistent as low level ASM
+ *    entry disabled interrupts.
+ *
+ * As a precondition, this requires that the entry came from user mode,
+ * idle, or a kernel context in which RCU is watching.
+ *
+ * For kernel mode entries RCU handling is done conditional. If RCU is
+ * watching then the only RCU requirement is to check whether the tick has
+ * to be restarted. If RCU is not watching then rcu_irq_enter() has to be
+ * invoked on entry and rcu_irq_exit() on exit.
+ *
+ * Avoiding the rcu_irq_enter/exit() calls is an optimization but also
+ * solves the problem of kernel mode pagefaults which can schedule, which
+ * is not possible after invoking rcu_irq_enter() without undoing it.
+ *
+ * For user mode entries irqentry_enter_from_user_mode() is invoked to
+ * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
+ * would not be possible.
+ *
+ * Returns: An opaque object that must be passed to idtentry_exit()
+ */
+irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs);
+
+/**
+ * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt
+ *
+ * Conditional reschedule with additional sanity checks.
+ */
+void irqentry_exit_cond_resched(void);
+
+/**
+ * irqentry_exit - Handle return from exception that used irqentry_enter()
+ * @regs:	Pointer to pt_regs (exception entry regs)
+ * @state:	Return value from matching call to irqentry_enter()
+ *
+ * Depending on the return target (kernel/user) this runs the necessary
+ * preemption and work checks if possible and reguired and returns to
+ * the caller with interrupts disabled and no further work pending.
+ *
+ * This is the last action before returning to the low level ASM code which
+ * just needs to return to the appropriate context.
+ *
+ * Counterpart to irqentry_enter().
+ */
+void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state);
+
 #endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 0a051bb91a56..495f5c051b03 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -255,3 +255,120 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
 	instrumentation_end();
 	exit_to_user_mode();
 }
+
+irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs)
+{
+	irqentry_state_t ret = {
+		.exit_rcu = false,
+	};
+
+	if (user_mode(regs)) {
+		irqentry_enter_from_user_mode(regs);
+		return ret;
+	}
+
+	/*
+	 * If this entry hit the idle task invoke rcu_irq_enter() whether
+	 * RCU is watching or not.
+	 *
+	 * Interupts can nest when the first interrupt invokes softirq
+	 * processing on return which enables interrupts.
+	 *
+	 * Scheduler ticks in the idle task can mark quiescent state and
+	 * terminate a grace period, if and only if the timer interrupt is
+	 * not nested into another interrupt.
+	 *
+	 * Checking for __rcu_is_watching() here would prevent the nesting
+	 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
+	 * the tick then rcu_flavor_sched_clock_irq() would wrongfully
+	 * assume that it is the first interupt and eventually claim
+	 * quiescient state and end grace periods prematurely.
+	 *
+	 * Unconditionally invoke rcu_irq_enter() so RCU state stays
+	 * consistent.
+	 *
+	 * TINY_RCU does not support EQS, so let the compiler eliminate
+	 * this part when enabled.
+	 */
+	if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
+		/*
+		 * If RCU is not watching then the same careful
+		 * sequence vs. lockdep and tracing is required
+		 * as in irq_enter_from_user_mode().
+		 */
+		lockdep_hardirqs_off(CALLER_ADDR0);
+		rcu_irq_enter();
+		instrumentation_begin();
+		trace_hardirqs_off_finish();
+		instrumentation_end();
+
+		ret.exit_rcu = true;
+		return ret;
+	}
+
+	/*
+	 * If RCU is watching then RCU only wants to check whether it needs
+	 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
+	 * already contains a warning when RCU is not watching, so no point
+	 * in having another one here.
+	 */
+	instrumentation_begin();
+	rcu_irq_enter_check_tick();
+	/* Use the combo lockdep/tracing function */
+	trace_hardirqs_off();
+	instrumentation_end();
+
+	return ret;
+}
+
+void irqentry_exit_cond_resched(void)
+{
+	if (!preempt_count()) {
+		/* Sanity check RCU and thread stack */
+		rcu_irq_exit_check_preempt();
+		if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+			WARN_ON_ONCE(!on_thread_stack());
+		if (need_resched())
+			preempt_schedule_irq();
+	}
+}
+
+void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
+{
+	lockdep_assert_irqs_disabled();
+
+	/* Check whether this returns to user mode */
+	if (user_mode(regs)) {
+		irqentry_exit_to_user_mode(regs);
+	} else if (!regs_irqs_disabled(regs)) {
+		/*
+		 * If RCU was not watching on entry this needs to be done
+		 * carefully and needs the same ordering of lockdep/tracing
+		 * and RCU as the return to user mode path.
+		 */
+		if (state.exit_rcu) {
+			instrumentation_begin();
+			/* Tell the tracer that IRET will enable interrupts */
+			trace_hardirqs_on_prepare();
+			lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+			instrumentation_end();
+			rcu_irq_exit();
+			lockdep_hardirqs_on(CALLER_ADDR0);
+			return;
+		}
+
+		instrumentation_begin();
+		if (IS_ENABLED(CONFIG_PREEMPTION))
+			irqentry_exit_cond_resched();
+		/* Covers both tracing and lockdep */
+		trace_hardirqs_on();
+		instrumentation_end();
+	} else {
+		/*
+		 * IRQ flags state is correct already. Just tell RCU if it
+		 * was not watching on entry.
+		 */
+		if (state.exit_rcu)
+			rcu_irq_exit();
+	}
+}
-- 
cgit v1.2.3


From 935ace2fb5cc49ae88bd1f1735ddc51cdc2ebfb3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Jul 2020 23:59:59 +0200
Subject: entry: Provide infrastructure for work before transitioning to guest
 mode

Entering a guest is similar to exiting to user space. Pending work like
handling signals, rescheduling, task work etc. needs to be handled before
that.

Provide generic infrastructure to avoid duplication of the same handling
code all over the place.

The transfer to guest mode handling is different from the exit to usermode
handling, e.g. vs. rseq and live patching, so a separate function is used.

The initial list of work items handled is:

    TIF_SIGPENDING, TIF_NEED_RESCHED, TIF_NOTIFY_RESUME

Architecture specific TIF flags can be added via defines in the
architecture specific include files.

The calling convention is also different from the syscall/interrupt entry
functions as KVM invokes this from the outer vcpu_run() loop with
interrupts and preemption enabled. To prevent missing a pending work item
it invokes a check for pending TIF work from interrupt disabled code right
before transitioning to guest mode. The lockdep, RCU and tracing state
handling is also done directly around the switch to and from guest mode.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200722220519.833296398@linutronix.de
---
 include/linux/entry-kvm.h | 80 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/kvm_host.h  |  8 +++++
 kernel/entry/Makefile     |  3 +-
 kernel/entry/kvm.c        | 51 ++++++++++++++++++++++++++++++
 virt/kvm/Kconfig          |  3 ++
 5 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/entry-kvm.h
 create mode 100644 kernel/entry/kvm.c

(limited to 'kernel')

diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
new file mode 100644
index 000000000000..0cef17afb41a
--- /dev/null
+++ b/include/linux/entry-kvm.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_ENTRYKVM_H
+#define __LINUX_ENTRYKVM_H
+
+#include <linux/entry-common.h>
+
+/* Transfer to guest mode work */
+#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
+
+#ifndef ARCH_XFER_TO_GUEST_MODE_WORK
+# define ARCH_XFER_TO_GUEST_MODE_WORK	(0)
+#endif
+
+#define XFER_TO_GUEST_MODE_WORK					\
+	(_TIF_NEED_RESCHED | _TIF_SIGPENDING |			\
+	 _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK)
+
+struct kvm_vcpu;
+
+/**
+ * arch_xfer_to_guest_mode_handle_work - Architecture specific xfer to guest
+ *					 mode work handling function.
+ * @vcpu:	Pointer to current's VCPU data
+ * @ti_work:	Cached TIF flags gathered in xfer_to_guest_mode_handle_work()
+ *
+ * Invoked from xfer_to_guest_mode_handle_work(). Defaults to NOOP. Can be
+ * replaced by architecture specific code.
+ */
+static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu,
+						      unsigned long ti_work);
+
+#ifndef arch_xfer_to_guest_mode_work
+static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu,
+						      unsigned long ti_work)
+{
+	return 0;
+}
+#endif
+
+/**
+ * xfer_to_guest_mode_handle_work - Check and handle pending work which needs
+ *				    to be handled before going to guest mode
+ * @vcpu:	Pointer to current's VCPU data
+ *
+ * Returns: 0 or an error code
+ */
+int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu);
+
+/**
+ * __xfer_to_guest_mode_work_pending - Check if work is pending
+ *
+ * Returns: True if work pending, False otherwise.
+ *
+ * Bare variant of xfer_to_guest_mode_work_pending(). Can be called from
+ * interrupt enabled code for racy quick checks with care.
+ */
+static inline bool __xfer_to_guest_mode_work_pending(void)
+{
+	unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+
+	return !!(ti_work & XFER_TO_GUEST_MODE_WORK);
+}
+
+/**
+ * xfer_to_guest_mode_work_pending - Check if work is pending which needs to be
+ *				     handled before returning to guest mode
+ *
+ * Returns: True if work pending, False otherwise.
+ *
+ * Has to be invoked with interrupts disabled before the transition to
+ * guest mode.
+ */
+static inline bool xfer_to_guest_mode_work_pending(void)
+{
+	lockdep_assert_irqs_disabled();
+	return __xfer_to_guest_mode_work_pending();
+}
+#endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */
+
+#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d564855243d8..ac83e9c1d82c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1439,4 +1439,12 @@ int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
 				uintptr_t data, const char *name,
 				struct task_struct **thread_ptr);
 
+#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
+static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
+{
+	vcpu->run->exit_reason = KVM_EXIT_INTR;
+	vcpu->stat.signal_exits++;
+}
+#endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */
+
 #endif
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
index c207d202bf3a..34c8a3f1c735 100644
--- a/kernel/entry/Makefile
+++ b/kernel/entry/Makefile
@@ -9,4 +9,5 @@ KCOV_INSTRUMENT := n
 CFLAGS_REMOVE_common.o	 = -fstack-protector -fstack-protector-strong
 CFLAGS_common.o		+= -fno-stack-protector
 
-obj-$(CONFIG_GENERIC_ENTRY) += common.o
+obj-$(CONFIG_GENERIC_ENTRY) 		+= common.o
+obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK)	+= kvm.o
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
new file mode 100644
index 000000000000..eb1a8a4c867c
--- /dev/null
+++ b/kernel/entry/kvm.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/entry-kvm.h>
+#include <linux/kvm_host.h>
+
+static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
+{
+	do {
+		int ret;
+
+		if (ti_work & _TIF_SIGPENDING) {
+			kvm_handle_signal_exit(vcpu);
+			return -EINTR;
+		}
+
+		if (ti_work & _TIF_NEED_RESCHED)
+			schedule();
+
+		if (ti_work & _TIF_NOTIFY_RESUME) {
+			clear_thread_flag(TIF_NOTIFY_RESUME);
+			tracehook_notify_resume(NULL);
+		}
+
+		ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
+		if (ret)
+			return ret;
+
+		ti_work = READ_ONCE(current_thread_info()->flags);
+	} while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
+	return 0;
+}
+
+int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu)
+{
+	unsigned long ti_work;
+
+	/*
+	 * This is invoked from the outer guest loop with interrupts and
+	 * preemption enabled.
+	 *
+	 * KVM invokes xfer_to_guest_mode_work_pending() with interrupts
+	 * disabled in the inner loop before going into guest mode. No need
+	 * to disable interrupts here.
+	 */
+	ti_work = READ_ONCE(current_thread_info()->flags);
+	if (!(ti_work & XFER_TO_GUEST_MODE_WORK))
+		return 0;
+
+	return xfer_to_guest_mode_work(vcpu, ti_work);
+}
+EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work);
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index aad9284c043a..1c37ccd5d402 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -60,3 +60,6 @@ config HAVE_KVM_VCPU_RUN_PID_CHANGE
 
 config HAVE_KVM_NO_POLL
        bool
+
+config KVM_XFER_TO_GUEST_WORK
+       bool
-- 
cgit v1.2.3


From e5ebffe18e5add85acb23c7a0f74509749967204 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Sun, 19 Jul 2020 17:10:45 -0600
Subject: dyndbg: rename __verbose section to __dyndbg

dyndbg populates its callsite info into __verbose section, change that
to a more specific and descriptive name, __dyndbg.

Also, per checkpatch:
  simplify __attribute(..) to __section(__dyndbg) declaration.

and 1 spelling fix, decriptor

Acked-by: <jbaron@akamai.com>
Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Link: https://lore.kernel.org/r/20200719231058.1586423-6-jim.cromie@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/asm-generic/vmlinux.lds.h |  6 +++---
 include/linux/dynamic_debug.h     |  4 ++--
 kernel/module.c                   |  2 +-
 lib/dynamic_debug.c               | 12 ++++++------
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index db600ef218d7..05af5cef1ad6 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -320,9 +320,9 @@
 	*(__tracepoints)						\
 	/* implement dynamic printk debug */				\
 	. = ALIGN(8);							\
-	__start___verbose = .;						\
-	KEEP(*(__verbose))                                              \
-	__stop___verbose = .;						\
+	__start___dyndbg = .;						\
+	KEEP(*(__dyndbg))						\
+	__stop___dyndbg = .;						\
 	LIKELY_PROFILE()		       				\
 	BRANCH_PROFILE()						\
 	TRACE_PRINTKS()							\
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index abcd5fde30eb..aa9ff9e1c0b3 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -80,7 +80,7 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 
 #define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt)		\
 	static struct _ddebug  __aligned(8)			\
-	__attribute__((section("__verbose"))) name = {		\
+	__section(__dyndbg) name = {				\
 		.modname = KBUILD_MODNAME,			\
 		.function = __func__,				\
 		.filename = __FILE__,				\
@@ -133,7 +133,7 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 
 /*
  * "Factory macro" for generating a call to func, guarded by a
- * DYNAMIC_DEBUG_BRANCH. The dynamic debug decriptor will be
+ * DYNAMIC_DEBUG_BRANCH. The dynamic debug descriptor will be
  * initialized using the fmt argument. The function will be called with
  * the address of the descriptor as first argument, followed by all
  * the varargs. Note that fmt is repeated in invocations of this
diff --git a/kernel/module.c b/kernel/module.c
index aa183c9ac0a2..e7b4ff7e4fd0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3237,7 +3237,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 	if (section_addr(info, "__obsparm"))
 		pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
 
-	info->debug = section_objs(info, "__verbose",
+	info->debug = section_objs(info, "__dyndbg",
 				   sizeof(*info->debug), &info->num_debug);
 
 	return 0;
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index c97872cffc8e..66c0bdf06ce7 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -39,8 +39,8 @@
 
 #include <rdma/ib_verbs.h>
 
-extern struct _ddebug __start___verbose[];
-extern struct _ddebug __stop___verbose[];
+extern struct _ddebug __start___dyndbg[];
+extern struct _ddebug __stop___dyndbg[];
 
 struct ddebug_table {
 	struct list_head link;
@@ -1019,7 +1019,7 @@ static int __init dynamic_debug_init(void)
 	int n = 0, entries = 0, modct = 0;
 	int verbose_bytes = 0;
 
-	if (&__start___verbose == &__stop___verbose) {
+	if (&__start___dyndbg == &__stop___dyndbg) {
 		if (IS_ENABLED(CONFIG_DYNAMIC_DEBUG)) {
 			pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n");
 			return 1;
@@ -1028,10 +1028,10 @@ static int __init dynamic_debug_init(void)
 		ddebug_init_success = 1;
 		return 0;
 	}
-	iter = __start___verbose;
+	iter = __start___dyndbg;
 	modname = iter->modname;
 	iter_start = iter;
-	for (; iter < __stop___verbose; iter++) {
+	for (; iter < __stop___dyndbg; iter++) {
 		entries++;
 		verbose_bytes += strlen(iter->modname) + strlen(iter->function)
 			+ strlen(iter->filename) + strlen(iter->format);
@@ -1054,7 +1054,7 @@ static int __init dynamic_debug_init(void)
 	ddebug_init_success = 1;
 	vpr_info("%d modules, %d entries and %d bytes in ddebug tables, %d bytes in (readonly) verbose section\n",
 		 modct, entries, (int)(modct * sizeof(struct ddebug_table)),
-		 verbose_bytes + (int)(__stop___verbose - __start___verbose));
+		 verbose_bytes + (int)(__stop___dyndbg - __start___dyndbg));
 
 	/* apply ddebug_query boot param, dont unload tables on err */
 	if (ddebug_setup_string[0] != '\0') {
-- 
cgit v1.2.3


From 13efa616124f7eec7d6a58adeeef31864aa03879 Mon Sep 17 00:00:00 2001
From: Qinglang Miao <miaoqinglang@huawei.com>
Date: Sat, 25 Jul 2020 16:56:29 +0800
Subject: sched/uclamp: Remove unnecessary mutex_init()

The uclamp_mutex lock is initialized statically via DEFINE_MUTEX(),
it is unnecessary to initialize it runtime via mutex_init().

Signed-off-by: Qinglang Miao <miaoqinglang@huawei.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lore.kernel.org/r/20200725085629.98292-1-miaoqinglang@huawei.com
---
 kernel/sched/core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bd8e5211d31f..678253450ebb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1425,8 +1425,6 @@ static void __init init_uclamp(void)
 	enum uclamp_id clamp_id;
 	int cpu;
 
-	mutex_init(&uclamp_mutex);
-
 	for_each_possible_cpu(cpu)
 		init_uclamp_rq(cpu_rq(cpu));
 
-- 
cgit v1.2.3


From a7ef9b28aa8d72a1656fa6f0a01bbd1493886317 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Sat, 25 Jul 2020 19:51:10 +0100
Subject: locking/lockdep: Fix overflow in presentation of average lock-time

Though the number of lock-acquisitions is tracked as unsigned long, this
is passed as the divisor to div_s64() which interprets it as a s32,
giving nonsense values with more than 2 billion acquisitons. E.g.

  acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
  -------------------------------------------------------------------------
    2350439395           0.07         353.38   649647067.36          0.-32

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20200725185110.11588-1-chris@chris-wilson.co.uk
---
 kernel/locking/lockdep_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 5525cd3ba0c8..02ef87f50df2 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -423,7 +423,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
 	seq_time(m, lt->min);
 	seq_time(m, lt->max);
 	seq_time(m, lt->total);
-	seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
+	seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0);
 }
 
 static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
-- 
cgit v1.2.3


From 3f9969f2c040ba2ba635b6b5a7051f404bcc634d Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 22 Jul 2020 12:51:56 -0700
Subject: bpf: Fix pos computation for bpf_iter seq_ops->start()

Currently, the pos pointer in bpf iterator map/task/task_file
seq_ops->start() is always incremented.
This is incorrect. It should be increased only if
*pos is 0 (for SEQ_START_TOKEN) since these start()
function actually returns the first real object.
If *pos is not 0, it merely found the object
based on the state in seq->private, and not really
advancing the *pos. This patch fixed this issue
by only incrementing *pos if it is 0.

Note that the old *pos calculation, although not
correct, does not affect correctness of bpf_iter
as bpf_iter seq_file->read() does not support llseek.

This patch also renamed "mid" in bpf_map iterator
seq_file private data to "map_id" for better clarity.

Fixes: 6086d29def80 ("bpf: Add bpf_map iterator")
Fixes: eaaacd23910f ("bpf: Add task and task/file iterator targets")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200722195156.4029817-1-yhs@fb.com
---
 kernel/bpf/map_iter.c  | 16 ++++++----------
 kernel/bpf/task_iter.c |  6 ++++--
 2 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 8a7af11b411f..5926c76d854e 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -7,7 +7,7 @@
 #include <linux/btf_ids.h>
 
 struct bpf_iter_seq_map_info {
-	u32 mid;
+	u32 map_id;
 };
 
 static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
@@ -15,27 +15,23 @@ static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
 	struct bpf_iter_seq_map_info *info = seq->private;
 	struct bpf_map *map;
 
-	map = bpf_map_get_curr_or_next(&info->mid);
+	map = bpf_map_get_curr_or_next(&info->map_id);
 	if (!map)
 		return NULL;
 
-	++*pos;
+	if (*pos == 0)
+		++*pos;
 	return map;
 }
 
 static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct bpf_iter_seq_map_info *info = seq->private;
-	struct bpf_map *map;
 
 	++*pos;
-	++info->mid;
+	++info->map_id;
 	bpf_map_put((struct bpf_map *)v);
-	map = bpf_map_get_curr_or_next(&info->mid);
-	if (!map)
-		return NULL;
-
-	return map;
+	return bpf_map_get_curr_or_next(&info->map_id);
 }
 
 struct bpf_iter__bpf_map {
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 2feecf095609..1039e52ebd8b 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -51,7 +51,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
 	if (!task)
 		return NULL;
 
-	++*pos;
+	if (*pos == 0)
+		++*pos;
 	return task;
 }
 
@@ -210,7 +211,8 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
 		return NULL;
 	}
 
-	++*pos;
+	if (*pos == 0)
+		++*pos;
 	info->task = task;
 	info->files = files;
 
-- 
cgit v1.2.3


From a228a64fc1e4428e2b96dc68e9ad3c447095c9e7 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Jul 2020 18:10:18 -0700
Subject: bpf: Add bpf_prog iterator

It's mostly a copy paste of commit 6086d29def80 ("bpf: Add bpf_map iterator")
that is use to implement bpf_seq_file opreations to traverse all bpf programs.

v1->v2: Tweak to use build time btf_id

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h    |   1 +
 kernel/bpf/Makefile    |   2 +-
 kernel/bpf/prog_iter.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c   |  19 +++++++++
 4 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/prog_iter.c

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bae557ff2da8..72221aea1c60 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1117,6 +1117,7 @@ int  generic_map_delete_batch(struct bpf_map *map,
 			      const union bpf_attr *attr,
 			      union bpf_attr __user *uattr);
 struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
+struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1131a921e1a6..e6eb9c0402da 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
 obj-y := core.o
 CFLAGS_core.o += $(call cc-disable-warning, override-init)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c
new file mode 100644
index 000000000000..6541b577d69f
--- /dev/null
+++ b/kernel/bpf/prog_iter.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/btf_ids.h>
+
+struct bpf_iter_seq_prog_info {
+	u32 prog_id;
+};
+
+static void *bpf_prog_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_prog_info *info = seq->private;
+	struct bpf_prog *prog;
+
+	prog = bpf_prog_get_curr_or_next(&info->prog_id);
+	if (!prog)
+		return NULL;
+
+	if (*pos == 0)
+		++*pos;
+	return prog;
+}
+
+static void *bpf_prog_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_prog_info *info = seq->private;
+
+	++*pos;
+	++info->prog_id;
+	bpf_prog_put((struct bpf_prog *)v);
+	return bpf_prog_get_curr_or_next(&info->prog_id);
+}
+
+struct bpf_iter__bpf_prog {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct bpf_prog *, prog);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_prog, struct bpf_iter_meta *meta, struct bpf_prog *prog)
+
+static int __bpf_prog_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+	struct bpf_iter__bpf_prog ctx;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	ctx.meta = &meta;
+	ctx.prog = v;
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (prog)
+		ret = bpf_iter_run_prog(prog, &ctx);
+
+	return ret;
+}
+
+static int bpf_prog_seq_show(struct seq_file *seq, void *v)
+{
+	return __bpf_prog_seq_show(seq, v, false);
+}
+
+static void bpf_prog_seq_stop(struct seq_file *seq, void *v)
+{
+	if (!v)
+		(void)__bpf_prog_seq_show(seq, v, true);
+	else
+		bpf_prog_put((struct bpf_prog *)v);
+}
+
+static const struct seq_operations bpf_prog_seq_ops = {
+	.start	= bpf_prog_seq_start,
+	.next	= bpf_prog_seq_next,
+	.stop	= bpf_prog_seq_stop,
+	.show	= bpf_prog_seq_show,
+};
+
+BTF_ID_LIST(btf_bpf_prog_id)
+BTF_ID(struct, bpf_prog)
+
+static struct bpf_iter_reg bpf_prog_reg_info = {
+	.target			= "bpf_prog",
+	.seq_ops		= &bpf_prog_seq_ops,
+	.init_seq_private	= NULL,
+	.fini_seq_private	= NULL,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_prog_info),
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__bpf_prog, prog),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
+};
+
+static int __init bpf_prog_iter_init(void)
+{
+	bpf_prog_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_prog_id;
+	return bpf_iter_reg_target(&bpf_prog_reg_info);
+}
+
+late_initcall(bpf_prog_iter_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d07417d17712..ee290b1f2d9e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3044,6 +3044,25 @@ again:
 	return map;
 }
 
+struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
+{
+	struct bpf_prog *prog;
+
+	spin_lock_bh(&prog_idr_lock);
+again:
+	prog = idr_get_next(&prog_idr, id);
+	if (prog) {
+		prog = bpf_prog_inc_not_zero(prog);
+		if (IS_ERR(prog)) {
+			(*id)++;
+			goto again;
+		}
+	}
+	spin_unlock_bh(&prog_idr_lock);
+
+	return prog;
+}
+
 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
 
 struct bpf_prog *bpf_prog_by_id(u32 id)
-- 
cgit v1.2.3


From 14fc6bd6b79c430f615500d0fe6cea4722110db8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:09 -0700
Subject: bpf: Refactor bpf_iter_reg to have separate seq_info member

There is no functionality change for this patch.
Struct bpf_iter_reg is used to register a bpf_iter target,
which includes information for both prog_load, link_create
and seq_file creation.

This patch puts fields related seq_file creation into
a different structure. This will be useful for map
elements iterator where one iterator covers different
map types and different map types may have different
seq_ops, init/fini private_data function and
private_data size.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184109.590030-1-yhs@fb.com
---
 include/linux/bpf.h      | 17 ++++++++++-------
 kernel/bpf/bpf_iter.c    | 12 ++++++------
 kernel/bpf/map_iter.c    |  8 ++++++--
 kernel/bpf/prog_iter.c   |  8 ++++++--
 kernel/bpf/task_iter.c   | 16 ++++++++++++----
 net/ipv4/tcp_ipv4.c      |  8 ++++++--
 net/ipv4/udp.c           |  8 ++++++--
 net/ipv6/route.c         |  8 ++++++--
 net/netlink/af_netlink.c |  8 ++++++--
 9 files changed, 64 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 72221aea1c60..127067f71fd4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -37,6 +37,15 @@ struct seq_operations;
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 
+typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
+typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
+struct bpf_iter_seq_info {
+	const struct seq_operations *seq_ops;
+	bpf_iter_init_seq_priv_t init_seq_private;
+	bpf_iter_fini_seq_priv_t fini_seq_private;
+	u32 seq_priv_size;
+};
+
 /* map is generic key/value storage optionally accesible by eBPF programs */
 struct bpf_map_ops {
 	/* funcs callable from userspace (via syscall) */
@@ -1189,18 +1198,12 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 	extern int bpf_iter_ ## target(args);			\
 	int __init bpf_iter_ ## target(args) { return 0; }
 
-typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
-typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
-
 #define BPF_ITER_CTX_ARG_MAX 2
 struct bpf_iter_reg {
 	const char *target;
-	const struct seq_operations *seq_ops;
-	bpf_iter_init_seq_priv_t init_seq_private;
-	bpf_iter_fini_seq_priv_t fini_seq_private;
-	u32 seq_priv_size;
 	u32 ctx_arg_info_size;
 	struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
+	const struct bpf_iter_seq_info *seq_info;
 };
 
 struct bpf_iter_meta {
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index dd612b80b9fe..5b2387d6aa1f 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -218,8 +218,8 @@ static int iter_release(struct inode *inode, struct file *file)
 	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
 				 target_private);
 
-	if (iter_priv->tinfo->reg_info->fini_seq_private)
-		iter_priv->tinfo->reg_info->fini_seq_private(seq->private);
+	if (iter_priv->tinfo->reg_info->seq_info->fini_seq_private)
+		iter_priv->tinfo->reg_info->seq_info->fini_seq_private(seq->private);
 
 	bpf_prog_put(iter_priv->prog);
 	seq->private = iter_priv;
@@ -433,16 +433,16 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
 
 	tinfo = link->tinfo;
 	total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
-			   tinfo->reg_info->seq_priv_size;
-	priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops,
+			   tinfo->reg_info->seq_info->seq_priv_size;
+	priv_data = __seq_open_private(file, tinfo->reg_info->seq_info->seq_ops,
 				       total_priv_dsize);
 	if (!priv_data) {
 		err = -ENOMEM;
 		goto release_prog;
 	}
 
-	if (tinfo->reg_info->init_seq_private) {
-		err = tinfo->reg_info->init_seq_private(priv_data->target_private);
+	if (tinfo->reg_info->seq_info->init_seq_private) {
+		err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private);
 		if (err)
 			goto release_seq_file;
 	}
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 5926c76d854e..1a69241fb1e2 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -81,17 +81,21 @@ static const struct seq_operations bpf_map_seq_ops = {
 BTF_ID_LIST(btf_bpf_map_id)
 BTF_ID(struct, bpf_map)
 
-static struct bpf_iter_reg bpf_map_reg_info = {
-	.target			= "bpf_map",
+static const struct bpf_iter_seq_info bpf_map_seq_info = {
 	.seq_ops		= &bpf_map_seq_ops,
 	.init_seq_private	= NULL,
 	.fini_seq_private	= NULL,
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_map_info),
+};
+
+static struct bpf_iter_reg bpf_map_reg_info = {
+	.target			= "bpf_map",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__bpf_map, map),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &bpf_map_seq_info,
 };
 
 static int __init bpf_map_iter_init(void)
diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c
index 6541b577d69f..53a73c841c13 100644
--- a/kernel/bpf/prog_iter.c
+++ b/kernel/bpf/prog_iter.c
@@ -81,17 +81,21 @@ static const struct seq_operations bpf_prog_seq_ops = {
 BTF_ID_LIST(btf_bpf_prog_id)
 BTF_ID(struct, bpf_prog)
 
-static struct bpf_iter_reg bpf_prog_reg_info = {
-	.target			= "bpf_prog",
+static const struct bpf_iter_seq_info bpf_prog_seq_info = {
 	.seq_ops		= &bpf_prog_seq_ops,
 	.init_seq_private	= NULL,
 	.fini_seq_private	= NULL,
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_prog_info),
+};
+
+static struct bpf_iter_reg bpf_prog_reg_info = {
+	.target			= "bpf_prog",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__bpf_prog, prog),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &bpf_prog_seq_info,
 };
 
 static int __init bpf_prog_iter_init(void)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 1039e52ebd8b..6d9cd23869bf 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -319,25 +319,32 @@ BTF_ID_LIST(btf_task_file_ids)
 BTF_ID(struct, task_struct)
 BTF_ID(struct, file)
 
-static struct bpf_iter_reg task_reg_info = {
-	.target			= "task",
+static const struct bpf_iter_seq_info task_seq_info = {
 	.seq_ops		= &task_seq_ops,
 	.init_seq_private	= init_seq_pidns,
 	.fini_seq_private	= fini_seq_pidns,
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
+};
+
+static struct bpf_iter_reg task_reg_info = {
+	.target			= "task",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__task, task),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &task_seq_info,
 };
 
-static struct bpf_iter_reg task_file_reg_info = {
-	.target			= "task_file",
+static const struct bpf_iter_seq_info task_file_seq_info = {
 	.seq_ops		= &task_file_seq_ops,
 	.init_seq_private	= init_seq_pidns,
 	.fini_seq_private	= fini_seq_pidns,
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_file_info),
+};
+
+static struct bpf_iter_reg task_file_reg_info = {
+	.target			= "task_file",
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__task_file, task),
@@ -345,6 +352,7 @@ static struct bpf_iter_reg task_file_reg_info = {
 		{ offsetof(struct bpf_iter__task_file, file),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &task_file_seq_info,
 };
 
 static int __init task_iter_init(void)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f8913923a6c0..cb288fdcf2ca 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2947,17 +2947,21 @@ static void bpf_iter_fini_tcp(void *priv_data)
 	bpf_iter_fini_seq_net(priv_data);
 }
 
-static struct bpf_iter_reg tcp_reg_info = {
-	.target			= "tcp",
+static const struct bpf_iter_seq_info tcp_seq_info = {
 	.seq_ops		= &bpf_iter_tcp_seq_ops,
 	.init_seq_private	= bpf_iter_init_tcp,
 	.fini_seq_private	= bpf_iter_fini_tcp,
 	.seq_priv_size		= sizeof(struct tcp_iter_state),
+};
+
+static struct bpf_iter_reg tcp_reg_info = {
+	.target			= "tcp",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__tcp, sk_common),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &tcp_seq_info,
 };
 
 static void __init bpf_iter_register(void)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0fb5e4ea133f..1bc50ec2caef 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3208,17 +3208,21 @@ static void bpf_iter_fini_udp(void *priv_data)
 	bpf_iter_fini_seq_net(priv_data);
 }
 
-static struct bpf_iter_reg udp_reg_info = {
-	.target			= "udp",
+static const struct bpf_iter_seq_info udp_seq_info = {
 	.seq_ops		= &bpf_iter_udp_seq_ops,
 	.init_seq_private	= bpf_iter_init_udp,
 	.fini_seq_private	= bpf_iter_fini_udp,
 	.seq_priv_size		= sizeof(struct udp_iter_state),
+};
+
+static struct bpf_iter_reg udp_reg_info = {
+	.target			= "udp",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__udp, udp_sk),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &udp_seq_info,
 };
 
 static void __init bpf_iter_register(void)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 33f5efbad0a9..8bfc57b0802a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6427,17 +6427,21 @@ DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *r
 BTF_ID_LIST(btf_fib6_info_id)
 BTF_ID(struct, fib6_info)
 
-static struct bpf_iter_reg ipv6_route_reg_info = {
-	.target			= "ipv6_route",
+static const struct bpf_iter_seq_info ipv6_route_seq_info = {
 	.seq_ops		= &ipv6_route_seq_ops,
 	.init_seq_private	= bpf_iter_init_seq_net,
 	.fini_seq_private	= bpf_iter_fini_seq_net,
 	.seq_priv_size		= sizeof(struct ipv6_route_iter),
+};
+
+static struct bpf_iter_reg ipv6_route_reg_info = {
+	.target			= "ipv6_route",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__ipv6_route, rt),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &ipv6_route_seq_info,
 };
 
 static int __init bpf_iter_register(void)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index d8921b833744..b5f30d7d30d0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2807,17 +2807,21 @@ static const struct rhashtable_params netlink_rhashtable_params = {
 BTF_ID_LIST(btf_netlink_sock_id)
 BTF_ID(struct, netlink_sock)
 
-static struct bpf_iter_reg netlink_reg_info = {
-	.target			= "netlink",
+static const struct bpf_iter_seq_info netlink_seq_info = {
 	.seq_ops		= &netlink_seq_ops,
 	.init_seq_private	= bpf_iter_init_seq_net,
 	.fini_seq_private	= bpf_iter_fini_seq_net,
 	.seq_priv_size		= sizeof(struct nl_seq_iter),
+};
+
+static struct bpf_iter_reg netlink_reg_info = {
+	.target			= "netlink",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__netlink, sk),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &netlink_seq_info,
 };
 
 static int __init bpf_iter_register(void)
-- 
cgit v1.2.3


From f9c792729581bd8b8473af163e8ab426c2c61d89 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:10 -0700
Subject: bpf: Refactor to provide aux info to bpf_iter_init_seq_priv_t

This patch refactored target bpf_iter_init_seq_priv_t callback
function to accept additional information. This will be needed
in later patches for map element targets since a particular
map should be passed to traverse elements for that particular
map. In the future, other information may be passed to target
as well, e.g., pid, cgroup id, etc. to customize the iterator.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184110.590156-1-yhs@fb.com
---
 fs/proc/proc_net.c      | 2 +-
 include/linux/bpf.h     | 7 ++++++-
 include/linux/proc_fs.h | 3 ++-
 kernel/bpf/bpf_iter.c   | 2 +-
 kernel/bpf/task_iter.c  | 2 +-
 net/ipv4/tcp_ipv4.c     | 4 ++--
 net/ipv4/udp.c          | 4 ++--
 7 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index dba63b2429f0..ed8a6306990c 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -98,7 +98,7 @@ static const struct proc_ops proc_net_seq_ops = {
 	.proc_release	= seq_release_net,
 };
 
-int bpf_iter_init_seq_net(void *priv_data)
+int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux)
 {
 #ifdef CONFIG_NET_NS
 	struct seq_net_private *p = priv_data;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 127067f71fd4..ef52717336cf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -33,11 +33,13 @@ struct btf;
 struct btf_type;
 struct exception_table_entry;
 struct seq_operations;
+struct bpf_iter_aux_info;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 
-typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
+typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
+					struct bpf_iter_aux_info *aux);
 typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
 struct bpf_iter_seq_info {
 	const struct seq_operations *seq_ops;
@@ -1198,6 +1200,9 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 	extern int bpf_iter_ ## target(args);			\
 	int __init bpf_iter_ ## target(args) { return 0; }
 
+struct bpf_iter_aux_info {
+};
+
 #define BPF_ITER_CTX_ARG_MAX 2
 struct bpf_iter_reg {
 	const char *target;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d1eed1b43651..2df965cd0974 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -133,7 +133,8 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
 						    void *data);
 extern struct pid *tgid_pidfd_to_pid(const struct file *file);
 
-extern int bpf_iter_init_seq_net(void *priv_data);
+struct bpf_iter_aux_info;
+extern int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux);
 extern void bpf_iter_fini_seq_net(void *priv_data);
 
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 5b2387d6aa1f..8fa94cb1b5a0 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -442,7 +442,7 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
 	}
 
 	if (tinfo->reg_info->seq_info->init_seq_private) {
-		err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private);
+		err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private, NULL);
 		if (err)
 			goto release_seq_file;
 	}
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 6d9cd23869bf..232df29793e9 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -293,7 +293,7 @@ static void task_file_seq_stop(struct seq_file *seq, void *v)
 	}
 }
 
-static int init_seq_pidns(void *priv_data)
+static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
 {
 	struct bpf_iter_seq_task_common *common = priv_data;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cb288fdcf2ca..5084333b5ab6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2921,7 +2921,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
 		     struct sock_common *sk_common, uid_t uid)
 
-static int bpf_iter_init_tcp(void *priv_data)
+static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
 {
 	struct tcp_iter_state *st = priv_data;
 	struct tcp_seq_afinfo *afinfo;
@@ -2933,7 +2933,7 @@ static int bpf_iter_init_tcp(void *priv_data)
 
 	afinfo->family = AF_UNSPEC;
 	st->bpf_seq_afinfo = afinfo;
-	ret = bpf_iter_init_seq_net(priv_data);
+	ret = bpf_iter_init_seq_net(priv_data, aux);
 	if (ret)
 		kfree(afinfo);
 	return ret;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1bc50ec2caef..7ce31beccfc2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3181,7 +3181,7 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = {
 DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
 		     struct udp_sock *udp_sk, uid_t uid, int bucket)
 
-static int bpf_iter_init_udp(void *priv_data)
+static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
 {
 	struct udp_iter_state *st = priv_data;
 	struct udp_seq_afinfo *afinfo;
@@ -3194,7 +3194,7 @@ static int bpf_iter_init_udp(void *priv_data)
 	afinfo->family = AF_UNSPEC;
 	afinfo->udp_table = &udp_table;
 	st->bpf_seq_afinfo = afinfo;
-	ret = bpf_iter_init_seq_net(priv_data);
+	ret = bpf_iter_init_seq_net(priv_data, aux);
 	if (ret)
 		kfree(afinfo);
 	return ret;
-- 
cgit v1.2.3


From afbf21dce668ef59482037596eaffbe5041e094c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:11 -0700
Subject: bpf: Support readonly/readwrite buffers in verifier

Readonly and readwrite buffer register states
are introduced. Totally four states,
PTR_TO_RDONLY_BUF[_OR_NULL] and PTR_TO_RDWR_BUF[_OR_NULL]
are supported. As suggested by their respective
names, PTR_TO_RDONLY_BUF[_OR_NULL] are for
readonly buffers and PTR_TO_RDWR_BUF[_OR_NULL]
for read/write buffers.

These new register states will be used
by later bpf map element iterator.

New register states share some similarity to
PTR_TO_TP_BUFFER as it will calculate accessed buffer
size during verification time. The accessed buffer
size will be later compared to other metrics during
later attach/link_create time.

Similar to reg_state PTR_TO_BTF_ID_OR_NULL in bpf
iterator programs, PTR_TO_RDONLY_BUF_OR_NULL or
PTR_TO_RDWR_BUF_OR_NULL reg_types can be set at
prog->aux->bpf_ctx_arg_aux, and bpf verifier will
retrieve the values during btf_ctx_access().
Later bpf map element iterator implementation
will show how such information will be assigned
during target registeration time.

The verifier is also enhanced such that PTR_TO_RDONLY_BUF
can be passed to ARG_PTR_TO_MEM[_OR_NULL] helper argument, and
PTR_TO_RDWR_BUF can be passed to ARG_PTR_TO_MEM[_OR_NULL] or
ARG_PTR_TO_UNINIT_MEM.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184111.590274-1-yhs@fb.com
---
 include/linux/bpf.h   |  6 ++++
 kernel/bpf/btf.c      | 13 ++++++++
 kernel/bpf/verifier.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 104 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ef52717336cf..f9c4bb08f616 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -353,6 +353,10 @@ enum bpf_reg_type {
 	PTR_TO_BTF_ID_OR_NULL,	 /* reg points to kernel struct or NULL */
 	PTR_TO_MEM,		 /* reg points to valid memory region */
 	PTR_TO_MEM_OR_NULL,	 /* reg points to valid memory region or NULL */
+	PTR_TO_RDONLY_BUF,	 /* reg points to a readonly buffer */
+	PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */
+	PTR_TO_RDWR_BUF,	 /* reg points to a read/write buffer */
+	PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -694,6 +698,8 @@ struct bpf_prog_aux {
 	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
 	u32 attach_btf_id; /* in-kernel BTF type id to attach to */
 	u32 ctx_arg_info_size;
+	u32 max_rdonly_access;
+	u32 max_rdwr_access;
 	const struct bpf_ctx_arg_aux *ctx_arg_info;
 	struct bpf_prog *linked_prog;
 	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ee36b7f60936..0fd6bb62be3a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3806,6 +3806,19 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			btf_kind_str[BTF_INFO_KIND(t->info)]);
 		return false;
 	}
+
+	/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
+	for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
+		const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+
+		if (ctx_arg_info->offset == off &&
+		    (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||
+		     ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {
+			info->reg_type = ctx_arg_info->reg_type;
+			return true;
+		}
+	}
+
 	if (t->type == 0)
 		/* This is a pointer to void.
 		 * It is the same as scalar from the verifier safety pov.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9a6703bc3f36..8d6979db48d8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -409,7 +409,9 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
 	       type == PTR_TO_SOCK_COMMON_OR_NULL ||
 	       type == PTR_TO_TCP_SOCK_OR_NULL ||
 	       type == PTR_TO_BTF_ID_OR_NULL ||
-	       type == PTR_TO_MEM_OR_NULL;
+	       type == PTR_TO_MEM_OR_NULL ||
+	       type == PTR_TO_RDONLY_BUF_OR_NULL ||
+	       type == PTR_TO_RDWR_BUF_OR_NULL;
 }
 
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
@@ -503,6 +505,10 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_BTF_ID_OR_NULL]	= "ptr_or_null_",
 	[PTR_TO_MEM]		= "mem",
 	[PTR_TO_MEM_OR_NULL]	= "mem_or_null",
+	[PTR_TO_RDONLY_BUF]	= "rdonly_buf",
+	[PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
+	[PTR_TO_RDWR_BUF]	= "rdwr_buf",
+	[PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
 };
 
 static char slot_type_char[] = {
@@ -2173,6 +2179,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
 	case PTR_TO_BTF_ID_OR_NULL:
+	case PTR_TO_RDONLY_BUF:
+	case PTR_TO_RDONLY_BUF_OR_NULL:
+	case PTR_TO_RDWR_BUF:
+	case PTR_TO_RDWR_BUF_OR_NULL:
 		return true;
 	default:
 		return false;
@@ -3052,14 +3062,15 @@ int check_ctx_reg(struct bpf_verifier_env *env,
 	return 0;
 }
 
-static int check_tp_buffer_access(struct bpf_verifier_env *env,
-				  const struct bpf_reg_state *reg,
-				  int regno, int off, int size)
+static int __check_buffer_access(struct bpf_verifier_env *env,
+				 const char *buf_info,
+				 const struct bpf_reg_state *reg,
+				 int regno, int off, int size)
 {
 	if (off < 0) {
 		verbose(env,
-			"R%d invalid tracepoint buffer access: off=%d, size=%d",
-			regno, off, size);
+			"R%d invalid %s buffer access: off=%d, size=%d",
+			regno, buf_info, off, size);
 		return -EACCES;
 	}
 	if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
@@ -3071,12 +3082,45 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env,
 			regno, off, tn_buf);
 		return -EACCES;
 	}
+
+	return 0;
+}
+
+static int check_tp_buffer_access(struct bpf_verifier_env *env,
+				  const struct bpf_reg_state *reg,
+				  int regno, int off, int size)
+{
+	int err;
+
+	err = __check_buffer_access(env, "tracepoint", reg, regno, off, size);
+	if (err)
+		return err;
+
 	if (off + size > env->prog->aux->max_tp_access)
 		env->prog->aux->max_tp_access = off + size;
 
 	return 0;
 }
 
+static int check_buffer_access(struct bpf_verifier_env *env,
+			       const struct bpf_reg_state *reg,
+			       int regno, int off, int size,
+			       bool zero_size_allowed,
+			       const char *buf_info,
+			       u32 *max_access)
+{
+	int err;
+
+	err = __check_buffer_access(env, buf_info, reg, regno, off, size);
+	if (err)
+		return err;
+
+	if (off + size > *max_access)
+		*max_access = off + size;
+
+	return 0;
+}
+
 /* BPF architecture zero extends alu32 ops into 64-bit registesr */
 static void zext_32_to_64(struct bpf_reg_state *reg)
 {
@@ -3427,6 +3471,23 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	} else if (reg->type == CONST_PTR_TO_MAP) {
 		err = check_ptr_to_map_access(env, regs, regno, off, size, t,
 					      value_regno);
+	} else if (reg->type == PTR_TO_RDONLY_BUF) {
+		if (t == BPF_WRITE) {
+			verbose(env, "R%d cannot write into %s\n",
+				regno, reg_type_str[reg->type]);
+			return -EACCES;
+		}
+		err = check_buffer_access(env, reg, regno, off, size, "rdonly",
+					  false,
+					  &env->prog->aux->max_rdonly_access);
+		if (!err && value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
+	} else if (reg->type == PTR_TO_RDWR_BUF) {
+		err = check_buffer_access(env, reg, regno, off, size, "rdwr",
+					  false,
+					  &env->prog->aux->max_rdwr_access);
+		if (!err && t == BPF_READ && value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
@@ -3668,6 +3729,18 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 		return check_mem_region_access(env, regno, reg->off,
 					       access_size, reg->mem_size,
 					       zero_size_allowed);
+	case PTR_TO_RDONLY_BUF:
+		if (meta && meta->raw_mode)
+			return -EACCES;
+		return check_buffer_access(env, reg, regno, reg->off,
+					   access_size, zero_size_allowed,
+					   "rdonly",
+					   &env->prog->aux->max_rdonly_access);
+	case PTR_TO_RDWR_BUF:
+		return check_buffer_access(env, reg, regno, reg->off,
+					   access_size, zero_size_allowed,
+					   "rdwr",
+					   &env->prog->aux->max_rdwr_access);
 	default: /* scalar_value|ptr_to_stack or invalid ptr */
 		return check_stack_boundary(env, regno, access_size,
 					    zero_size_allowed, meta);
@@ -3933,6 +4006,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		else if (!type_is_pkt_pointer(type) &&
 			 type != PTR_TO_MAP_VALUE &&
 			 type != PTR_TO_MEM &&
+			 type != PTR_TO_RDONLY_BUF &&
+			 type != PTR_TO_RDWR_BUF &&
 			 type != expected_type)
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
@@ -6806,6 +6881,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 			reg->type = PTR_TO_BTF_ID;
 		} else if (reg->type == PTR_TO_MEM_OR_NULL) {
 			reg->type = PTR_TO_MEM;
+		} else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) {
+			reg->type = PTR_TO_RDONLY_BUF;
+		} else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) {
+			reg->type = PTR_TO_RDWR_BUF;
 		}
 		if (is_null) {
 			/* We don't need id and ref_obj_id from this point
-- 
cgit v1.2.3


From a5cbe05a6673b85bed2a63ffcfea6a96c6410cff Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:12 -0700
Subject: bpf: Implement bpf iterator for map elements

The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
  bpf_iter_meta *meta: the meta data
  bpf_map *map:        the bpf_map whose elements are traversed
  void *key:           the key of one element
  void *value:         the value of the same element

Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.

A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.

In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.

Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
---
 include/linux/bpf.h            | 16 ++++++++
 include/uapi/linux/bpf.h       |  7 ++++
 kernel/bpf/bpf_iter.c          | 85 ++++++++++++++++++++++++++++++++++--------
 kernel/bpf/map_iter.c          | 30 ++++++++++++++-
 tools/include/uapi/linux/bpf.h |  7 ++++
 5 files changed, 128 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f9c4bb08f616..4175cf1f4665 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -107,6 +107,9 @@ struct bpf_map_ops {
 	/* BTF name and id of struct allocated by map_alloc */
 	const char * const map_btf_name;
 	int *map_btf_id;
+
+	/* bpf_iter info used to open a seq_file */
+	const struct bpf_iter_seq_info *iter_seq_info;
 };
 
 struct bpf_map_memory {
@@ -1207,12 +1210,18 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 	int __init bpf_iter_ ## target(args) { return 0; }
 
 struct bpf_iter_aux_info {
+	struct bpf_map *map;
 };
 
+typedef int (*bpf_iter_check_target_t)(struct bpf_prog *prog,
+				       struct bpf_iter_aux_info *aux);
+
 #define BPF_ITER_CTX_ARG_MAX 2
 struct bpf_iter_reg {
 	const char *target;
+	bpf_iter_check_target_t check_target;
 	u32 ctx_arg_info_size;
+	enum bpf_iter_link_info req_linfo;
 	struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
 	const struct bpf_iter_seq_info *seq_info;
 };
@@ -1223,6 +1232,13 @@ struct bpf_iter_meta {
 	u64 seq_num;
 };
 
+struct bpf_iter__bpf_map_elem {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct bpf_map *, map);
+	__bpf_md_ptr(void *, key);
+	__bpf_md_ptr(void *, value);
+};
+
 int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info);
 void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 54d0c886e3ba..828c2f6438f2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -246,6 +246,13 @@ enum bpf_link_type {
 	MAX_BPF_LINK_TYPE,
 };
 
+enum bpf_iter_link_info {
+	BPF_ITER_LINK_UNSPEC = 0,
+	BPF_ITER_LINK_MAP_FD = 1,
+
+	MAX_BPF_ITER_LINK_INFO,
+};
+
 /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
  *
  * NONE(default): No further bpf programs allowed in the subtree.
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 8fa94cb1b5a0..363b9cafc2d8 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -14,11 +14,13 @@ struct bpf_iter_target_info {
 
 struct bpf_iter_link {
 	struct bpf_link link;
+	struct bpf_iter_aux_info aux;
 	struct bpf_iter_target_info *tinfo;
 };
 
 struct bpf_iter_priv_data {
 	struct bpf_iter_target_info *tinfo;
+	const struct bpf_iter_seq_info *seq_info;
 	struct bpf_prog *prog;
 	u64 session_id;
 	u64 seq_num;
@@ -35,7 +37,8 @@ static DEFINE_MUTEX(link_mutex);
 /* incremented on every opened seq_file */
 static atomic64_t session_id;
 
-static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
+			    const struct bpf_iter_seq_info *seq_info);
 
 static void bpf_iter_inc_seq_num(struct seq_file *seq)
 {
@@ -199,11 +202,25 @@ done:
 	return copied;
 }
 
+static const struct bpf_iter_seq_info *
+__get_seq_info(struct bpf_iter_link *link)
+{
+	const struct bpf_iter_seq_info *seq_info;
+
+	if (link->aux.map) {
+		seq_info = link->aux.map->ops->iter_seq_info;
+		if (seq_info)
+			return seq_info;
+	}
+
+	return link->tinfo->reg_info->seq_info;
+}
+
 static int iter_open(struct inode *inode, struct file *file)
 {
 	struct bpf_iter_link *link = inode->i_private;
 
-	return prepare_seq_file(file, link);
+	return prepare_seq_file(file, link, __get_seq_info(link));
 }
 
 static int iter_release(struct inode *inode, struct file *file)
@@ -218,8 +235,8 @@ static int iter_release(struct inode *inode, struct file *file)
 	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
 				 target_private);
 
-	if (iter_priv->tinfo->reg_info->seq_info->fini_seq_private)
-		iter_priv->tinfo->reg_info->seq_info->fini_seq_private(seq->private);
+	if (iter_priv->seq_info->fini_seq_private)
+		iter_priv->seq_info->fini_seq_private(seq->private);
 
 	bpf_prog_put(iter_priv->prog);
 	seq->private = iter_priv;
@@ -318,6 +335,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
 
 static void bpf_iter_link_release(struct bpf_link *link)
 {
+	struct bpf_iter_link *iter_link =
+		container_of(link, struct bpf_iter_link, link);
+
+	if (iter_link->aux.map)
+		bpf_map_put_with_uref(iter_link->aux.map);
 }
 
 static void bpf_iter_link_dealloc(struct bpf_link *link)
@@ -370,14 +392,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 {
 	struct bpf_link_primer link_primer;
 	struct bpf_iter_target_info *tinfo;
+	struct bpf_iter_aux_info aux = {};
 	struct bpf_iter_link *link;
+	u32 prog_btf_id, target_fd;
 	bool existed = false;
-	u32 prog_btf_id;
+	struct bpf_map *map;
 	int err;
 
-	if (attr->link_create.target_fd || attr->link_create.flags)
-		return -EINVAL;
-
 	prog_btf_id = prog->aux->attach_btf_id;
 	mutex_lock(&targets_mutex);
 	list_for_each_entry(tinfo, &targets, list) {
@@ -390,6 +411,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 	if (!existed)
 		return -ENOENT;
 
+	/* Make sure user supplied flags are target expected. */
+	target_fd = attr->link_create.target_fd;
+	if (attr->link_create.flags != tinfo->reg_info->req_linfo)
+		return -EINVAL;
+	if (!attr->link_create.flags && target_fd)
+		return -EINVAL;
+
 	link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
 	if (!link)
 		return -ENOMEM;
@@ -403,21 +431,45 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 		return err;
 	}
 
+	if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) {
+		map = bpf_map_get_with_uref(target_fd);
+		if (IS_ERR(map)) {
+			err = PTR_ERR(map);
+			goto cleanup_link;
+		}
+
+		aux.map = map;
+		err = tinfo->reg_info->check_target(prog, &aux);
+		if (err) {
+			bpf_map_put_with_uref(map);
+			goto cleanup_link;
+		}
+
+		link->aux.map = map;
+	}
+
 	return bpf_link_settle(&link_primer);
+
+cleanup_link:
+	bpf_link_cleanup(&link_primer);
+	return err;
 }
 
 static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
 			  struct bpf_iter_target_info *tinfo,
+			  const struct bpf_iter_seq_info *seq_info,
 			  struct bpf_prog *prog)
 {
 	priv_data->tinfo = tinfo;
+	priv_data->seq_info = seq_info;
 	priv_data->prog = prog;
 	priv_data->session_id = atomic64_inc_return(&session_id);
 	priv_data->seq_num = 0;
 	priv_data->done_stop = false;
 }
 
-static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
+			    const struct bpf_iter_seq_info *seq_info)
 {
 	struct bpf_iter_priv_data *priv_data;
 	struct bpf_iter_target_info *tinfo;
@@ -433,21 +485,21 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
 
 	tinfo = link->tinfo;
 	total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
-			   tinfo->reg_info->seq_info->seq_priv_size;
-	priv_data = __seq_open_private(file, tinfo->reg_info->seq_info->seq_ops,
+			   seq_info->seq_priv_size;
+	priv_data = __seq_open_private(file, seq_info->seq_ops,
 				       total_priv_dsize);
 	if (!priv_data) {
 		err = -ENOMEM;
 		goto release_prog;
 	}
 
-	if (tinfo->reg_info->seq_info->init_seq_private) {
-		err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private, NULL);
+	if (seq_info->init_seq_private) {
+		err = seq_info->init_seq_private(priv_data->target_private, &link->aux);
 		if (err)
 			goto release_seq_file;
 	}
 
-	init_seq_meta(priv_data, tinfo, prog);
+	init_seq_meta(priv_data, tinfo, seq_info, prog);
 	seq = file->private_data;
 	seq->private = priv_data->target_private;
 
@@ -463,6 +515,7 @@ release_prog:
 
 int bpf_iter_new_fd(struct bpf_link *link)
 {
+	struct bpf_iter_link *iter_link;
 	struct file *file;
 	unsigned int flags;
 	int err, fd;
@@ -481,8 +534,8 @@ int bpf_iter_new_fd(struct bpf_link *link)
 		goto free_fd;
 	}
 
-	err = prepare_seq_file(file,
-			       container_of(link, struct bpf_iter_link, link));
+	iter_link = container_of(link, struct bpf_iter_link, link);
+	err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link));
 	if (err)
 		goto free_file;
 
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 1a69241fb1e2..8a1f9b3355d0 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -98,10 +98,38 @@ static struct bpf_iter_reg bpf_map_reg_info = {
 	.seq_info		= &bpf_map_seq_info,
 };
 
+static int bpf_iter_check_map(struct bpf_prog *prog,
+			      struct bpf_iter_aux_info *aux)
+{
+	return -EINVAL;
+}
+
+DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,
+		     struct bpf_map *map, void *key, void *value)
+
+static const struct bpf_iter_reg bpf_map_elem_reg_info = {
+	.target			= "bpf_map_elem",
+	.check_target		= bpf_iter_check_map,
+	.req_linfo		= BPF_ITER_LINK_MAP_FD,
+	.ctx_arg_info_size	= 2,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__bpf_map_elem, key),
+		  PTR_TO_RDONLY_BUF_OR_NULL },
+		{ offsetof(struct bpf_iter__bpf_map_elem, value),
+		  PTR_TO_RDWR_BUF_OR_NULL },
+	},
+};
+
 static int __init bpf_map_iter_init(void)
 {
+	int ret;
+
 	bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id;
-	return bpf_iter_reg_target(&bpf_map_reg_info);
+	ret = bpf_iter_reg_target(&bpf_map_reg_info);
+	if (ret)
+		return ret;
+
+	return bpf_iter_reg_target(&bpf_map_elem_reg_info);
 }
 
 late_initcall(bpf_map_iter_init);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 54d0c886e3ba..828c2f6438f2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -246,6 +246,13 @@ enum bpf_link_type {
 	MAX_BPF_LINK_TYPE,
 };
 
+enum bpf_iter_link_info {
+	BPF_ITER_LINK_UNSPEC = 0,
+	BPF_ITER_LINK_MAP_FD = 1,
+
+	MAX_BPF_ITER_LINK_INFO,
+};
+
 /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
  *
  * NONE(default): No further bpf programs allowed in the subtree.
-- 
cgit v1.2.3


From d6c4503cc29638f328e1a6e6fefbdbda401c28fc Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:14 -0700
Subject: bpf: Implement bpf iterator for hash maps

The bpf iterators for hash, percpu hash, lru hash
and lru percpu hash are implemented. During link time,
bpf_iter_reg->check_target() will check map type
and ensure the program access key/value region is
within the map defined key/value size limit.

For percpu hash and lru hash maps, the bpf program
will receive values for all cpus. The map element
bpf iterator infrastructure will prepare value
properly before passing the value pointer to the
bpf program.

This patch set supports readonly map keys and
read/write map values. It does not support deleting
map elements, e.g., from hash tables. If there is
a user case for this, the following mechanism can
be used to support map deletion for hashtab, etc.
  - permit a new bpf program return value, e.g., 2,
    to let bpf iterator know the map element should
    be removed.
  - since bucket lock is taken, the map element will be
    queued.
  - once bucket lock is released after all elements under
    this bucket are traversed, all to-be-deleted map
    elements can be deleted.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184114.590470-1-yhs@fb.com
---
 kernel/bpf/hashtab.c  | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/map_iter.c |  24 ++++++-
 2 files changed, 217 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d4378d7d442b..024276787055 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1612,6 +1612,196 @@ htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
 						  true, false);
 }
 
+struct bpf_iter_seq_hash_map_info {
+	struct bpf_map *map;
+	struct bpf_htab *htab;
+	void *percpu_value_buf; // non-zero means percpu hash
+	unsigned long flags;
+	u32 bucket_id;
+	u32 skip_elems;
+};
+
+static struct htab_elem *
+bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
+			   struct htab_elem *prev_elem)
+{
+	const struct bpf_htab *htab = info->htab;
+	unsigned long flags = info->flags;
+	u32 skip_elems = info->skip_elems;
+	u32 bucket_id = info->bucket_id;
+	struct hlist_nulls_head *head;
+	struct hlist_nulls_node *n;
+	struct htab_elem *elem;
+	struct bucket *b;
+	u32 i, count;
+
+	if (bucket_id >= htab->n_buckets)
+		return NULL;
+
+	/* try to find next elem in the same bucket */
+	if (prev_elem) {
+		/* no update/deletion on this bucket, prev_elem should be still valid
+		 * and we won't skip elements.
+		 */
+		n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
+		elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
+		if (elem)
+			return elem;
+
+		/* not found, unlock and go to the next bucket */
+		b = &htab->buckets[bucket_id++];
+		htab_unlock_bucket(htab, b, flags);
+		skip_elems = 0;
+	}
+
+	for (i = bucket_id; i < htab->n_buckets; i++) {
+		b = &htab->buckets[i];
+		flags = htab_lock_bucket(htab, b);
+
+		count = 0;
+		head = &b->head;
+		hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+			if (count >= skip_elems) {
+				info->flags = flags;
+				info->bucket_id = i;
+				info->skip_elems = count;
+				return elem;
+			}
+			count++;
+		}
+
+		htab_unlock_bucket(htab, b, flags);
+		skip_elems = 0;
+	}
+
+	info->bucket_id = i;
+	info->skip_elems = 0;
+	return NULL;
+}
+
+static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_hash_map_info *info = seq->private;
+	struct htab_elem *elem;
+
+	elem = bpf_hash_map_seq_find_next(info, NULL);
+	if (!elem)
+		return NULL;
+
+	if (*pos == 0)
+		++*pos;
+	return elem;
+}
+
+static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_hash_map_info *info = seq->private;
+
+	++*pos;
+	++info->skip_elems;
+	return bpf_hash_map_seq_find_next(info, v);
+}
+
+static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
+{
+	struct bpf_iter_seq_hash_map_info *info = seq->private;
+	u32 roundup_key_size, roundup_value_size;
+	struct bpf_iter__bpf_map_elem ctx = {};
+	struct bpf_map *map = info->map;
+	struct bpf_iter_meta meta;
+	int ret = 0, off = 0, cpu;
+	struct bpf_prog *prog;
+	void __percpu *pptr;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, elem == NULL);
+	if (prog) {
+		ctx.meta = &meta;
+		ctx.map = info->map;
+		if (elem) {
+			roundup_key_size = round_up(map->key_size, 8);
+			ctx.key = elem->key;
+			if (!info->percpu_value_buf) {
+				ctx.value = elem->key + roundup_key_size;
+			} else {
+				roundup_value_size = round_up(map->value_size, 8);
+				pptr = htab_elem_get_ptr(elem, map->key_size);
+				for_each_possible_cpu(cpu) {
+					bpf_long_memcpy(info->percpu_value_buf + off,
+							per_cpu_ptr(pptr, cpu),
+							roundup_value_size);
+					off += roundup_value_size;
+				}
+				ctx.value = info->percpu_value_buf;
+			}
+		}
+		ret = bpf_iter_run_prog(prog, &ctx);
+	}
+
+	return ret;
+}
+
+static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
+{
+	return __bpf_hash_map_seq_show(seq, v);
+}
+
+static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_hash_map_info *info = seq->private;
+
+	if (!v)
+		(void)__bpf_hash_map_seq_show(seq, NULL);
+	else
+		htab_unlock_bucket(info->htab,
+				   &info->htab->buckets[info->bucket_id],
+				   info->flags);
+}
+
+static int bpf_iter_init_hash_map(void *priv_data,
+				  struct bpf_iter_aux_info *aux)
+{
+	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+	struct bpf_map *map = aux->map;
+	void *value_buf;
+	u32 buf_size;
+
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+		value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
+		if (!value_buf)
+			return -ENOMEM;
+
+		seq_info->percpu_value_buf = value_buf;
+	}
+
+	seq_info->map = map;
+	seq_info->htab = container_of(map, struct bpf_htab, map);
+	return 0;
+}
+
+static void bpf_iter_fini_hash_map(void *priv_data)
+{
+	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+
+	kfree(seq_info->percpu_value_buf);
+}
+
+static const struct seq_operations bpf_hash_map_seq_ops = {
+	.start	= bpf_hash_map_seq_start,
+	.next	= bpf_hash_map_seq_next,
+	.stop	= bpf_hash_map_seq_stop,
+	.show	= bpf_hash_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+	.seq_ops		= &bpf_hash_map_seq_ops,
+	.init_seq_private	= bpf_iter_init_hash_map,
+	.fini_seq_private	= bpf_iter_fini_hash_map,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_hash_map_info),
+};
+
 static int htab_map_btf_id;
 const struct bpf_map_ops htab_map_ops = {
 	.map_alloc_check = htab_map_alloc_check,
@@ -1626,6 +1816,7 @@ const struct bpf_map_ops htab_map_ops = {
 	BATCH_OPS(htab),
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_map_btf_id,
+	.iter_seq_info = &iter_seq_info,
 };
 
 static int htab_lru_map_btf_id;
@@ -1643,6 +1834,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
 	BATCH_OPS(htab_lru),
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_lru_map_btf_id,
+	.iter_seq_info = &iter_seq_info,
 };
 
 /* Called from eBPF program */
@@ -1760,6 +1952,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
 	BATCH_OPS(htab_percpu),
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_percpu_map_btf_id,
+	.iter_seq_info = &iter_seq_info,
 };
 
 static int htab_lru_percpu_map_btf_id;
@@ -1775,6 +1968,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	BATCH_OPS(htab_lru_percpu),
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_lru_percpu_map_btf_id,
+	.iter_seq_info = &iter_seq_info,
 };
 
 static int fd_htab_map_alloc_check(union bpf_attr *attr)
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 8a1f9b3355d0..bcb68b55bf65 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -101,7 +101,29 @@ static struct bpf_iter_reg bpf_map_reg_info = {
 static int bpf_iter_check_map(struct bpf_prog *prog,
 			      struct bpf_iter_aux_info *aux)
 {
-	return -EINVAL;
+	u32 key_acc_size, value_acc_size, key_size, value_size;
+	struct bpf_map *map = aux->map;
+	bool is_percpu = false;
+
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH)
+		is_percpu = true;
+	else if (map->map_type != BPF_MAP_TYPE_HASH &&
+		 map->map_type != BPF_MAP_TYPE_LRU_HASH)
+		return -EINVAL;
+
+	key_acc_size = prog->aux->max_rdonly_access;
+	value_acc_size = prog->aux->max_rdwr_access;
+	key_size = map->key_size;
+	if (!is_percpu)
+		value_size = map->value_size;
+	else
+		value_size = round_up(map->value_size, 8) * num_possible_cpus();
+
+	if (key_acc_size > key_size || value_acc_size > value_size)
+		return -EACCES;
+
+	return 0;
 }
 
 DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,
-- 
cgit v1.2.3


From d3cc2ab546adc6e52b65f36f7c34820d2830d0c9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:15 -0700
Subject: bpf: Implement bpf iterator for array maps

The bpf iterators for array and percpu array
are implemented. Similar to hash maps, for percpu
array map, bpf program will receive values
from all cpus.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184115.590532-1-yhs@fb.com
---
 kernel/bpf/arraymap.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/map_iter.c |   6 ++-
 2 files changed, 142 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c66e8273fccd..8ff419b632a6 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -487,6 +487,142 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
 				   vma->vm_pgoff + pgoff);
 }
 
+struct bpf_iter_seq_array_map_info {
+	struct bpf_map *map;
+	void *percpu_value_buf;
+	u32 index;
+};
+
+static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_array_map_info *info = seq->private;
+	struct bpf_map *map = info->map;
+	struct bpf_array *array;
+	u32 index;
+
+	if (info->index >= map->max_entries)
+		return NULL;
+
+	if (*pos == 0)
+		++*pos;
+	array = container_of(map, struct bpf_array, map);
+	index = info->index & array->index_mask;
+	if (info->percpu_value_buf)
+	       return array->pptrs[index];
+	return array->value + array->elem_size * index;
+}
+
+static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_array_map_info *info = seq->private;
+	struct bpf_map *map = info->map;
+	struct bpf_array *array;
+	u32 index;
+
+	++*pos;
+	++info->index;
+	if (info->index >= map->max_entries)
+		return NULL;
+
+	array = container_of(map, struct bpf_array, map);
+	index = info->index & array->index_mask;
+	if (info->percpu_value_buf)
+	       return array->pptrs[index];
+	return array->value + array->elem_size * index;
+}
+
+static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_array_map_info *info = seq->private;
+	struct bpf_iter__bpf_map_elem ctx = {};
+	struct bpf_map *map = info->map;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int off = 0, cpu = 0;
+	void __percpu **pptr;
+	u32 size;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, v == NULL);
+	if (!prog)
+		return 0;
+
+	ctx.meta = &meta;
+	ctx.map = info->map;
+	if (v) {
+		ctx.key = &info->index;
+
+		if (!info->percpu_value_buf) {
+			ctx.value = v;
+		} else {
+			pptr = v;
+			size = round_up(map->value_size, 8);
+			for_each_possible_cpu(cpu) {
+				bpf_long_memcpy(info->percpu_value_buf + off,
+						per_cpu_ptr(pptr, cpu),
+						size);
+				off += size;
+			}
+			ctx.value = info->percpu_value_buf;
+		}
+	}
+
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_array_map_seq_show(struct seq_file *seq, void *v)
+{
+	return __bpf_array_map_seq_show(seq, v);
+}
+
+static void bpf_array_map_seq_stop(struct seq_file *seq, void *v)
+{
+	if (!v)
+		(void)__bpf_array_map_seq_show(seq, NULL);
+}
+
+static int bpf_iter_init_array_map(void *priv_data,
+				   struct bpf_iter_aux_info *aux)
+{
+	struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+	struct bpf_map *map = aux->map;
+	void *value_buf;
+	u32 buf_size;
+
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+		buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+		value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
+		if (!value_buf)
+			return -ENOMEM;
+
+		seq_info->percpu_value_buf = value_buf;
+	}
+
+	seq_info->map = map;
+	return 0;
+}
+
+static void bpf_iter_fini_array_map(void *priv_data)
+{
+	struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+
+	kfree(seq_info->percpu_value_buf);
+}
+
+static const struct seq_operations bpf_array_map_seq_ops = {
+	.start	= bpf_array_map_seq_start,
+	.next	= bpf_array_map_seq_next,
+	.stop	= bpf_array_map_seq_stop,
+	.show	= bpf_array_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+	.seq_ops		= &bpf_array_map_seq_ops,
+	.init_seq_private	= bpf_iter_init_array_map,
+	.fini_seq_private	= bpf_iter_fini_array_map,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_array_map_info),
+};
+
 static int array_map_btf_id;
 const struct bpf_map_ops array_map_ops = {
 	.map_alloc_check = array_map_alloc_check,
@@ -506,6 +642,7 @@ const struct bpf_map_ops array_map_ops = {
 	.map_update_batch = generic_map_update_batch,
 	.map_btf_name = "bpf_array",
 	.map_btf_id = &array_map_btf_id,
+	.iter_seq_info = &iter_seq_info,
 };
 
 static int percpu_array_map_btf_id;
@@ -521,6 +658,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
 	.map_check_btf = array_map_check_btf,
 	.map_btf_name = "bpf_array",
 	.map_btf_id = &percpu_array_map_btf_id,
+	.iter_seq_info = &iter_seq_info,
 };
 
 static int fd_array_map_alloc_check(union bpf_attr *attr)
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index bcb68b55bf65..fbe1f557cb88 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -106,10 +106,12 @@ static int bpf_iter_check_map(struct bpf_prog *prog,
 	bool is_percpu = false;
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH)
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 		is_percpu = true;
 	else if (map->map_type != BPF_MAP_TYPE_HASH &&
-		 map->map_type != BPF_MAP_TYPE_LRU_HASH)
+		 map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+		 map->map_type != BPF_MAP_TYPE_ARRAY)
 		return -EINVAL;
 
 	key_acc_size = prog->aux->max_rdonly_access;
-- 
cgit v1.2.3


From 7b04d6d60fcfb5b2200ffebb9cfb90927bdfeec7 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 23 Jul 2020 11:06:44 -0700
Subject: bpf: Separate bpf_get_[stack|stackid] for perf events BPF

Calling get_perf_callchain() on perf_events from PEBS entries may cause
unwinder errors. To fix this issue, the callchain is fetched early. Such
perf_events are marked with __PERF_SAMPLE_CALLCHAIN_EARLY.

Similarly, calling bpf_get_[stack|stackid] on perf_events from PEBS may
also cause unwinder errors. To fix this, add separate version of these
two helpers, bpf_get_[stack|stackid]_pe. These two hepers use callchain in
bpf_perf_event_data_kern->data->callchain.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723180648.1429892-2-songliubraving@fb.com
---
 include/linux/bpf.h      |   2 +
 kernel/bpf/stackmap.c    | 184 ++++++++++++++++++++++++++++++++++++++++++-----
 kernel/trace/bpf_trace.c |   4 +-
 3 files changed, 170 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4175cf1f4665..8357be349133 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1675,6 +1675,8 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_get_task_stack_proto;
+extern const struct bpf_func_proto bpf_get_stackid_proto_pe;
+extern const struct bpf_func_proto bpf_get_stack_proto_pe;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 48d8e739975f..5beb2f8c23da 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -4,6 +4,7 @@
 #include <linux/bpf.h>
 #include <linux/jhash.h>
 #include <linux/filter.h>
+#include <linux/kernel.h>
 #include <linux/stacktrace.h>
 #include <linux/perf_event.h>
 #include <linux/elf.h>
@@ -387,11 +388,10 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
 #endif
 }
 
-BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
-	   u64, flags)
+static long __bpf_get_stackid(struct bpf_map *map,
+			      struct perf_callchain_entry *trace, u64 flags)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
-	struct perf_callchain_entry *trace;
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 	u32 max_depth = map->value_size / stack_map_data_size(map);
 	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
@@ -399,21 +399,9 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	u32 hash, id, trace_nr, trace_len;
 	bool user = flags & BPF_F_USER_STACK;
-	bool kernel = !user;
 	u64 *ips;
 	bool hash_matches;
 
-	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
-			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
-		return -EINVAL;
-
-	trace = get_perf_callchain(regs, init_nr, kernel, user,
-				   sysctl_perf_event_max_stack, false, false);
-
-	if (unlikely(!trace))
-		/* couldn't fetch the stack trace */
-		return -EFAULT;
-
 	/* get_perf_callchain() guarantees that trace->nr >= init_nr
 	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
 	 */
@@ -478,6 +466,30 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	return id;
 }
 
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
+	   u64, flags)
+{
+	u32 max_depth = map->value_size / stack_map_data_size(map);
+	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+	bool user = flags & BPF_F_USER_STACK;
+	struct perf_callchain_entry *trace;
+	bool kernel = !user;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+		return -EINVAL;
+
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
+
+	if (unlikely(!trace))
+		/* couldn't fetch the stack trace */
+		return -EFAULT;
+
+	return __bpf_get_stackid(map, trace, flags);
+}
+
 const struct bpf_func_proto bpf_get_stackid_proto = {
 	.func		= bpf_get_stackid,
 	.gpl_only	= true,
@@ -487,7 +499,77 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static __u64 count_kernel_ip(struct perf_callchain_entry *trace)
+{
+	__u64 nr_kernel = 0;
+
+	while (nr_kernel < trace->nr) {
+		if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)
+			break;
+		nr_kernel++;
+	}
+	return nr_kernel;
+}
+
+BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
+	   struct bpf_map *, map, u64, flags)
+{
+	struct perf_event *event = ctx->event;
+	struct perf_callchain_entry *trace;
+	bool kernel, user;
+	__u64 nr_kernel;
+	int ret;
+
+	/* perf_sample_data doesn't have callchain, use bpf_get_stackid */
+	if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+		return bpf_get_stackid((unsigned long)(ctx->regs),
+				       (unsigned long) map, flags, 0, 0);
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+		return -EINVAL;
+
+	user = flags & BPF_F_USER_STACK;
+	kernel = !user;
+
+	trace = ctx->data->callchain;
+	if (unlikely(!trace))
+		return -EFAULT;
+
+	nr_kernel = count_kernel_ip(trace);
+
+	if (kernel) {
+		__u64 nr = trace->nr;
+
+		trace->nr = nr_kernel;
+		ret = __bpf_get_stackid(map, trace, flags);
+
+		/* restore nr */
+		trace->nr = nr;
+	} else { /* user */
+		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
+
+		skip += nr_kernel;
+		if (skip > BPF_F_SKIP_FIELD_MASK)
+			return -EFAULT;
+
+		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
+		ret = __bpf_get_stackid(map, trace, flags);
+	}
+	return ret;
+}
+
+const struct bpf_func_proto bpf_get_stackid_proto_pe = {
+	.func		= bpf_get_stackid_pe,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
+			    struct perf_callchain_entry *trace_in,
 			    void *buf, u32 size, u64 flags)
 {
 	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
@@ -520,7 +602,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 	else
 		init_nr = sysctl_perf_event_max_stack - num_elem;
 
-	if (kernel && task)
+	if (trace_in)
+		trace = trace_in;
+	else if (kernel && task)
 		trace = get_callchain_entry_for_task(task, init_nr);
 	else
 		trace = get_perf_callchain(regs, init_nr, kernel, user,
@@ -556,7 +640,7 @@ clear:
 BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
 	   u64, flags)
 {
-	return __bpf_get_stack(regs, NULL, buf, size, flags);
+	return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
 }
 
 const struct bpf_func_proto bpf_get_stack_proto = {
@@ -574,7 +658,7 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
 {
 	struct pt_regs *regs = task_pt_regs(task);
 
-	return __bpf_get_stack(regs, task, buf, size, flags);
+	return __bpf_get_stack(regs, task, NULL, buf, size, flags);
 }
 
 BTF_ID_LIST(bpf_get_task_stack_btf_ids)
@@ -591,6 +675,70 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
 	.btf_id		= bpf_get_task_stack_btf_ids,
 };
 
+BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
+	   void *, buf, u32, size, u64, flags)
+{
+	struct perf_event *event = ctx->event;
+	struct perf_callchain_entry *trace;
+	bool kernel, user;
+	int err = -EINVAL;
+	__u64 nr_kernel;
+
+	if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+		return __bpf_get_stack(ctx->regs, NULL, NULL, buf, size, flags);
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_USER_BUILD_ID)))
+		goto clear;
+
+	user = flags & BPF_F_USER_STACK;
+	kernel = !user;
+
+	err = -EFAULT;
+	trace = ctx->data->callchain;
+	if (unlikely(!trace))
+		goto clear;
+
+	nr_kernel = count_kernel_ip(trace);
+
+	if (kernel) {
+		__u64 nr = trace->nr;
+
+		trace->nr = nr_kernel;
+		err = __bpf_get_stack(ctx->regs, NULL, trace, buf,
+				      size, flags);
+
+		/* restore nr */
+		trace->nr = nr;
+	} else { /* user */
+		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
+
+		skip += nr_kernel;
+		if (skip > BPF_F_SKIP_FIELD_MASK)
+			goto clear;
+
+		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
+		err = __bpf_get_stack(ctx->regs, NULL, trace, buf,
+				      size, flags);
+	}
+	return err;
+
+clear:
+	memset(buf, 0, size);
+	return err;
+
+}
+
+const struct bpf_func_proto bpf_get_stack_proto_pe = {
+	.func		= bpf_get_stack_pe,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3cc0dcb60ca2..cb91ef902cc4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1411,9 +1411,9 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
-		return &bpf_get_stackid_proto_tp;
+		return &bpf_get_stackid_proto_pe;
 	case BPF_FUNC_get_stack:
-		return &bpf_get_stack_proto_tp;
+		return &bpf_get_stack_proto_pe;
 	case BPF_FUNC_perf_prog_read_value:
 		return &bpf_perf_prog_read_value_proto;
 	case BPF_FUNC_read_branch_records:
-- 
cgit v1.2.3


From 5d99cb2c86775b4780c02a339a9578bf9471ead9 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 23 Jul 2020 11:06:45 -0700
Subject: bpf: Fail PERF_EVENT_IOC_SET_BPF when bpf_get_[stack|stackid] cannot
 work

bpf_get_[stack|stackid] on perf_events with precise_ip uses callchain
attached to perf_sample_data. If this callchain is not presented, do not
allow attaching BPF program that calls bpf_get_[stack|stackid] to this
event.

In the error case, -EPROTO is returned so that libbpf can identify this
error and print proper hint message.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723180648.1429892-3-songliubraving@fb.com
---
 include/linux/filter.h |  3 ++-
 kernel/bpf/verifier.c  |  3 +++
 kernel/events/core.c   | 18 ++++++++++++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d07a6e973a7d..0a355b005bf4 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -533,7 +533,8 @@ struct bpf_prog {
 				is_func:1,	/* program is a bpf function */
 				kprobe_override:1, /* Do we override a kprobe? */
 				has_callchain_buf:1, /* callchain buffer allocated? */
-				enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
+				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
+				call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8d6979db48d8..cd14e70f2d07 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4962,6 +4962,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		env->prog->has_callchain_buf = true;
 	}
 
+	if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
+		env->prog->call_get_stack = true;
+
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 856d98c36f56..ddcfd2fb5cc5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9544,6 +9544,24 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
+	if (event->attr.precise_ip &&
+	    prog->call_get_stack &&
+	    (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
+	     event->attr.exclude_callchain_kernel ||
+	     event->attr.exclude_callchain_user)) {
+		/*
+		 * On perf_event with precise_ip, calling bpf_get_stack()
+		 * may trigger unwinder warnings and occasional crashes.
+		 * bpf_get_[stack|stackid] works around this issue by using
+		 * callchain attached to perf_sample_data. If the
+		 * perf_event does not full (kernel and user) callchain
+		 * attached to perf_sample_data, do not allow attaching BPF
+		 * program that calls bpf_get_[stack|stackid].
+		 */
+		bpf_prog_put(prog);
+		return -EPROTO;
+	}
+
 	event->prog = prog;
 	event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
 	WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
-- 
cgit v1.2.3


From 7d9c3427894fe70d1347b4820476bf37736d2ff0 Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Thu, 23 Jul 2020 23:47:43 -0500
Subject: bpf: Make cgroup storages shared between programs on the same cgroup

This change comes in several parts:

One, the restriction that the CGROUP_STORAGE map can only be used
by one program is removed. This results in the removal of the field
'aux' in struct bpf_cgroup_storage_map, and removal of relevant
code associated with the field, and removal of now-noop functions
bpf_free_cgroup_storage and bpf_cgroup_storage_release.

Second, we permit a key of type u64 as the key to the map.
Providing such a key type indicates that the map should ignore
attach type when comparing map keys. However, for simplicity newly
linked storage will still have the attach type at link time in
its key struct. cgroup_storage_check_btf is adapted to accept
u64 as the type of the key.

Third, because the storages are now shared, the storages cannot
be unconditionally freed on program detach. There could be two
ways to solve this issue:
* A. Reference count the usage of the storages, and free when the
     last program is detached.
* B. Free only when the storage is impossible to be referred to
     again, i.e. when either the cgroup_bpf it is attached to, or
     the map itself, is freed.
Option A has the side effect that, when the user detach and
reattach a program, whether the program gets a fresh storage
depends on whether there is another program attached using that
storage. This could trigger races if the user is multi-threaded,
and since nondeterminism in data races is evil, go with option B.

The both the map and the cgroup_bpf now tracks their associated
storages, and the storage unlink and free are removed from
cgroup_bpf_detach and added to cgroup_bpf_release and
cgroup_storage_map_free. The latter also new holds the cgroup_mutex
to prevent any races with the former.

Fourth, on attach, we reuse the old storage if the key already
exists in the map, via cgroup_storage_lookup. If the storage
does not exist yet, we create a new one, and publish it at the
last step in the attach process. This does not create a race
condition because for the whole attach the cgroup_mutex is held.
We keep track of an array of new storages that was allocated
and if the process fails only the new storages would get freed.

Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/d5401c6106728a00890401190db40020a1f84ff1.1595565795.git.zhuyifei@google.com
---
 include/linux/bpf-cgroup.h |  12 ++-
 kernel/bpf/cgroup.c        |  67 ++++++++------
 kernel/bpf/core.c          |  12 ---
 kernel/bpf/local_storage.c | 216 ++++++++++++++++++++++++---------------------
 4 files changed, 164 insertions(+), 143 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 2c6f26670acc..64f367044e25 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -46,7 +46,8 @@ struct bpf_cgroup_storage {
 	};
 	struct bpf_cgroup_storage_map *map;
 	struct bpf_cgroup_storage_key key;
-	struct list_head list;
+	struct list_head list_map;
+	struct list_head list_cg;
 	struct rb_node node;
 	struct rcu_head rcu;
 };
@@ -78,6 +79,9 @@ struct cgroup_bpf {
 	struct list_head progs[MAX_BPF_ATTACH_TYPE];
 	u32 flags[MAX_BPF_ATTACH_TYPE];
 
+	/* list of cgroup shared storages */
+	struct list_head storages;
+
 	/* temp storage for effective prog array used by prog_attach/detach */
 	struct bpf_prog_array *inactive;
 
@@ -161,6 +165,9 @@ static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
 		this_cpu_write(bpf_cgroup_storage[stype], storage[stype]);
 }
 
+struct bpf_cgroup_storage *
+cgroup_storage_lookup(struct bpf_cgroup_storage_map *map,
+		      void *key, bool locked);
 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
 					enum bpf_cgroup_storage_type stype);
 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
@@ -169,7 +176,6 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
 			     enum bpf_attach_type type);
 void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
 int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map);
-void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *map);
 
 int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
@@ -383,8 +389,6 @@ static inline void bpf_cgroup_storage_set(
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {}
 static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
 					    struct bpf_map *map) { return 0; }
-static inline void bpf_cgroup_storage_release(struct bpf_prog_aux *aux,
-					      struct bpf_map *map) {}
 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
 	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; }
 static inline void bpf_cgroup_storage_free(
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ac53102e244a..957cce1d5168 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -37,17 +37,34 @@ static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
 }
 
 static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
-				     struct bpf_prog *prog)
+				     struct bpf_cgroup_storage *new_storages[],
+				     enum bpf_attach_type type,
+				     struct bpf_prog *prog,
+				     struct cgroup *cgrp)
 {
 	enum bpf_cgroup_storage_type stype;
+	struct bpf_cgroup_storage_key key;
+	struct bpf_map *map;
+
+	key.cgroup_inode_id = cgroup_id(cgrp);
+	key.attach_type = type;
 
 	for_each_cgroup_storage_type(stype) {
+		map = prog->aux->cgroup_storage[stype];
+		if (!map)
+			continue;
+
+		storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
+		if (storages[stype])
+			continue;
+
 		storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
 		if (IS_ERR(storages[stype])) {
-			storages[stype] = NULL;
-			bpf_cgroup_storages_free(storages);
+			bpf_cgroup_storages_free(new_storages);
 			return -ENOMEM;
 		}
+
+		new_storages[stype] = storages[stype];
 	}
 
 	return 0;
@@ -63,7 +80,7 @@ static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
 }
 
 static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
-				     struct cgroup* cgrp,
+				     struct cgroup *cgrp,
 				     enum bpf_attach_type attach_type)
 {
 	enum bpf_cgroup_storage_type stype;
@@ -72,14 +89,6 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
 		bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
 }
 
-static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[])
-{
-	enum bpf_cgroup_storage_type stype;
-
-	for_each_cgroup_storage_type(stype)
-		bpf_cgroup_storage_unlink(storages[stype]);
-}
-
 /* Called when bpf_cgroup_link is auto-detached from dying cgroup.
  * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
  * doesn't free link memory, which will eventually be done by bpf_link's
@@ -101,22 +110,23 @@ static void cgroup_bpf_release(struct work_struct *work)
 	struct cgroup *p, *cgrp = container_of(work, struct cgroup,
 					       bpf.release_work);
 	struct bpf_prog_array *old_array;
+	struct list_head *storages = &cgrp->bpf.storages;
+	struct bpf_cgroup_storage *storage, *stmp;
+
 	unsigned int type;
 
 	mutex_lock(&cgroup_mutex);
 
 	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
 		struct list_head *progs = &cgrp->bpf.progs[type];
-		struct bpf_prog_list *pl, *tmp;
+		struct bpf_prog_list *pl, *pltmp;
 
-		list_for_each_entry_safe(pl, tmp, progs, node) {
+		list_for_each_entry_safe(pl, pltmp, progs, node) {
 			list_del(&pl->node);
 			if (pl->prog)
 				bpf_prog_put(pl->prog);
 			if (pl->link)
 				bpf_cgroup_link_auto_detach(pl->link);
-			bpf_cgroup_storages_unlink(pl->storage);
-			bpf_cgroup_storages_free(pl->storage);
 			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
@@ -126,6 +136,11 @@ static void cgroup_bpf_release(struct work_struct *work)
 		bpf_prog_array_free(old_array);
 	}
 
+	list_for_each_entry_safe(storage, stmp, storages, list_cg) {
+		bpf_cgroup_storage_unlink(storage);
+		bpf_cgroup_storage_free(storage);
+	}
+
 	mutex_unlock(&cgroup_mutex);
 
 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
@@ -290,6 +305,8 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
 	for (i = 0; i < NR; i++)
 		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 
+	INIT_LIST_HEAD(&cgrp->bpf.storages);
+
 	for (i = 0; i < NR; i++)
 		if (compute_effective_progs(cgrp, i, &arrays[i]))
 			goto cleanup;
@@ -422,7 +439,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 	struct list_head *progs = &cgrp->bpf.progs[type];
 	struct bpf_prog *old_prog = NULL;
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
-	struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
 	struct bpf_prog_list *pl;
 	int err;
 
@@ -455,17 +472,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 	if (IS_ERR(pl))
 		return PTR_ERR(pl);
 
-	if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog))
+	if (bpf_cgroup_storages_alloc(storage, new_storage, type,
+				      prog ? : link->link.prog, cgrp))
 		return -ENOMEM;
 
 	if (pl) {
 		old_prog = pl->prog;
-		bpf_cgroup_storages_unlink(pl->storage);
-		bpf_cgroup_storages_assign(old_storage, pl->storage);
 	} else {
 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 		if (!pl) {
-			bpf_cgroup_storages_free(storage);
+			bpf_cgroup_storages_free(new_storage);
 			return -ENOMEM;
 		}
 		list_add_tail(&pl->node, progs);
@@ -480,12 +496,11 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 	if (err)
 		goto cleanup;
 
-	bpf_cgroup_storages_free(old_storage);
 	if (old_prog)
 		bpf_prog_put(old_prog);
 	else
 		static_branch_inc(&cgroup_bpf_enabled_key);
-	bpf_cgroup_storages_link(pl->storage, cgrp, type);
+	bpf_cgroup_storages_link(new_storage, cgrp, type);
 	return 0;
 
 cleanup:
@@ -493,9 +508,7 @@ cleanup:
 		pl->prog = old_prog;
 		pl->link = NULL;
 	}
-	bpf_cgroup_storages_free(pl->storage);
-	bpf_cgroup_storages_assign(pl->storage, old_storage);
-	bpf_cgroup_storages_link(pl->storage, cgrp, type);
+	bpf_cgroup_storages_free(new_storage);
 	if (!old_prog) {
 		list_del(&pl->node);
 		kfree(pl);
@@ -679,8 +692,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 
 	/* now can actually delete it from this cgroup list */
 	list_del(&pl->node);
-	bpf_cgroup_storages_unlink(pl->storage);
-	bpf_cgroup_storages_free(pl->storage);
 	kfree(pl);
 	if (list_empty(progs))
 		/* last program was detached, reset flags to zero */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7be02e555ab9..bde93344164d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2097,24 +2097,12 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array,
 								     : 0;
 }
 
-static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
-{
-	enum bpf_cgroup_storage_type stype;
-
-	for_each_cgroup_storage_type(stype) {
-		if (!aux->cgroup_storage[stype])
-			continue;
-		bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]);
-	}
-}
-
 void __bpf_free_used_maps(struct bpf_prog_aux *aux,
 			  struct bpf_map **used_maps, u32 len)
 {
 	struct bpf_map *map;
 	u32 i;
 
-	bpf_free_cgroup_storage(aux);
 	for (i = 0; i < len; i++) {
 		map = used_maps[i];
 		if (map->ops->map_poke_untrack)
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 51bd5a8cb01b..3b2c70197d78 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -9,6 +9,8 @@
 #include <linux/slab.h>
 #include <uapi/linux/btf.h>
 
+#include "../cgroup/cgroup-internal.h"
+
 DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 #ifdef CONFIG_CGROUP_BPF
@@ -20,7 +22,6 @@ struct bpf_cgroup_storage_map {
 	struct bpf_map map;
 
 	spinlock_t lock;
-	struct bpf_prog_aux *aux;
 	struct rb_root root;
 	struct list_head list;
 };
@@ -30,24 +31,41 @@ static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map)
 	return container_of(map, struct bpf_cgroup_storage_map, map);
 }
 
-static int bpf_cgroup_storage_key_cmp(
-	const struct bpf_cgroup_storage_key *key1,
-	const struct bpf_cgroup_storage_key *key2)
+static bool attach_type_isolated(const struct bpf_map *map)
 {
-	if (key1->cgroup_inode_id < key2->cgroup_inode_id)
-		return -1;
-	else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
-		return 1;
-	else if (key1->attach_type < key2->attach_type)
-		return -1;
-	else if (key1->attach_type > key2->attach_type)
-		return 1;
+	return map->key_size == sizeof(struct bpf_cgroup_storage_key);
+}
+
+static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map,
+				      const void *_key1, const void *_key2)
+{
+	if (attach_type_isolated(&map->map)) {
+		const struct bpf_cgroup_storage_key *key1 = _key1;
+		const struct bpf_cgroup_storage_key *key2 = _key2;
+
+		if (key1->cgroup_inode_id < key2->cgroup_inode_id)
+			return -1;
+		else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
+			return 1;
+		else if (key1->attach_type < key2->attach_type)
+			return -1;
+		else if (key1->attach_type > key2->attach_type)
+			return 1;
+	} else {
+		const __u64 *cgroup_inode_id1 = _key1;
+		const __u64 *cgroup_inode_id2 = _key2;
+
+		if (*cgroup_inode_id1 < *cgroup_inode_id2)
+			return -1;
+		else if (*cgroup_inode_id1 > *cgroup_inode_id2)
+			return 1;
+	}
 	return 0;
 }
 
-static struct bpf_cgroup_storage *cgroup_storage_lookup(
-	struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key,
-	bool locked)
+struct bpf_cgroup_storage *
+cgroup_storage_lookup(struct bpf_cgroup_storage_map *map,
+		      void *key, bool locked)
 {
 	struct rb_root *root = &map->root;
 	struct rb_node *node;
@@ -61,7 +79,7 @@ static struct bpf_cgroup_storage *cgroup_storage_lookup(
 
 		storage = container_of(node, struct bpf_cgroup_storage, node);
 
-		switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) {
+		switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) {
 		case -1:
 			node = node->rb_left;
 			break;
@@ -93,7 +111,7 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
 		this = container_of(*new, struct bpf_cgroup_storage, node);
 
 		parent = *new;
-		switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) {
+		switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) {
 		case -1:
 			new = &((*new)->rb_left);
 			break;
@@ -111,10 +129,9 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
 	return 0;
 }
 
-static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key)
+static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)
 {
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-	struct bpf_cgroup_storage_key *key = _key;
 	struct bpf_cgroup_storage *storage;
 
 	storage = cgroup_storage_lookup(map, key, false);
@@ -124,17 +141,13 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key)
 	return &READ_ONCE(storage->buf)->data[0];
 }
 
-static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
+static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
 				      void *value, u64 flags)
 {
-	struct bpf_cgroup_storage_key *key = _key;
 	struct bpf_cgroup_storage *storage;
 	struct bpf_storage_buffer *new;
 
-	if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
-		return -EINVAL;
-
-	if (unlikely(flags & BPF_NOEXIST))
+	if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST)))
 		return -EINVAL;
 
 	if (unlikely((flags & BPF_F_LOCK) &&
@@ -167,11 +180,10 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
 	return 0;
 }
 
-int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
+int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key,
 				   void *value)
 {
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-	struct bpf_cgroup_storage_key *key = _key;
 	struct bpf_cgroup_storage *storage;
 	int cpu, off = 0;
 	u32 size;
@@ -197,11 +209,10 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
 	return 0;
 }
 
-int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
+int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key,
 				     void *value, u64 map_flags)
 {
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-	struct bpf_cgroup_storage_key *key = _key;
 	struct bpf_cgroup_storage *storage;
 	int cpu, off = 0;
 	u32 size;
@@ -232,12 +243,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
 	return 0;
 }
 
-static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
+static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key,
 				       void *_next_key)
 {
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-	struct bpf_cgroup_storage_key *key = _key;
-	struct bpf_cgroup_storage_key *next = _next_key;
 	struct bpf_cgroup_storage *storage;
 
 	spin_lock_bh(&map->lock);
@@ -250,17 +259,23 @@ static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
 		if (!storage)
 			goto enoent;
 
-		storage = list_next_entry(storage, list);
+		storage = list_next_entry(storage, list_map);
 		if (!storage)
 			goto enoent;
 	} else {
 		storage = list_first_entry(&map->list,
-					 struct bpf_cgroup_storage, list);
+					 struct bpf_cgroup_storage, list_map);
 	}
 
 	spin_unlock_bh(&map->lock);
-	next->attach_type = storage->key.attach_type;
-	next->cgroup_inode_id = storage->key.cgroup_inode_id;
+
+	if (attach_type_isolated(&map->map)) {
+		struct bpf_cgroup_storage_key *next = _next_key;
+		*next = storage->key;
+	} else {
+		__u64 *next = _next_key;
+		*next = storage->key.cgroup_inode_id;
+	}
 	return 0;
 
 enoent:
@@ -275,7 +290,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 	struct bpf_map_memory mem;
 	int ret;
 
-	if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
+	if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) &&
+	    attr->key_size != sizeof(__u64))
 		return ERR_PTR(-EINVAL);
 
 	if (attr->value_size == 0)
@@ -318,6 +334,17 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 static void cgroup_storage_map_free(struct bpf_map *_map)
 {
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+	struct list_head *storages = &map->list;
+	struct bpf_cgroup_storage *storage, *stmp;
+
+	mutex_lock(&cgroup_mutex);
+
+	list_for_each_entry_safe(storage, stmp, storages, list_map) {
+		bpf_cgroup_storage_unlink(storage);
+		bpf_cgroup_storage_free(storage);
+	}
+
+	mutex_unlock(&cgroup_mutex);
 
 	WARN_ON(!RB_EMPTY_ROOT(&map->root));
 	WARN_ON(!list_empty(&map->list));
@@ -335,49 +362,63 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
 				    const struct btf_type *key_type,
 				    const struct btf_type *value_type)
 {
-	struct btf_member *m;
-	u32 offset, size;
-
-	/* Key is expected to be of struct bpf_cgroup_storage_key type,
-	 * which is:
-	 * struct bpf_cgroup_storage_key {
-	 *	__u64	cgroup_inode_id;
-	 *	__u32	attach_type;
-	 * };
-	 */
+	if (attach_type_isolated(map)) {
+		struct btf_member *m;
+		u32 offset, size;
+
+		/* Key is expected to be of struct bpf_cgroup_storage_key type,
+		 * which is:
+		 * struct bpf_cgroup_storage_key {
+		 *	__u64	cgroup_inode_id;
+		 *	__u32	attach_type;
+		 * };
+		 */
+
+		/*
+		 * Key_type must be a structure with two fields.
+		 */
+		if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ||
+		    BTF_INFO_VLEN(key_type->info) != 2)
+			return -EINVAL;
+
+		/*
+		 * The first field must be a 64 bit integer at 0 offset.
+		 */
+		m = (struct btf_member *)(key_type + 1);
+		size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id);
+		if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
+			return -EINVAL;
+
+		/*
+		 * The second field must be a 32 bit integer at 64 bit offset.
+		 */
+		m++;
+		offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
+		size = sizeof_field(struct bpf_cgroup_storage_key, attach_type);
+		if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
+			return -EINVAL;
+	} else {
+		u32 int_data;
 
-	/*
-	 * Key_type must be a structure with two fields.
-	 */
-	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ||
-	    BTF_INFO_VLEN(key_type->info) != 2)
-		return -EINVAL;
+		/*
+		 * Key is expected to be u64, which stores the cgroup_inode_id
+		 */
 
-	/*
-	 * The first field must be a 64 bit integer at 0 offset.
-	 */
-	m = (struct btf_member *)(key_type + 1);
-	size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id);
-	if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
-		return -EINVAL;
+		if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+			return -EINVAL;
 
-	/*
-	 * The second field must be a 32 bit integer at 64 bit offset.
-	 */
-	m++;
-	offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
-	size = sizeof_field(struct bpf_cgroup_storage_key, attach_type);
-	if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
-		return -EINVAL;
+		int_data = *(u32 *)(key_type + 1);
+		if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data))
+			return -EINVAL;
+	}
 
 	return 0;
 }
 
-static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key,
+static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
 					 struct seq_file *m)
 {
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
-	struct bpf_cgroup_storage_key *key = _key;
 	struct bpf_cgroup_storage *storage;
 	int cpu;
 
@@ -426,38 +467,13 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
 int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
 {
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
-	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-	int ret = -EBUSY;
-
-	spin_lock_bh(&map->lock);
 
-	if (map->aux && map->aux != aux)
-		goto unlock;
 	if (aux->cgroup_storage[stype] &&
 	    aux->cgroup_storage[stype] != _map)
-		goto unlock;
+		return -EBUSY;
 
-	map->aux = aux;
 	aux->cgroup_storage[stype] = _map;
-	ret = 0;
-unlock:
-	spin_unlock_bh(&map->lock);
-
-	return ret;
-}
-
-void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map)
-{
-	enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
-	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-
-	spin_lock_bh(&map->lock);
-	if (map->aux == aux) {
-		WARN_ON(aux->cgroup_storage[stype] != _map);
-		map->aux = NULL;
-		aux->cgroup_storage[stype] = NULL;
-	}
-	spin_unlock_bh(&map->lock);
+	return 0;
 }
 
 static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages)
@@ -578,7 +594,8 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
 
 	spin_lock_bh(&map->lock);
 	WARN_ON(cgroup_storage_insert(map, storage));
-	list_add(&storage->list, &map->list);
+	list_add(&storage->list_map, &map->list);
+	list_add(&storage->list_cg, &cgroup->bpf.storages);
 	spin_unlock_bh(&map->lock);
 }
 
@@ -596,7 +613,8 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage)
 	root = &map->root;
 	rb_erase(&storage->node, root);
 
-	list_del(&storage->list);
+	list_del(&storage->list_map);
+	list_del(&storage->list_cg);
 	spin_unlock_bh(&map->lock);
 }
 
-- 
cgit v1.2.3


From dfcdf0e9ad2e006196986f363c99b2097aec5ef0 Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Fri, 24 Jul 2020 16:17:53 -0500
Subject: bpf/local_storage: Fix build without CONFIG_CGROUP

local_storage.o has its compile guard as CONFIG_BPF_SYSCALL, which
does not imply that CONFIG_CGROUP is on. Including cgroup-internal.h
when CONFIG_CGROUP is off cause a compilation failure.

Fixes: f67cfc233706 ("bpf: Make cgroup storages shared between programs on the same cgroup")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200724211753.902969-1-zhuyifei1999@gmail.com
---
 kernel/bpf/local_storage.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 3b2c70197d78..571bb351ed3b 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -9,12 +9,12 @@
 #include <linux/slab.h>
 #include <uapi/linux/btf.h>
 
-#include "../cgroup/cgroup-internal.h"
-
 DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 #ifdef CONFIG_CGROUP_BPF
 
+#include "../cgroup/cgroup-internal.h"
+
 #define LOCAL_STORAGE_CREATE_FLAG_MASK					\
 	(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
 
-- 
cgit v1.2.3


From 2b9b305fcdda1810bdffeb599361174eb2cd0b7c Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 24 Jul 2020 13:05:02 -0700
Subject: bpf: Fix build on architectures with special bpf_user_pt_regs_t

Architectures like s390, powerpc, arm64, riscv have speical definition of
bpf_user_pt_regs_t. So we need to cast the pointer before passing it to
bpf_get_stack(). This is similar to bpf_get_stack_tp().

Fixes: 03d42fd2d83f ("bpf: Separate bpf_get_[stack|stackid] for perf events BPF")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200724200503.3629591-1-songliubraving@fb.com
---
 kernel/bpf/stackmap.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 5beb2f8c23da..4fd830a62be2 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -678,6 +678,7 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
 BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
 	   void *, buf, u32, size, u64, flags)
 {
+	struct pt_regs *regs = (struct pt_regs *)(ctx->regs);
 	struct perf_event *event = ctx->event;
 	struct perf_callchain_entry *trace;
 	bool kernel, user;
@@ -685,7 +686,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
 	__u64 nr_kernel;
 
 	if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
-		return __bpf_get_stack(ctx->regs, NULL, NULL, buf, size, flags);
+		return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
 
 	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
 			       BPF_F_USER_BUILD_ID)))
@@ -705,8 +706,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
 		__u64 nr = trace->nr;
 
 		trace->nr = nr_kernel;
-		err = __bpf_get_stack(ctx->regs, NULL, trace, buf,
-				      size, flags);
+		err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
 
 		/* restore nr */
 		trace->nr = nr;
@@ -718,8 +718,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
 			goto clear;
 
 		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
-		err = __bpf_get_stack(ctx->regs, NULL, trace, buf,
-				      size, flags);
+		err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
 	}
 	return err;
 
-- 
cgit v1.2.3


From aa8d3a716b59db6c1ad6c68fb8aa05e31980da60 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 21 Jul 2020 23:45:57 -0700
Subject: bpf, xdp: Add bpf_link-based XDP attachment API

Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through
BPF_LINK_CREATE command.

bpf_xdp_link is mutually exclusive with direct BPF program attachment,
previous BPF program should be detached prior to attempting to create a new
bpf_xdp_link attachment (for a given XDP mode). Once BPF link is attached, it
can't be replaced by other BPF program attachment or link attachment. It will
be detached only when the last BPF link FD is closed.

bpf_xdp_link will be auto-detached when net_device is shutdown, similarly to
how other BPF links behave (cgroup, flow_dissector). At that point bpf_link
will become defunct, but won't be destroyed until last FD is closed.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200722064603.3350758-5-andriin@fb.com
---
 include/linux/netdevice.h |   4 ++
 include/uapi/linux/bpf.h  |   7 +-
 kernel/bpf/syscall.c      |   5 ++
 net/core/dev.c            | 169 ++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 178 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cad44b40c776..7d3c412fcfe5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -888,6 +888,7 @@ struct bpf_prog_offload_ops;
 struct netlink_ext_ack;
 struct xdp_umem;
 struct xdp_dev_bulk_queue;
+struct bpf_xdp_link;
 
 enum bpf_xdp_mode {
 	XDP_MODE_SKB = 0,
@@ -898,6 +899,7 @@ enum bpf_xdp_mode {
 
 struct bpf_xdp_entity {
 	struct bpf_prog *prog;
+	struct bpf_xdp_link *link;
 };
 
 struct netdev_bpf {
@@ -3831,7 +3833,9 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, int expected_fd, u32 flags);
+int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);
+
 int xdp_umem_query(struct net_device *dev, u16 queue_id);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 828c2f6438f2..87823fb9c123 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -230,6 +230,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET_SOCK_RELEASE,
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
+	BPF_XDP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -242,6 +243,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_CGROUP = 3,
 	BPF_LINK_TYPE_ITER = 4,
 	BPF_LINK_TYPE_NETNS = 5,
+	BPF_LINK_TYPE_XDP = 6,
 
 	MAX_BPF_LINK_TYPE,
 };
@@ -614,7 +616,10 @@ union bpf_attr {
 
 	struct { /* struct used by BPF_LINK_CREATE command */
 		__u32		prog_fd;	/* eBPF program to attach */
-		__u32		target_fd;	/* object to attach to */
+		union {
+			__u32		target_fd;	/* object to attach to */
+			__u32		target_ifindex; /* target ifindex */
+		};
 		__u32		attach_type;	/* attach type */
 		__u32		flags;		/* extra flags */
 	} link_create;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ee290b1f2d9e..0e8c88db7e7a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2824,6 +2824,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_TRACING;
 	case BPF_SK_LOOKUP:
 		return BPF_PROG_TYPE_SK_LOOKUP;
+	case BPF_XDP:
+		return BPF_PROG_TYPE_XDP;
 	default:
 		return BPF_PROG_TYPE_UNSPEC;
 	}
@@ -3921,6 +3923,9 @@ static int link_create(union bpf_attr *attr)
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		ret = netns_bpf_link_create(attr, prog);
 		break;
+	case BPF_PROG_TYPE_XDP:
+		ret = bpf_xdp_link_attach(attr, prog);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/net/core/dev.c b/net/core/dev.c
index 521ce031ee35..e24248f3d675 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8716,6 +8716,12 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down_generic);
 
+struct bpf_xdp_link {
+	struct bpf_link link;
+	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
+	int flags;
+};
+
 static enum bpf_xdp_mode dev_xdp_mode(u32 flags)
 {
 	if (flags & XDP_FLAGS_HW_MODE)
@@ -8738,9 +8744,19 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
 	};
 }
 
+static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
+					 enum bpf_xdp_mode mode)
+{
+	return dev->xdp_state[mode].link;
+}
+
 static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
 				     enum bpf_xdp_mode mode)
 {
+	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
+
+	if (link)
+		return link->link.prog;
 	return dev->xdp_state[mode].prog;
 }
 
@@ -8751,9 +8767,17 @@ u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
 	return prog ? prog->aux->id : 0;
 }
 
+static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
+			     struct bpf_xdp_link *link)
+{
+	dev->xdp_state[mode].link = link;
+	dev->xdp_state[mode].prog = NULL;
+}
+
 static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
 			     struct bpf_prog *prog)
 {
+	dev->xdp_state[mode].link = NULL;
 	dev->xdp_state[mode].prog = prog;
 }
 
@@ -8793,6 +8817,7 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
 
 static void dev_xdp_uninstall(struct net_device *dev)
 {
+	struct bpf_xdp_link *link;
 	struct bpf_prog *prog;
 	enum bpf_xdp_mode mode;
 	bpf_op_t bpf_op;
@@ -8810,14 +8835,20 @@ static void dev_xdp_uninstall(struct net_device *dev)
 
 		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 
-		bpf_prog_put(prog);
-		dev_xdp_set_prog(dev, mode, NULL);
+		/* auto-detach link from net device */
+		link = dev_xdp_link(dev, mode);
+		if (link)
+			link->dev = NULL;
+		else
+			bpf_prog_put(prog);
+
+		dev_xdp_set_link(dev, mode, NULL);
 	}
 }
 
 static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
-			  struct bpf_prog *new_prog, struct bpf_prog *old_prog,
-			  u32 flags)
+			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
+			  struct bpf_prog *old_prog, u32 flags)
 {
 	struct bpf_prog *cur_prog;
 	enum bpf_xdp_mode mode;
@@ -8826,6 +8857,14 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
 
 	ASSERT_RTNL();
 
+	/* either link or prog attachment, never both */
+	if (link && (new_prog || old_prog))
+		return -EINVAL;
+	/* link supports only XDP mode flags */
+	if (link && (flags & ~XDP_FLAGS_MODES)) {
+		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
+		return -EINVAL;
+	}
 	/* just one XDP mode bit should be set, zero defaults to SKB mode */
 	if (hweight32(flags & XDP_FLAGS_MODES) > 1) {
 		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
@@ -8838,7 +8877,18 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
 	}
 
 	mode = dev_xdp_mode(flags);
+	/* can't replace attached link */
+	if (dev_xdp_link(dev, mode)) {
+		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
+		return -EBUSY;
+	}
+
 	cur_prog = dev_xdp_prog(dev, mode);
+	/* can't replace attached prog with link */
+	if (link && cur_prog) {
+		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
+		return -EBUSY;
+	}
 	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
 		NL_SET_ERR_MSG(extack, "Active program does not match expected");
 		return -EEXIST;
@@ -8848,6 +8898,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
 		return -EBUSY;
 	}
 
+	/* put effective new program into new_prog */
+	if (link)
+		new_prog = link->link.prog;
+
 	if (new_prog) {
 		bool offload = mode == XDP_MODE_HW;
 		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
@@ -8884,13 +8938,116 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
 			return err;
 	}
 
-	dev_xdp_set_prog(dev, mode, new_prog);
+	if (link)
+		dev_xdp_set_link(dev, mode, link);
+	else
+		dev_xdp_set_prog(dev, mode, new_prog);
 	if (cur_prog)
 		bpf_prog_put(cur_prog);
 
 	return 0;
 }
 
+static int dev_xdp_attach_link(struct net_device *dev,
+			       struct netlink_ext_ack *extack,
+			       struct bpf_xdp_link *link)
+{
+	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
+}
+
+static int dev_xdp_detach_link(struct net_device *dev,
+			       struct netlink_ext_ack *extack,
+			       struct bpf_xdp_link *link)
+{
+	enum bpf_xdp_mode mode;
+	bpf_op_t bpf_op;
+
+	ASSERT_RTNL();
+
+	mode = dev_xdp_mode(link->flags);
+	if (dev_xdp_link(dev, mode) != link)
+		return -EINVAL;
+
+	bpf_op = dev_xdp_bpf_op(dev, mode);
+	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
+	dev_xdp_set_link(dev, mode, NULL);
+	return 0;
+}
+
+static void bpf_xdp_link_release(struct bpf_link *link)
+{
+	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+
+	rtnl_lock();
+
+	/* if racing with net_device's tear down, xdp_link->dev might be
+	 * already NULL, in which case link was already auto-detached
+	 */
+	if (xdp_link->dev)
+		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
+
+	rtnl_unlock();
+}
+
+static void bpf_xdp_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+
+	kfree(xdp_link);
+}
+
+static const struct bpf_link_ops bpf_xdp_link_lops = {
+	.release = bpf_xdp_link_release,
+	.dealloc = bpf_xdp_link_dealloc,
+};
+
+int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_link_primer link_primer;
+	struct bpf_xdp_link *link;
+	struct net_device *dev;
+	int err, fd;
+
+	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
+	if (!dev)
+		return -EINVAL;
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
+		err = -ENOMEM;
+		goto out_put_dev;
+	}
+
+	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
+	link->dev = dev;
+	link->flags = attr->link_create.flags;
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err) {
+		kfree(link);
+		goto out_put_dev;
+	}
+
+	rtnl_lock();
+	err = dev_xdp_attach_link(dev, NULL, link);
+	rtnl_unlock();
+
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		goto out_put_dev;
+	}
+
+	fd = bpf_link_settle(&link_primer);
+	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
+	dev_put(dev);
+	return fd;
+
+out_put_dev:
+	dev_put(dev);
+	return err;
+}
+
 /**
  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
  *	@dev: device
@@ -8927,7 +9084,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		}
 	}
 
-	err = dev_xdp_attach(dev, extack, new_prog, old_prog, flags);
+	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
 
 err_out:
 	if (err && new_prog)
-- 
cgit v1.2.3


From aadfc2f957cb470a5a7e52cc41a2fa86e784bcd2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 25 Jul 2020 11:19:51 +0200
Subject: entry: Correct 'noinstr' attributes

The noinstr attribute is to be specified before the return type in the
same way 'inline' is used.

Similar cases were recently fixed for x86 in commit 7f6fa101dfac ("x86:
Correct noinstr qualifiers"), but the generic entry code was based on the
the original version and did not carry the fix over.

Fixes: a5497bab5f72 ("entry: Provide generic interrupt entry/exit code")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200725091951.744848-3-mingo@kernel.org
---
 kernel/entry/common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 495f5c051b03..9852e0d62d95 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -256,7 +256,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
 	exit_to_user_mode();
 }
 
-irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs)
+noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 {
 	irqentry_state_t ret = {
 		.exit_rcu = false,
@@ -333,7 +333,7 @@ void irqentry_exit_cond_resched(void)
 	}
 }
 
-void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
+noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
 {
 	lockdep_assert_irqs_disabled();
 
-- 
cgit v1.2.3


From 45e9504f109b84986ba9d25a273a5d5bd3f3e9d9 Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Thu, 16 Jul 2020 16:39:05 +0800
Subject: genirq/irqdomain: Remove redundant NULL pointer check on fwnode

The is_fwnode_irqchip() helper will check if the fwnode_handle is empty.
There is no need to perform a redundant check outside of it.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200716083905.287-1-yuzenghui@huawei.com
---
 kernel/irq/irqdomain.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index a4c2c915511d..155460fc0147 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -142,7 +142,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
 	if (!domain)
 		return NULL;
 
-	if (fwnode && is_fwnode_irqchip(fwnode)) {
+	if (is_fwnode_irqchip(fwnode)) {
 		fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
 
 		switch (fwid->type) {
-- 
cgit v1.2.3


From 8a667928cfae337b85bee4fb0dbc09f3c2715856 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 10 Jul 2020 23:18:22 +0000
Subject: irqdomain: Export irq_domain_update_bus_token

Add export for irq_domain_update_bus_token() so that
we can allow drivers like the qcom-pdc driver to be
loadable as a module.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Andy Gross <agross@kernel.org>
Cc: Bjorn Andersson <bjorn.andersson@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Maulik Shah <mkshah@codeaurora.org>
Cc: Lina Iyer <ilina@codeaurora.org>
Cc: Saravana Kannan <saravanak@google.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: linux-arm-msm@vger.kernel.org
Cc: iommu@lists.linux-foundation.org
Cc: linux-gpio@vger.kernel.org
Link: https://lore.kernel.org/r/20200710231824.60699-2-john.stultz@linaro.org
---
 kernel/irq/irqdomain.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 155460fc0147..76cd7ebd1178 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -281,6 +281,7 @@ void irq_domain_update_bus_token(struct irq_domain *domain,
 
 	mutex_unlock(&irq_domain_mutex);
 }
+EXPORT_SYMBOL_GPL(irq_domain_update_bus_token);
 
 /**
  * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
-- 
cgit v1.2.3


From 8d16f5b979660c0fdcfa21a418cc03f1fde60cf7 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 10 Jul 2020 23:18:23 +0000
Subject: genirq: Export irq_chip_retrigger_hierarchy and
 irq_chip_set_vcpu_affinity_parent

Add EXPORT_SYMBOL_GPL entries for irq_chip_retrigger_hierarchy()
and irq_chip_set_vcpu_affinity_parent() so that we can allow
drivers like the qcom-pdc driver to be loadable as a module.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Andy Gross <agross@kernel.org>
Cc: Bjorn Andersson <bjorn.andersson@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Maulik Shah <mkshah@codeaurora.org>
Cc: Lina Iyer <ilina@codeaurora.org>
Cc: Saravana Kannan <saravanak@google.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: linux-arm-msm@vger.kernel.org
Cc: iommu@lists.linux-foundation.org
Cc: linux-gpio@vger.kernel.org
Link: https://lore.kernel.org/r/20200710231824.60699-3-john.stultz@linaro.org
---
 kernel/irq/chip.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 41e7e37a0928..ba6ce66d7ed6 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1478,6 +1478,7 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(irq_chip_retrigger_hierarchy);
 
 /**
  * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
@@ -1492,7 +1493,7 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
 
 	return -ENOSYS;
 }
-
+EXPORT_SYMBOL_GPL(irq_chip_set_vcpu_affinity_parent);
 /**
  * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
  * @data:	Pointer to interrupt specific data
-- 
cgit v1.2.3


From ed00495333ccc80fc8fb86fb43773c3c2a499466 Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Mon, 27 Jul 2020 14:48:52 +0200
Subject: locking/lockdep: Fix TRACE_IRQFLAGS vs. NMIs

Prior to commit:

  859d069ee1dd ("lockdep: Prepare for NMI IRQ state tracking")

IRQ state tracking was disabled in NMIs due to nmi_enter()
doing lockdep_off() -- with the obvious requirement that NMI entry
call nmi_enter() before trace_hardirqs_off().

[ AFAICT, PowerPC and SH violate this order on their NMI entry ]

However, that commit explicitly changed lockdep_hardirqs_*() to ignore
lockdep_off() and breaks every architecture that has irq-tracing in
it's NMI entry that hasn't been fixed up (x86 being the only fixed one
at this point).

The reason for this change is that by ignoring lockdep_off() we can:

  - get rid of 'current->lockdep_recursion' in lockdep_assert_irqs*()
    which was going to to give header-recursion issues with the
    seqlock rework.

  - allow these lockdep_assert_*() macros to function in NMI context.

Restore the previous state of things and allow an architecture to
opt-in to the NMI IRQ tracking support, however instead of relying on
lockdep_off(), rely on in_nmi(), both are part of nmi_enter() and so
over-all entry ordering doesn't need to change.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200727124852.GK119549@hirez.programming.kicks-ass.net
---
 arch/x86/Kconfig.debug   | 3 +++
 kernel/locking/lockdep.c | 8 +++++++-
 lib/Kconfig.debug        | 6 ++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 0dd319e6e5b4..ee1d3c5834c6 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -3,6 +3,9 @@
 config TRACE_IRQFLAGS_SUPPORT
 	def_bool y
 
+config TRACE_IRQFLAGS_NMI_SUPPORT
+	def_bool y
+
 config EARLY_PRINTK_USB
 	bool
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d595623c4b34..8b0b28b4546b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3712,6 +3712,9 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 	 * and not rely on hardware state like normal interrupts.
 	 */
 	if (unlikely(in_nmi())) {
+		if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+			return;
+
 		/*
 		 * Skip:
 		 *  - recursion check, because NMI can hit lockdep;
@@ -3773,7 +3776,10 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
 	 * they will restore the software state. This ensures the software
 	 * state is consistent inside NMIs as well.
 	 */
-	if (unlikely(!in_nmi() && (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)))
+	if (in_nmi()) {
+		if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+			return;
+	} else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)
 		return;
 
 	/*
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 9ad9210d70a1..fa964b51f066 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1325,11 +1325,17 @@ config WW_MUTEX_SELFTEST
 endmenu # lock debugging
 
 config TRACE_IRQFLAGS
+	depends on TRACE_IRQFLAGS_SUPPORT
 	bool
 	help
 	  Enables hooks to interrupt enabling and disabling for
 	  either tracing or lock debugging.
 
+config TRACE_IRQFLAGS_NMI
+	def_bool y
+	depends on TRACE_IRQFLAGS
+	depends on TRACE_IRQFLAGS_NMI_SUPPORT
+
 config STACKTRACE
 	bool "Stack backtrace support"
 	depends on STACKTRACE_SUPPORT
-- 
cgit v1.2.3


From f0c7baca180046824e07fc5f1326e83a8fd150c7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 24 Jul 2020 22:44:41 +0200
Subject: genirq/affinity: Make affinity setting if activated opt-in

John reported that on a RK3288 system the perf per CPU interrupts are all
affine to CPU0 and provided the analysis:

 "It looks like what happens is that because the interrupts are not per-CPU
  in the hardware, armpmu_request_irq() calls irq_force_affinity() while
  the interrupt is deactivated and then request_irq() with IRQF_PERCPU |
  IRQF_NOBALANCING.

  Now when irq_startup() runs with IRQ_STARTUP_NORMAL, it calls
  irq_setup_affinity() which returns early because IRQF_PERCPU and
  IRQF_NOBALANCING are set, leaving the interrupt on its original CPU."

This was broken by the recent commit which blocked interrupt affinity
setting in hardware before activation of the interrupt. While this works in
general, it does not work for this particular case. As contrary to the
initial analysis not all interrupt chip drivers implement an activate
callback, the safe cure is to make the deferred interrupt affinity setting
at activation time opt-in.

Implement the necessary core logic and make the two irqchip implementations
for which this is required opt-in. In hindsight this would have been the
right thing to do, but ...

Fixes: baedb87d1b53 ("genirq/affinity: Handle affinity setting on inactive interrupts correctly")
Reported-by: John Keeping <john@metanate.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marc Zyngier <maz@kernel.org>
Acked-by: Marc Zyngier <maz@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/87blk4tzgm.fsf@nanos.tec.linutronix.de
---
 arch/x86/kernel/apic/vector.c    |  4 ++++
 drivers/irqchip/irq-gic-v3-its.c |  5 ++++-
 include/linux/irq.h              | 13 +++++++++++++
 kernel/irq/manage.c              |  6 +++++-
 4 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 7649da2478d8..dae32d948bf2 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -560,6 +560,10 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 		 * as that can corrupt the affinity move state.
 		 */
 		irqd_set_handle_enforce_irqctx(irqd);
+
+		/* Don't invoke affinity setter on deactivated interrupts */
+		irqd_set_affinity_on_activate(irqd);
+
 		/*
 		 * Legacy vectors are already assigned when the IOAPIC
 		 * takes them over. They stay on the same vector. This is
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index beac4caefad9..103d850b5595 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3523,6 +3523,7 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 	msi_alloc_info_t *info = args;
 	struct its_device *its_dev = info->scratchpad[0].ptr;
 	struct its_node *its = its_dev->its;
+	struct irq_data *irqd;
 	irq_hw_number_t hwirq;
 	int err;
 	int i;
@@ -3542,7 +3543,9 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 
 		irq_domain_set_hwirq_and_chip(domain, virq + i,
 					      hwirq + i, &its_irq_chip, its_dev);
-		irqd_set_single_target(irq_desc_get_irq_data(irq_to_desc(virq + i)));
+		irqd = irq_get_irq_data(virq + i);
+		irqd_set_single_target(irqd);
+		irqd_set_affinity_on_activate(irqd);
 		pr_debug("ID:%d pID:%d vID:%d\n",
 			 (int)(hwirq + i - its_dev->event_map.lpi_base),
 			 (int)(hwirq + i), virq + i);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8d5bc2c237d7..1b7f4dfee35b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -213,6 +213,8 @@ struct irq_data {
  *				  required
  * IRQD_HANDLE_ENFORCE_IRQCTX	- Enforce that handle_irq_*() is only invoked
  *				  from actual interrupt context.
+ * IRQD_AFFINITY_ON_ACTIVATE	- Affinity is set on activation. Don't call
+ *				  irq_chip::irq_set_affinity() when deactivated.
  */
 enum {
 	IRQD_TRIGGER_MASK		= 0xf,
@@ -237,6 +239,7 @@ enum {
 	IRQD_CAN_RESERVE		= (1 << 26),
 	IRQD_MSI_NOMASK_QUIRK		= (1 << 27),
 	IRQD_HANDLE_ENFORCE_IRQCTX	= (1 << 28),
+	IRQD_AFFINITY_ON_ACTIVATE	= (1 << 29),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -421,6 +424,16 @@ static inline bool irqd_msi_nomask_quirk(struct irq_data *d)
 	return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK;
 }
 
+static inline void irqd_set_affinity_on_activate(struct irq_data *d)
+{
+	__irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE;
+}
+
+static inline bool irqd_affinity_on_activate(struct irq_data *d)
+{
+	return __irqd_to_state(d) & IRQD_AFFINITY_ON_ACTIVATE;
+}
+
 #undef __irqd_to_state
 
 static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2a9fec53e159..48c38e09c673 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -320,12 +320,16 @@ static bool irq_set_affinity_deactivated(struct irq_data *data,
 	struct irq_desc *desc = irq_data_to_desc(data);
 
 	/*
+	 * Handle irq chips which can handle affinity only in activated
+	 * state correctly
+	 *
 	 * If the interrupt is not yet activated, just store the affinity
 	 * mask and do not call the chip driver at all. On activation the
 	 * driver has to make sure anyway that the interrupt is in a
 	 * useable state so startup works.
 	 */
-	if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data))
+	if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) ||
+	    irqd_is_activated(data) || !irqd_affinity_on_activate(data))
 		return false;
 
 	cpumask_copy(desc->irq_common_data.affinity, mask);
-- 
cgit v1.2.3


From aa251fc5b936d3ddb4b4c4b36427eb9aa3347c82 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sat, 25 Jul 2020 13:30:55 +0100
Subject: genirq/debugfs: Add missing irqchip flags

Recently introduced irqchip flags lack the corresponding printouts in
debugfs. Add them.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/874kpvydxc.wl-maz@kernel.org
---
 kernel/irq/debugfs.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 4f9f844074db..b95ff5d5f4bd 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -112,6 +112,7 @@ static const struct irq_bit_descr irqdata_states[] = {
 	BIT_MASK_DESCR(IRQD_AFFINITY_SET),
 	BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
 	BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
+	BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE),
 	BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
 	BIT_MASK_DESCR(IRQD_CAN_RESERVE),
 	BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
@@ -120,6 +121,10 @@ static const struct irq_bit_descr irqdata_states[] = {
 
 	BIT_MASK_DESCR(IRQD_WAKEUP_STATE),
 	BIT_MASK_DESCR(IRQD_WAKEUP_ARMED),
+
+	BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET),
+
+	BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX),
 };
 
 static const struct irq_bit_descr irqdesc_states[] = {
-- 
cgit v1.2.3


From b54cecf5e2293d15620f7b3f8d1bf486243d5643 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 7 Jun 2020 12:10:40 +0300
Subject: fsnotify: pass dir argument to handle_event() callback

The 'inode' argument to handle_event(), sometimes referred to as
'to_tell' is somewhat obsolete.
It is a remnant from the times when a group could only have an inode mark
associated with an event.

We now pass an iter_info array to the callback, with all marks associated
with an event.

Most backends ignore this argument, with two exceptions:
1. dnotify uses it for sanity check that event is on directory
2. fanotify uses it to report fid of directory on directory entry
   modification events

Remove the 'inode' argument and add a 'dir' argument.
The callback function signature is deliberately changed, because
the meaning of the argument has changed and the arguments have
been documented.

The 'dir' argument is set to when 'file_name' is specified and it is
referring to the directory that the 'file_name' entry belongs to.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/nfsd/filecache.c                  |  6 +++---
 fs/notify/dnotify/dnotify.c          |  8 ++++----
 fs/notify/fanotify/fanotify.c        | 23 +++++++++++------------
 fs/notify/fsnotify.c                 | 26 ++++++++++++--------------
 fs/notify/inotify/inotify.h          |  6 +++---
 fs/notify/inotify/inotify_fsnotify.c |  7 +++----
 fs/notify/inotify/inotify_user.c     |  4 ++--
 include/linux/fsnotify_backend.h     | 16 +++++++++++++---
 kernel/audit_fsnotify.c              | 10 +++++-----
 kernel/audit_tree.c                  |  6 +++---
 kernel/audit_watch.c                 |  6 +++---
 11 files changed, 62 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ace8e5c30952..bbc7892d2928 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -598,9 +598,9 @@ static struct notifier_block nfsd_file_lease_notifier = {
 };
 
 static int
-nfsd_file_fsnotify_handle_event(struct fsnotify_group *group,
-				struct inode *to_tell,
-				u32 mask, const void *data, int data_type,
+nfsd_file_fsnotify_handle_event(struct fsnotify_group *group, u32 mask,
+				const void *data, int data_type,
+				struct inode *dir,
 				const struct qstr *file_name, u32 cookie,
 				struct fsnotify_iter_info *iter_info)
 {
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 7a42c2ebe28d..2d2eadfb5186 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -70,9 +70,9 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
  * destroy the dnotify struct if it was not registered to receive multiple
  * events.
  */
-static int dnotify_handle_event(struct fsnotify_group *group,
-				struct inode *inode,
-				u32 mask, const void *data, int data_type,
+static int dnotify_handle_event(struct fsnotify_group *group, u32 mask,
+				const void *data, int data_type,
+				struct inode *dir,
 				const struct qstr *file_name, u32 cookie,
 				struct fsnotify_iter_info *iter_info)
 {
@@ -84,7 +84,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 	__u32 test_mask = mask & ~FS_EVENT_ON_CHILD;
 
 	/* not a dir, dnotify doesn't care */
-	if (!S_ISDIR(inode->i_mode))
+	if (!dir && !(mask & FS_ISDIR))
 		return 0;
 
 	if (WARN_ON(fsnotify_iter_vfsmount_mark(iter_info)))
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 41f5fc9a8f19..e417c64c365b 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -335,11 +335,11 @@ out:
  * FS_ATTRIB reports the child inode even if reported on a watched parent.
  * FS_CREATE reports the modified dir inode and not the created inode.
  */
-static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask,
-					const void *data, int data_type)
+static struct inode *fanotify_fid_inode(u32 event_mask, const void *data,
+					int data_type, struct inode *dir)
 {
 	if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
-		return to_tell;
+		return dir;
 
 	return fsnotify_data_inode(data, data_type);
 }
@@ -416,14 +416,14 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id,
 }
 
 static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
-						   struct inode *inode, u32 mask,
-						   const void *data, int data_type,
+						   u32 mask, const void *data,
+						   int data_type, struct inode *dir,
 						   const struct qstr *file_name,
 						   __kernel_fsid_t *fsid)
 {
 	struct fanotify_event *event = NULL;
 	gfp_t gfp = GFP_KERNEL_ACCOUNT;
-	struct inode *id = fanotify_fid_inode(inode, mask, data, data_type);
+	struct inode *id = fanotify_fid_inode(mask, data, data_type, dir);
 	const struct path *path = fsnotify_data_path(data, data_type);
 
 	/*
@@ -507,9 +507,9 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
 	return fsid;
 }
 
-static int fanotify_handle_event(struct fsnotify_group *group,
-				 struct inode *inode,
-				 u32 mask, const void *data, int data_type,
+static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
+				 const void *data, int data_type,
+				 struct inode *dir,
 				 const struct qstr *file_name, u32 cookie,
 				 struct fsnotify_iter_info *iter_info)
 {
@@ -546,8 +546,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 	if (!mask)
 		return 0;
 
-	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
-		 mask);
+	pr_debug("%s: group=%p mask=%x\n", __func__, group, mask);
 
 	if (fanotify_is_perm_event(mask)) {
 		/*
@@ -565,7 +564,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 			return 0;
 	}
 
-	event = fanotify_alloc_event(group, inode, mask, data, data_type,
+	event = fanotify_alloc_event(group, mask, data, data_type, dir,
 				     file_name, &fsid);
 	ret = -ENOMEM;
 	if (unlikely(!event)) {
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 30628a72ca01..c4ac4d13e10f 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -185,11 +185,9 @@ notify_child:
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
 
-static int send_to_group(struct inode *to_tell,
-			 __u32 mask, const void *data,
-			 int data_is, u32 cookie,
-			 const struct qstr *file_name,
-			 struct fsnotify_iter_info *iter_info)
+static int send_to_group(__u32 mask, const void *data, int data_type,
+			 struct inode *dir, const struct qstr *file_name,
+			 u32 cookie, struct fsnotify_iter_info *iter_info)
 {
 	struct fsnotify_group *group = NULL;
 	__u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
@@ -225,15 +223,14 @@ static int send_to_group(struct inode *to_tell,
 		}
 	}
 
-	pr_debug("%s: group=%p to_tell=%p mask=%x marks_mask=%x marks_ignored_mask=%x"
-		 " data=%p data_is=%d cookie=%d\n",
-		 __func__, group, to_tell, mask, marks_mask, marks_ignored_mask,
-		 data, data_is, cookie);
+	pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
+		 __func__, group, mask, marks_mask, marks_ignored_mask,
+		 data, data_type, dir, cookie);
 
 	if (!(test_mask & marks_mask & ~marks_ignored_mask))
 		return 0;
 
-	return group->ops->handle_event(group, to_tell, mask, data, data_is,
+	return group->ops->handle_event(group, mask, data, data_type, dir,
 					file_name, cookie, iter_info);
 }
 
@@ -317,12 +314,13 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
  * out to all of the registered fsnotify_group.  Those groups can then use the
  * notification event in whatever means they feel necessary.
  */
-int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
+int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_type,
 	     const struct qstr *file_name, u32 cookie)
 {
-	const struct path *path = fsnotify_data_path(data, data_is);
+	const struct path *path = fsnotify_data_path(data, data_type);
 	struct fsnotify_iter_info iter_info = {};
 	struct super_block *sb = to_tell->i_sb;
+	struct inode *dir = file_name ? to_tell : NULL;
 	struct mount *mnt = NULL;
 	int ret = 0;
 	__u32 test_mask, marks_mask;
@@ -375,8 +373,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	 * That's why this traversal is so complicated...
 	 */
 	while (fsnotify_iter_select_report_types(&iter_info)) {
-		ret = send_to_group(to_tell, mask, data, data_is, cookie,
-				    file_name, &iter_info);
+		ret = send_to_group(mask, data, data_type, dir, file_name,
+				    cookie, &iter_info);
 
 		if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
 			goto out;
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 3f246f7b8a92..4327d0e9c364 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -24,9 +24,9 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
 
 extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 					   struct fsnotify_group *group);
-extern int inotify_handle_event(struct fsnotify_group *group,
-				struct inode *inode,
-				u32 mask, const void *data, int data_type,
+extern int inotify_handle_event(struct fsnotify_group *group, u32 mask,
+				const void *data, int data_type,
+				struct inode *dir,
 				const struct qstr *file_name, u32 cookie,
 				struct fsnotify_iter_info *iter_info);
 
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 9b481460a2dc..dfd455798a1b 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -55,9 +55,8 @@ static int inotify_merge(struct list_head *list,
 	return event_compare(last_event, event);
 }
 
-int inotify_handle_event(struct fsnotify_group *group,
-			 struct inode *inode,
-			 u32 mask, const void *data, int data_type,
+int inotify_handle_event(struct fsnotify_group *group, u32 mask,
+			 const void *data, int data_type, struct inode *dir,
 			 const struct qstr *file_name, u32 cookie,
 			 struct fsnotify_iter_info *iter_info)
 {
@@ -82,7 +81,7 @@ int inotify_handle_event(struct fsnotify_group *group,
 		alloc_len += len + 1;
 	}
 
-	pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
+	pr_debug("%s: group=%p mark=%p mask=%x\n", __func__, group, inode_mark,
 		 mask);
 
 	i_mark = container_of(inode_mark, struct inotify_inode_mark,
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index f88bbcc9efeb..5385d5817dd9 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -490,8 +490,8 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 					   fsn_mark);
 
 	/* Queue ignore event for the watch */
-	inotify_handle_event(group, NULL, FS_IN_IGNORED, NULL,
-			     FSNOTIFY_EVENT_NONE, NULL, 0, &iter_info);
+	inotify_handle_event(group, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE,
+			     NULL, NULL, 0, &iter_info);
 
 	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 	/* remove this mark from the idr */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 97300f3b8ff0..0de130cbf72d 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -108,6 +108,17 @@ struct mem_cgroup;
  * these operations for each relevant group.
  *
  * handle_event - main call for a group to handle an fs event
+ * @group:	group to notify
+ * @mask:	event type and flags
+ * @data:	object that event happened on
+ * @data_type:	type of object for fanotify_data_XXX() accessors
+ * @dir:	optional directory associated with event -
+ *		if @file_name is not NULL, this is the directory that
+ *		@file_name is relative to
+ * @file_name:	optional file name associated with event
+ * @cookie:	inotify rename cookie
+ * @iter_info:	array of marks from this group that are interested in the event
+ *
  * free_group_priv - called when a group refcnt hits 0 to clean up the private union
  * freeing_mark - called when a mark is being destroyed for some reason.  The group
  * 		MUST be holding a reference on each mark and that reference must be
@@ -115,9 +126,8 @@ struct mem_cgroup;
  * 		userspace messages that marks have been removed.
  */
 struct fsnotify_ops {
-	int (*handle_event)(struct fsnotify_group *group,
-			    struct inode *inode,
-			    u32 mask, const void *data, int data_type,
+	int (*handle_event)(struct fsnotify_group *group, u32 mask,
+			    const void *data, int data_type, struct inode *dir,
 			    const struct qstr *file_name, u32 cookie,
 			    struct fsnotify_iter_info *iter_info);
 	void (*free_group_priv)(struct fsnotify_group *group);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 3596448bfdab..30ca239285a3 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -152,11 +152,11 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
 }
 
 /* Update mark data in audit rules based on fsnotify events. */
-static int audit_mark_handle_event(struct fsnotify_group *group,
-				    struct inode *to_tell,
-				    u32 mask, const void *data, int data_type,
-				    const struct qstr *dname, u32 cookie,
-				    struct fsnotify_iter_info *iter_info)
+static int audit_mark_handle_event(struct fsnotify_group *group, u32 mask,
+				   const void *data, int data_type,
+				   struct inode *dir,
+				   const struct qstr *dname, u32 cookie,
+				   struct fsnotify_iter_info *iter_info)
 {
 	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
 	struct audit_fsnotify_mark *audit_mark;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e49c912f862d..2ce2ac1ce100 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1037,9 +1037,9 @@ static void evict_chunk(struct audit_chunk *chunk)
 		audit_schedule_prune();
 }
 
-static int audit_tree_handle_event(struct fsnotify_group *group,
-				   struct inode *to_tell,
-				   u32 mask, const void *data, int data_type,
+static int audit_tree_handle_event(struct fsnotify_group *group, u32 mask,
+				   const void *data, int data_type,
+				   struct inode *dir,
 				   const struct qstr *file_name, u32 cookie,
 				   struct fsnotify_iter_info *iter_info)
 {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e09c551ae52d..61fd601f1edf 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -464,9 +464,9 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 }
 
 /* Update watch data in audit rules based on fsnotify events. */
-static int audit_watch_handle_event(struct fsnotify_group *group,
-				    struct inode *to_tell,
-				    u32 mask, const void *data, int data_type,
+static int audit_watch_handle_event(struct fsnotify_group *group, u32 mask,
+				    const void *data, int data_type,
+				    struct inode *dir,
 				    const struct qstr *dname, u32 cookie,
 				    struct fsnotify_iter_info *iter_info)
 {
-- 
cgit v1.2.3


From b4e9c9549f62329d2412f899635fddc5212b9cd4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 1 Jun 2020 19:42:40 -0400
Subject: introduction of regset ->get() wrappers, switching ELF coredumps to
 those

Two new helpers: given a process and regset, dump into a buffer.
regset_get() takes a buffer and size, regset_get_alloc() takes size
and allocates a buffer.

Return value in both cases is the amount of data actually dumped in
case of success or -E...  on error.

In both cases the size is capped by regset->n * regset->size, so
->get() is called with offset 0 and size no more than what regset
expects.

binfmt_elf.c callers of ->get() are switched to using those; the other
caller (copy_regset_to_user()) will need some preparations to switch.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c        | 54 ++++++++++++++++++++++++--------------------------
 include/linux/regset.h |  9 +++++++++
 kernel/Makefile        |  2 +-
 kernel/regset.c        | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 90 insertions(+), 29 deletions(-)
 create mode 100644 kernel/regset.c

(limited to 'kernel')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 9fe3b51c116a..e922a6abdca8 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1821,7 +1821,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 				 long signr, size_t *total)
 {
 	unsigned int i;
-	unsigned int regset0_size = regset_size(t->task, &view->regsets[0]);
+	int regset0_size;
 
 	/*
 	 * NT_PRSTATUS is the one special case, because the regset data
@@ -1830,8 +1830,10 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 * We assume that regset 0 is NT_PRSTATUS.
 	 */
 	fill_prstatus(&t->prstatus, t->task, signr);
-	(void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset0_size,
-				    &t->prstatus.pr_reg, NULL);
+	regset0_size = regset_get(t->task, &view->regsets[0],
+		   sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
+	if (regset0_size < 0)
+		return 0;
 
 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
 		  PRSTATUS_SIZE(t->prstatus, regset0_size), &t->prstatus);
@@ -1846,32 +1848,28 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 */
 	for (i = 1; i < view->n; ++i) {
 		const struct user_regset *regset = &view->regsets[i];
+		int note_type = regset->core_note_type;
+		bool is_fpreg = note_type == NT_PRFPREG;
+		void *data;
+		int ret;
+
 		do_thread_regset_writeback(t->task, regset);
-		if (regset->core_note_type && regset->get &&
-		    (!regset->active || regset->active(t->task, regset) > 0)) {
-			int ret;
-			size_t size = regset_size(t->task, regset);
-			void *data = kzalloc(size, GFP_KERNEL);
-			if (unlikely(!data))
-				return 0;
-			ret = regset->get(t->task, regset,
-					  0, size, data, NULL);
-			if (unlikely(ret))
-				kfree(data);
-			else {
-				if (regset->core_note_type != NT_PRFPREG)
-					fill_note(&t->notes[i], "LINUX",
-						  regset->core_note_type,
-						  size, data);
-				else {
-					SET_PR_FPVALID(&t->prstatus,
-							1, regset0_size);
-					fill_note(&t->notes[i], "CORE",
-						  NT_PRFPREG, size, data);
-				}
-				*total += notesize(&t->notes[i]);
-			}
-		}
+		if (!note_type) // not for coredumps
+			continue;
+		if (regset->active && regset->active(t->task, regset) <= 0)
+			continue;
+
+		ret = regset_get_alloc(t->task, regset, ~0U, &data);
+		if (ret < 0)
+			continue;
+
+		if (is_fpreg)
+			SET_PR_FPVALID(&t->prstatus, 1, regset0_size);
+
+		fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX",
+			  note_type, ret, data);
+
+		*total += notesize(&t->notes[i]);
 	}
 
 	return 1;
diff --git a/include/linux/regset.h b/include/linux/regset.h
index 46d6ae68c455..968a032922d5 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -353,6 +353,15 @@ static inline int user_regset_copyin_ignore(unsigned int *pos,
 	return 0;
 }
 
+extern int regset_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int size, void *data);
+
+extern int regset_get_alloc(struct task_struct *target,
+			    const struct user_regset *regset,
+			    unsigned int size,
+			    void **data);
+
 /**
  * copy_regset_to_user - fetch a thread's user_regset data into user memory
  * @target:	thread to be examined
diff --git a/kernel/Makefile b/kernel/Makefile
index f3218bc5ec69..e6e03380a0f1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
-	    async.o range.o smpboot.o ucount.o
+	    async.o range.o smpboot.o ucount.o regset.o
 
 obj-$(CONFIG_MODULES) += kmod.o
 obj-$(CONFIG_MULTIUSER) += groups.o
diff --git a/kernel/regset.c b/kernel/regset.c
new file mode 100644
index 000000000000..6b39fa0993ec
--- /dev/null
+++ b/kernel/regset.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/regset.h>
+
+static int __regset_get(struct task_struct *target,
+			const struct user_regset *regset,
+			unsigned int size,
+			void **data)
+{
+	void *p = *data, *to_free = NULL;
+	int res;
+
+	if (!regset->get)
+		return -EOPNOTSUPP;
+	if (size > regset->n * regset->size)
+		size = regset->n * regset->size;
+	if (!p) {
+		to_free = p = kzalloc(size, GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+	}
+	res = regset->get(target, regset, 0, size, p, NULL);
+	if (unlikely(res < 0)) {
+		kfree(to_free);
+		return res;
+	}
+	*data = p;
+	if (regset->get_size) { // arm64-only kludge, will go away
+		unsigned max_size = regset->get_size(target, regset);
+		if (size > max_size)
+			size = max_size;
+	}
+	return size;
+}
+
+int regset_get(struct task_struct *target,
+	       const struct user_regset *regset,
+	       unsigned int size,
+	       void *data)
+{
+	return __regset_get(target, regset, size, &data);
+}
+EXPORT_SYMBOL(regset_get);
+
+int regset_get_alloc(struct task_struct *target,
+		     const struct user_regset *regset,
+		     unsigned int size,
+		     void **data)
+{
+	*data = NULL;
+	return __regset_get(target, regset, size, data);
+}
+EXPORT_SYMBOL(regset_get_alloc);
-- 
cgit v1.2.3


From dc12d7968f9c9540494deb1285854b18ca4465ec Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 17 Feb 2020 12:25:14 -0500
Subject: copy_regset_to_user(): do all copyout at once.

Turn copy_regset_to_user() into regset_get_alloc() + copy_to_user().
Now all ->get() calls have a kernel buffer as destination.

Note that we'd already eliminated the callers of copy_regset_to_user()
with non-zero offset; now that argument is simply unused.

Uninlined, while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/regset.h | 29 ++++-------------------------
 kernel/regset.c        | 26 ++++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/regset.h b/include/linux/regset.h
index 968a032922d5..af57c1db1924 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -362,31 +362,10 @@ extern int regset_get_alloc(struct task_struct *target,
 			    unsigned int size,
 			    void **data);
 
-/**
- * copy_regset_to_user - fetch a thread's user_regset data into user memory
- * @target:	thread to be examined
- * @view:	&struct user_regset_view describing user thread machine state
- * @setno:	index in @view->regsets
- * @offset:	offset into the regset data, in bytes
- * @size:	amount of data to copy, in bytes
- * @data:	user-mode pointer to copy into
- */
-static inline int copy_regset_to_user(struct task_struct *target,
-				      const struct user_regset_view *view,
-				      unsigned int setno,
-				      unsigned int offset, unsigned int size,
-				      void __user *data)
-{
-	const struct user_regset *regset = &view->regsets[setno];
-
-	if (!regset->get)
-		return -EOPNOTSUPP;
-
-	if (!access_ok(data, size))
-		return -EFAULT;
-
-	return regset->get(target, regset, offset, size, NULL, data);
-}
+extern int copy_regset_to_user(struct task_struct *target,
+			       const struct user_regset_view *view,
+			       unsigned int setno, unsigned int offset,
+			       unsigned int size, void __user *data);
 
 /**
  * copy_regset_from_user - store into thread's user_regset data from user memory
diff --git a/kernel/regset.c b/kernel/regset.c
index 6b39fa0993ec..0a610983ce43 100644
--- a/kernel/regset.c
+++ b/kernel/regset.c
@@ -52,3 +52,29 @@ int regset_get_alloc(struct task_struct *target,
 	return __regset_get(target, regset, size, data);
 }
 EXPORT_SYMBOL(regset_get_alloc);
+
+/**
+ * copy_regset_to_user - fetch a thread's user_regset data into user memory
+ * @target:	thread to be examined
+ * @view:	&struct user_regset_view describing user thread machine state
+ * @setno:	index in @view->regsets
+ * @offset:	offset into the regset data, in bytes
+ * @size:	amount of data to copy, in bytes
+ * @data:	user-mode pointer to copy into
+ */
+int copy_regset_to_user(struct task_struct *target,
+			const struct user_regset_view *view,
+			unsigned int setno,
+			unsigned int offset, unsigned int size,
+			void __user *data)
+{
+	const struct user_regset *regset = &view->regsets[setno];
+	void *buf;
+	int ret;
+
+	ret = regset_get_alloc(target, regset, size, &buf);
+	if (ret > 0)
+		ret = copy_to_user(data, buf, ret) ? -EFAULT : 0;
+	kfree(buf);
+	return ret;
+}
-- 
cgit v1.2.3


From 7717cb9bdd0421faa432a4e0d499fdba6e2394c8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 20 Feb 2020 20:48:16 -0500
Subject: regset: new method and helpers for it

->regset_get() takes task+regset+buffer, returns the amount of free space
left in the buffer on success and -E... on error.

buffer is represented as struct membuf - a pair of (kernel) pointer
and amount of space left

Primitives for writing to such:
	* membuf_write(buf, data, size)
	* membuf_zero(buf, size)
	* membuf_store(buf, value)

These are implemented as inlines (in case of membuf_store - a macro).
All writes are sequential; they become no-ops when there's no space
left.  Return value of all primitives is the amount of space left
after the operation, so they can be used as return values of ->regset_get().

Example of use:

// stores pt_regs of task + 64 bytes worth of zeroes + 32bit PID of task
int foo_get(struct task_struct *task, const struct regset *regset,
	    struct membuf to)
{
	membuf_write(&to, task_pt_regs(task), sizeof(struct pt_regs));
	membuf_zero(&to, 64);
	return membuf_store(&to, (u32)task_tgid_vnr(task));
}

regset_get()/regset_get_alloc() taught to use that thing if present.
By the end of the series all users of ->get() will be converted;
then ->get() and ->get_size() can go.

Note that unlike ->get() this thing always starts at offset 0 and,
since it only writes to kernel buffer, can't fail on copyout.
It can, of course, fail for other reasons, but those tend to
be less numerous.

The caller guarantees that the buffer size won't be bigger than
regset->n * regset->size.  That simplifies life for quite a few
instances.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/regset.h | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/regset.c        | 12 +++++++++++-
 2 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/regset.h b/include/linux/regset.h
index af57c1db1924..f6125a7d949d 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -17,6 +17,52 @@
 struct task_struct;
 struct user_regset;
 
+struct membuf {
+	void *p;
+	size_t left;
+};
+
+static inline int membuf_zero(struct membuf *s, size_t size)
+{
+	if (s->left) {
+		if (size > s->left)
+			size = s->left;
+		memset(s->p, 0, size);
+		s->p += size;
+		s->left -= size;
+	}
+	return s->left;
+}
+
+static inline int membuf_write(struct membuf *s, const void *v, size_t size)
+{
+	if (s->left) {
+		if (size > s->left)
+			size = s->left;
+		memcpy(s->p, v, size);
+		s->p += size;
+		s->left -= size;
+	}
+	return s->left;
+}
+
+/* current s->p must be aligned for v; v must be a scalar */
+#define membuf_store(s, v)				\
+({							\
+	struct membuf *__s = (s);			\
+        if (__s->left) {				\
+		typeof(v) __v = (v);			\
+		size_t __size = sizeof(__v);		\
+		if (unlikely(__size > __s->left)) {	\
+			__size = __s->left;		\
+			memcpy(__s->p, &__v, __size);	\
+		} else {				\
+			*(typeof(__v + 0) *)__s->p = __v;	\
+		}					\
+		__s->p += __size;			\
+		__s->left -= __size;			\
+	}						\
+	__s->left;})
 
 /**
  * user_regset_active_fn - type of @active function in &struct user_regset
@@ -57,6 +103,10 @@ typedef int user_regset_get_fn(struct task_struct *target,
 			       unsigned int pos, unsigned int count,
 			       void *kbuf, void __user *ubuf);
 
+typedef int user_regset_get2_fn(struct task_struct *target,
+			       const struct user_regset *regset,
+			       struct membuf to);
+
 /**
  * user_regset_set_fn - type of @set function in &struct user_regset
  * @target:	thread being examined
@@ -186,6 +236,7 @@ typedef unsigned int user_regset_get_size_fn(struct task_struct *target,
  */
 struct user_regset {
 	user_regset_get_fn		*get;
+	user_regset_get2_fn		*regset_get;
 	user_regset_set_fn		*set;
 	user_regset_active_fn		*active;
 	user_regset_writeback_fn	*writeback;
diff --git a/kernel/regset.c b/kernel/regset.c
index 0a610983ce43..eaeaefbbd39e 100644
--- a/kernel/regset.c
+++ b/kernel/regset.c
@@ -11,7 +11,7 @@ static int __regset_get(struct task_struct *target,
 	void *p = *data, *to_free = NULL;
 	int res;
 
-	if (!regset->get)
+	if (!regset->get && !regset->regset_get)
 		return -EOPNOTSUPP;
 	if (size > regset->n * regset->size)
 		size = regset->n * regset->size;
@@ -20,6 +20,16 @@ static int __regset_get(struct task_struct *target,
 		if (!p)
 			return -ENOMEM;
 	}
+	if (regset->regset_get) {
+		res = regset->regset_get(target, regset,
+				   (struct membuf){.p = p, .left = size});
+		if (res < 0) {
+			kfree(to_free);
+			return res;
+		}
+		*data = p;
+		return size - res;
+	}
 	res = regset->get(target, regset, 0, size, p, NULL);
 	if (unlikely(res < 0)) {
 		kfree(to_free);
-- 
cgit v1.2.3


From 1e6986c9db21265bac1435a344b4446c51a3f4d8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 16 Jun 2020 15:34:20 -0400
Subject: regset: kill ->get()

no instances left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/regset.h | 22 ----------------------
 kernel/regset.c        | 24 +++++-------------------
 2 files changed, 5 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/regset.h b/include/linux/regset.h
index f6125a7d949d..2a4a555b1617 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -82,27 +82,6 @@ static inline int membuf_write(struct membuf *s, const void *v, size_t size)
 typedef int user_regset_active_fn(struct task_struct *target,
 				  const struct user_regset *regset);
 
-/**
- * user_regset_get_fn - type of @get function in &struct user_regset
- * @target:	thread being examined
- * @regset:	regset being examined
- * @pos:	offset into the regset data to access, in bytes
- * @count:	amount of data to copy, in bytes
- * @kbuf:	if not %NULL, a kernel-space pointer to copy into
- * @ubuf:	if @kbuf is %NULL, a user-space pointer to copy into
- *
- * Fetch register values.  Return %0 on success; -%EIO or -%ENODEV
- * are usual failure returns.  The @pos and @count values are in
- * bytes, but must be properly aligned.  If @kbuf is non-null, that
- * buffer is used and @ubuf is ignored.  If @kbuf is %NULL, then
- * ubuf gives a userland pointer to access directly, and an -%EFAULT
- * return value is possible.
- */
-typedef int user_regset_get_fn(struct task_struct *target,
-			       const struct user_regset *regset,
-			       unsigned int pos, unsigned int count,
-			       void *kbuf, void __user *ubuf);
-
 typedef int user_regset_get2_fn(struct task_struct *target,
 			       const struct user_regset *regset,
 			       struct membuf to);
@@ -235,7 +214,6 @@ typedef unsigned int user_regset_get_size_fn(struct task_struct *target,
  * omitted when there is an @active function and it returns zero.
  */
 struct user_regset {
-	user_regset_get_fn		*get;
 	user_regset_get2_fn		*regset_get;
 	user_regset_set_fn		*set;
 	user_regset_active_fn		*active;
diff --git a/kernel/regset.c b/kernel/regset.c
index eaeaefbbd39e..586823786f39 100644
--- a/kernel/regset.c
+++ b/kernel/regset.c
@@ -11,7 +11,7 @@ static int __regset_get(struct task_struct *target,
 	void *p = *data, *to_free = NULL;
 	int res;
 
-	if (!regset->get && !regset->regset_get)
+	if (!regset->regset_get)
 		return -EOPNOTSUPP;
 	if (size > regset->n * regset->size)
 		size = regset->n * regset->size;
@@ -20,28 +20,14 @@ static int __regset_get(struct task_struct *target,
 		if (!p)
 			return -ENOMEM;
 	}
-	if (regset->regset_get) {
-		res = regset->regset_get(target, regset,
-				   (struct membuf){.p = p, .left = size});
-		if (res < 0) {
-			kfree(to_free);
-			return res;
-		}
-		*data = p;
-		return size - res;
-	}
-	res = regset->get(target, regset, 0, size, p, NULL);
-	if (unlikely(res < 0)) {
+	res = regset->regset_get(target, regset,
+			   (struct membuf){.p = p, .left = size});
+	if (res < 0) {
 		kfree(to_free);
 		return res;
 	}
 	*data = p;
-	if (regset->get_size) { // arm64-only kludge, will go away
-		unsigned max_size = regset->get_size(target, regset);
-		if (size > max_size)
-			size = max_size;
-	}
-	return size;
+	return size - res;
 }
 
 int regset_get(struct task_struct *target,
-- 
cgit v1.2.3


From 82ace1efb3cb1d49a1681cc6e31156047d5ae1f2 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 22 Jul 2020 15:58:44 +0300
Subject: fsnotify: create helper fsnotify_inode()

Simple helper to consolidate biolerplate code.

Link: https://lore.kernel.org/r/20200722125849.17418-5-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/kernfs/file.c         |  6 ++----
 fs/notify/fsnotify.c     |  2 +-
 include/linux/fsnotify.h | 26 +++++++++++---------------
 kernel/trace/trace.c     |  3 +--
 4 files changed, 15 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 5b1468bc509e..1d185bffc52f 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -910,10 +910,8 @@ repeat:
 			kernfs_put(parent);
 		}
 
-		if (!p_inode) {
-			fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
-				 NULL, 0);
-		}
+		if (!p_inode)
+			fsnotify_inode(inode, FS_MODIFY);
 
 		iput(inode);
 	}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 0b0e01e04349..ba172b742de5 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -74,7 +74,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
 			iput(iput_inode);
 
 		/* for each watch, send FS_UNMOUNT and then remove it */
-		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+		fsnotify_inode(inode, FS_UNMOUNT);
 
 		fsnotify_inode_delete(inode);
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 9b2566d273a9..f9db7c9b3ef1 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -38,6 +38,14 @@ static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry,
 	fsnotify_name(dir, mask, d_inode(dentry), &dentry->d_name, 0);
 }
 
+static inline void fsnotify_inode(struct inode *inode, __u32 mask)
+{
+	if (S_ISDIR(inode->i_mode))
+		mask |= FS_ISDIR;
+
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+}
+
 /* Notify this dentry's parent about a child's events. */
 static inline int fsnotify_parent(struct dentry *dentry, __u32 mask,
 				  const void *data, int data_type)
@@ -105,12 +113,7 @@ static inline int fsnotify_perm(struct file *file, int mask)
  */
 static inline void fsnotify_link_count(struct inode *inode)
 {
-	__u32 mask = FS_ATTRIB;
-
-	if (S_ISDIR(inode->i_mode))
-		mask |= FS_ISDIR;
-
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+	fsnotify_inode(inode, FS_ATTRIB);
 }
 
 /*
@@ -125,7 +128,6 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	u32 fs_cookie = fsnotify_get_cookie();
 	__u32 old_dir_mask = FS_MOVED_FROM;
 	__u32 new_dir_mask = FS_MOVED_TO;
-	__u32 mask = FS_MOVE_SELF;
 	const struct qstr *new_name = &moved->d_name;
 
 	if (old_dir == new_dir)
@@ -134,7 +136,6 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	if (isdir) {
 		old_dir_mask |= FS_ISDIR;
 		new_dir_mask |= FS_ISDIR;
-		mask |= FS_ISDIR;
 	}
 
 	fsnotify_name(old_dir, old_dir_mask, source, old_name, fs_cookie);
@@ -144,7 +145,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 		fsnotify_link_count(target);
 
 	if (source)
-		fsnotify(source, mask, source, FSNOTIFY_EVENT_INODE, NULL, 0);
+		fsnotify_inode(source, FS_MOVE_SELF);
 	audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE);
 }
 
@@ -169,12 +170,7 @@ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
  */
 static inline void fsnotify_inoderemove(struct inode *inode)
 {
-	__u32 mask = FS_DELETE_SELF;
-
-	if (S_ISDIR(inode->i_mode))
-		mask |= FS_ISDIR;
-
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+	fsnotify_inode(inode, FS_DELETE_SELF);
 	__fsnotify_inode_delete(inode);
 }
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bb62269724d5..0c655c039506 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1543,8 +1543,7 @@ static void latency_fsnotify_workfn(struct work_struct *work)
 {
 	struct trace_array *tr = container_of(work, struct trace_array,
 					      fsnotify_work);
-	fsnotify(tr->d_max_latency->d_inode, FS_MODIFY,
-		 tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+	fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
 }
 
 static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
-- 
cgit v1.2.3


From 7dbe6080167860df0bf8627e55fd5154f366cc7a Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 22 Jul 2020 15:58:43 +0300
Subject: audit: do not set FS_EVENT_ON_CHILD in audit marks mask

The audit group marks mask does not contain any events possible on
a child so setting the flag FS_EVENT_ON_CHILD in the mask is counter
productive.

It may lead to the undesired outcome of setting the dentry flag
DCACHE_FSNOTIFY_PARENT_WATCHED on a directory inode even though it is
not watching children, because the audit mark contribute the flag
FS_EVENT_ON_CHILD to the inode's fsnotify_mask and another mark could
be contributing an event that is possible on child to the inode's mask.

Furthermore in the following patches we want to use FS_EVENT_ON_CHILD
for non-dir inodes for other purposes so stop using the flag.

Link: https://lore.kernel.org/r/20200722125849.17418-4-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 kernel/audit_fsnotify.c | 2 +-
 kernel/audit_watch.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 30ca239285a3..bd3a6b79316a 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -36,7 +36,7 @@ static struct fsnotify_group *audit_fsnotify_group;
 
 /* fsnotify events we care about. */
 #define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
-			 FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+			 FS_MOVE_SELF)
 
 static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark)
 {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 61fd601f1edf..e23d54bcc587 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -53,7 +53,7 @@ static struct fsnotify_group *audit_watch_group;
 
 /* fsnotify events we care about. */
 #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
-			FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
+			FS_MOVE_SELF | FS_UNMOUNT)
 
 static void audit_free_parent(struct audit_parent *parent)
 {
-- 
cgit v1.2.3


From b9a1b9772509cbc6f6aa8bcd0b019f6347a2b631 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 22 Jul 2020 15:58:48 +0300
Subject: fsnotify: create method handle_inode_event() in fsnotify_operations

The method handle_event() grew a lot of complexity due to the design of
fanotify and merging of ignore masks.

Most backends do not care about this complex functionality, so we can hide
this complexity from them.

Introduce a method handle_inode_event() that serves those backends and
passes a single inode mark and less arguments.

This change converts all backends except fanotify and inotify to use the
simplified handle_inode_event() method.  In pricipal, inotify could have
also used the new method, but that would require passing more arguments
on the simple helper (data, data_type, cookie), so we leave it with the
handle_event() method.

Link: https://lore.kernel.org/r/20200722125849.17418-9-amir73il@gmail.com
Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/nfsd/filecache.c              | 12 ++++------
 fs/notify/dnotify/dnotify.c      | 38 +++++++----------------------
 fs/notify/fsnotify.c             | 52 ++++++++++++++++++++++++++++++++++++++--
 include/linux/fsnotify_backend.h | 19 ++++++++++++---
 kernel/audit_fsnotify.c          | 20 +++++++---------
 kernel/audit_tree.c              | 10 ++++----
 kernel/audit_watch.c             | 17 ++++++-------
 7 files changed, 97 insertions(+), 71 deletions(-)

(limited to 'kernel')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index bbc7892d2928..c8b9d2667ee6 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -598,14 +598,10 @@ static struct notifier_block nfsd_file_lease_notifier = {
 };
 
 static int
-nfsd_file_fsnotify_handle_event(struct fsnotify_group *group, u32 mask,
-				const void *data, int data_type,
-				struct inode *dir,
-				const struct qstr *file_name, u32 cookie,
-				struct fsnotify_iter_info *iter_info)
+nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask,
+				struct inode *inode, struct inode *dir,
+				const struct qstr *name)
 {
-	struct inode *inode = fsnotify_data_inode(data, data_type);
-
 	trace_nfsd_file_fsnotify_handle_event(inode, mask);
 
 	/* Should be no marks on non-regular files */
@@ -626,7 +622,7 @@ nfsd_file_fsnotify_handle_event(struct fsnotify_group *group, u32 mask,
 
 
 static const struct fsnotify_ops nfsd_file_fsnotify_ops = {
-	.handle_event = nfsd_file_fsnotify_handle_event,
+	.handle_inode_event = nfsd_file_fsnotify_handle_event,
 	.free_mark = nfsd_file_mark_free,
 };
 
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index ca78d3f78da8..5dcda8f20c04 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -70,8 +70,9 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
  * destroy the dnotify struct if it was not registered to receive multiple
  * events.
  */
-static void dnotify_one_event(struct fsnotify_group *group, u32 mask,
-			      struct fsnotify_mark *inode_mark)
+static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+				struct inode *inode, struct inode *dir,
+				const struct qstr *name)
 {
 	struct dnotify_mark *dn_mark;
 	struct dnotify_struct *dn;
@@ -79,6 +80,10 @@ static void dnotify_one_event(struct fsnotify_group *group, u32 mask,
 	struct fown_struct *fown;
 	__u32 test_mask = mask & ~FS_EVENT_ON_CHILD;
 
+	/* not a dir, dnotify doesn't care */
+	if (!dir && !(mask & FS_ISDIR))
+		return 0;
+
 	dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
 
 	spin_lock(&inode_mark->lock);
@@ -100,33 +105,6 @@ static void dnotify_one_event(struct fsnotify_group *group, u32 mask,
 	}
 
 	spin_unlock(&inode_mark->lock);
-}
-
-static int dnotify_handle_event(struct fsnotify_group *group, u32 mask,
-				const void *data, int data_type,
-				struct inode *dir,
-				const struct qstr *file_name, u32 cookie,
-				struct fsnotify_iter_info *iter_info)
-{
-	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
-	struct fsnotify_mark *child_mark = fsnotify_iter_child_mark(iter_info);
-
-	/* not a dir, dnotify doesn't care */
-	if (!dir && !(mask & FS_ISDIR))
-		return 0;
-
-	if (WARN_ON(fsnotify_iter_vfsmount_mark(iter_info)))
-		return 0;
-
-	/*
-	 * Some events can be sent on both parent dir and subdir marks
-	 * (e.g. DN_ATTRIB).  If both parent dir and subdir are watching,
-	 * report the event once to parent dir and once to subdir.
-	 */
-	if (inode_mark)
-		dnotify_one_event(group, mask, inode_mark);
-	if (child_mark)
-		dnotify_one_event(group, mask, child_mark);
 
 	return 0;
 }
@@ -143,7 +121,7 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
 }
 
 static const struct fsnotify_ops dnotify_fsnotify_ops = {
-	.handle_event = dnotify_handle_event,
+	.handle_inode_event = dnotify_handle_event,
 	.free_mark = dnotify_free_mark,
 };
 
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 494d5d70323f..a960ec3a569a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -230,6 +230,49 @@ notify:
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
 
+static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
+				 const void *data, int data_type,
+				 struct inode *dir, const struct qstr *name,
+				 u32 cookie, struct fsnotify_iter_info *iter_info)
+{
+	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
+	struct fsnotify_mark *child_mark = fsnotify_iter_child_mark(iter_info);
+	struct inode *inode = fsnotify_data_inode(data, data_type);
+	const struct fsnotify_ops *ops = group->ops;
+	int ret;
+
+	if (WARN_ON_ONCE(!ops->handle_inode_event))
+		return 0;
+
+	if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) ||
+	    WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
+		return 0;
+
+	/*
+	 * An event can be sent on child mark iterator instead of inode mark
+	 * iterator because of other groups that have interest of this inode
+	 * and have marks on both parent and child.  We can simplify this case.
+	 */
+	if (!inode_mark) {
+		inode_mark = child_mark;
+		child_mark = NULL;
+		dir = NULL;
+		name = NULL;
+	}
+
+	ret = ops->handle_inode_event(inode_mark, mask, inode, dir, name);
+	if (ret || !child_mark)
+		return ret;
+
+	/*
+	 * Some events can be sent on both parent dir and child marks
+	 * (e.g. FS_ATTRIB).  If both parent dir and child are watching,
+	 * report the event once to parent dir with name and once to child
+	 * without name.
+	 */
+	return ops->handle_inode_event(child_mark, mask, inode, NULL, NULL);
+}
+
 static int send_to_group(__u32 mask, const void *data, int data_type,
 			 struct inode *dir, const struct qstr *file_name,
 			 u32 cookie, struct fsnotify_iter_info *iter_info)
@@ -275,8 +318,13 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
 	if (!(test_mask & marks_mask & ~marks_ignored_mask))
 		return 0;
 
-	return group->ops->handle_event(group, mask, data, data_type, dir,
-					file_name, cookie, iter_info);
+	if (group->ops->handle_event) {
+		return group->ops->handle_event(group, mask, data, data_type, dir,
+						file_name, cookie, iter_info);
+	}
+
+	return fsnotify_handle_event(group, mask, data, data_type, dir,
+				     file_name, cookie, iter_info);
 }
 
 static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp)
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 32104cfc27a5..f8529a3a2923 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -128,17 +128,30 @@ struct mem_cgroup;
  * @cookie:	inotify rename cookie
  * @iter_info:	array of marks from this group that are interested in the event
  *
+ * handle_inode_event - simple variant of handle_event() for groups that only
+ *		have inode marks and don't have ignore mask
+ * @mark:	mark to notify
+ * @mask:	event type and flags
+ * @inode:	inode that event happened on
+ * @dir:	optional directory associated with event -
+ *		if @file_name is not NULL, this is the directory that
+ *		@file_name is relative to.
+ * @file_name:	optional file name associated with event
+ *
  * free_group_priv - called when a group refcnt hits 0 to clean up the private union
  * freeing_mark - called when a mark is being destroyed for some reason.  The group
- * 		MUST be holding a reference on each mark and that reference must be
- * 		dropped in this function.  inotify uses this function to send
- * 		userspace messages that marks have been removed.
+ *		MUST be holding a reference on each mark and that reference must be
+ *		dropped in this function.  inotify uses this function to send
+ *		userspace messages that marks have been removed.
  */
 struct fsnotify_ops {
 	int (*handle_event)(struct fsnotify_group *group, u32 mask,
 			    const void *data, int data_type, struct inode *dir,
 			    const struct qstr *file_name, u32 cookie,
 			    struct fsnotify_iter_info *iter_info);
+	int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask,
+			    struct inode *inode, struct inode *dir,
+			    const struct qstr *file_name);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
 	void (*free_event)(struct fsnotify_event *event);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index bd3a6b79316a..bfcfcd61adb6 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -152,35 +152,31 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
 }
 
 /* Update mark data in audit rules based on fsnotify events. */
-static int audit_mark_handle_event(struct fsnotify_group *group, u32 mask,
-				   const void *data, int data_type,
-				   struct inode *dir,
-				   const struct qstr *dname, u32 cookie,
-				   struct fsnotify_iter_info *iter_info)
+static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+				   struct inode *inode, struct inode *dir,
+				   const struct qstr *dname)
 {
-	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
 	struct audit_fsnotify_mark *audit_mark;
-	const struct inode *inode = fsnotify_data_inode(data, data_type);
 
 	audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
 
-	BUG_ON(group != audit_fsnotify_group);
-
-	if (WARN_ON(!inode))
+	if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) ||
+	    WARN_ON_ONCE(!inode))
 		return 0;
 
 	if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
 		if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
 			return 0;
 		audit_update_mark(audit_mark, inode);
-	} else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
+	} else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) {
 		audit_autoremove_mark_rule(audit_mark);
+	}
 
 	return 0;
 }
 
 static const struct fsnotify_ops audit_mark_fsnotify_ops = {
-	.handle_event =	audit_mark_handle_event,
+	.handle_inode_event = audit_mark_handle_event,
 	.free_mark = audit_fsnotify_free_mark,
 };
 
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2ce2ac1ce100..025d24abf15d 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1037,11 +1037,9 @@ static void evict_chunk(struct audit_chunk *chunk)
 		audit_schedule_prune();
 }
 
-static int audit_tree_handle_event(struct fsnotify_group *group, u32 mask,
-				   const void *data, int data_type,
-				   struct inode *dir,
-				   const struct qstr *file_name, u32 cookie,
-				   struct fsnotify_iter_info *iter_info)
+static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask,
+				   struct inode *inode, struct inode *dir,
+				   const struct qstr *file_name)
 {
 	return 0;
 }
@@ -1070,7 +1068,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
 }
 
 static const struct fsnotify_ops audit_tree_ops = {
-	.handle_event = audit_tree_handle_event,
+	.handle_inode_event = audit_tree_handle_event,
 	.freeing_mark = audit_tree_freeing_mark,
 	.free_mark = audit_tree_destroy_watch,
 };
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e23d54bcc587..246e5ba704c0 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -464,20 +464,17 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 }
 
 /* Update watch data in audit rules based on fsnotify events. */
-static int audit_watch_handle_event(struct fsnotify_group *group, u32 mask,
-				    const void *data, int data_type,
-				    struct inode *dir,
-				    const struct qstr *dname, u32 cookie,
-				    struct fsnotify_iter_info *iter_info)
+static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+				    struct inode *inode, struct inode *dir,
+				    const struct qstr *dname)
 {
-	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
-	const struct inode *inode = fsnotify_data_inode(data, data_type);
 	struct audit_parent *parent;
 
 	parent = container_of(inode_mark, struct audit_parent, mark);
 
-	BUG_ON(group != audit_watch_group);
-	WARN_ON(!inode);
+	if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) ||
+	    WARN_ON_ONCE(!inode))
+		return 0;
 
 	if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
 		audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
@@ -490,7 +487,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, u32 mask,
 }
 
 static const struct fsnotify_ops audit_watch_fsnotify_ops = {
-	.handle_event = 	audit_watch_handle_event,
+	.handle_inode_event =	audit_watch_handle_event,
 	.free_mark =		audit_watch_free_mark,
 };
 
-- 
cgit v1.2.3


From f6dfbe31e8fa5cbd5bc89df9d7f0fa0af7e69981 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 27 Jul 2020 18:54:11 +0100
Subject: bpf: Fix swapped arguments in calls to check_buffer_access

There are a couple of arguments of the boolean flag zero_size_allowed and
the char pointer buf_info when calling to function check_buffer_access that
are swapped by mistake. Fix these by swapping them to correct the argument
ordering.

Fixes: afbf21dce668 ("bpf: Support readonly/readwrite buffers in verifier")
Addresses-Coverity: ("Array compared to 0")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20200727175411.155179-1-colin.king@canonical.com
---
 kernel/bpf/verifier.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cd14e70f2d07..88bb25d08bf8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3477,14 +3477,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 				regno, reg_type_str[reg->type]);
 			return -EACCES;
 		}
-		err = check_buffer_access(env, reg, regno, off, size, "rdonly",
-					  false,
+		err = check_buffer_access(env, reg, regno, off, size, false,
+					  "rdonly",
 					  &env->prog->aux->max_rdonly_access);
 		if (!err && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else if (reg->type == PTR_TO_RDWR_BUF) {
-		err = check_buffer_access(env, reg, regno, off, size, "rdwr",
-					  false,
+		err = check_buffer_access(env, reg, regno, off, size, false,
+					  "rdwr",
 					  &env->prog->aux->max_rdwr_access);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
-- 
cgit v1.2.3


From 112a0e4171e111e963aada3fe790c71accf4d705 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 28 Jul 2020 16:34:00 +0900
Subject: kprobes: Remove unnecessary module_mutex locking from
 kprobe_optimizer()

Since we already lock both kprobe_mutex and text_mutex in the optimizer,
text will not be changed and the module unloading will be stopped
inside kprobes_module_callback().

The mutex_lock() has originally been introduced to avoid conflict with text modification,
at that point we didn't hold text_mutex.

But after:

  f1c6ece23729 ("kprobes: Fix potential deadlock in kprobe_optimizer()")

We started holding the text_mutex and don't need the modules mutex anyway.

So remove the module_mutex locking.

[ mingo: Amended the changelog. ]

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Link: https://lore.kernel.org/r/20200728163400.e00b09c594763349f99ce6cb@kernel.org
---
 kernel/kprobes.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 146c648eb943..e87679a48ba2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -598,8 +598,6 @@ static void kprobe_optimizer(struct work_struct *work)
 	mutex_lock(&kprobe_mutex);
 	cpus_read_lock();
 	mutex_lock(&text_mutex);
-	/* Lock modules while optimizing kprobes */
-	mutex_lock(&module_mutex);
 
 	/*
 	 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
@@ -624,7 +622,6 @@ static void kprobe_optimizer(struct work_struct *work)
 	/* Step 4: Free cleaned kprobes after quiesence period */
 	do_free_cleaned_kprobes();
 
-	mutex_unlock(&module_mutex);
 	mutex_unlock(&text_mutex);
 	cpus_read_unlock();
 
-- 
cgit v1.2.3


From 21a6ee14a8f277766618ef07154432b46528113e Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Tue, 28 Jul 2020 19:17:55 +0800
Subject: sched: Remove duplicated tick_nohz_full_enabled() check

In sched_update_tick_dependency() there's two calls that check
whether nohz_full is enabled: tick_nohz_full_cpu() does it
implicitly, while there's also an explicit call to tick_nohz_full_enabled().

Remove the duplicated, open coded check.

[ mingo: Amended the changelog. ]

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/1595935075-14223-1-git-send-email-linmiaohe@huawei.com
---
 kernel/sched/sched.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9f33c77258ea..296efd30d8c9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1961,12 +1961,7 @@ extern int __init sched_tick_offload_init(void);
  */
 static inline void sched_update_tick_dependency(struct rq *rq)
 {
-	int cpu;
-
-	if (!tick_nohz_full_enabled())
-		return;
-
-	cpu = cpu_of(rq);
+	int cpu = cpu_of(rq);
 
 	if (!tick_nohz_full_cpu(cpu))
 		return;
-- 
cgit v1.2.3


From 274b3f7bf34415eed106e479e4815e897ce5d763 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 22 Jul 2020 16:33:43 +0200
Subject: dma-contiguous: cleanup dma_alloc_contiguous

Split out a cma_alloc_aligned helper to deal with the "interesting"
calling conventions for cma_alloc, which then allows to the main
function to be written straight forward.  This also takes advantage
of the fact that NULL dev arguments have been gone from the DMA API
for a while.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Nicolin Chen <nicoleotsuka@gmail.com>
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>
---
 kernel/dma/contiguous.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 15bc5026c485..cff7e60968b9 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -215,6 +215,13 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
 	return cma_release(dev_get_cma_area(dev), pages, count);
 }
 
+static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp)
+{
+	unsigned int align = min(get_order(size), CONFIG_CMA_ALIGNMENT);
+
+	return cma_alloc(cma, size >> PAGE_SHIFT, align, gfp & __GFP_NOWARN);
+}
+
 /**
  * dma_alloc_contiguous() - allocate contiguous pages
  * @dev:   Pointer to device for which the allocation is performed.
@@ -231,24 +238,14 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
  */
 struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
 {
-	size_t count = size >> PAGE_SHIFT;
-	struct page *page = NULL;
-	struct cma *cma = NULL;
-
-	if (dev && dev->cma_area)
-		cma = dev->cma_area;
-	else if (count > 1)
-		cma = dma_contiguous_default_area;
-
 	/* CMA can be used only in the context which permits sleeping */
-	if (cma && gfpflags_allow_blocking(gfp)) {
-		size_t align = get_order(size);
-		size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
-
-		page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN);
-	}
-
-	return page;
+	if (!gfpflags_allow_blocking(gfp))
+		return NULL;
+	if (dev->cma_area)
+		return cma_alloc_aligned(dev->cma_area, size, gfp);
+	if (size <= PAGE_SIZE || !dma_contiguous_default_area)
+		return NULL;
+	return cma_alloc_aligned(dma_contiguous_default_area, size, gfp);
 }
 
 /**
-- 
cgit v1.2.3


From 310ad7970a0dec847563dc6dba9e7e587d545622 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 28 Jul 2020 12:05:27 -0700
Subject: bpf: Fix build without CONFIG_NET when using BPF XDP link

Entire net/core subsystem is not built without CONFIG_NET. linux/netdevice.h
just assumes that it's always there, so the easiest way to fix this is to
conditionally compile out bpf_xdp_link_attach() use in bpf/syscall.c.

Fixes: aa8d3a716b59 ("bpf, xdp: Add bpf_link-based XDP attachment API")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200728190527.110830-1-andriin@fb.com
---
 kernel/bpf/syscall.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0e8c88db7e7a..cd3d599e9e90 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3923,9 +3923,11 @@ static int link_create(union bpf_attr *attr)
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		ret = netns_bpf_link_create(attr, prog);
 		break;
+#ifdef CONFIG_NET
 	case BPF_PROG_TYPE_XDP:
 		ret = bpf_xdp_link_attach(attr, prog);
 		break;
+#endif
 	default:
 		ret = -EINVAL;
 	}
-- 
cgit v1.2.3


From 48001ea50d17f3eb06a552e9ecf21f7fc01b25da Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 20 Jul 2020 15:08:18 -0700
Subject: PM, libnvdimm: Add runtime firmware activation support

Abstract platform specific mechanics for nvdimm firmware activation
behind a handful of generic ops. At the bus level ->activate_state()
indicates the unified state (idle, busy, armed) of all DIMMs on the bus,
and ->capability() indicates the system state expectations for activate.
At the DIMM level ->activate_state() indicates the per-DIMM state,
->activate_result() indicates the outcome of the last activation
attempt, and ->arm() attempts to transition the DIMM from 'idle' to
'armed'.

A new hibernate_quiet_exec() facility is added to support firmware
activation in an OS defined system quiesce state. It leverages the fact
that the hibernate-freeze state wants to assert that a memory
hibernation snapshot can be taken. This is in contrast to a platform
firmware defined quiesce state that may forcefully quiet the memory
controller independent of whether an individual device-driver properly
supports hibernate-freeze.

The libnvdimm sysfs interface is extended to support detection of a
firmware activate capability. The mechanism supports enumeration and
triggering of firmware activate, optionally in the
hibernate_quiet_exec() context.

[rafael: hibernate_quiet_exec() proposal]
[vishal: fix up sparse warning, grammar in Documentation/]

Cc: Pavel Machek <pavel@ucw.cz>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Reported-by: kernel test robot <lkp@intel.com>
Co-developed-by: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Signed-off-by: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 Documentation/ABI/testing/sysfs-bus-nvdimm         |   2 +
 .../driver-api/nvdimm/firmware-activate.rst        |  86 ++++++++++++
 drivers/nvdimm/core.c                              | 149 +++++++++++++++++++++
 drivers/nvdimm/dimm_devs.c                         | 115 ++++++++++++++++
 drivers/nvdimm/nd-core.h                           |   1 +
 include/linux/libnvdimm.h                          |  44 ++++++
 include/linux/suspend.h                            |   6 +
 kernel/power/hibernate.c                           |  97 ++++++++++++++
 8 files changed, 500 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-nvdimm
 create mode 100644 Documentation/driver-api/nvdimm/firmware-activate.rst

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/sysfs-bus-nvdimm b/Documentation/ABI/testing/sysfs-bus-nvdimm
new file mode 100644
index 000000000000..d64380262be8
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-nvdimm
@@ -0,0 +1,2 @@
+The libnvdimm sub-system implements a common sysfs interface for
+platform nvdimm resources. See Documentation/driver-api/nvdimm/.
diff --git a/Documentation/driver-api/nvdimm/firmware-activate.rst b/Documentation/driver-api/nvdimm/firmware-activate.rst
new file mode 100644
index 000000000000..7ee7decbbdc3
--- /dev/null
+++ b/Documentation/driver-api/nvdimm/firmware-activate.rst
@@ -0,0 +1,86 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==================================
+NVDIMM Runtime Firmware Activation
+==================================
+
+Some persistent memory devices run a firmware locally on the device /
+"DIMM" to perform tasks like media management, capacity provisioning,
+and health monitoring. The process of updating that firmware typically
+involves a reboot because it has implications for in-flight memory
+transactions. However, reboots are disruptive and at least the Intel
+persistent memory platform implementation, described by the Intel ACPI
+DSM specification [1], has added support for activating firmware at
+runtime.
+
+A native sysfs interface is implemented in libnvdimm to allow platform
+to advertise and control their local runtime firmware activation
+capability.
+
+The libnvdimm bus object, ndbusX, implements an ndbusX/firmware/activate
+attribute that shows the state of the firmware activation as one of 'idle',
+'armed', 'overflow', and 'busy'.
+
+- idle:
+  No devices are set / armed to activate firmware
+
+- armed:
+  At least one device is armed
+
+- busy:
+  In the busy state armed devices are in the process of transitioning
+  back to idle and completing an activation cycle.
+
+- overflow:
+  If the platform has a concept of incremental work needed to perform
+  the activation it could be the case that too many DIMMs are armed for
+  activation. In that scenario the potential for firmware activation to
+  timeout is indicated by the 'overflow' state.
+
+The 'ndbusX/firmware/activate' property can be written with a value of
+either 'live', or 'quiesce'. A value of 'quiesce' triggers the kernel to
+run firmware activation from within the equivalent of the hibernation
+'freeze' state where drivers and applications are notified to stop their
+modifications of system memory. A value of 'live' attempts
+firmware activation without this hibernation cycle. The
+'ndbusX/firmware/activate' property will be elided completely if no
+firmware activation capability is detected.
+
+Another property 'ndbusX/firmware/capability' indicates a value of
+'live' or 'quiesce', where 'live' indicates that the firmware
+does not require or inflict any quiesce period on the system to update
+firmware. A capability value of 'quiesce' indicates that firmware does
+expect and injects a quiet period for the memory controller, but 'live'
+may still be written to 'ndbusX/firmware/activate' as an override to
+assume the risk of racing firmware update with in-flight device and
+application activity. The 'ndbusX/firmware/capability' property will be
+elided completely if no firmware activation capability is detected.
+
+The libnvdimm memory-device / DIMM object, nmemX, implements
+'nmemX/firmware/activate' and 'nmemX/firmware/result' attributes to
+communicate the per-device firmware activation state. Similar to the
+'ndbusX/firmware/activate' attribute, the 'nmemX/firmware/activate'
+attribute indicates 'idle', 'armed', or 'busy'. The state transitions
+from 'armed' to 'idle' when the system is prepared to activate firmware,
+firmware staged + state set to armed, and 'ndbusX/firmware/activate' is
+triggered. After that activation event the nmemX/firmware/result
+attribute reflects the state of the last activation as one of:
+
+- none:
+  No runtime activation triggered since the last time the device was reset
+
+- success:
+  The last runtime activation completed successfully.
+
+- fail:
+  The last runtime activation failed for device-specific reasons.
+
+- not_staged:
+  The last runtime activation failed due to a sequencing error of the
+  firmware image not being staged.
+
+- need_reset:
+  Runtime firmware activation failed, but the firmware can still be
+  activated via the legacy method of power-cycling the system.
+
+[1]: https://docs.pmem.io/persistent-memory/
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index fe9bd6febdd2..c21ba0602029 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -4,6 +4,7 @@
  */
 #include <linux/libnvdimm.h>
 #include <linux/badblocks.h>
+#include <linux/suspend.h>
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
@@ -389,8 +390,156 @@ static const struct attribute_group nvdimm_bus_attribute_group = {
 	.attrs = nvdimm_bus_attributes,
 };
 
+static ssize_t capability_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	enum nvdimm_fwa_capability cap;
+
+	if (!nd_desc->fw_ops)
+		return -EOPNOTSUPP;
+
+	nvdimm_bus_lock(dev);
+	cap = nd_desc->fw_ops->capability(nd_desc);
+	nvdimm_bus_unlock(dev);
+
+	switch (cap) {
+	case NVDIMM_FWA_CAP_QUIESCE:
+		return sprintf(buf, "quiesce\n");
+	case NVDIMM_FWA_CAP_LIVE:
+		return sprintf(buf, "live\n");
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static DEVICE_ATTR_RO(capability);
+
+static ssize_t activate_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	enum nvdimm_fwa_capability cap;
+	enum nvdimm_fwa_state state;
+
+	if (!nd_desc->fw_ops)
+		return -EOPNOTSUPP;
+
+	nvdimm_bus_lock(dev);
+	cap = nd_desc->fw_ops->capability(nd_desc);
+	state = nd_desc->fw_ops->activate_state(nd_desc);
+	nvdimm_bus_unlock(dev);
+
+	if (cap < NVDIMM_FWA_CAP_QUIESCE)
+		return -EOPNOTSUPP;
+
+	switch (state) {
+	case NVDIMM_FWA_IDLE:
+		return sprintf(buf, "idle\n");
+	case NVDIMM_FWA_BUSY:
+		return sprintf(buf, "busy\n");
+	case NVDIMM_FWA_ARMED:
+		return sprintf(buf, "armed\n");
+	case NVDIMM_FWA_ARM_OVERFLOW:
+		return sprintf(buf, "overflow\n");
+	default:
+		return -ENXIO;
+	}
+}
+
+static int exec_firmware_activate(void *data)
+{
+	struct nvdimm_bus_descriptor *nd_desc = data;
+
+	return nd_desc->fw_ops->activate(nd_desc);
+}
+
+static ssize_t activate_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	enum nvdimm_fwa_state state;
+	bool quiesce;
+	ssize_t rc;
+
+	if (!nd_desc->fw_ops)
+		return -EOPNOTSUPP;
+
+	if (sysfs_streq(buf, "live"))
+		quiesce = false;
+	else if (sysfs_streq(buf, "quiesce"))
+		quiesce = true;
+	else
+		return -EINVAL;
+
+	nvdimm_bus_lock(dev);
+	state = nd_desc->fw_ops->activate_state(nd_desc);
+
+	switch (state) {
+	case NVDIMM_FWA_BUSY:
+		rc = -EBUSY;
+		break;
+	case NVDIMM_FWA_ARMED:
+	case NVDIMM_FWA_ARM_OVERFLOW:
+		if (quiesce)
+			rc = hibernate_quiet_exec(exec_firmware_activate, nd_desc);
+		else
+			rc = nd_desc->fw_ops->activate(nd_desc);
+		break;
+	case NVDIMM_FWA_IDLE:
+	default:
+		rc = -ENXIO;
+	}
+	nvdimm_bus_unlock(dev);
+
+	if (rc == 0)
+		rc = len;
+	return rc;
+}
+
+static DEVICE_ATTR_ADMIN_RW(activate);
+
+static umode_t nvdimm_bus_firmware_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, typeof(*dev), kobj);
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	enum nvdimm_fwa_capability cap;
+
+	/*
+	 * Both 'activate' and 'capability' disappear when no ops
+	 * detected, or a negative capability is indicated.
+	 */
+	if (!nd_desc->fw_ops)
+		return 0;
+
+	nvdimm_bus_lock(dev);
+	cap = nd_desc->fw_ops->capability(nd_desc);
+	nvdimm_bus_unlock(dev);
+
+	if (cap < NVDIMM_FWA_CAP_QUIESCE)
+		return 0;
+
+	return a->mode;
+}
+static struct attribute *nvdimm_bus_firmware_attributes[] = {
+	&dev_attr_activate.attr,
+	&dev_attr_capability.attr,
+	NULL,
+};
+
+static const struct attribute_group nvdimm_bus_firmware_attribute_group = {
+	.name = "firmware",
+	.attrs = nvdimm_bus_firmware_attributes,
+	.is_visible = nvdimm_bus_firmware_visible,
+};
+
 const struct attribute_group *nvdimm_bus_attribute_groups[] = {
 	&nvdimm_bus_attribute_group,
+	&nvdimm_bus_firmware_attribute_group,
 	NULL,
 };
 
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index b7b77e8d9027..85b53a7f44f2 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -446,9 +446,124 @@ static const struct attribute_group nvdimm_attribute_group = {
 	.is_visible = nvdimm_visible,
 };
 
+static ssize_t result_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	enum nvdimm_fwa_result result;
+
+	if (!nvdimm->fw_ops)
+		return -EOPNOTSUPP;
+
+	nvdimm_bus_lock(dev);
+	result = nvdimm->fw_ops->activate_result(nvdimm);
+	nvdimm_bus_unlock(dev);
+
+	switch (result) {
+	case NVDIMM_FWA_RESULT_NONE:
+		return sprintf(buf, "none\n");
+	case NVDIMM_FWA_RESULT_SUCCESS:
+		return sprintf(buf, "success\n");
+	case NVDIMM_FWA_RESULT_FAIL:
+		return sprintf(buf, "fail\n");
+	case NVDIMM_FWA_RESULT_NOTSTAGED:
+		return sprintf(buf, "not_staged\n");
+	case NVDIMM_FWA_RESULT_NEEDRESET:
+		return sprintf(buf, "need_reset\n");
+	default:
+		return -ENXIO;
+	}
+}
+static DEVICE_ATTR_ADMIN_RO(result);
+
+static ssize_t activate_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	enum nvdimm_fwa_state state;
+
+	if (!nvdimm->fw_ops)
+		return -EOPNOTSUPP;
+
+	nvdimm_bus_lock(dev);
+	state = nvdimm->fw_ops->activate_state(nvdimm);
+	nvdimm_bus_unlock(dev);
+
+	switch (state) {
+	case NVDIMM_FWA_IDLE:
+		return sprintf(buf, "idle\n");
+	case NVDIMM_FWA_BUSY:
+		return sprintf(buf, "busy\n");
+	case NVDIMM_FWA_ARMED:
+		return sprintf(buf, "armed\n");
+	default:
+		return -ENXIO;
+	}
+}
+
+static ssize_t activate_store(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t len)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	enum nvdimm_fwa_trigger arg;
+	int rc;
+
+	if (!nvdimm->fw_ops)
+		return -EOPNOTSUPP;
+
+	if (sysfs_streq(buf, "arm"))
+		arg = NVDIMM_FWA_ARM;
+	else if (sysfs_streq(buf, "disarm"))
+		arg = NVDIMM_FWA_DISARM;
+	else
+		return -EINVAL;
+
+	nvdimm_bus_lock(dev);
+	rc = nvdimm->fw_ops->arm(nvdimm, arg);
+	nvdimm_bus_unlock(dev);
+
+	if (rc < 0)
+		return rc;
+	return len;
+}
+static DEVICE_ATTR_ADMIN_RW(activate);
+
+static struct attribute *nvdimm_firmware_attributes[] = {
+	&dev_attr_activate.attr,
+	&dev_attr_result.attr,
+};
+
+static umode_t nvdimm_firmware_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, typeof(*dev), kobj);
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	enum nvdimm_fwa_capability cap;
+
+	if (!nd_desc->fw_ops)
+		return 0;
+	if (!nvdimm->fw_ops)
+		return 0;
+
+	nvdimm_bus_lock(dev);
+	cap = nd_desc->fw_ops->capability(nd_desc);
+	nvdimm_bus_unlock(dev);
+
+	if (cap < NVDIMM_FWA_CAP_QUIESCE)
+		return 0;
+
+	return a->mode;
+}
+
+static const struct attribute_group nvdimm_firmware_attribute_group = {
+	.name = "firmware",
+	.attrs = nvdimm_firmware_attributes,
+	.is_visible = nvdimm_firmware_visible,
+};
+
 static const struct attribute_group *nvdimm_attribute_groups[] = {
 	&nd_device_attribute_group,
 	&nvdimm_attribute_group,
+	&nvdimm_firmware_attribute_group,
 	NULL,
 };
 
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index ddb9d97d9129..564faa36a3ca 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -45,6 +45,7 @@ struct nvdimm {
 		struct kernfs_node *overwrite_state;
 	} sec;
 	struct delayed_work dwork;
+	const struct nvdimm_fw_ops *fw_ops;
 };
 
 static inline unsigned long nvdimm_security_flags(
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index ad9898ece7d3..15dbcb718316 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -86,6 +86,7 @@ struct nvdimm_bus_descriptor {
 	int (*flush_probe)(struct nvdimm_bus_descriptor *nd_desc);
 	int (*clear_to_send)(struct nvdimm_bus_descriptor *nd_desc,
 			struct nvdimm *nvdimm, unsigned int cmd, void *data);
+	const struct nvdimm_bus_fw_ops *fw_ops;
 };
 
 struct nd_cmd_desc {
@@ -200,6 +201,49 @@ struct nvdimm_security_ops {
 	int (*query_overwrite)(struct nvdimm *nvdimm);
 };
 
+enum nvdimm_fwa_state {
+	NVDIMM_FWA_INVALID,
+	NVDIMM_FWA_IDLE,
+	NVDIMM_FWA_ARMED,
+	NVDIMM_FWA_BUSY,
+	NVDIMM_FWA_ARM_OVERFLOW,
+};
+
+enum nvdimm_fwa_trigger {
+	NVDIMM_FWA_ARM,
+	NVDIMM_FWA_DISARM,
+};
+
+enum nvdimm_fwa_capability {
+	NVDIMM_FWA_CAP_INVALID,
+	NVDIMM_FWA_CAP_NONE,
+	NVDIMM_FWA_CAP_QUIESCE,
+	NVDIMM_FWA_CAP_LIVE,
+};
+
+enum nvdimm_fwa_result {
+	NVDIMM_FWA_RESULT_INVALID,
+	NVDIMM_FWA_RESULT_NONE,
+	NVDIMM_FWA_RESULT_SUCCESS,
+	NVDIMM_FWA_RESULT_NOTSTAGED,
+	NVDIMM_FWA_RESULT_NEEDRESET,
+	NVDIMM_FWA_RESULT_FAIL,
+};
+
+struct nvdimm_bus_fw_ops {
+	enum nvdimm_fwa_state (*activate_state)
+		(struct nvdimm_bus_descriptor *nd_desc);
+	enum nvdimm_fwa_capability (*capability)
+		(struct nvdimm_bus_descriptor *nd_desc);
+	int (*activate)(struct nvdimm_bus_descriptor *nd_desc);
+};
+
+struct nvdimm_fw_ops {
+	enum nvdimm_fwa_state (*activate_state)(struct nvdimm *nvdimm);
+	enum nvdimm_fwa_result (*activate_result)(struct nvdimm *nvdimm);
+	int (*arm)(struct nvdimm *nvdimm, enum nvdimm_fwa_trigger arg);
+};
+
 void badrange_init(struct badrange *badrange);
 int badrange_add(struct badrange *badrange, u64 addr, u64 length);
 void badrange_forget(struct badrange *badrange, phys_addr_t start,
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index b960098acfb0..cb9afad82a90 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -453,6 +453,8 @@ extern bool hibernation_available(void);
 asmlinkage int swsusp_save(void);
 extern struct pbe *restore_pblist;
 int pfn_is_nosave(unsigned long pfn);
+
+int hibernate_quiet_exec(int (*func)(void *data), void *data);
 #else /* CONFIG_HIBERNATION */
 static inline void register_nosave_region(unsigned long b, unsigned long e) {}
 static inline void register_nosave_region_late(unsigned long b, unsigned long e) {}
@@ -464,6 +466,10 @@ static inline void hibernation_set_ops(const struct platform_hibernation_ops *op
 static inline int hibernate(void) { return -ENOSYS; }
 static inline bool system_entering_hibernation(void) { return false; }
 static inline bool hibernation_available(void) { return false; }
+
+static inline int hibernate_quiet_exec(int (*func)(void *data), void *data) {
+	return -ENOTSUPP;
+}
 #endif /* CONFIG_HIBERNATION */
 
 #ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 02ec716a4927..e6fab3f09c98 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -795,6 +795,103 @@ int hibernate(void)
 	return error;
 }
 
+/**
+ * hibernate_quiet_exec - Execute a function with all devices frozen.
+ * @func: Function to execute.
+ * @data: Data pointer to pass to @func.
+ *
+ * Return the @func return value or an error code if it cannot be executed.
+ */
+int hibernate_quiet_exec(int (*func)(void *data), void *data)
+{
+	int error, nr_calls = 0;
+
+	lock_system_sleep();
+
+	if (!hibernate_acquire()) {
+		error = -EBUSY;
+		goto unlock;
+	}
+
+	pm_prepare_console();
+
+	error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
+	if (error) {
+		nr_calls--;
+		goto exit;
+	}
+
+	error = freeze_processes();
+	if (error)
+		goto exit;
+
+	lock_device_hotplug();
+
+	pm_suspend_clear_flags();
+
+	error = platform_begin(true);
+	if (error)
+		goto thaw;
+
+	error = freeze_kernel_threads();
+	if (error)
+		goto thaw;
+
+	error = dpm_prepare(PMSG_FREEZE);
+	if (error)
+		goto dpm_complete;
+
+	suspend_console();
+
+	error = dpm_suspend(PMSG_FREEZE);
+	if (error)
+		goto dpm_resume;
+
+	error = dpm_suspend_end(PMSG_FREEZE);
+	if (error)
+		goto dpm_resume;
+
+	error = platform_pre_snapshot(true);
+	if (error)
+		goto skip;
+
+	error = func(data);
+
+skip:
+	platform_finish(true);
+
+	dpm_resume_start(PMSG_THAW);
+
+dpm_resume:
+	dpm_resume(PMSG_THAW);
+
+	resume_console();
+
+dpm_complete:
+	dpm_complete(PMSG_THAW);
+
+	thaw_kernel_threads();
+
+thaw:
+	platform_end(true);
+
+	unlock_device_hotplug();
+
+	thaw_processes();
+
+exit:
+	__pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
+
+	pm_restore_console();
+
+	hibernate_release();
+
+unlock:
+	unlock_system_sleep();
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(hibernate_quiet_exec);
 
 /**
  * software_resume - Resume from a saved hibernation image.
-- 
cgit v1.2.3


From 4fd5750af02ab7bba7c58a073060cc1da8a69173 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 20 Jul 2020 23:49:18 +0200
Subject: sched,tracing: Convert to sched_set_fifo()

One module user of sched_setscheduler() was overlooked and is
obviously causing build failures.

Convert ring_buffer_benchmark to use sched_set_fifo_low() when fifo==1
and sched_set_fifo() when fifo==2. This is a bit of an abuse, but it
makes the thing 'work' again.

Specifically, it enables all combinations that were previously
possible:

  producer higher than consumer
  consumer higher than producer

Fixes: 616d91b68cd5 ("sched: Remove sched_setscheduler*() EXPORTs")
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/20200720214918.GM5523@worktop.programming.kicks-ass.net
---
 kernel/trace/ring_buffer_benchmark.c | 48 +++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 8df0aa810950..78e576575b79 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -45,8 +45,8 @@ MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
 static int producer_nice = MAX_NICE;
 static int consumer_nice = MAX_NICE;
 
-static int producer_fifo = -1;
-static int consumer_fifo = -1;
+static int producer_fifo;
+static int consumer_fifo;
 
 module_param(producer_nice, int, 0644);
 MODULE_PARM_DESC(producer_nice, "nice prio for producer");
@@ -55,10 +55,10 @@ module_param(consumer_nice, int, 0644);
 MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
 
 module_param(producer_fifo, int, 0644);
-MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
+MODULE_PARM_DESC(producer_fifo, "use fifo for producer: 0 - disabled, 1 - low prio, 2 - fifo");
 
 module_param(consumer_fifo, int, 0644);
-MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
+MODULE_PARM_DESC(consumer_fifo, "use fifo for consumer: 0 - disabled, 1 - low prio, 2 - fifo");
 
 static int read_events;
 
@@ -303,22 +303,22 @@ static void ring_buffer_producer(void)
 		trace_printk("ERROR!\n");
 
 	if (!disable_reader) {
-		if (consumer_fifo < 0)
+		if (consumer_fifo)
+			trace_printk("Running Consumer at SCHED_FIFO %s\n",
+				     consumer_fifo == 1 ? "low" : "high");
+		else
 			trace_printk("Running Consumer at nice: %d\n",
 				     consumer_nice);
-		else
-			trace_printk("Running Consumer at SCHED_FIFO %d\n",
-				     consumer_fifo);
 	}
-	if (producer_fifo < 0)
+	if (producer_fifo)
+		trace_printk("Running Producer at SCHED_FIFO %s\n",
+			     producer_fifo == 1 ? "low" : "high");
+	else
 		trace_printk("Running Producer at nice: %d\n",
 			     producer_nice);
-	else
-		trace_printk("Running Producer at SCHED_FIFO %d\n",
-			     producer_fifo);
 
 	/* Let the user know that the test is running at low priority */
-	if (producer_fifo < 0 && consumer_fifo < 0 &&
+	if (!producer_fifo && !consumer_fifo &&
 	    producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
 		trace_printk("WARNING!!! This test is running at lowest priority.\n");
 
@@ -455,21 +455,19 @@ static int __init ring_buffer_benchmark_init(void)
 	 * Run them as low-prio background tasks by default:
 	 */
 	if (!disable_reader) {
-		if (consumer_fifo >= 0) {
-			struct sched_param param = {
-				.sched_priority = consumer_fifo
-			};
-			sched_setscheduler(consumer, SCHED_FIFO, &param);
-		} else
+		if (consumer_fifo >= 2)
+			sched_set_fifo(consumer);
+		else if (consumer_fifo == 1)
+			sched_set_fifo_low(consumer);
+		else
 			set_user_nice(consumer, consumer_nice);
 	}
 
-	if (producer_fifo >= 0) {
-		struct sched_param param = {
-			.sched_priority = producer_fifo
-		};
-		sched_setscheduler(producer, SCHED_FIFO, &param);
-	} else
+	if (producer_fifo >= 2)
+		sched_set_fifo(producer);
+	else if (producer_fifo == 1)
+		sched_set_fifo_low(producer);
+	else
 		set_user_nice(producer, producer_nice);
 
 	return 0;
-- 
cgit v1.2.3


From e65855a52b479f98674998cb23b21ef5a8144b04 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@arm.com>
Date: Thu, 16 Jul 2020 12:03:47 +0100
Subject: sched/uclamp: Fix a deadlock when enabling uclamp static key

The following splat was caught when setting uclamp value of a task:

  BUG: sleeping function called from invalid context at ./include/linux/percpu-rwsem.h:49

   cpus_read_lock+0x68/0x130
   static_key_enable+0x1c/0x38
   __sched_setscheduler+0x900/0xad8

Fix by ensuring we enable the key outside of the critical section in
__sched_setscheduler()

Fixes: 46609ce22703 ("sched/uclamp: Protect uclamp fast path code with static key")
Signed-off-by: Qais Yousef <qais.yousef@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200716110347.19553-4-qais.yousef@arm.com
---
 kernel/sched/core.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 678253450ebb..e44d83f3e0e6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1346,6 +1346,15 @@ static int uclamp_validate(struct task_struct *p,
 	if (upper_bound > SCHED_CAPACITY_SCALE)
 		return -EINVAL;
 
+	/*
+	 * We have valid uclamp attributes; make sure uclamp is enabled.
+	 *
+	 * We need to do that here, because enabling static branches is a
+	 * blocking operation which obviously cannot be done while holding
+	 * scheduler locks.
+	 */
+	static_branch_enable(&sched_uclamp_used);
+
 	return 0;
 }
 
@@ -1376,8 +1385,6 @@ static void __setscheduler_uclamp(struct task_struct *p,
 	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
 		return;
 
-	static_branch_enable(&sched_uclamp_used);
-
 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
 		uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
 			      attr->sched_util_min, true);
-- 
cgit v1.2.3


From 13685c4a08fca9dd76bf53bfcbadc044ab2a08cb Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@arm.com>
Date: Thu, 16 Jul 2020 12:03:45 +0100
Subject: sched/uclamp: Add a new sysctl to control RT default boost value

RT tasks by default run at the highest capacity/performance level. When
uclamp is selected this default behavior is retained by enforcing the
requested uclamp.min (p->uclamp_req[UCLAMP_MIN]) of the RT tasks to be
uclamp_none(UCLAMP_MAX), which is SCHED_CAPACITY_SCALE; the maximum
value.

This is also referred to as 'the default boost value of RT tasks'.

See commit 1a00d999971c ("sched/uclamp: Set default clamps for RT tasks").

On battery powered devices, it is desired to control this default
(currently hardcoded) behavior at runtime to reduce energy consumed by
RT tasks.

For example, a mobile device manufacturer where big.LITTLE architecture
is dominant, the performance of the little cores varies across SoCs, and
on high end ones the big cores could be too power hungry.

Given the diversity of SoCs, the new knob allows manufactures to tune
the best performance/power for RT tasks for the particular hardware they
run on.

They could opt to further tune the value when the user selects
a different power saving mode or when the device is actively charging.

The runtime aspect of it further helps in creating a single kernel image
that can be run on multiple devices that require different tuning.

Keep in mind that a lot of RT tasks in the system are created by the
kernel. On Android for instance I can see over 50 RT tasks, only
a handful of which created by the Android framework.

To control the default behavior globally by system admins and device
integrator, introduce the new sysctl_sched_uclamp_util_min_rt_default
to change the default boost value of the RT tasks.

I anticipate this to be mostly in the form of modifying the init script
of a particular device.

To avoid polluting the fast path with unnecessary code, the approach
taken is to synchronously do the update by traversing all the existing
tasks in the system. This could race with a concurrent fork(), which is
dealt with by introducing sched_post_fork() function which will ensure
the racy fork will get the right update applied.

Tested on Juno-r2 in combination with the RT capacity awareness [1].
By default an RT task will go to the highest capacity CPU and run at the
maximum frequency, which is particularly energy inefficient on high end
mobile devices because the biggest core[s] are 'huge' and power hungry.

With this patch the RT task can be controlled to run anywhere by
default, and doesn't cause the frequency to be maximum all the time.
Yet any task that really needs to be boosted can easily escape this
default behavior by modifying its requested uclamp.min value
(p->uclamp_req[UCLAMP_MIN]) via sched_setattr() syscall.

[1] 804d402fb6f6: ("sched/rt: Make RT capacity-aware")

Signed-off-by: Qais Yousef <qais.yousef@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200716110347.19553-2-qais.yousef@arm.com
---
 include/linux/sched.h        |  10 +++-
 include/linux/sched/sysctl.h |   1 +
 include/linux/sched/task.h   |   1 +
 kernel/fork.c                |   1 +
 kernel/sched/core.c          | 119 ++++++++++++++++++++++++++++++++++++++++---
 kernel/sysctl.c              |   7 +++
 6 files changed, 131 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index adf0125190d4..a6bf77c34687 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -686,9 +686,15 @@ struct task_struct {
 	struct sched_dl_entity		dl;
 
 #ifdef CONFIG_UCLAMP_TASK
-	/* Clamp values requested for a scheduling entity */
+	/*
+	 * Clamp values requested for a scheduling entity.
+	 * Must be updated with task_rq_lock() held.
+	 */
 	struct uclamp_se		uclamp_req[UCLAMP_CNT];
-	/* Effective clamp values used for a scheduling entity */
+	/*
+	 * Effective clamp values used for a scheduling entity.
+	 * Must be updated with task_rq_lock() held.
+	 */
 	struct uclamp_se		uclamp[UCLAMP_CNT];
 #endif
 
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 24be30a40814..3c31ba88aca5 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -67,6 +67,7 @@ extern unsigned int sysctl_sched_dl_period_min;
 #ifdef CONFIG_UCLAMP_TASK
 extern unsigned int sysctl_sched_uclamp_util_min;
 extern unsigned int sysctl_sched_uclamp_util_max;
+extern unsigned int sysctl_sched_uclamp_util_min_rt_default;
 #endif
 
 #ifdef CONFIG_CFS_BANDWIDTH
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 38359071236a..e7ddab095baf 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -55,6 +55,7 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 
 extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
+extern void sched_post_fork(struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
 
 void __noreturn do_task_dead(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index efc5493203ae..e75c2e41f3d1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2304,6 +2304,7 @@ static __latent_entropy struct task_struct *copy_process(
 	write_unlock_irq(&tasklist_lock);
 
 	proc_fork_connector(p);
+	sched_post_fork(p);
 	cgroup_post_fork(p, args);
 	perf_event_fork(p);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e44d83f3e0e6..12e1f3a2cabc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -889,6 +889,23 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
 /* Max allowed maximum utilization */
 unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
 
+/*
+ * By default RT tasks run at the maximum performance point/capacity of the
+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
+ * SCHED_CAPACITY_SCALE.
+ *
+ * This knob allows admins to change the default behavior when uclamp is being
+ * used. In battery powered devices, particularly, running at the maximum
+ * capacity and frequency will increase energy consumption and shorten the
+ * battery life.
+ *
+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
+ *
+ * This knob will not override the system default sched_util_clamp_min defined
+ * above.
+ */
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
+
 /* All clamps are required to be less or equal than these values */
 static struct uclamp_se uclamp_default[UCLAMP_CNT];
 
@@ -991,6 +1008,64 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
 	return uclamp_idle_value(rq, clamp_id, clamp_value);
 }
 
+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+	unsigned int default_util_min;
+	struct uclamp_se *uc_se;
+
+	lockdep_assert_held(&p->pi_lock);
+
+	uc_se = &p->uclamp_req[UCLAMP_MIN];
+
+	/* Only sync if user didn't override the default */
+	if (uc_se->user_defined)
+		return;
+
+	default_util_min = sysctl_sched_uclamp_util_min_rt_default;
+	uclamp_se_set(uc_se, default_util_min, false);
+}
+
+static void uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+	struct rq_flags rf;
+	struct rq *rq;
+
+	if (!rt_task(p))
+		return;
+
+	/* Protect updates to p->uclamp_* */
+	rq = task_rq_lock(p, &rf);
+	__uclamp_update_util_min_rt_default(p);
+	task_rq_unlock(rq, p, &rf);
+}
+
+static void uclamp_sync_util_min_rt_default(void)
+{
+	struct task_struct *g, *p;
+
+	/*
+	 * copy_process()			sysctl_uclamp
+	 *					  uclamp_min_rt = X;
+	 *   write_lock(&tasklist_lock)		  read_lock(&tasklist_lock)
+	 *   // link thread			  smp_mb__after_spinlock()
+	 *   write_unlock(&tasklist_lock)	  read_unlock(&tasklist_lock);
+	 *   sched_post_fork()			  for_each_process_thread()
+	 *     __uclamp_sync_rt()		    __uclamp_sync_rt()
+	 *
+	 * Ensures that either sched_post_fork() will observe the new
+	 * uclamp_min_rt or for_each_process_thread() will observe the new
+	 * task.
+	 */
+	read_lock(&tasklist_lock);
+	smp_mb__after_spinlock();
+	read_unlock(&tasklist_lock);
+
+	rcu_read_lock();
+	for_each_process_thread(g, p)
+		uclamp_update_util_min_rt_default(p);
+	rcu_read_unlock();
+}
+
 static inline struct uclamp_se
 uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
 {
@@ -1278,12 +1353,13 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	bool update_root_tg = false;
-	int old_min, old_max;
+	int old_min, old_max, old_min_rt;
 	int result;
 
 	mutex_lock(&uclamp_mutex);
 	old_min = sysctl_sched_uclamp_util_min;
 	old_max = sysctl_sched_uclamp_util_max;
+	old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
 
 	result = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (result)
@@ -1292,7 +1368,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 		goto done;
 
 	if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
-	    sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+	    sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE	||
+	    sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
+
 		result = -EINVAL;
 		goto undo;
 	}
@@ -1313,6 +1391,11 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 		uclamp_update_root_tg();
 	}
 
+	if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
+		static_branch_enable(&sched_uclamp_used);
+		uclamp_sync_util_min_rt_default();
+	}
+
 	/*
 	 * We update all RUNNABLE tasks only when task groups are in use.
 	 * Otherwise, keep it simple and do just a lazy update at each next
@@ -1324,6 +1407,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 undo:
 	sysctl_sched_uclamp_util_min = old_min;
 	sysctl_sched_uclamp_util_max = old_max;
+	sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
 done:
 	mutex_unlock(&uclamp_mutex);
 
@@ -1369,17 +1453,20 @@ static void __setscheduler_uclamp(struct task_struct *p,
 	 */
 	for_each_clamp_id(clamp_id) {
 		struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
-		unsigned int clamp_value = uclamp_none(clamp_id);
 
 		/* Keep using defined clamps across class changes */
 		if (uc_se->user_defined)
 			continue;
 
-		/* By default, RT tasks always get 100% boost */
+		/*
+		 * RT by default have a 100% boost value that could be modified
+		 * at runtime.
+		 */
 		if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
-			clamp_value = uclamp_none(UCLAMP_MAX);
+			__uclamp_update_util_min_rt_default(p);
+		else
+			uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
 
-		uclamp_se_set(uc_se, clamp_value, false);
 	}
 
 	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
@@ -1400,6 +1487,10 @@ static void uclamp_fork(struct task_struct *p)
 {
 	enum uclamp_id clamp_id;
 
+	/*
+	 * We don't need to hold task_rq_lock() when updating p->uclamp_* here
+	 * as the task is still at its early fork stages.
+	 */
 	for_each_clamp_id(clamp_id)
 		p->uclamp[clamp_id].active = false;
 
@@ -1412,6 +1503,11 @@ static void uclamp_fork(struct task_struct *p)
 	}
 }
 
+static void uclamp_post_fork(struct task_struct *p)
+{
+	uclamp_update_util_min_rt_default(p);
+}
+
 static void __init init_uclamp_rq(struct rq *rq)
 {
 	enum uclamp_id clamp_id;
@@ -1462,6 +1558,7 @@ static inline int uclamp_validate(struct task_struct *p,
 static void __setscheduler_uclamp(struct task_struct *p,
 				  const struct sched_attr *attr) { }
 static inline void uclamp_fork(struct task_struct *p) { }
+static inline void uclamp_post_fork(struct task_struct *p) { }
 static inline void init_uclamp(void) { }
 #endif /* CONFIG_UCLAMP_TASK */
 
@@ -3205,6 +3302,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	return 0;
 }
 
+void sched_post_fork(struct task_struct *p)
+{
+	uclamp_post_fork(p);
+}
+
 unsigned long to_ratio(u64 period, u64 runtime)
 {
 	if (runtime == RUNTIME_INF)
@@ -5724,6 +5826,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 		kattr.sched_nice = task_nice(p);
 
 #ifdef CONFIG_UCLAMP_TASK
+	/*
+	 * This could race with another potential updater, but this is fine
+	 * because it'll correctly read the old or the new value. We don't need
+	 * to guarantee who wins the race as long as it doesn't return garbage.
+	 */
 	kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
 	kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
 #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4aea67d3d552..1b4d2dc270a5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1815,6 +1815,13 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sysctl_sched_uclamp_handler,
 	},
+	{
+		.procname	= "sched_util_clamp_min_rt_default",
+		.data		= &sysctl_sched_uclamp_util_min_rt_default,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_sched_uclamp_handler,
+	},
 #endif
 #ifdef CONFIG_SCHED_AUTOGROUP
 	{
-- 
cgit v1.2.3


From f891f19736bdf404845f97d8038054be37160ea8 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Wed, 29 Jul 2020 17:09:19 +0530
Subject: kexec_file: Allow archs to handle special regions while locating
 memory hole

Some architectures may have special memory regions, within the given
memory range, which can't be used for the buffer in a kexec segment.
Implement weak arch_kexec_locate_mem_hole() definition which arch code
may override, to take care of special regions, while trying to locate
a memory hole.

Also, add the missing declarations for arch overridable functions and
and drop the __weak descriptors in the declarations to avoid non-weak
definitions from becoming weak.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/159602273603.575379.17665852963340380839.stgit@hbathini
---
 include/linux/kexec.h | 29 ++++++++++++++++++-----------
 kernel/kexec_file.c   | 16 ++++++++++++++--
 2 files changed, 32 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index ea67910ae6b7..9e93bef52968 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -183,17 +183,24 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 				   bool get_value);
 void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name);
 
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
-					 unsigned long buf_len);
-void * __weak arch_kexec_kernel_image_load(struct kimage *image);
-int __weak arch_kexec_apply_relocations_add(struct purgatory_info *pi,
-					    Elf_Shdr *section,
-					    const Elf_Shdr *relsec,
-					    const Elf_Shdr *symtab);
-int __weak arch_kexec_apply_relocations(struct purgatory_info *pi,
-					Elf_Shdr *section,
-					const Elf_Shdr *relsec,
-					const Elf_Shdr *symtab);
+/* Architectures may override the below functions */
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+				  unsigned long buf_len);
+void *arch_kexec_kernel_image_load(struct kimage *image);
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+				     Elf_Shdr *section,
+				     const Elf_Shdr *relsec,
+				     const Elf_Shdr *symtab);
+int arch_kexec_apply_relocations(struct purgatory_info *pi,
+				 Elf_Shdr *section,
+				 const Elf_Shdr *relsec,
+				 const Elf_Shdr *symtab);
+int arch_kimage_file_post_load_cleanup(struct kimage *image);
+#ifdef CONFIG_KEXEC_SIG
+int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+				 unsigned long buf_len);
+#endif
+int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf);
 
 extern int kexec_add_buffer(struct kexec_buf *kbuf);
 int kexec_locate_mem_hole(struct kexec_buf *kbuf);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index bb05fd52de85..eb42d2efa16a 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -657,6 +657,19 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
 	return ret == 1 ? 0 : -EADDRNOTAVAIL;
 }
 
+/**
+ * arch_kexec_locate_mem_hole - Find free memory to place the segments.
+ * @kbuf:                       Parameters for the memory search.
+ *
+ * On success, kbuf->mem will have the start address of the memory region found.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
+{
+	return kexec_locate_mem_hole(kbuf);
+}
+
 /**
  * kexec_add_buffer - place a buffer in a kexec segment
  * @kbuf:	Buffer contents and memory parameters.
@@ -669,7 +682,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
  */
 int kexec_add_buffer(struct kexec_buf *kbuf)
 {
-
 	struct kexec_segment *ksegment;
 	int ret;
 
@@ -697,7 +709,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
 	kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
 
 	/* Walk the RAM ranges and allocate a suitable range for the buffer */
-	ret = kexec_locate_mem_hole(kbuf);
+	ret = arch_kexec_locate_mem_hole(kbuf);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From b75058614fdd3140074a640b514f6a0b4d485a2d Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Mon, 20 Jul 2020 17:55:19 +0200
Subject: sched: tasks: Use sequence counter with associated spinlock

A sequence counter write side critical section must be protected by some
form of locking to serialize writers. A plain seqcount_t does not
contain the information of which lock must be held when entering a write
side critical section.

Use the new seqcount_spinlock_t data type, which allows to associate a
spinlock with the sequence counter. This enables lockdep to verify that
the spinlock used for writer serialization is held when the write side
critical section is entered.

If lockdep is disabled this lock association is compiled out and has
neither storage size nor runtime overhead.

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200720155530.1173732-14-a.darwish@linutronix.de
---
 include/linux/sched.h | 2 +-
 init/init_task.c      | 3 ++-
 kernel/fork.c         | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d1de021b315..9a9d8263962d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1050,7 +1050,7 @@ struct task_struct {
 	/* Protected by ->alloc_lock: */
 	nodemask_t			mems_allowed;
 	/* Seqence number to catch updates: */
-	seqcount_t			mems_allowed_seq;
+	seqcount_spinlock_t		mems_allowed_seq;
 	int				cpuset_mem_spread_rotor;
 	int				cpuset_slab_spread_rotor;
 #endif
diff --git a/init/init_task.c b/init/init_task.c
index 15089d15010a..94fe3ba1bb60 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -154,7 +154,8 @@ struct task_struct init_task
 	.trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list),
 #endif
 #ifdef CONFIG_CPUSETS
-	.mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq),
+	.mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq,
+						 &init_task.alloc_lock),
 #endif
 #ifdef CONFIG_RT_MUTEXES
 	.pi_waiters	= RB_ROOT_CACHED,
diff --git a/kernel/fork.c b/kernel/fork.c
index 70d9d0a4de2a..fc72f09a61b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2032,7 +2032,7 @@ static __latent_entropy struct task_struct *copy_process(
 #ifdef CONFIG_CPUSETS
 	p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
 	p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
-	seqcount_init(&p->mems_allowed_seq);
+	seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	p->irq_events = 0;
-- 
cgit v1.2.3


From 025e82bcbc34cd071390e72fd0b31593282f9425 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Mon, 20 Jul 2020 17:55:23 +0200
Subject: timekeeping: Use sequence counter with associated raw spinlock

A sequence counter write side critical section must be protected by some
form of locking to serialize writers. A plain seqcount_t does not
contain the information of which lock must be held when entering a write
side critical section.

Use the new seqcount_raw_spinlock_t data type, which allows to associate
a raw spinlock with the sequence counter. This enables lockdep to verify
that the raw spinlock used for writer serialization is held when the
write side critical section is entered.

If lockdep is disabled this lock association is compiled out and has
neither storage size nor runtime overhead.

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200720155530.1173732-18-a.darwish@linutronix.de
---
 kernel/time/timekeeping.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d20d489841c8..05ecfd8a3314 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -39,18 +39,19 @@ enum timekeeping_adv_mode {
 	TK_ADV_FREQ
 };
 
+static DEFINE_RAW_SPINLOCK(timekeeper_lock);
+
 /*
  * The most important data for readout fits into a single 64 byte
  * cache line.
  */
 static struct {
-	seqcount_t		seq;
+	seqcount_raw_spinlock_t	seq;
 	struct timekeeper	timekeeper;
 } tk_core ____cacheline_aligned = {
-	.seq = SEQCNT_ZERO(tk_core.seq),
+	.seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock),
 };
 
-static DEFINE_RAW_SPINLOCK(timekeeper_lock);
 static struct timekeeper shadow_timekeeper;
 
 /**
@@ -63,7 +64,7 @@ static struct timekeeper shadow_timekeeper;
  * See @update_fast_timekeeper() below.
  */
 struct tk_fast {
-	seqcount_t		seq;
+	seqcount_raw_spinlock_t	seq;
 	struct tk_read_base	base[2];
 };
 
@@ -80,11 +81,13 @@ static struct clocksource dummy_clock = {
 };
 
 static struct tk_fast tk_fast_mono ____cacheline_aligned = {
+	.seq     = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock),
 	.base[0] = { .clock = &dummy_clock, },
 	.base[1] = { .clock = &dummy_clock, },
 };
 
 static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
+	.seq     = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock),
 	.base[0] = { .clock = &dummy_clock, },
 	.base[1] = { .clock = &dummy_clock, },
 };
@@ -157,7 +160,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
  * tk_clock_read - atomic clocksource read() helper
  *
  * This helper is necessary to use in the read paths because, while the
- * seqlock ensures we don't return a bad value while structures are updated,
+ * seqcount ensures we don't return a bad value while structures are updated,
  * it doesn't protect from potential crashes. There is the possibility that
  * the tkr's clocksource may change between the read reference, and the
  * clock reference passed to the read function.  This can cause crashes if
@@ -222,10 +225,10 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
 	unsigned int seq;
 
 	/*
-	 * Since we're called holding a seqlock, the data may shift
+	 * Since we're called holding a seqcount, the data may shift
 	 * under us while we're doing the calculation. This can cause
 	 * false positives, since we'd note a problem but throw the
-	 * results away. So nest another seqlock here to atomically
+	 * results away. So nest another seqcount here to atomically
 	 * grab the points we are checking with.
 	 */
 	do {
@@ -486,7 +489,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
  *
  * To keep it NMI safe since we're accessing from tracing, we're not using a
  * separate timekeeper with updates to monotonic clock and boot offset
- * protected with seqlocks. This has the following minor side effects:
+ * protected with seqcounts. This has the following minor side effects:
  *
  * (1) Its possible that a timestamp be taken after the boot offset is updated
  * but before the timekeeper is updated. If this happens, the new boot offset
-- 
cgit v1.2.3


From af5a06b582ec3d7b0160b4faaa65f73d8dcf989f Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Mon, 20 Jul 2020 17:55:30 +0200
Subject: hrtimer: Use sequence counter with associated raw spinlock

A sequence counter write side critical section must be protected by some
form of locking to serialize writers. A plain seqcount_t does not
contain the information of which lock must be held when entering a write
side critical section.

Use the new seqcount_raw_spinlock_t data type, which allows to associate
a raw spinlock with the sequence counter. This enables lockdep to verify
that the raw spinlock used for writer serialization is held when the
write side critical section is entered.

If lockdep is disabled this lock association is compiled out and has
neither storage size nor runtime overhead.

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200720155530.1173732-25-a.darwish@linutronix.de
---
 include/linux/hrtimer.h |  2 +-
 kernel/time/hrtimer.c   | 13 ++++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 15c8ac313678..25993b86ac5c 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -159,7 +159,7 @@ struct hrtimer_clock_base {
 	struct hrtimer_cpu_base	*cpu_base;
 	unsigned int		index;
 	clockid_t		clockid;
-	seqcount_t		seq;
+	seqcount_raw_spinlock_t	seq;
 	struct hrtimer		*running;
 	struct timerqueue_head	active;
 	ktime_t			(*get_time)(void);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d89da1c7e005..c4038511d5c9 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -135,7 +135,11 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
  * timer->base->cpu_base
  */
 static struct hrtimer_cpu_base migration_cpu_base = {
-	.clock_base = { { .cpu_base = &migration_cpu_base, }, },
+	.clock_base = { {
+		.cpu_base = &migration_cpu_base,
+		.seq      = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
+						     &migration_cpu_base.lock),
+	}, },
 };
 
 #define migration_base	migration_cpu_base.clock_base[0]
@@ -1998,8 +2002,11 @@ int hrtimers_prepare_cpu(unsigned int cpu)
 	int i;
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-		cpu_base->clock_base[i].cpu_base = cpu_base;
-		timerqueue_init_head(&cpu_base->clock_base[i].active);
+		struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
+
+		clock_b->cpu_base = cpu_base;
+		seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
+		timerqueue_init_head(&clock_b->active);
 	}
 
 	cpu_base->cpu = cpu;
-- 
cgit v1.2.3


From b13fecb1c3a603c4b8e99b306fecf4f668c11b32 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 13 Jul 2020 15:01:26 -0700
Subject: treewide: Replace DECLARE_TASKLET() with DECLARE_TASKLET_OLD()

This converts all the existing DECLARE_TASKLET() (and ...DISABLED)
macros with DECLARE_TASKLET_OLD() in preparation for refactoring the
tasklet callback type. All existing DECLARE_TASKLET() users had a "0"
data argument, it has been removed here as well.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/input/keyboard/omap-keypad.c   |  2 +-
 drivers/input/serio/hil_mlc.c          |  2 +-
 drivers/net/wan/farsync.c              |  4 ++--
 drivers/s390/crypto/ap_bus.c           |  2 +-
 drivers/staging/most/dim2/dim2.c       |  2 +-
 drivers/staging/octeon/ethernet-tx.c   |  2 +-
 drivers/tty/vt/keyboard.c              |  2 +-
 drivers/usb/gadget/udc/snps_udc_core.c |  2 +-
 drivers/usb/host/fhci-sched.c          |  2 +-
 include/linux/interrupt.h              | 15 ++++++++++-----
 kernel/backtracetest.c                 |  2 +-
 kernel/debug/debug_core.c              |  2 +-
 kernel/irq/resend.c                    |  2 +-
 net/atm/pppoatm.c                      |  2 +-
 net/iucv/iucv.c                        |  2 +-
 sound/drivers/pcsp/pcsp_lib.c          |  2 +-
 16 files changed, 26 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/drivers/input/keyboard/omap-keypad.c b/drivers/input/keyboard/omap-keypad.c
index 5fe7a5633e33..dbe836c7ff47 100644
--- a/drivers/input/keyboard/omap-keypad.c
+++ b/drivers/input/keyboard/omap-keypad.c
@@ -46,7 +46,7 @@ struct omap_kp {
 	unsigned short keymap[];
 };
 
-static DECLARE_TASKLET_DISABLED(kp_tasklet, omap_kp_tasklet, 0);
+static DECLARE_TASKLET_DISABLED_OLD(kp_tasklet, omap_kp_tasklet);
 
 static unsigned int *row_gpios;
 static unsigned int *col_gpios;
diff --git a/drivers/input/serio/hil_mlc.c b/drivers/input/serio/hil_mlc.c
index e1423f7648d6..65f4e9d62a67 100644
--- a/drivers/input/serio/hil_mlc.c
+++ b/drivers/input/serio/hil_mlc.c
@@ -77,7 +77,7 @@ static struct timer_list	hil_mlcs_kicker;
 static int			hil_mlcs_probe;
 
 static void hil_mlcs_process(unsigned long unused);
-static DECLARE_TASKLET_DISABLED(hil_mlcs_tasklet, hil_mlcs_process, 0);
+static DECLARE_TASKLET_DISABLED_OLD(hil_mlcs_tasklet, hil_mlcs_process);
 
 
 /* #define HIL_MLC_DEBUG */
diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c
index 7916efce7188..f5198a391417 100644
--- a/drivers/net/wan/farsync.c
+++ b/drivers/net/wan/farsync.c
@@ -569,8 +569,8 @@ static void do_bottom_half_rx(struct fst_card_info *card);
 static void fst_process_tx_work_q(unsigned long work_q);
 static void fst_process_int_work_q(unsigned long work_q);
 
-static DECLARE_TASKLET(fst_tx_task, fst_process_tx_work_q, 0);
-static DECLARE_TASKLET(fst_int_task, fst_process_int_work_q, 0);
+static DECLARE_TASKLET_OLD(fst_tx_task, fst_process_tx_work_q);
+static DECLARE_TASKLET_OLD(fst_int_task, fst_process_int_work_q);
 
 static struct fst_card_info *fst_card_array[FST_MAX_CARDS];
 static spinlock_t fst_work_q_lock;
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index e71ca4a719a5..2589ccd257e3 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -93,7 +93,7 @@ static DECLARE_WORK(ap_scan_work, ap_scan_bus);
  * Tasklet & timer for AP request polling and interrupts
  */
 static void ap_tasklet_fn(unsigned long);
-static DECLARE_TASKLET(ap_tasklet, ap_tasklet_fn, 0);
+static DECLARE_TASKLET_OLD(ap_tasklet, ap_tasklet_fn);
 static DECLARE_WAIT_QUEUE_HEAD(ap_poll_wait);
 static struct task_struct *ap_poll_kthread;
 static DEFINE_MUTEX(ap_poll_thread_mutex);
diff --git a/drivers/staging/most/dim2/dim2.c b/drivers/staging/most/dim2/dim2.c
index 8e0f27e61652..509c8012d20b 100644
--- a/drivers/staging/most/dim2/dim2.c
+++ b/drivers/staging/most/dim2/dim2.c
@@ -46,7 +46,7 @@ MODULE_PARM_DESC(fcnt, "Num of frames per sub-buffer for sync channels as a powe
 static DEFINE_SPINLOCK(dim_lock);
 
 static void dim2_tasklet_fn(unsigned long data);
-static DECLARE_TASKLET(dim2_tasklet, dim2_tasklet_fn, 0);
+static DECLARE_TASKLET_OLD(dim2_tasklet, dim2_tasklet_fn);
 
 /**
  * struct hdm_channel - private structure to keep channel specific data
diff --git a/drivers/staging/octeon/ethernet-tx.c b/drivers/staging/octeon/ethernet-tx.c
index ab7dd8216006..9c71ad5af7b9 100644
--- a/drivers/staging/octeon/ethernet-tx.c
+++ b/drivers/staging/octeon/ethernet-tx.c
@@ -41,7 +41,7 @@
 #endif
 
 static void cvm_oct_tx_do_cleanup(unsigned long arg);
-static DECLARE_TASKLET(cvm_oct_tx_cleanup_tasklet, cvm_oct_tx_do_cleanup, 0);
+static DECLARE_TASKLET_OLD(cvm_oct_tx_cleanup_tasklet, cvm_oct_tx_do_cleanup);
 
 /* Maximum number of SKBs to try to free per xmit packet. */
 #define MAX_SKB_TO_FREE (MAX_OUT_QUEUE_DEPTH * 2)
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 568b2171f335..f80199984ee0 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -1236,7 +1236,7 @@ static void kbd_bh(unsigned long dummy)
 	}
 }
 
-DECLARE_TASKLET_DISABLED(keyboard_tasklet, kbd_bh, 0);
+DECLARE_TASKLET_DISABLED_OLD(keyboard_tasklet, kbd_bh);
 
 #if defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_ALPHA) ||\
     defined(CONFIG_MIPS) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) ||\
diff --git a/drivers/usb/gadget/udc/snps_udc_core.c b/drivers/usb/gadget/udc/snps_udc_core.c
index afdd28f332ce..e76f1a50b0fc 100644
--- a/drivers/usb/gadget/udc/snps_udc_core.c
+++ b/drivers/usb/gadget/udc/snps_udc_core.c
@@ -96,7 +96,7 @@ static int stop_pollstall_timer;
 static DECLARE_COMPLETION(on_pollstall_exit);
 
 /* tasklet for usb disconnect */
-static DECLARE_TASKLET(disconnect_tasklet, udc_tasklet_disconnect, 0);
+static DECLARE_TASKLET_OLD(disconnect_tasklet, udc_tasklet_disconnect);
 
 /* endpoint names used for print */
 static const char ep0_string[] = "ep0in";
diff --git a/drivers/usb/host/fhci-sched.c b/drivers/usb/host/fhci-sched.c
index 3235d5307403..5c423f240a1f 100644
--- a/drivers/usb/host/fhci-sched.c
+++ b/drivers/usb/host/fhci-sched.c
@@ -677,7 +677,7 @@ static void process_done_list(unsigned long data)
 	enable_irq(fhci_to_hcd(fhci)->irq);
 }
 
-DECLARE_TASKLET(fhci_tasklet, process_done_list, 0);
+DECLARE_TASKLET_OLD(fhci_tasklet, process_done_list);
 
 /* transfer complted callback */
 u32 fhci_transfer_confirm_callback(struct fhci_hcd *fhci)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5db970b6615a..b911196f03eb 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -612,12 +612,17 @@ struct tasklet_struct
 	unsigned long data;
 };
 
-#define DECLARE_TASKLET(name, func, data) \
-struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(0), func, data }
-
-#define DECLARE_TASKLET_DISABLED(name, func, data) \
-struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
+#define DECLARE_TASKLET_OLD(name, _func)		\
+struct tasklet_struct name = {				\
+	.count = ATOMIC_INIT(0),			\
+	.func = _func,					\
+}
 
+#define DECLARE_TASKLET_DISABLED_OLD(name, _func)	\
+struct tasklet_struct name = {				\
+	.count = ATOMIC_INIT(1),			\
+	.func = _func,					\
+}
 
 enum
 {
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a2a97fa3071b..370217dd7e39 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -29,7 +29,7 @@ static void backtrace_test_irq_callback(unsigned long data)
 	complete(&backtrace_work);
 }
 
-static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
+static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback);
 
 static void backtrace_test_irq(void)
 {
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9e5934780f41..b16dbc1bf056 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1068,7 +1068,7 @@ static void kgdb_tasklet_bpt(unsigned long ing)
 	atomic_set(&kgdb_break_tasklet_var, 0);
 }
 
-static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
+static DECLARE_TASKLET_OLD(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt);
 
 void kgdb_schedule_breakpoint(void)
 {
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 27634f4022d0..c48ce19a257f 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -45,7 +45,7 @@ static void resend_irqs(unsigned long arg)
 }
 
 /* Tasklet to handle resend: */
-static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+static DECLARE_TASKLET_OLD(resend_tasklet, resend_irqs);
 
 static int irq_sw_resend(struct irq_desc *desc)
 {
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
index 45d8e1d5d033..579b66da1d95 100644
--- a/net/atm/pppoatm.c
+++ b/net/atm/pppoatm.c
@@ -393,7 +393,7 @@ static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg)
 	 * Each PPPoATM instance has its own tasklet - this is just a
 	 * prototypical one used to initialize them
 	 */
-	static const DECLARE_TASKLET(tasklet_proto, pppoatm_wakeup_sender, 0);
+	static const DECLARE_TASKLET_OLD(tasklet_proto, pppoatm_wakeup_sender);
 	if (copy_from_user(&be, arg, sizeof be))
 		return -EFAULT;
 	if (be.encaps != PPPOATM_ENCAPS_AUTODETECT &&
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 19250a0c85d3..cd2e468852e7 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -105,7 +105,7 @@ static LIST_HEAD(iucv_task_queue);
  * The tasklet for fast delivery of iucv interrupts.
  */
 static void iucv_tasklet_fn(unsigned long);
-static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_fn,0);
+static DECLARE_TASKLET_OLD(iucv_tasklet, iucv_tasklet_fn);
 
 /*
  * Queue of interrupt buffers for delivery via a work queue
diff --git a/sound/drivers/pcsp/pcsp_lib.c b/sound/drivers/pcsp/pcsp_lib.c
index 05244b11ed5e..4e79293d7f11 100644
--- a/sound/drivers/pcsp/pcsp_lib.c
+++ b/sound/drivers/pcsp/pcsp_lib.c
@@ -36,7 +36,7 @@ static void pcsp_call_pcm_elapsed(unsigned long priv)
 	}
 }
 
-static DECLARE_TASKLET(pcsp_pcm_tasklet, pcsp_call_pcm_elapsed, 0);
+static DECLARE_TASKLET_OLD(pcsp_pcm_tasklet, pcsp_call_pcm_elapsed);
 
 /* write the port and returns the next expire time in ns;
  * called at the trigger-start and in hrtimer callback
-- 
cgit v1.2.3


From 12cc923f1ccc1df467e046b02a72c2b3b321b6a2 Mon Sep 17 00:00:00 2001
From: Romain Perier <romain.perier@gmail.com>
Date: Sun, 29 Sep 2019 18:30:13 +0200
Subject: tasklet: Introduce new initialization API

Nowadays, modern kernel subsystems that use callbacks pass the data
structure associated with a given callback as argument to the callback.
The tasklet subsystem remains one which passes an arbitrary unsigned
long to the callback function. This has several problems:

- This keeps an extra field for storing the argument in each tasklet
  data structure, it bloats the tasklet_struct structure with a redundant
  .data field

- No type checking can be performed on this argument. Instead of
  using container_of() like other callback subsystems, it forces callbacks
  to do explicit type cast of the unsigned long argument into the required
  object type.

- Buffer overflows can overwrite the .func and the .data field, so
  an attacker can easily overwrite the function and its first argument
  to whatever it wants.

Add a new tasklet initialization API, via DECLARE_TASKLET() and
tasklet_setup(), which will replace the existing ones.

This work is greatly inspired by the timer_struct conversion series,
see commit e99e88a9d2b0 ("treewide: setup_timer() -> timer_setup()")

To avoid problems with both -Wcast-function-type (which is enabled in
the kernel via -Wextra is several subsystems), and with mismatched
function prototypes when build with Control Flow Integrity enabled,
this adds the "use_callback" member to let the tasklet caller choose
which union member to call through. Once all old API uses are removed,
this and the .data member will be removed as well. (On 64-bit this does
not grow the struct size as the new member fills the hole after atomic_t,
which is also "int" sized.)

Signed-off-by: Romain Perier <romain.perier@gmail.com>
Co-developed-by: Allen Pais <allen.lkml@gmail.com>
Signed-off-by: Allen Pais <allen.lkml@gmail.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Co-developed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/interrupt.h | 28 +++++++++++++++++++++++++++-
 kernel/softirq.c          | 18 +++++++++++++++++-
 2 files changed, 44 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index b911196f03eb..f9aee3538461 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -585,6 +585,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
 
 /* Tasklets --- multithreaded analogue of BHs.
 
+   This API is deprecated. Please consider using threaded IRQs instead:
+   https://lore.kernel.org/lkml/20200716081538.2sivhkj4hcyrusem@linutronix.de
+
    Main feature differing them of generic softirqs: tasklet
    is running only on one CPU simultaneously.
 
@@ -608,10 +611,31 @@ struct tasklet_struct
 	struct tasklet_struct *next;
 	unsigned long state;
 	atomic_t count;
-	void (*func)(unsigned long);
+	bool use_callback;
+	union {
+		void (*func)(unsigned long data);
+		void (*callback)(struct tasklet_struct *t);
+	};
 	unsigned long data;
 };
 
+#define DECLARE_TASKLET(name, _callback)		\
+struct tasklet_struct name = {				\
+	.count = ATOMIC_INIT(0),			\
+	.callback = _callback,				\
+	.use_callback = true,				\
+}
+
+#define DECLARE_TASKLET_DISABLED(name, _callback)	\
+struct tasklet_struct name = {				\
+	.count = ATOMIC_INIT(1),			\
+	.callback = _callback,				\
+	.use_callback = true,				\
+}
+
+#define from_tasklet(var, callback_tasklet, tasklet_fieldname)	\
+	container_of(callback_tasklet, typeof(*var), tasklet_fieldname)
+
 #define DECLARE_TASKLET_OLD(name, _func)		\
 struct tasklet_struct name = {				\
 	.count = ATOMIC_INIT(0),			\
@@ -691,6 +715,8 @@ extern void tasklet_kill(struct tasklet_struct *t);
 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
 extern void tasklet_init(struct tasklet_struct *t,
 			 void (*func)(unsigned long), unsigned long data);
+extern void tasklet_setup(struct tasklet_struct *t,
+			  void (*callback)(struct tasklet_struct *));
 
 /*
  * Autoprobing for irqs:
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c4201b7f42b1..292e7c2d2333 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -547,7 +547,10 @@ static void tasklet_action_common(struct softirq_action *a,
 				if (!test_and_clear_bit(TASKLET_STATE_SCHED,
 							&t->state))
 					BUG();
-				t->func(t->data);
+				if (t->use_callback)
+					t->callback(t);
+				else
+					t->func(t->data);
 				tasklet_unlock(t);
 				continue;
 			}
@@ -573,6 +576,18 @@ static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
 	tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
 }
 
+void tasklet_setup(struct tasklet_struct *t,
+		   void (*callback)(struct tasklet_struct *))
+{
+	t->next = NULL;
+	t->state = 0;
+	atomic_set(&t->count, 0);
+	t->callback = callback;
+	t->use_callback = true;
+	t->data = 0;
+}
+EXPORT_SYMBOL(tasklet_setup);
+
 void tasklet_init(struct tasklet_struct *t,
 		  void (*func)(unsigned long), unsigned long data)
 {
@@ -580,6 +595,7 @@ void tasklet_init(struct tasklet_struct *t,
 	t->state = 0;
 	atomic_set(&t->count, 0);
 	t->func = func;
+	t->use_callback = false;
 	t->data = data;
 }
 EXPORT_SYMBOL(tasklet_init);
-- 
cgit v1.2.3


From 4fc00b79b85d4c34bef06ad49f109ad7cd9e5d83 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 28 Jul 2020 15:18:01 -0700
Subject: bpf: Add missing newline characters in verifier error messages

Newline characters are added in two verifier error messages,
refactored in Commit afbf21dce668 ("bpf: Support readonly/readwrite
buffers in verifier"). This way, they do not mix with
messages afterwards.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200728221801.1090349-1-yhs@fb.com
---
 kernel/bpf/verifier.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 88bb25d08bf8..b6ccfce3bf4c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3069,7 +3069,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env,
 {
 	if (off < 0) {
 		verbose(env,
-			"R%d invalid %s buffer access: off=%d, size=%d",
+			"R%d invalid %s buffer access: off=%d, size=%d\n",
 			regno, buf_info, off, size);
 		return -EACCES;
 	}
@@ -3078,7 +3078,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env,
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 		verbose(env,
-			"R%d invalid variable buffer offset: off=%d, var_off=%s",
+			"R%d invalid variable buffer offset: off=%d, var_off=%s\n",
 			regno, off, tn_buf);
 		return -EACCES;
 	}
-- 
cgit v1.2.3


From a9d0ba6772a0056457838022b624c3d2a17cee21 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Thu, 30 Jul 2020 16:23:17 +0800
Subject: tracing/hwlat: Drop the duplicate assignment in start_kthread()

We have set 'current_mask' to '&save_cpumask' in its declaration,
so there is no need to assign again.

Link: https://lkml.kernel.org/r/20200730082318.42584-1-haokexin@gmail.com

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_hwlat.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index e2be7bb7ef7e..ddb528a6cd51 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -371,7 +371,6 @@ static int start_kthread(struct trace_array *tr)
 		return 0;
 
 	/* Just pick the first CPU on first iteration */
-	current_mask = &save_cpumask;
 	get_online_cpus();
 	cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
 	put_online_cpus();
-- 
cgit v1.2.3


From 96b4833b6827a62c295b149213c68b559514c929 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Thu, 30 Jul 2020 16:23:18 +0800
Subject: tracing/hwlat: Honor the tracing_cpumask

In calculation of the cpu mask for the hwlat kernel thread, the wrong
cpu mask is used instead of the tracing_cpumask, this causes the
tracing/tracing_cpumask useless for hwlat tracer. Fixes it.

Link: https://lkml.kernel.org/r/20200730082318.42584-2-haokexin@gmail.com

Cc: Ingo Molnar <mingo@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 0330f7aa8ee6 ("tracing: Have hwlat trace migrate across tracing_cpumask CPUs")
Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_hwlat.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index ddb528a6cd51..17873e5d0353 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -283,6 +283,7 @@ static bool disable_migrate;
 static void move_to_next_cpu(void)
 {
 	struct cpumask *current_mask = &save_cpumask;
+	struct trace_array *tr = hwlat_trace;
 	int next_cpu;
 
 	if (disable_migrate)
@@ -296,7 +297,7 @@ static void move_to_next_cpu(void)
 		goto disable;
 
 	get_online_cpus();
-	cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+	cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
 	next_cpu = cpumask_next(smp_processor_id(), current_mask);
 	put_online_cpus();
 
@@ -372,7 +373,7 @@ static int start_kthread(struct trace_array *tr)
 
 	/* Just pick the first CPU on first iteration */
 	get_online_cpus();
-	cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+	cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
 	put_online_cpus();
 	next_cpu = cpumask_first(current_mask);
 
-- 
cgit v1.2.3


From 8a224ffb3f52b0027f6b7279854c71a31c48fc97 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Wed, 29 Jul 2020 02:05:53 +0800
Subject: ftrace: Setup correct FTRACE_FL_REGS flags for module

When module loaded and enabled, we will use __ftrace_replace_code
for module if any ftrace_ops referenced it found. But we will get
wrong ftrace_addr for module rec in ftrace_get_addr_new, because
rec->flags has not been setup correctly. It can cause the callback
function of a ftrace_ops has FTRACE_OPS_FL_SAVE_REGS to be called
with pt_regs set to NULL.
So setup correct FTRACE_FL_REGS flags for rec when we call
referenced_filters to find ftrace_ops references it.

Link: https://lkml.kernel.org/r/20200728180554.65203-1-zhouchengming@bytedance.com

Cc: stable@vger.kernel.org
Fixes: 8c4f3c3fa9681 ("ftrace: Check module functions being traced on reload")
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c141d347f71a..d052f856f1cf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6198,8 +6198,11 @@ static int referenced_filters(struct dyn_ftrace *rec)
 	int cnt = 0;
 
 	for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
-		if (ops_references_rec(ops, rec))
-		    cnt++;
+		if (ops_references_rec(ops, rec)) {
+			cnt++;
+			if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+				rec->flags |= FTRACE_FL_REGS;
+		}
 	}
 
 	return cnt;
@@ -6378,8 +6381,8 @@ void ftrace_module_enable(struct module *mod)
 		if (ftrace_start_up)
 			cnt += referenced_filters(rec);
 
-		/* This clears FTRACE_FL_DISABLED */
-		rec->flags = cnt;
+		rec->flags &= ~FTRACE_FL_DISABLED;
+		rec->flags += cnt;
 
 		if (ftrace_start_up && cnt) {
 			int failed = __ftrace_replace_code(rec, 1);
-- 
cgit v1.2.3


From c5f51572a7fdff5be9e67c231de593155f394ab3 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Wed, 29 Jul 2020 02:05:54 +0800
Subject: ftrace: Do not let direct or IPMODIFY ftrace_ops be added to module
 and set trampolines

When inserting a module, we find all ftrace_ops referencing it on the
ftrace_ops_list. But FTRACE_OPS_FL_DIRECT and FTRACE_OPS_FL_IPMODIFY
flags are special, and should not be set automatically. So warn and
skip ftrace_ops that have these two flags set and adding new code.
Also check if only one ftrace_ops references the module, in which case
we can use a trampoline as an optimization.

Link: https://lkml.kernel.org/r/20200728180554.65203-2-zhouchengming@bytedance.com

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d052f856f1cf..f433cb44300a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6199,9 +6199,17 @@ static int referenced_filters(struct dyn_ftrace *rec)
 
 	for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
 		if (ops_references_rec(ops, rec)) {
+			if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT))
+				continue;
+			if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+				continue;
 			cnt++;
 			if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
 				rec->flags |= FTRACE_FL_REGS;
+			if (cnt == 1 && ops->trampoline)
+				rec->flags |= FTRACE_FL_TRAMP;
+			else
+				rec->flags &= ~FTRACE_FL_TRAMP;
 		}
 	}
 
-- 
cgit v1.2.3


From ee896ee8051a618ea4fe3d91c58d3e6ef9faf705 Mon Sep 17 00:00:00 2001
From: Vincent Whitchurch <vincent.whitchurch@axis.com>
Date: Mon, 27 Jul 2020 11:28:40 +0200
Subject: tracing: Remove outdated comment in stack handling

This comment describes the behaviour before commit 2a820bf74918
("tracing: Use percpu stack trace buffer more intelligently").  Since
that commit, interrupts and NMIs do use the per-cpu stacks so the
comment is no longer correct.  Remove it.

(Note that the FTRACE_STACK_SIZE mentioned in the comment has never
existed, it probably should have said FTRACE_STACK_ENTRIES.)

Link: https://lkml.kernel.org/r/20200727092840.18659-1-vincent.whitchurch@axis.com

Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4aab712f9567..dbcacdd56b02 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2930,12 +2930,6 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
 		skip++;
 #endif
 
-	/*
-	 * Since events can happen in NMIs there's no safe way to
-	 * use the per cpu ftrace_stacks. We reserve it and if an interrupt
-	 * or NMI comes in, it will just have to use the default
-	 * FTRACE_STACK_SIZE.
-	 */
 	preempt_disable_notrace();
 
 	stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
-- 
cgit v1.2.3


From 0584df9c12f449124d0bfef9899e5365604ee7a9 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 29 Jul 2020 13:09:15 +0200
Subject: lockdep: Refactor IRQ trace events fields into struct

Refactor the IRQ trace events fields, used for printing information
about the IRQ trace events, into a separate struct 'irqtrace_events'.

This improves readability by separating the information only used in
reporting, as well as enables (simplified) storing/restoring of
irqtrace_events snapshots.

No functional change intended.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200729110916.3920464-1-elver@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/irqflags.h | 13 +++++++++++
 include/linux/sched.h    | 11 ++-------
 kernel/fork.c            | 16 +++++--------
 kernel/locking/lockdep.c | 58 +++++++++++++++++++++++++-----------------------
 4 files changed, 50 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 5811ee8a5cd8..bd5c55755447 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -33,6 +33,19 @@
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 
+/* Per-task IRQ trace events information. */
+struct irqtrace_events {
+	unsigned int	irq_events;
+	unsigned long	hardirq_enable_ip;
+	unsigned long	hardirq_disable_ip;
+	unsigned int	hardirq_enable_event;
+	unsigned int	hardirq_disable_event;
+	unsigned long	softirq_disable_ip;
+	unsigned long	softirq_enable_ip;
+	unsigned int	softirq_disable_event;
+	unsigned int	softirq_enable_event;
+};
+
 DECLARE_PER_CPU(int, hardirqs_enabled);
 DECLARE_PER_CPU(int, hardirq_context);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d1de021b315..52e0fdd6a555 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -18,6 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/plist.h>
 #include <linux/hrtimer.h>
+#include <linux/irqflags.h>
 #include <linux/seccomp.h>
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
@@ -980,17 +981,9 @@ struct task_struct {
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-	unsigned int			irq_events;
+	struct irqtrace_events		irqtrace;
 	unsigned int			hardirq_threaded;
-	unsigned long			hardirq_enable_ip;
-	unsigned long			hardirq_disable_ip;
-	unsigned int			hardirq_enable_event;
-	unsigned int			hardirq_disable_event;
 	u64				hardirq_chain_key;
-	unsigned long			softirq_disable_ip;
-	unsigned long			softirq_enable_ip;
-	unsigned int			softirq_disable_event;
-	unsigned int			softirq_enable_event;
 	int				softirqs_enabled;
 	int				softirq_context;
 	int				irq_config;
diff --git a/kernel/fork.c b/kernel/fork.c
index 70d9d0a4de2a..56a640799680 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2035,17 +2035,11 @@ static __latent_entropy struct task_struct *copy_process(
 	seqcount_init(&p->mems_allowed_seq);
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
-	p->irq_events = 0;
-	p->hardirq_enable_ip = 0;
-	p->hardirq_enable_event = 0;
-	p->hardirq_disable_ip = _THIS_IP_;
-	p->hardirq_disable_event = 0;
-	p->softirqs_enabled = 1;
-	p->softirq_enable_ip = _THIS_IP_;
-	p->softirq_enable_event = 0;
-	p->softirq_disable_ip = 0;
-	p->softirq_disable_event = 0;
-	p->softirq_context = 0;
+	memset(&p->irqtrace, 0, sizeof(p->irqtrace));
+	p->irqtrace.hardirq_disable_ip	= _THIS_IP_;
+	p->irqtrace.softirq_enable_ip	= _THIS_IP_;
+	p->softirqs_enabled		= 1;
+	p->softirq_context		= 0;
 #endif
 
 	p->pagefault_disabled = 0;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c9ea05edce25..7b5800374c40 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3484,19 +3484,21 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
 
 void print_irqtrace_events(struct task_struct *curr)
 {
-	printk("irq event stamp: %u\n", curr->irq_events);
+	const struct irqtrace_events *trace = &curr->irqtrace;
+
+	printk("irq event stamp: %u\n", trace->irq_events);
 	printk("hardirqs last  enabled at (%u): [<%px>] %pS\n",
-		curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
-		(void *)curr->hardirq_enable_ip);
+		trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip,
+		(void *)trace->hardirq_enable_ip);
 	printk("hardirqs last disabled at (%u): [<%px>] %pS\n",
-		curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
-		(void *)curr->hardirq_disable_ip);
+		trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip,
+		(void *)trace->hardirq_disable_ip);
 	printk("softirqs last  enabled at (%u): [<%px>] %pS\n",
-		curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
-		(void *)curr->softirq_enable_ip);
+		trace->softirq_enable_event, (void *)trace->softirq_enable_ip,
+		(void *)trace->softirq_enable_ip);
 	printk("softirqs last disabled at (%u): [<%px>] %pS\n",
-		curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
-		(void *)curr->softirq_disable_ip);
+		trace->softirq_disable_event, (void *)trace->softirq_disable_ip,
+		(void *)trace->softirq_disable_ip);
 }
 
 static int HARDIRQ_verbose(struct lock_class *class)
@@ -3699,7 +3701,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare);
 
 void noinstr lockdep_hardirqs_on(unsigned long ip)
 {
-	struct task_struct *curr = current;
+	struct irqtrace_events *trace = &current->irqtrace;
 
 	if (unlikely(!debug_locks))
 		return;
@@ -3752,8 +3754,8 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 skip_checks:
 	/* we'll do an OFF -> ON transition: */
 	this_cpu_write(hardirqs_enabled, 1);
-	curr->hardirq_enable_ip = ip;
-	curr->hardirq_enable_event = ++curr->irq_events;
+	trace->hardirq_enable_ip = ip;
+	trace->hardirq_enable_event = ++trace->irq_events;
 	debug_atomic_inc(hardirqs_on_events);
 }
 EXPORT_SYMBOL_GPL(lockdep_hardirqs_on);
@@ -3763,8 +3765,6 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on);
  */
 void noinstr lockdep_hardirqs_off(unsigned long ip)
 {
-	struct task_struct *curr = current;
-
 	if (unlikely(!debug_locks))
 		return;
 
@@ -3784,12 +3784,14 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
 		return;
 
 	if (lockdep_hardirqs_enabled()) {
+		struct irqtrace_events *trace = &current->irqtrace;
+
 		/*
 		 * We have done an ON -> OFF transition:
 		 */
 		this_cpu_write(hardirqs_enabled, 0);
-		curr->hardirq_disable_ip = ip;
-		curr->hardirq_disable_event = ++curr->irq_events;
+		trace->hardirq_disable_ip = ip;
+		trace->hardirq_disable_event = ++trace->irq_events;
 		debug_atomic_inc(hardirqs_off_events);
 	} else {
 		debug_atomic_inc(redundant_hardirqs_off);
@@ -3802,7 +3804,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_off);
  */
 void lockdep_softirqs_on(unsigned long ip)
 {
-	struct task_struct *curr = current;
+	struct irqtrace_events *trace = &current->irqtrace;
 
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
@@ -3814,7 +3816,7 @@ void lockdep_softirqs_on(unsigned long ip)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
-	if (curr->softirqs_enabled) {
+	if (current->softirqs_enabled) {
 		debug_atomic_inc(redundant_softirqs_on);
 		return;
 	}
@@ -3823,9 +3825,9 @@ void lockdep_softirqs_on(unsigned long ip)
 	/*
 	 * We'll do an OFF -> ON transition:
 	 */
-	curr->softirqs_enabled = 1;
-	curr->softirq_enable_ip = ip;
-	curr->softirq_enable_event = ++curr->irq_events;
+	current->softirqs_enabled = 1;
+	trace->softirq_enable_ip = ip;
+	trace->softirq_enable_event = ++trace->irq_events;
 	debug_atomic_inc(softirqs_on_events);
 	/*
 	 * We are going to turn softirqs on, so set the
@@ -3833,7 +3835,7 @@ void lockdep_softirqs_on(unsigned long ip)
 	 * enabled too:
 	 */
 	if (lockdep_hardirqs_enabled())
-		mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
+		mark_held_locks(current, LOCK_ENABLED_SOFTIRQ);
 	lockdep_recursion_finish();
 }
 
@@ -3842,8 +3844,6 @@ void lockdep_softirqs_on(unsigned long ip)
  */
 void lockdep_softirqs_off(unsigned long ip)
 {
-	struct task_struct *curr = current;
-
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -3853,13 +3853,15 @@ void lockdep_softirqs_off(unsigned long ip)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
-	if (curr->softirqs_enabled) {
+	if (current->softirqs_enabled) {
+		struct irqtrace_events *trace = &current->irqtrace;
+
 		/*
 		 * We have done an ON -> OFF transition:
 		 */
-		curr->softirqs_enabled = 0;
-		curr->softirq_disable_ip = ip;
-		curr->softirq_disable_event = ++curr->irq_events;
+		current->softirqs_enabled = 0;
+		trace->softirq_disable_ip = ip;
+		trace->softirq_disable_event = ++trace->irq_events;
 		debug_atomic_inc(softirqs_off_events);
 		/*
 		 * Whoops, we wanted softirqs off, so why aren't they?
-- 
cgit v1.2.3


From 92c209ac6d3d35783c16c8a717547183e6e11162 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 29 Jul 2020 13:09:16 +0200
Subject: kcsan: Improve IRQ state trace reporting

To improve the general usefulness of the IRQ state trace events with
KCSAN enabled, save and restore the trace information when entering and
exiting the KCSAN runtime as well as when generating a KCSAN report.

Without this, reporting the IRQ trace events (whether via a KCSAN report
or outside of KCSAN via a lockdep report) is rather useless due to
continuously being touched by KCSAN. This is because if KCSAN is
enabled, every instrumented memory access causes changes to IRQ trace
events (either by KCSAN disabling/enabling interrupts or taking
report_lock when generating a report).

Before "lockdep: Prepare for NMI IRQ state tracking", KCSAN avoided
touching the IRQ trace events via raw_local_irq_save/restore() and
lockdep_off/on().

Fixes: 248591f5d257 ("kcsan: Make KCSAN compatible with new IRQ state tracking")
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200729110916.3920464-2-elver@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  4 ++++
 kernel/kcsan/core.c   | 23 +++++++++++++++++++++++
 kernel/kcsan/kcsan.h  |  7 +++++++
 kernel/kcsan/report.c |  3 +++
 4 files changed, 37 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52e0fdd6a555..060e9214c8b5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1184,8 +1184,12 @@ struct task_struct {
 #ifdef CONFIG_KASAN
 	unsigned int			kasan_depth;
 #endif
+
 #ifdef CONFIG_KCSAN
 	struct kcsan_ctx		kcsan_ctx;
+#ifdef CONFIG_TRACE_IRQFLAGS
+	struct irqtrace_events		kcsan_save_irqtrace;
+#endif
 #endif
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 732623c30359..0fe068192781 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -291,6 +291,20 @@ static inline unsigned int get_delay(void)
 				0);
 }
 
+void kcsan_save_irqtrace(struct task_struct *task)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+	task->kcsan_save_irqtrace = task->irqtrace;
+#endif
+}
+
+void kcsan_restore_irqtrace(struct task_struct *task)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+	task->irqtrace = task->kcsan_save_irqtrace;
+#endif
+}
+
 /*
  * Pull everything together: check_access() below contains the performance
  * critical operations; the fast-path (including check_access) functions should
@@ -336,9 +350,11 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
 	flags = user_access_save();
 
 	if (consumed) {
+		kcsan_save_irqtrace(current);
 		kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE,
 			     KCSAN_REPORT_CONSUMED_WATCHPOINT,
 			     watchpoint - watchpoints);
+		kcsan_restore_irqtrace(current);
 	} else {
 		/*
 		 * The other thread may not print any diagnostics, as it has
@@ -396,6 +412,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
 		goto out;
 	}
 
+	/*
+	 * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's
+	 * runtime is entered for every memory access, and potentially useful
+	 * information is lost if dirtied by KCSAN.
+	 */
+	kcsan_save_irqtrace(current);
 	if (!kcsan_interrupt_watcher)
 		local_irq_save(irq_flags);
 
@@ -539,6 +561,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
 out_unlock:
 	if (!kcsan_interrupt_watcher)
 		local_irq_restore(irq_flags);
+	kcsan_restore_irqtrace(current);
 out:
 	user_access_restore(ua_flags);
 }
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 763d6d08d94b..29480010dc30 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -9,6 +9,7 @@
 #define _KERNEL_KCSAN_KCSAN_H
 
 #include <linux/kcsan.h>
+#include <linux/sched.h>
 
 /* The number of adjacent watchpoints to check. */
 #define KCSAN_CHECK_ADJACENT 1
@@ -22,6 +23,12 @@ extern unsigned int kcsan_udelay_interrupt;
  */
 extern bool kcsan_enabled;
 
+/*
+ * Save/restore IRQ flags state trace dirtied by KCSAN.
+ */
+void kcsan_save_irqtrace(struct task_struct *task);
+void kcsan_restore_irqtrace(struct task_struct *task);
+
 /*
  * Initialize debugfs file.
  */
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 6b2fb1a6d8cd..9d07e175de0f 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -308,6 +308,9 @@ static void print_verbose_info(struct task_struct *task)
 	if (!task)
 		return;
 
+	/* Restore IRQ state trace for printing. */
+	kcsan_restore_irqtrace(task);
+
 	pr_err("\n");
 	debug_show_held_locks(task);
 	print_irqtrace_events(task);
-- 
cgit v1.2.3


From f4470cdf108f00533e8079b19434e6cb48c17fa3 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Fri, 31 Jul 2020 20:20:14 +0100
Subject: sched: Document arch_scale_*_capacity()

Rather that hide their purpose in some dark, damp corner of Documentation/,
add some documentation to the default implementations.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200731192016.7484-2-valentin.schneider@arm.com
---
 include/linux/sched/topology.h | 10 ++++++++++
 kernel/sched/sched.h           | 10 ++++++++++
 2 files changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 764222d637b7..820511289857 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -217,6 +217,16 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
 #endif	/* !CONFIG_SMP */
 
 #ifndef arch_scale_cpu_capacity
+/**
+ * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU.
+ * @cpu: the CPU in question.
+ *
+ * Return: the CPU scale factor normalized against SCHED_CAPACITY_SCALE, i.e.
+ *
+ *             max_perf(cpu)
+ *      ----------------------------- * SCHED_CAPACITY_SCALE
+ *      max(max_perf(c) : c \in CPUs)
+ */
 static __always_inline
 unsigned long arch_scale_cpu_capacity(int cpu)
 {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 296efd30d8c9..3fd283892761 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2049,6 +2049,16 @@ void arch_scale_freq_tick(void)
 #endif
 
 #ifndef arch_scale_freq_capacity
+/**
+ * arch_scale_freq_capacity - get the frequency scale factor of a given CPU.
+ * @cpu: the CPU in question.
+ *
+ * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e.
+ *
+ *     f_curr
+ *     ------ * SCHED_CAPACITY_SCALE
+ *     f_max
+ */
 static __always_inline
 unsigned long arch_scale_freq_capacity(int cpu)
 {
-- 
cgit v1.2.3


From 7ef5264de773279b9f23b6cc8afb5addb30e970b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 30 Jul 2020 08:10:20 +0200
Subject: modules: mark ref_module static

ref_module isn't used anywhere outside of module.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 1 -
 kernel/module.c        | 6 ++----
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 2e6670860d27..f1fdbeef2153 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -657,7 +657,6 @@ static inline void __module_get(struct module *module)
 #define symbol_put_addr(p) do { } while (0)
 
 #endif /* CONFIG_MODULE_UNLOAD */
-int ref_module(struct module *a, struct module *b);
 
 /* This is a #define so the string doesn't get put in every .o file */
 #define module_name(mod)			\
diff --git a/kernel/module.c b/kernel/module.c
index e8a198588f26..baae0e83d630 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -869,7 +869,7 @@ static int add_module_usage(struct module *a, struct module *b)
 }
 
 /* Module a uses b: caller needs module_mutex() */
-int ref_module(struct module *a, struct module *b)
+static int ref_module(struct module *a, struct module *b)
 {
 	int err;
 
@@ -888,7 +888,6 @@ int ref_module(struct module *a, struct module *b)
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(ref_module);
 
 /* Clear the unload stuff of the module. */
 static void module_unload_free(struct module *mod)
@@ -1169,11 +1168,10 @@ static inline void module_unload_free(struct module *mod)
 {
 }
 
-int ref_module(struct module *a, struct module *b)
+static int ref_module(struct module *a, struct module *b)
 {
 	return strong_try_module_get(b);
 }
-EXPORT_SYMBOL_GPL(ref_module);
 
 static inline int module_unload_init(struct module *mod)
 {
-- 
cgit v1.2.3


From 773110470e2fa3839523384ae014f8a723c4d178 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 30 Jul 2020 08:10:21 +0200
Subject: modules: mark find_symbol static

find_symbol is only used in module.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 11 -----------
 kernel/module.c        |  3 +--
 2 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index f1fdbeef2153..90bdc362be36 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -590,17 +590,6 @@ struct symsearch {
 	bool unused;
 };
 
-/*
- * Search for an exported symbol by name.
- *
- * Must be called with module_mutex held or preemption disabled.
- */
-const struct kernel_symbol *find_symbol(const char *name,
-					struct module **owner,
-					const s32 **crc,
-					bool gplok,
-					bool warn);
-
 /*
  * Walk the exported symbol table
  *
diff --git a/kernel/module.c b/kernel/module.c
index baae0e83d630..0f95fb4b3e37 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -585,7 +585,7 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
 
 /* Find an exported symbol and return it, along with, (optional) crc and
  * (optional) module which owns it.  Needs preempt disabled or module_mutex. */
-const struct kernel_symbol *find_symbol(const char *name,
+static const struct kernel_symbol *find_symbol(const char *name,
 					struct module **owner,
 					const s32 **crc,
 					bool gplok,
@@ -608,7 +608,6 @@ const struct kernel_symbol *find_symbol(const char *name,
 	pr_debug("Failed to find symbol %s\n", name);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(find_symbol);
 
 /*
  * Search for module by name: must hold module_mutex (or preempt disabled
-- 
cgit v1.2.3


From a54e04914c211b5678602a46b3ede5d82ec1327d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 30 Jul 2020 08:10:22 +0200
Subject: modules: mark each_symbol_section static

each_symbol_section is only used inside of module.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 9 ---------
 kernel/module.c        | 3 +--
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 90bdc362be36..b79219eed83c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -590,15 +590,6 @@ struct symsearch {
 	bool unused;
 };
 
-/*
- * Walk the exported symbol table
- *
- * Must be called with module_mutex held or preemption disabled.
- */
-bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
-				    struct module *owner,
-				    void *data), void *data);
-
 /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
    symnum out of range. */
 int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
diff --git a/kernel/module.c b/kernel/module.c
index 0f95fb4b3e37..c2a099a27b68 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -422,7 +422,7 @@ static bool each_symbol_in_section(const struct symsearch *arr,
 }
 
 /* Returns true as soon as fn returns true, otherwise false. */
-bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
+static bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
 				    struct module *owner,
 				    void *data),
 			 void *data)
@@ -484,7 +484,6 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
 	}
 	return false;
 }
-EXPORT_SYMBOL_GPL(each_symbol_section);
 
 struct find_symbol_arg {
 	/* Input */
-- 
cgit v1.2.3


From 3fe1e56d0e68b623dd62d8d38265d2a052e7e185 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 30 Jul 2020 08:10:23 +0200
Subject: modules: unexport __module_text_address

__module_text_address is only used by built-in code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index c2a099a27b68..6ee1739e3150 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4503,7 +4503,6 @@ struct module *__module_text_address(unsigned long addr)
 	}
 	return mod;
 }
-EXPORT_SYMBOL_GPL(__module_text_address);
 
 /* Don't grab lock, we're oopsing. */
 void print_modules(void)
-- 
cgit v1.2.3


From 34e64705ad415ed7a816e60ef62b42fe6d1729d9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 30 Jul 2020 08:10:24 +0200
Subject: modules: unexport __module_address

__module_address is only used by built-in code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 6ee1739e3150..e85d06158fbc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4464,7 +4464,6 @@ struct module *__module_address(unsigned long addr)
 	}
 	return mod;
 }
-EXPORT_SYMBOL_GPL(__module_address);
 
 /*
  * is_module_text_address - is this address inside module code?
-- 
cgit v1.2.3


From cd8732cdcc37d7077c4fa2c966b748c0662b607e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 30 Jul 2020 08:10:25 +0200
Subject: modules: rename the licence field in struct symsearch to license

Use the same spelling variant as the rest of the file.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 2 +-
 kernel/module.c        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index b79219eed83c..be04ba2f881d 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -586,7 +586,7 @@ struct symsearch {
 		NOT_GPL_ONLY,
 		GPL_ONLY,
 		WILL_BE_GPL_ONLY,
-	} licence;
+	} license;
 	bool unused;
 };
 
diff --git a/kernel/module.c b/kernel/module.c
index e85d06158fbc..62d817a0dca8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -504,9 +504,9 @@ static bool check_exported_symbol(const struct symsearch *syms,
 	struct find_symbol_arg *fsa = data;
 
 	if (!fsa->gplok) {
-		if (syms->licence == GPL_ONLY)
+		if (syms->license == GPL_ONLY)
 			return false;
-		if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
+		if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) {
 			pr_warn("Symbol %s is being used by a non-GPL module, "
 				"which will not be allowed in the future\n",
 				fsa->name);
-- 
cgit v1.2.3


From ef1dac6021cc8ec5de02ce31722bf26ac4ed5523 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 30 Jul 2020 08:10:26 +0200
Subject: modules: return licensing information from find_symbol

Report the GPLONLY status through a new argument.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h |  2 +-
 kernel/module.c        | 16 +++++++++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index be04ba2f881d..30b0f5fcdb3c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -582,7 +582,7 @@ struct module *find_module(const char *name);
 struct symsearch {
 	const struct kernel_symbol *start, *stop;
 	const s32 *crcs;
-	enum {
+	enum mod_license {
 		NOT_GPL_ONLY,
 		GPL_ONLY,
 		WILL_BE_GPL_ONLY,
diff --git a/kernel/module.c b/kernel/module.c
index 62d817a0dca8..656f5ff27088 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -495,6 +495,7 @@ struct find_symbol_arg {
 	struct module *owner;
 	const s32 *crc;
 	const struct kernel_symbol *sym;
+	enum mod_license license;
 };
 
 static bool check_exported_symbol(const struct symsearch *syms,
@@ -528,6 +529,7 @@ static bool check_exported_symbol(const struct symsearch *syms,
 	fsa->owner = owner;
 	fsa->crc = symversion(syms->crcs, symnum);
 	fsa->sym = &syms->start[symnum];
+	fsa->license = syms->license;
 	return true;
 }
 
@@ -587,6 +589,7 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
 static const struct kernel_symbol *find_symbol(const char *name,
 					struct module **owner,
 					const s32 **crc,
+					enum mod_license *license,
 					bool gplok,
 					bool warn)
 {
@@ -601,6 +604,8 @@ static const struct kernel_symbol *find_symbol(const char *name,
 			*owner = fsa.owner;
 		if (crc)
 			*crc = fsa.crc;
+		if (license)
+			*license = fsa.license;
 		return fsa.sym;
 	}
 
@@ -1074,7 +1079,7 @@ void __symbol_put(const char *symbol)
 	struct module *owner;
 
 	preempt_disable();
-	if (!find_symbol(symbol, &owner, NULL, true, false))
+	if (!find_symbol(symbol, &owner, NULL, NULL, true, false))
 		BUG();
 	module_put(owner);
 	preempt_enable();
@@ -1352,7 +1357,7 @@ static inline int check_modstruct_version(const struct load_info *info,
 	 * locking is necessary -- use preempt_disable() to placate lockdep.
 	 */
 	preempt_disable();
-	if (!find_symbol("module_layout", NULL, &crc, true, false)) {
+	if (!find_symbol("module_layout", NULL, &crc, NULL, true, false)) {
 		preempt_enable();
 		BUG();
 	}
@@ -1436,6 +1441,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
 	struct module *owner;
 	const struct kernel_symbol *sym;
 	const s32 *crc;
+	enum mod_license license;
 	int err;
 
 	/*
@@ -1445,7 +1451,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
 	 */
 	sched_annotate_sleep();
 	mutex_lock(&module_mutex);
-	sym = find_symbol(name, &owner, &crc,
+	sym = find_symbol(name, &owner, &crc, &license,
 			  !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
 	if (!sym)
 		goto unlock;
@@ -2213,7 +2219,7 @@ void *__symbol_get(const char *symbol)
 	const struct kernel_symbol *sym;
 
 	preempt_disable();
-	sym = find_symbol(symbol, &owner, NULL, true, true);
+	sym = find_symbol(symbol, &owner, NULL, NULL, true, true);
 	if (sym && strong_try_module_get(owner))
 		sym = NULL;
 	preempt_enable();
@@ -2249,7 +2255,7 @@ static int verify_exported_symbols(struct module *mod)
 	for (i = 0; i < ARRAY_SIZE(arr); i++) {
 		for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
 			if (find_symbol(kernel_symbol_name(s), &owner, NULL,
-					true, false)) {
+					NULL, true, false)) {
 				pr_err("%s: exports duplicate symbol %s"
 				       " (owned by %s)\n",
 				       mod->name, kernel_symbol_name(s),
-- 
cgit v1.2.3


From 73b11c2ab072d5b0599d1e12cc126f55ee306daf Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Fri, 31 Jul 2020 11:28:26 -0700
Subject: bpf: Add support for forced LINK_DETACH command

Add LINK_DETACH command to force-detach bpf_link without destroying it. It has
the same behavior as auto-detaching of bpf_link due to cgroup dying for
bpf_cgroup_link or net_device being destroyed for bpf_xdp_link. In such case,
bpf_link is still a valid kernel object, but is defuncts and doesn't hold BPF
program attached to corresponding BPF hook. This functionality allows users
with enough access rights to manually force-detach attached bpf_link without
killing respective owner process.

This patch implements LINK_DETACH for cgroup, xdp, and netns links, mostly
re-using existing link release handling code.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200731182830.286260-2-andriin@fb.com
---
 include/linux/bpf.h        |  1 +
 include/uapi/linux/bpf.h   |  5 +++++
 kernel/bpf/cgroup.c        | 15 ++++++++++++++-
 kernel/bpf/net_namespace.c |  8 ++++++++
 kernel/bpf/syscall.c       | 26 ++++++++++++++++++++++++++
 net/core/dev.c             | 11 ++++++++++-
 6 files changed, 64 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 40c5e206ecf2..cef4ef0d2b4e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -793,6 +793,7 @@ struct bpf_link {
 struct bpf_link_ops {
 	void (*release)(struct bpf_link *link);
 	void (*dealloc)(struct bpf_link *link);
+	int (*detach)(struct bpf_link *link);
 	int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog,
 			   struct bpf_prog *old_prog);
 	void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index eb5e0c38eb2c..b134e679e9db 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -117,6 +117,7 @@ enum bpf_cmd {
 	BPF_LINK_GET_NEXT_ID,
 	BPF_ENABLE_STATS,
 	BPF_ITER_CREATE,
+	BPF_LINK_DETACH,
 };
 
 enum bpf_map_type {
@@ -634,6 +635,10 @@ union bpf_attr {
 		__u32		old_prog_fd;
 	} link_update;
 
+	struct {
+		__u32		link_fd;
+	} link_detach;
+
 	struct { /* struct used by BPF_ENABLE_STATS command */
 		__u32		type;
 	} enable_stats;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 957cce1d5168..83ff127ef7ae 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -814,6 +814,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
 {
 	struct bpf_cgroup_link *cg_link =
 		container_of(link, struct bpf_cgroup_link, link);
+	struct cgroup *cg;
 
 	/* link might have been auto-detached by dying cgroup already,
 	 * in that case our work is done here
@@ -832,8 +833,12 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
 	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
 				    cg_link->type));
 
+	cg = cg_link->cgroup;
+	cg_link->cgroup = NULL;
+
 	mutex_unlock(&cgroup_mutex);
-	cgroup_put(cg_link->cgroup);
+
+	cgroup_put(cg);
 }
 
 static void bpf_cgroup_link_dealloc(struct bpf_link *link)
@@ -844,6 +849,13 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link)
 	kfree(cg_link);
 }
 
+static int bpf_cgroup_link_detach(struct bpf_link *link)
+{
+	bpf_cgroup_link_release(link);
+
+	return 0;
+}
+
 static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
 					struct seq_file *seq)
 {
@@ -883,6 +895,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
 static const struct bpf_link_ops bpf_cgroup_link_lops = {
 	.release = bpf_cgroup_link_release,
 	.dealloc = bpf_cgroup_link_dealloc,
+	.detach = bpf_cgroup_link_detach,
 	.update_prog = cgroup_bpf_replace,
 	.show_fdinfo = bpf_cgroup_link_show_fdinfo,
 	.fill_link_info = bpf_cgroup_link_fill_link_info,
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 71405edd667c..542f275bf252 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -142,9 +142,16 @@ static void bpf_netns_link_release(struct bpf_link *link)
 	bpf_prog_array_free(old_array);
 
 out_unlock:
+	net_link->net = NULL;
 	mutex_unlock(&netns_bpf_mutex);
 }
 
+static int bpf_netns_link_detach(struct bpf_link *link)
+{
+	bpf_netns_link_release(link);
+	return 0;
+}
+
 static void bpf_netns_link_dealloc(struct bpf_link *link)
 {
 	struct bpf_netns_link *net_link =
@@ -228,6 +235,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link,
 static const struct bpf_link_ops bpf_netns_link_ops = {
 	.release = bpf_netns_link_release,
 	.dealloc = bpf_netns_link_dealloc,
+	.detach = bpf_netns_link_detach,
 	.update_prog = bpf_netns_link_update_prog,
 	.fill_link_info = bpf_netns_link_fill_info,
 	.show_fdinfo = bpf_netns_link_show_fdinfo,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cd3d599e9e90..2f343ce15747 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3991,6 +3991,29 @@ out_put_link:
 	return ret;
 }
 
+#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
+
+static int link_detach(union bpf_attr *attr)
+{
+	struct bpf_link *link;
+	int ret;
+
+	if (CHECK_ATTR(BPF_LINK_DETACH))
+		return -EINVAL;
+
+	link = bpf_link_get_from_fd(attr->link_detach.link_fd);
+	if (IS_ERR(link))
+		return PTR_ERR(link);
+
+	if (link->ops->detach)
+		ret = link->ops->detach(link);
+	else
+		ret = -EOPNOTSUPP;
+
+	bpf_link_put(link);
+	return ret;
+}
+
 static int bpf_link_inc_not_zero(struct bpf_link *link)
 {
 	return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT;
@@ -4240,6 +4263,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_ITER_CREATE:
 		err = bpf_iter_create(&attr);
 		break;
+	case BPF_LINK_DETACH:
+		err = link_detach(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/net/core/dev.c b/net/core/dev.c
index a2a57988880a..c8b911b10187 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8979,12 +8979,20 @@ static void bpf_xdp_link_release(struct bpf_link *link)
 	/* if racing with net_device's tear down, xdp_link->dev might be
 	 * already NULL, in which case link was already auto-detached
 	 */
-	if (xdp_link->dev)
+	if (xdp_link->dev) {
 		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
+		xdp_link->dev = NULL;
+	}
 
 	rtnl_unlock();
 }
 
+static int bpf_xdp_link_detach(struct bpf_link *link)
+{
+	bpf_xdp_link_release(link);
+	return 0;
+}
+
 static void bpf_xdp_link_dealloc(struct bpf_link *link)
 {
 	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
@@ -9066,6 +9074,7 @@ out_unlock:
 static const struct bpf_link_ops bpf_xdp_link_lops = {
 	.release = bpf_xdp_link_release,
 	.dealloc = bpf_xdp_link_dealloc,
+	.detach = bpf_xdp_link_detach,
 	.show_fdinfo = bpf_xdp_link_show_fdinfo,
 	.fill_link_info = bpf_xdp_link_fill_link_info,
 	.update_prog = bpf_xdp_link_update,
-- 
cgit v1.2.3


From c6fe44d96fc1536af5b11cd859686453d1b7bfd1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 23 Jul 2020 12:33:41 -0700
Subject: list: add "list_del_init_careful()" to go with "list_empty_careful()"

That gives us ordering guarantees around the pair.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/list.h | 20 +++++++++++++++++++-
 kernel/sched/wait.c  |  2 +-
 mm/filemap.c         |  7 +------
 3 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/list.h b/include/linux/list.h
index aff44d34f4e4..0d0d17a10d25 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -282,6 +282,24 @@ static inline int list_empty(const struct list_head *head)
 	return READ_ONCE(head->next) == head;
 }
 
+/**
+ * list_del_init_careful - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ *
+ * This is the same as list_del_init(), except designed to be used
+ * together with list_empty_careful() in a way to guarantee ordering
+ * of other memory operations.
+ *
+ * Any memory operations done before a list_del_init_careful() are
+ * guaranteed to be visible after a list_empty_careful() test.
+ */
+static inline void list_del_init_careful(struct list_head *entry)
+{
+	__list_del_entry(entry);
+	entry->prev = entry;
+	smp_store_release(&entry->next, entry);
+}
+
 /**
  * list_empty_careful - tests whether a list is empty and not being modified
  * @head: the list to test
@@ -297,7 +315,7 @@ static inline int list_empty(const struct list_head *head)
  */
 static inline int list_empty_careful(const struct list_head *head)
 {
-	struct list_head *next = head->next;
+	struct list_head *next = smp_load_acquire(&head->next);
 	return (next == head) && (next == head->prev);
 }
 
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index ba059fbfc53a..01f5d3020589 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -389,7 +389,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
 	int ret = default_wake_function(wq_entry, mode, sync, key);
 
 	if (ret)
-		list_del_init(&wq_entry->entry);
+		list_del_init_careful(&wq_entry->entry);
 
 	return ret;
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 8c3d3e233d37..991503bbf922 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1041,13 +1041,8 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
 	 * since after list_del_init(&wait->entry) the wait entry
 	 * might be de-allocated and the process might even have
 	 * exited.
-	 *
-	 * We _really_ should have a "list_del_init_careful()" to
-	 * properly pair with the unlocked "list_empty_careful()"
-	 * in finish_wait().
 	 */
-	smp_mb();
-	list_del_init(&wait->entry);
+	list_del_init_careful(&wait->entry);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 0f69dae4d1069963fdcc16e63655927d83281006 Mon Sep 17 00:00:00 2001
From: Zhaoyang Huang <huangzhaoyang@gmail.com>
Date: Fri, 31 Jul 2020 08:27:45 +0800
Subject: trace : Have tracing buffer info use kvzalloc instead of kzalloc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

High order memory stuff within trace could introduce OOM, use kvzalloc instead.

Please find the bellowing for the call stack we run across in an android system.
The scenario happens when traced_probes is woken up to get a large quantity of
trace even if free memory is even higher than watermark_low. 

traced_probes invoked oom-killer: gfp_mask=0x140c0c0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),  order=2, oom_score_adj=-1

traced_probes cpuset=system-background mems_allowed=0
CPU: 3 PID: 588 Comm: traced_probes Tainted: G        W  O    4.14.181 #1
Hardware name: Generic DT based system
(unwind_backtrace) from [<c010d824>] (show_stack+0x20/0x24)
(show_stack) from [<c0b2e174>] (dump_stack+0xa8/0xec)
(dump_stack) from [<c027d584>] (dump_header+0x9c/0x220)
(dump_header) from [<c027cfe4>] (oom_kill_process+0xc0/0x5c4)
(oom_kill_process) from [<c027cb94>] (out_of_memory+0x220/0x310)
(out_of_memory) from [<c02816bc>] (__alloc_pages_nodemask+0xff8/0x13a4)
(__alloc_pages_nodemask) from [<c02a6a1c>] (kmalloc_order+0x30/0x48)
(kmalloc_order) from [<c02a6a64>] (kmalloc_order_trace+0x30/0x118)
(kmalloc_order_trace) from [<c0223d7c>] (tracing_buffers_open+0x50/0xfc)
(tracing_buffers_open) from [<c02e6f58>] (do_dentry_open+0x278/0x34c)
(do_dentry_open) from [<c02e70d0>] (vfs_open+0x50/0x70)
(vfs_open) from [<c02f7c24>] (path_openat+0x5fc/0x169c)
(path_openat) from [<c02f75c4>] (do_filp_open+0x94/0xf8)
(do_filp_open) from [<c02e7650>] (do_sys_open+0x168/0x26c)
(do_sys_open) from [<c02e77bc>] (SyS_openat+0x34/0x38)
(SyS_openat) from [<c0108bc0>] (ret_fast_syscall+0x0/0x28)

Link: https://lkml.kernel.org/r/1596155265-32365-1-git-send-email-zhaoyang.huang@unisoc.com

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dbcacdd56b02..06c0feae5ff9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7402,7 +7402,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
 	if (ret)
 		return ret;
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	info = kvzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info) {
 		trace_array_put(tr);
 		return -ENOMEM;
@@ -7528,7 +7528,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
 	if (info->spare)
 		ring_buffer_free_read_page(iter->array_buffer->buffer,
 					   info->spare_cpu, info->spare);
-	kfree(info);
+	kvfree(info);
 
 	mutex_unlock(&trace_types_lock);
 
-- 
cgit v1.2.3


From c58b6b0372de0d4cd0536d6585addd1b36b151ae Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 24 Jul 2020 20:50:48 -0400
Subject: ftrace: Fix ftrace_trace_task return value

I was attempting to use pid filtering with function_graph, but it wasn't
allowing anything to make it through.  Turns out ftrace_trace_task
returns false if ftrace_ignore_pid is not-empty, which isn't correct
anymore.  We're now setting it to FTRACE_PID_IGNORE if we need to ignore
that pid, otherwise it's set to the pid (which is weird considering the
name) or to FTRACE_PID_TRACE.  Fix the check to check for !=
FTRACE_PID_IGNORE.  With this we can now use function_graph with pid
filtering.

Link: https://lkml.kernel.org/r/20200725005048.1790-1-josef@toxicpanda.com

Fixes: 717e3f5ebc82 ("ftrace: Make function trace pid filtering a bit more exact")
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 3 ---
 kernel/trace/trace.h  | 7 ++++++-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f433cb44300a..4e3a5d79c078 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -139,9 +139,6 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
 #endif
 }
 
-#define FTRACE_PID_IGNORE	-1
-#define FTRACE_PID_TRACE	-2
-
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
 			    struct ftrace_ops *op, struct pt_regs *regs)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f21607f87189..610d21355526 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1103,6 +1103,10 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 extern struct list_head ftrace_pids;
 
 #ifdef CONFIG_FUNCTION_TRACER
+
+#define FTRACE_PID_IGNORE	-1
+#define FTRACE_PID_TRACE	-2
+
 struct ftrace_func_command {
 	struct list_head	list;
 	char			*name;
@@ -1114,7 +1118,8 @@ struct ftrace_func_command {
 extern bool ftrace_filter_param __initdata;
 static inline int ftrace_trace_task(struct trace_array *tr)
 {
-	return !this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid);
+	return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) !=
+		FTRACE_PID_IGNORE;
 }
 extern int ftrace_is_dead(void);
 int ftrace_create_function_files(struct trace_array *tr,
-- 
cgit v1.2.3


From 0cb2f1372baa60af8456388a574af6133edd7d80 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 28 Jul 2020 14:45:36 +0800
Subject: kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler

We found a case of kernel panic on our server. The stack trace is as
follows(omit some irrelevant information):

  BUG: kernel NULL pointer dereference, address: 0000000000000080
  RIP: 0010:kprobe_ftrace_handler+0x5e/0xe0
  RSP: 0018:ffffb512c6550998 EFLAGS: 00010282
  RAX: 0000000000000000 RBX: ffff8e9d16eea018 RCX: 0000000000000000
  RDX: ffffffffbe1179c0 RSI: ffffffffc0535564 RDI: ffffffffc0534ec0
  RBP: ffffffffc0534ec1 R08: ffff8e9d1bbb0f00 R09: 0000000000000004
  R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
  R13: ffff8e9d1f797060 R14: 000000000000bacc R15: ffff8e9ce13eca00
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000080 CR3: 00000008453d0005 CR4: 00000000003606e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   <IRQ>
   ftrace_ops_assist_func+0x56/0xe0
   ftrace_call+0x5/0x34
   tcpa_statistic_send+0x5/0x130 [ttcp_engine]

The tcpa_statistic_send is the function being kprobed. After analysis,
the root cause is that the fourth parameter regs of kprobe_ftrace_handler
is NULL. Why regs is NULL? We use the crash tool to analyze the kdump.

  crash> dis tcpa_statistic_send -r
         <tcpa_statistic_send>: callq 0xffffffffbd8018c0 <ftrace_caller>

The tcpa_statistic_send calls ftrace_caller instead of ftrace_regs_caller.
So it is reasonable that the fourth parameter regs of kprobe_ftrace_handler
is NULL. In theory, we should call the ftrace_regs_caller instead of the
ftrace_caller. After in-depth analysis, we found a reproducible path.

  Writing a simple kernel module which starts a periodic timer. The
  timer's handler is named 'kprobe_test_timer_handler'. The module
  name is kprobe_test.ko.

  1) insmod kprobe_test.ko
  2) bpftrace -e 'kretprobe:kprobe_test_timer_handler {}'
  3) echo 0 > /proc/sys/kernel/ftrace_enabled
  4) rmmod kprobe_test
  5) stop step 2) kprobe
  6) insmod kprobe_test.ko
  7) bpftrace -e 'kretprobe:kprobe_test_timer_handler {}'

We mark the kprobe as GONE but not disarm the kprobe in the step 4).
The step 5) also do not disarm the kprobe when unregister kprobe. So
we do not remove the ip from the filter. In this case, when the module
loads again in the step 6), we will replace the code to ftrace_caller
via the ftrace_module_enable(). When we register kprobe again, we will
not replace ftrace_caller to ftrace_regs_caller because the ftrace is
disabled in the step 3). So the step 7) will trigger kernel panic. Fix
this problem by disarming the kprobe when the module is going away.

Link: https://lkml.kernel.org/r/20200728064536.24405-1-songmuchun@bytedance.com

Cc: stable@vger.kernel.org
Fixes: ae6aa16fdc16 ("kprobes: introduce ftrace based optimization")
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Co-developed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/kprobes.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 4a904cc56d68..07bf03fcf574 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2113,6 +2113,13 @@ static void kill_kprobe(struct kprobe *p)
 	 * the original probed function (which will be freed soon) any more.
 	 */
 	arch_remove_kprobe(p);
+
+	/*
+	 * The module is going away. We should disarm the kprobe which
+	 * is using ftrace.
+	 */
+	if (kprobe_ftrace(p))
+		disarm_kprobe_ftrace(p);
 }
 
 /* Disable one kprobe */
-- 
cgit v1.2.3


From 231621d0c570765d5cebd95c582f1c8df5c46028 Mon Sep 17 00:00:00 2001
From: Peng Fan <fanpeng@loongson.cn>
Date: Fri, 24 Jul 2020 11:24:24 +0800
Subject: tracing/uprobe: Remove dead code in trace_uprobe_register()

In the function trace_uprobe_register(), the statement "return 0;"
out of switch case is dead code, remove it.

Link: https://lkml.kernel.org/r/1595561064-29186-1-git-send-email-fanpeng@loongson.cn

Signed-off-by: Peng Fan <fanpeng@loongson.cn>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index fdd47f99b18f..f4286c9bdeb4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1456,7 +1456,6 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
 	default:
 		return 0;
 	}
-	return 0;
 }
 
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
-- 
cgit v1.2.3


From afcab636657421f7ebfa0783a91f90256bba0091 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 4 Aug 2020 20:00:02 -0400
Subject: tracing: Use trace_sched_process_free() instead of exit() for pid
 tracing

On exit, if a process is preempted after the trace_sched_process_exit()
tracepoint but before the process is done exiting, then when it gets
scheduled in, the function tracers will not filter it properly against the
function tracing pid filters.

That is because the function tracing pid filters hooks to the
sched_process_exit() tracepoint to remove the exiting task's pid from the
filter list. Because the filtering happens at the sched_switch tracepoint,
when the exiting task schedules back in to finish up the exit, it will no
longer be in the function pid filtering tables.

This was noticeable in the notrace self tests on a preemptable kernel, as
the tests would fail as it exits and preempted after being taken off the
notrace filter table and on scheduling back in it would not be in the
notrace list, and then the ending of the exit function would trace. The test
detected this and would fail.

Cc: stable@vger.kernel.org
Cc: Namhyung Kim <namhyung@kernel.org>
Fixes: 1e10486ffee0a ("ftrace: Add 'function-fork' trace option")
Fixes: c37775d57830a ("tracing: Add infrastructure to allow set_event_pid to follow children"
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c       | 4 ++--
 kernel/trace/trace_events.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4e3a5d79c078..76f2dd6fd414 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6985,12 +6985,12 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
 	if (enable) {
 		register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
 						  tr);
-		register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+		register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
 						  tr);
 	} else {
 		unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
 						    tr);
-		unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+		unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
 						    tr);
 	}
 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f6f55682d3e2..a85effb2373b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -538,12 +538,12 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
 	if (enable) {
 		register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork,
 						       tr, INT_MIN);
-		register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit,
+		register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit,
 						       tr, INT_MAX);
 	} else {
 		unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork,
 						    tr);
-		unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit,
+		unregister_trace_sched_process_free(event_filter_pid_sched_process_exit,
 						    tr);
 	}
 }
-- 
cgit v1.2.3


From 262e6ae7081df304fc625cf368d5c2cbba2bb991 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 28 Jul 2020 23:33:33 +0200
Subject: modules: inherit TAINT_PROPRIETARY_MODULE

If a TAINT_PROPRIETARY_MODULE exports symbol, inherit the taint flag
for all modules importing these symbols, and don't allow loading
symbols from TAINT_PROPRIETARY_MODULE modules if the module previously
imported gplonly symbols.  Add a anti-circumvention devices so people
don't accidentally get themselves into trouble this way.

Comment from Greg:
  "Ah, the proven-to-be-illegal "GPL Condom" defense :)"

[jeyu: pr_info -> pr_err and pr_warn as per discussion]
Link: http://lore.kernel.org/r/20200730162957.GA22469@lst.de
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h |  1 +
 kernel/module.c        | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 30b0f5fcdb3c..e30ed5fa33a7 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -389,6 +389,7 @@ struct module {
 	unsigned int num_gpl_syms;
 	const struct kernel_symbol *gpl_syms;
 	const s32 *gpl_crcs;
+	bool using_gplonly_symbols;
 
 #ifdef CONFIG_UNUSED_SYMBOLS
 	/* unused exported symbols. */
diff --git a/kernel/module.c b/kernel/module.c
index 656f5ff27088..09bf5a652a47 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1431,6 +1431,24 @@ static int verify_namespace_is_imported(const struct load_info *info,
 	return 0;
 }
 
+static bool inherit_taint(struct module *mod, struct module *owner)
+{
+	if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
+		return true;
+
+	if (mod->using_gplonly_symbols) {
+		pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n",
+			mod->name, owner->name);
+		return false;
+	}
+
+	if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
+		pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n",
+			mod->name, owner->name);
+		set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
+	}
+	return true;
+}
 
 /* Resolve a symbol for this module.  I.e. if we find one, record usage. */
 static const struct kernel_symbol *resolve_symbol(struct module *mod,
@@ -1456,6 +1474,14 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
 	if (!sym)
 		goto unlock;
 
+	if (license == GPL_ONLY)
+		mod->using_gplonly_symbols = true;
+
+	if (!inherit_taint(mod, owner)) {
+		sym = NULL;
+		goto getname;
+	}
+
 	if (!check_version(info, name, mod, crc)) {
 		sym = ERR_PTR(-EINVAL);
 		goto getname;
-- 
cgit v1.2.3


From a1bd06853ee478d37fae9435c5521e301de94c67 Mon Sep 17 00:00:00 2001
From: Phil Auld <pauld@redhat.com>
Date: Wed, 5 Aug 2020 16:31:38 -0400
Subject: sched: Fix use of count for nr_running tracepoint

The count field is meant to tell if an update to nr_running
is an add or a subtract. Make it do so by adding the missing
minus sign.

Fixes: 9d246053a691 ("sched: Add a tracepoint to track rq->nr_running")
Signed-off-by: Phil Auld <pauld@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200805203138.1411-1-pauld@redhat.com
---
 kernel/sched/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3fd283892761..28709f6b0975 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1999,7 +1999,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
 	rq->nr_running -= count;
 	if (trace_sched_update_nr_running_tp_enabled()) {
-		call_trace_sched_update_nr_running(rq, count);
+		call_trace_sched_update_nr_running(rq, -count);
 	}
 
 	/* Check if we still need preemption */
-- 
cgit v1.2.3


From 19d0070a2792181f79df01277fe00b83b9f7eda7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 4 Aug 2020 17:01:23 +0200
Subject: timekeeping/vsyscall: Provide vdso_update_begin/end()

Architectures can have the requirement to add additional architecture
specific data to the VDSO data page which needs to be updated independent
of the timekeeper updates.

To protect these updates vs. concurrent readers and a conflicting update
through timekeeping, provide helper functions to make such updates safe.

vdso_update_begin() takes the timekeeper_lock to protect against a
potential update from timekeeper code and increments the VDSO sequence
count to signal data inconsistency to concurrent readers. vdso_update_end()
makes the sequence count even again to signal data consistency and drops
the timekeeper lock.

[ Sven: Add interrupt disable handling to the functions ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200804150124.41692-3-svens@linux.ibm.com
---
 include/vdso/vsyscall.h            |  3 +++
 kernel/time/timekeeping.c          |  2 +-
 kernel/time/timekeeping_internal.h | 11 +++++++---
 kernel/time/vsyscall.c             | 41 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/vdso/vsyscall.h b/include/vdso/vsyscall.h
index 2c6134e0c23d..b0fdc9c6bf43 100644
--- a/include/vdso/vsyscall.h
+++ b/include/vdso/vsyscall.h
@@ -6,6 +6,9 @@
 
 #include <asm/vdso/vsyscall.h>
 
+unsigned long vdso_update_begin(void);
+void vdso_update_end(unsigned long flags);
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __VDSO_VSYSCALL_H */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 63a632f9896c..4c7212f3c603 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -50,7 +50,7 @@ static struct {
 	.seq = SEQCNT_ZERO(tk_core.seq),
 };
 
-static DEFINE_RAW_SPINLOCK(timekeeper_lock);
+DEFINE_RAW_SPINLOCK(timekeeper_lock);
 static struct timekeeper shadow_timekeeper;
 
 /**
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index bcbb52db2256..4ca2787d1642 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -1,12 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _TIMEKEEPING_INTERNAL_H
 #define _TIMEKEEPING_INTERNAL_H
-/*
- * timekeeping debug functions
- */
+
 #include <linux/clocksource.h>
+#include <linux/spinlock.h>
 #include <linux/time.h>
 
+/*
+ * timekeeping debug functions
+ */
 #ifdef CONFIG_DEBUG_FS
 extern void tk_debug_account_sleep_time(const struct timespec64 *t);
 #else
@@ -31,4 +33,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
 }
 #endif
 
+/* Semi public for serialization of non timekeeper VDSO updates. */
+extern raw_spinlock_t timekeeper_lock;
+
 #endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 54ce6eb2ca36..88e6b8ed6ca5 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -13,6 +13,8 @@
 #include <vdso/helpers.h>
 #include <vdso/vsyscall.h>
 
+#include "timekeeping_internal.h"
+
 static inline void update_vdso_data(struct vdso_data *vdata,
 				    struct timekeeper *tk)
 {
@@ -127,3 +129,42 @@ void update_vsyscall_tz(void)
 
 	__arch_sync_vdso_data(vdata);
 }
+
+/**
+ * vdso_update_begin - Start of a VDSO update section
+ *
+ * Allows architecture code to safely update the architecture specific VDSO
+ * data. Disables interrupts, acquires timekeeper lock to serialize against
+ * concurrent updates from timekeeping and invalidates the VDSO data
+ * sequence counter to prevent concurrent readers from accessing
+ * inconsistent data.
+ *
+ * Returns: Saved interrupt flags which need to be handed in to
+ * vdso_update_end().
+ */
+unsigned long vdso_update_begin(void)
+{
+	struct vdso_data *vdata = __arch_get_k_vdso_data();
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	vdso_write_begin(vdata);
+	return flags;
+}
+
+/**
+ * vdso_update_end - End of a VDSO update section
+ * @flags:	Interrupt flags as returned from vdso_update_begin()
+ *
+ * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data
+ * synchronization if the architecture requires it, drops timekeeper lock
+ * and restores interrupt flags.
+ */
+void vdso_update_end(unsigned long flags)
+{
+	struct vdso_data *vdata = __arch_get_k_vdso_data();
+
+	vdso_write_end(vdata);
+	__arch_sync_vdso_data(vdata);
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
-- 
cgit v1.2.3


From 45fd22da97c6125d8d0d35bd1791e7c0c4175279 Mon Sep 17 00:00:00 2001
From: Alexey Budankov <alexey.budankov@linux.intel.com>
Date: Wed, 5 Aug 2020 10:56:56 +0300
Subject: perf/core: Take over CAP_SYS_PTRACE creds to CAP_PERFMON capability

Open access to per-process monitoring for CAP_PERFMON only
privileged processes [1]. Extend ptrace_may_access() check
in perf_events subsystem with perfmon_capable() to simplify
user experience and make monitoring more secure by reducing
attack surface.

[1] https://lore.kernel.org/lkml/7776fa40-6c65-2aa6-1322-eb3a01201000@linux.intel.com/

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/6e8392ff-4732-0012-2949-e1587709f0f6@linux.intel.com
---
 kernel/events/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 78e69e10482a..41e0cefb429b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11689,7 +11689,7 @@ SYSCALL_DEFINE5(perf_event_open,
 			goto err_task;
 
 		/*
-		 * Reuse ptrace permission checks for now.
+		 * Preserve ptrace permission check for backwards compatibility.
 		 *
 		 * We must hold exec_update_mutex across this and any potential
 		 * perf_install_in_context() call for this new event to
@@ -11697,7 +11697,7 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * perf_event_exit_task() that could imply).
 		 */
 		err = -EACCES;
-		if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+		if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
 			goto err_cred;
 	}
 
-- 
cgit v1.2.3


From 10de795a5addd1962406796a6e13ba6cc0fc6bee Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 6 Aug 2020 01:20:46 +0800
Subject: kprobes: Fix compiler warning for !CONFIG_KPROBES_ON_FTRACE

Fix compiler warning(as show below) for !CONFIG_KPROBES_ON_FTRACE.

kernel/kprobes.c: In function 'kill_kprobe':
kernel/kprobes.c:1116:33: warning: statement with no effect
[-Wunused-value]
 1116 | #define disarm_kprobe_ftrace(p) (-ENODEV)
      |                                 ^
kernel/kprobes.c:2154:3: note: in expansion of macro
'disarm_kprobe_ftrace'
 2154 |   disarm_kprobe_ftrace(p);

Link: https://lore.kernel.org/r/20200805142136.0331f7ea@canb.auug.org.au
Link: https://lkml.kernel.org/r/20200805172046.19066-1-songmuchun@bytedance.com

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: 0cb2f1372baa ("kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler")
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/kprobes.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 07bf03fcf574..66d14107968d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1079,9 +1079,20 @@ static int disarm_kprobe_ftrace(struct kprobe *p)
 		ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
 }
 #else	/* !CONFIG_KPROBES_ON_FTRACE */
-#define prepare_kprobe(p)	arch_prepare_kprobe(p)
-#define arm_kprobe_ftrace(p)	(-ENODEV)
-#define disarm_kprobe_ftrace(p)	(-ENODEV)
+static inline int prepare_kprobe(struct kprobe *p)
+{
+	return arch_prepare_kprobe(p);
+}
+
+static inline int arm_kprobe_ftrace(struct kprobe *p)
+{
+	return -ENODEV;
+}
+
+static inline int disarm_kprobe_ftrace(struct kprobe *p)
+{
+	return -ENODEV;
+}
 #endif
 
 /* Arm a kprobe with text_mutex */
-- 
cgit v1.2.3


From 820903c784a01bf6e143253418508da4f5790cff Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 30 Jul 2020 12:14:05 +0200
Subject: posix-cpu-timers: Split run_posix_cpu_timers()

Split it up as a preparatory step to move the heavy lifting out of
interrupt context.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20200730102337.677439437@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 43 +++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 165117996ea0..e5ad87320468 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1080,32 +1080,15 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)
 	return false;
 }
 
-/*
- * This is called from the timer interrupt handler.  The irq handler has
- * already updated our counts.  We need to check if any timers fire now.
- * Interrupts are disabled.
- */
-void run_posix_cpu_timers(void)
+static void __run_posix_cpu_timers(struct task_struct *tsk)
 {
-	struct task_struct *tsk = current;
 	struct k_itimer *timer, *next;
 	unsigned long flags;
 	LIST_HEAD(firing);
 
-	lockdep_assert_irqs_disabled();
-
-	/*
-	 * The fast path checks that there are no expired thread or thread
-	 * group timers.  If that's so, just return.
-	 */
-	if (!fastpath_timer_check(tsk))
+	if (!lock_task_sighand(tsk, &flags))
 		return;
 
-	lockdep_posixtimer_enter();
-	if (!lock_task_sighand(tsk, &flags)) {
-		lockdep_posixtimer_exit();
-		return;
-	}
 	/*
 	 * Here we take off tsk->signal->cpu_timers[N] and
 	 * tsk->cpu_timers[N] all the timers that are firing, and
@@ -1147,6 +1130,28 @@ void run_posix_cpu_timers(void)
 			cpu_timer_fire(timer);
 		spin_unlock(&timer->it_lock);
 	}
+}
+
+/*
+ * This is called from the timer interrupt handler.  The irq handler has
+ * already updated our counts.  We need to check if any timers fire now.
+ * Interrupts are disabled.
+ */
+void run_posix_cpu_timers(void)
+{
+	struct task_struct *tsk = current;
+
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * The fast path checks that there are no expired thread or thread
+	 * group timers.  If that's so, just return.
+	 */
+	if (!fastpath_timer_check(tsk))
+		return;
+
+	lockdep_posixtimer_enter();
+	__run_posix_cpu_timers(tsk);
 	lockdep_posixtimer_exit();
 }
 
-- 
cgit v1.2.3


From 1fb497dd003009be95ce67689ac800c446b7acc5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 30 Jul 2020 12:14:06 +0200
Subject: posix-cpu-timers: Provide mechanisms to defer timer handling to
 task_work

Running posix CPU timers in hard interrupt context has a few downsides:

 - For PREEMPT_RT it cannot work as the expiry code needs to take
   sighand lock, which is a 'sleeping spinlock' in RT. The original RT
   approach of offloading the posix CPU timer handling into a high
   priority thread was clumsy and provided no real benefit in general.

 - For fine grained accounting it's just wrong to run this in context of
   the timer interrupt because that way a process specific CPU time is
   accounted to the timer interrupt.

 - Long running timer interrupts caused by a large amount of expiring
   timers which can be created and armed by unpriviledged user space.

There is no hard requirement to expire them in interrupt context.

If the signal is targeted at the task itself then it won't be delivered
before the task returns to user space anyway. If the signal is targeted at
a supervisor process then it might be slightly delayed, but posix CPU
timers are inaccurate anyway due to the fact that they are tied to the
tick.

Provide infrastructure to schedule task work which allows splitting the
posix CPU timer code into a quick check in interrupt context and a thread
context expiry and signal delivery function. This has to be enabled by
architectures as it requires that the architecture specific KVM
implementation handles pending task work before exiting to guest mode.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20200730102337.783470146@linutronix.de
---
 include/linux/posix-timers.h   |  17 ++++
 include/linux/sched.h          |   4 +
 kernel/time/Kconfig            |   9 ++
 kernel/time/posix-cpu-timers.c | 185 ++++++++++++++++++++++++++++++++++++++---
 kernel/time/timer.c            |   1 +
 5 files changed, 204 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index e3f0f8585da4..896c16d2c5fb 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -6,6 +6,7 @@
 #include <linux/list.h>
 #include <linux/alarmtimer.h>
 #include <linux/timerqueue.h>
+#include <linux/task_work.h>
 
 struct kernel_siginfo;
 struct task_struct;
@@ -125,6 +126,16 @@ struct posix_cputimers {
 	unsigned int			expiry_active;
 };
 
+/**
+ * posix_cputimers_work - Container for task work based posix CPU timer expiry
+ * @work:	The task work to be scheduled
+ * @scheduled:  @work has been scheduled already, no further processing
+ */
+struct posix_cputimers_work {
+	struct callback_head	work;
+	unsigned int		scheduled;
+};
+
 static inline void posix_cputimers_init(struct posix_cputimers *pct)
 {
 	memset(pct, 0, sizeof(*pct));
@@ -165,6 +176,12 @@ static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
 					      u64 cpu_limit) { }
 #endif
 
+#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
+void posix_cputimers_init_work(void);
+#else
+static inline void posix_cputimers_init_work(void) { }
+#endif
+
 #define REQUEUE_PENDING 1
 
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 06ec60462af0..e9942ce07ef1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -889,6 +889,10 @@ struct task_struct {
 	/* Empty if CONFIG_POSIX_CPUTIMERS=n */
 	struct posix_cputimers		posix_cputimers;
 
+#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
+	struct posix_cputimers_work	posix_cputimers_work;
+#endif
+
 	/* Process credentials: */
 
 	/* Tracer's credentials at attach: */
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fcc42353f125..a09b1d61df6a 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -52,6 +52,15 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST
 config GENERIC_CMOS_UPDATE
 	bool
 
+# Select to handle posix CPU timers from task_work
+# and not from the timer interrupt context
+config HAVE_POSIX_CPU_TIMERS_TASK_WORK
+	bool
+
+config POSIX_CPU_TIMERS_TASK_WORK
+	bool
+	default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK
+
 if GENERIC_CLOCKEVENTS
 menu "Timers subsystem"
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e5ad87320468..a71758e34e45 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -377,6 +377,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
  */
 static int posix_cpu_timer_create(struct k_itimer *new_timer)
 {
+	static struct lock_class_key posix_cpu_timers_key;
 	struct pid *pid;
 
 	rcu_read_lock();
@@ -386,6 +387,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
 		return -EINVAL;
 	}
 
+	/*
+	 * If posix timer expiry is handled in task work context then
+	 * timer::it_lock can be taken without disabling interrupts as all
+	 * other locking happens in task context. This requires a seperate
+	 * lock class key otherwise regular posix timer expiry would record
+	 * the lock class being taken in interrupt context and generate a
+	 * false positive warning.
+	 */
+	if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK))
+		lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key);
+
 	new_timer->kclock = &clock_posix_cpu;
 	timerqueue_init(&new_timer->it.cpu.node);
 	new_timer->it.cpu.pid = get_pid(pid);
@@ -1080,26 +1092,163 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)
 	return false;
 }
 
-static void __run_posix_cpu_timers(struct task_struct *tsk)
+static void handle_posix_cpu_timers(struct task_struct *tsk);
+
+#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
+static void posix_cpu_timers_work(struct callback_head *work)
+{
+	handle_posix_cpu_timers(current);
+}
+
+/*
+ * Initialize posix CPU timers task work in init task. Out of line to
+ * keep the callback static and to avoid header recursion hell.
+ */
+void __init posix_cputimers_init_work(void)
+{
+	init_task_work(&current->posix_cputimers_work.work,
+		       posix_cpu_timers_work);
+}
+
+/*
+ * Note: All operations on tsk->posix_cputimer_work.scheduled happen either
+ * in hard interrupt context or in task context with interrupts
+ * disabled. Aside of that the writer/reader interaction is always in the
+ * context of the current task, which means they are strict per CPU.
+ */
+static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
+{
+	return tsk->posix_cputimers_work.scheduled;
+}
+
+static inline void __run_posix_cpu_timers(struct task_struct *tsk)
+{
+	if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled))
+		return;
+
+	/* Schedule task work to actually expire the timers */
+	tsk->posix_cputimers_work.scheduled = true;
+	task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME);
+}
+
+static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
+						unsigned long start)
+{
+	bool ret = true;
+
+	/*
+	 * On !RT kernels interrupts are disabled while collecting expired
+	 * timers, so no tick can happen and the fast path check can be
+	 * reenabled without further checks.
+	 */
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+		tsk->posix_cputimers_work.scheduled = false;
+		return true;
+	}
+
+	/*
+	 * On RT enabled kernels ticks can happen while the expired timers
+	 * are collected under sighand lock. But any tick which observes
+	 * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath
+	 * checks. So reenabling the tick work has do be done carefully:
+	 *
+	 * Disable interrupts and run the fast path check if jiffies have
+	 * advanced since the collecting of expired timers started. If
+	 * jiffies have not advanced or the fast path check did not find
+	 * newly expired timers, reenable the fast path check in the timer
+	 * interrupt. If there are newly expired timers, return false and
+	 * let the collection loop repeat.
+	 */
+	local_irq_disable();
+	if (start != jiffies && fastpath_timer_check(tsk))
+		ret = false;
+	else
+		tsk->posix_cputimers_work.scheduled = false;
+	local_irq_enable();
+
+	return ret;
+}
+#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
+static inline void __run_posix_cpu_timers(struct task_struct *tsk)
+{
+	lockdep_posixtimer_enter();
+	handle_posix_cpu_timers(tsk);
+	lockdep_posixtimer_exit();
+}
+
+static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
+{
+	return false;
+}
+
+static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
+						unsigned long start)
+{
+	return true;
+}
+#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
+
+static void handle_posix_cpu_timers(struct task_struct *tsk)
 {
 	struct k_itimer *timer, *next;
-	unsigned long flags;
+	unsigned long flags, start;
 	LIST_HEAD(firing);
 
 	if (!lock_task_sighand(tsk, &flags))
 		return;
 
-	/*
-	 * Here we take off tsk->signal->cpu_timers[N] and
-	 * tsk->cpu_timers[N] all the timers that are firing, and
-	 * put them on the firing list.
-	 */
-	check_thread_timers(tsk, &firing);
+	do {
+		/*
+		 * On RT locking sighand lock does not disable interrupts,
+		 * so this needs to be careful vs. ticks. Store the current
+		 * jiffies value.
+		 */
+		start = READ_ONCE(jiffies);
+		barrier();
 
-	check_process_timers(tsk, &firing);
+		/*
+		 * Here we take off tsk->signal->cpu_timers[N] and
+		 * tsk->cpu_timers[N] all the timers that are firing, and
+		 * put them on the firing list.
+		 */
+		check_thread_timers(tsk, &firing);
+
+		check_process_timers(tsk, &firing);
+
+		/*
+		 * The above timer checks have updated the exipry cache and
+		 * because nothing can have queued or modified timers after
+		 * sighand lock was taken above it is guaranteed to be
+		 * consistent. So the next timer interrupt fastpath check
+		 * will find valid data.
+		 *
+		 * If timer expiry runs in the timer interrupt context then
+		 * the loop is not relevant as timers will be directly
+		 * expired in interrupt context. The stub function below
+		 * returns always true which allows the compiler to
+		 * optimize the loop out.
+		 *
+		 * If timer expiry is deferred to task work context then
+		 * the following rules apply:
+		 *
+		 * - On !RT kernels no tick can have happened on this CPU
+		 *   after sighand lock was acquired because interrupts are
+		 *   disabled. So reenabling task work before dropping
+		 *   sighand lock and reenabling interrupts is race free.
+		 *
+		 * - On RT kernels ticks might have happened but the tick
+		 *   work ignored posix CPU timer handling because the
+		 *   CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work
+		 *   must be done very carefully including a check whether
+		 *   ticks have happened since the start of the timer
+		 *   expiry checks. posix_cpu_timers_enable_work() takes
+		 *   care of that and eventually lets the expiry checks
+		 *   run again.
+		 */
+	} while (!posix_cpu_timers_enable_work(tsk, start));
 
 	/*
-	 * We must release these locks before taking any timer's lock.
+	 * We must release sighand lock before taking any timer's lock.
 	 * There is a potential race with timer deletion here, as the
 	 * siglock now protects our private firing list.  We have set
 	 * the firing flag in each timer, so that a deletion attempt
@@ -1117,6 +1266,13 @@ static void __run_posix_cpu_timers(struct task_struct *tsk)
 	list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
 		int cpu_firing;
 
+		/*
+		 * spin_lock() is sufficient here even independent of the
+		 * expiry context. If expiry happens in hard interrupt
+		 * context it's obvious. For task work context it's safe
+		 * because all other operations on timer::it_lock happen in
+		 * task context (syscall or exit).
+		 */
 		spin_lock(&timer->it_lock);
 		list_del_init(&timer->it.cpu.elist);
 		cpu_firing = timer->it.cpu.firing;
@@ -1143,6 +1299,13 @@ void run_posix_cpu_timers(void)
 
 	lockdep_assert_irqs_disabled();
 
+	/*
+	 * If the actual expiry is deferred to task work context and the
+	 * work is already scheduled there is no point to do anything here.
+	 */
+	if (posix_cpu_timers_work_scheduled(tsk))
+		return;
+
 	/*
 	 * The fast path checks that there are no expired thread or thread
 	 * group timers.  If that's so, just return.
@@ -1150,9 +1313,7 @@ void run_posix_cpu_timers(void)
 	if (!fastpath_timer_check(tsk))
 		return;
 
-	lockdep_posixtimer_enter();
 	__run_posix_cpu_timers(tsk);
-	lockdep_posixtimer_exit();
 }
 
 /*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ae5029f984a8..a16764b0116e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2017,6 +2017,7 @@ static void __init init_timer_cpus(void)
 void __init init_timers(void)
 {
 	init_timer_cpus();
+	posix_cputimers_init_work();
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
 
-- 
cgit v1.2.3


From a2e9a95d2190ef55bf0724ecdf8a466d393a86b6 Mon Sep 17 00:00:00 2001
From: Lianbo Jiang <lijiang@redhat.com>
Date: Tue, 4 Aug 2020 12:49:32 +0800
Subject: kexec: Improve & fix crash_exclude_mem_range() to handle overlapping
 ranges

The crash_exclude_mem_range() function can only handle one memory region a time.

It will fail in the case in which the passed in area covers several memory
regions. In this case, it will only exclude the first region, then return,
but leave the later regions unsolved.

E.g in a NEC system with two usable RAM regions inside the low 1M:

  ...
  BIOS-e820: [mem 0x0000000000000000-0x000000000003efff] usable
  BIOS-e820: [mem 0x000000000003f000-0x000000000003ffff] reserved
  BIOS-e820: [mem 0x0000000000040000-0x000000000009ffff] usable

It will only exclude the memory region [0, 0x3efff], the memory region
[0x40000, 0x9ffff] will still be added into /proc/vmcore, which may cause
the following failure when dumping vmcore:

 ioremap on RAM at 0x0000000000040000 - 0x0000000000040fff
 WARNING: CPU: 0 PID: 665 at arch/x86/mm/ioremap.c:186 __ioremap_caller+0x2c7/0x2e0
 ...
 RIP: 0010:__ioremap_caller+0x2c7/0x2e0
 ...
 cp: error reading '/proc/vmcore': Cannot allocate memory
 kdump: saving vmcore failed

In order to fix this bug, let's extend the crash_exclude_mem_range()
to handle the overlapping ranges.

[ mingo: Amended the changelog. ]

Signed-off-by: Lianbo Jiang <lijiang@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Dave Young <dyoung@redhat.com>
Link: https://lore.kernel.org/r/20200804044933.1973-3-lijiang@redhat.com
---
 kernel/kexec_file.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 94661d2d13ad..97fa68267197 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1157,24 +1157,26 @@ int crash_exclude_mem_range(struct crash_mem *mem,
 			    unsigned long long mstart, unsigned long long mend)
 {
 	int i, j;
-	unsigned long long start, end;
+	unsigned long long start, end, p_start, p_end;
 	struct crash_mem_range temp_range = {0, 0};
 
 	for (i = 0; i < mem->nr_ranges; i++) {
 		start = mem->ranges[i].start;
 		end = mem->ranges[i].end;
+		p_start = mstart;
+		p_end = mend;
 
 		if (mstart > end || mend < start)
 			continue;
 
 		/* Truncate any area outside of range */
 		if (mstart < start)
-			mstart = start;
+			p_start = start;
 		if (mend > end)
-			mend = end;
+			p_end = end;
 
 		/* Found completely overlapping range */
-		if (mstart == start && mend == end) {
+		if (p_start == start && p_end == end) {
 			mem->ranges[i].start = 0;
 			mem->ranges[i].end = 0;
 			if (i < mem->nr_ranges - 1) {
@@ -1185,20 +1187,29 @@ int crash_exclude_mem_range(struct crash_mem *mem,
 					mem->ranges[j].end =
 							mem->ranges[j+1].end;
 				}
+
+				/*
+				 * Continue to check if there are another overlapping ranges
+				 * from the current position because of shifting the above
+				 * mem ranges.
+				 */
+				i--;
+				mem->nr_ranges--;
+				continue;
 			}
 			mem->nr_ranges--;
 			return 0;
 		}
 
-		if (mstart > start && mend < end) {
+		if (p_start > start && p_end < end) {
 			/* Split original range */
-			mem->ranges[i].end = mstart - 1;
-			temp_range.start = mend + 1;
+			mem->ranges[i].end = p_start - 1;
+			temp_range.start = p_end + 1;
 			temp_range.end = end;
-		} else if (mstart != start)
-			mem->ranges[i].end = mstart - 1;
+		} else if (p_start != start)
+			mem->ranges[i].end = p_start - 1;
 		else
-			mem->ranges[i].start = mend + 1;
+			mem->ranges[i].start = p_end + 1;
 		break;
 	}
 
@@ -1243,7 +1254,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
 	 * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
 	 * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
 	 * I think this is required by tools like gdb. So same physical
-	 * memory will be mapped in two elf headers. One will contain kernel
+	 * memory will be mapped in two elf  headers. One will contain kernel
 	 * text virtual addresses and other will have __va(physical) addresses.
 	 */
 
@@ -1270,7 +1281,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
 	ehdr->e_ehsize = sizeof(Elf64_Ehdr);
 	ehdr->e_phentsize = sizeof(Elf64_Phdr);
 
-	/* Prepare one phdr of type PT_NOTE for each present cpu */
+	/* Prepare one phdr of type PT_NOTE for each present CPU */
 	for_each_present_cpu(cpu) {
 		phdr->p_type = PT_NOTE;
 		notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
-- 
cgit v1.2.3


From 475f63ae63b5102ae6423d1712333929d04d6ecc Mon Sep 17 00:00:00 2001
From: Lianbo Jiang <lijiang@redhat.com>
Date: Tue, 4 Aug 2020 12:49:33 +0800
Subject: kexec_file: Correctly output debugging information for the PT_LOAD
 ELF header

Currently, when we enable the debugging switch to debug kexec_file,
we always get the following incorrect results:

  kexec_file: Crash PT_LOAD elf header. phdr=00000000c988639b vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=51 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=000000003cca69a0 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=52 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=00000000c584cb9f vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=53 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=00000000cf85d57f vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=54 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=00000000a4a8f847 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=55 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=00000000272ec49f vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=56 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=00000000ea0b65de vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=57 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=000000001f5e490c vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=58 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=00000000dfe4109e vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=59 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=00000000480ed2b6 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=60 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=0000000080b65151 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=61 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=0000000024e31c5e vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=62 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=00000000332e0385 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=63 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=000000002754d5da vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=64 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=00000000783320dd vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=65 p_offset=0x0
  kexec_file: Crash PT_LOAD elf header. phdr=0000000076fe5b64 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=66 p_offset=0x0

The reason is that kernel always prints the values of the next PT_LOAD
instead of the current PT_LOAD. Change it to ensure that we can get the
correct debugging information.

[ mingo: Amended changelog, capitalized "ELF". ]

Signed-off-by: Lianbo Jiang <lijiang@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Dave Young <dyoung@redhat.com>
Link: https://lore.kernel.org/r/20200804044933.1973-4-lijiang@redhat.com
---
 kernel/kexec_file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 97fa68267197..3f7867c1820f 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1246,7 +1246,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
 	unsigned long long notes_addr;
 	unsigned long mstart, mend;
 
-	/* extra phdr for vmcoreinfo elf note */
+	/* extra phdr for vmcoreinfo ELF note */
 	nr_phdr = nr_cpus + 1;
 	nr_phdr += mem->nr_ranges;
 
@@ -1254,7 +1254,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
 	 * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
 	 * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
 	 * I think this is required by tools like gdb. So same physical
-	 * memory will be mapped in two elf  headers. One will contain kernel
+	 * memory will be mapped in two ELF headers. One will contain kernel
 	 * text virtual addresses and other will have __va(physical) addresses.
 	 */
 
@@ -1323,10 +1323,10 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
 		phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
 		phdr->p_align = 0;
 		ehdr->e_phnum++;
-		phdr++;
-		pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+		pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
 			phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
 			ehdr->e_phnum, phdr->p_offset);
+		phdr++;
 	}
 
 	*addr = buf;
-- 
cgit v1.2.3


From 5e7b30205cef80f6bb922e61834437ca7bff5837 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 4 Aug 2020 22:50:56 -0700
Subject: bpf: Change uapi for bpf iterator map elements

Commit a5cbe05a6673 ("bpf: Implement bpf iterator for
map elements") added bpf iterator support for
map elements. The map element bpf iterator requires
info to identify a particular map. In the above
commit, the attr->link_create.target_fd is used
to carry map_fd and an enum bpf_iter_link_info
is added to uapi to specify the target_fd actually
representing a map_fd:
    enum bpf_iter_link_info {
	BPF_ITER_LINK_UNSPEC = 0,
	BPF_ITER_LINK_MAP_FD = 1,

	MAX_BPF_ITER_LINK_INFO,
    };

This is an extensible approach as we can grow
enumerator for pid, cgroup_id, etc. and we can
unionize target_fd for pid, cgroup_id, etc.
But in the future, there are chances that
more complex customization may happen, e.g.,
for tasks, it could be filtered based on
both cgroup_id and user_id.

This patch changed the uapi to have fields
	__aligned_u64	iter_info;
	__u32		iter_info_len;
for additional iter_info for link_create.
The iter_info is defined as
	union bpf_iter_link_info {
		struct {
			__u32   map_fd;
		} map;
	};

So future extension for additional customization
will be easier. The bpf_iter_link_info will be
passed to target callback to validate and generic
bpf_iter framework does not need to deal it any
more.

Note that map_fd = 0 will be considered invalid
and -EBADF will be returned to user space.

Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com
---
 include/linux/bpf.h       | 10 ++++----
 include/uapi/linux/bpf.h  | 15 ++++++------
 kernel/bpf/bpf_iter.c     | 58 +++++++++++++++++++++++------------------------
 kernel/bpf/map_iter.c     | 37 +++++++++++++++++++++++-------
 kernel/bpf/syscall.c      |  2 +-
 net/core/bpf_sk_storage.c | 37 +++++++++++++++++++++++-------
 6 files changed, 102 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cef4ef0d2b4e..55f694b63164 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1214,15 +1214,17 @@ struct bpf_iter_aux_info {
 	struct bpf_map *map;
 };
 
-typedef int (*bpf_iter_check_target_t)(struct bpf_prog *prog,
-				       struct bpf_iter_aux_info *aux);
+typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
+					union bpf_iter_link_info *linfo,
+					struct bpf_iter_aux_info *aux);
+typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux);
 
 #define BPF_ITER_CTX_ARG_MAX 2
 struct bpf_iter_reg {
 	const char *target;
-	bpf_iter_check_target_t check_target;
+	bpf_iter_attach_target_t attach_target;
+	bpf_iter_detach_target_t detach_target;
 	u32 ctx_arg_info_size;
-	enum bpf_iter_link_info req_linfo;
 	struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
 	const struct bpf_iter_seq_info *seq_info;
 };
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b134e679e9db..0480f893facd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -81,6 +81,12 @@ struct bpf_cgroup_storage_key {
 	__u32	attach_type;		/* program attach type */
 };
 
+union bpf_iter_link_info {
+	struct {
+		__u32	map_fd;
+	} map;
+};
+
 /* BPF syscall commands, see bpf(2) man-page for details. */
 enum bpf_cmd {
 	BPF_MAP_CREATE,
@@ -249,13 +255,6 @@ enum bpf_link_type {
 	MAX_BPF_LINK_TYPE,
 };
 
-enum bpf_iter_link_info {
-	BPF_ITER_LINK_UNSPEC = 0,
-	BPF_ITER_LINK_MAP_FD = 1,
-
-	MAX_BPF_ITER_LINK_INFO,
-};
-
 /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
  *
  * NONE(default): No further bpf programs allowed in the subtree.
@@ -623,6 +622,8 @@ union bpf_attr {
 		};
 		__u32		attach_type;	/* attach type */
 		__u32		flags;		/* extra flags */
+		__aligned_u64	iter_info;	/* extra bpf_iter_link_info */
+		__u32		iter_info_len;	/* iter_info length */
 	} link_create;
 
 	struct { /* struct used by BPF_LINK_UPDATE command */
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 363b9cafc2d8..b6715964b685 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -338,8 +338,8 @@ static void bpf_iter_link_release(struct bpf_link *link)
 	struct bpf_iter_link *iter_link =
 		container_of(link, struct bpf_iter_link, link);
 
-	if (iter_link->aux.map)
-		bpf_map_put_with_uref(iter_link->aux.map);
+	if (iter_link->tinfo->reg_info->detach_target)
+		iter_link->tinfo->reg_info->detach_target(&iter_link->aux);
 }
 
 static void bpf_iter_link_dealloc(struct bpf_link *link)
@@ -390,15 +390,35 @@ bool bpf_link_is_iter(struct bpf_link *link)
 
 int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 {
+	union bpf_iter_link_info __user *ulinfo;
 	struct bpf_link_primer link_primer;
 	struct bpf_iter_target_info *tinfo;
-	struct bpf_iter_aux_info aux = {};
+	union bpf_iter_link_info linfo;
 	struct bpf_iter_link *link;
-	u32 prog_btf_id, target_fd;
+	u32 prog_btf_id, linfo_len;
 	bool existed = false;
-	struct bpf_map *map;
 	int err;
 
+	if (attr->link_create.target_fd || attr->link_create.flags)
+		return -EINVAL;
+
+	memset(&linfo, 0, sizeof(union bpf_iter_link_info));
+
+	ulinfo = u64_to_user_ptr(attr->link_create.iter_info);
+	linfo_len = attr->link_create.iter_info_len;
+	if (!ulinfo ^ !linfo_len)
+		return -EINVAL;
+
+	if (ulinfo) {
+		err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo),
+					       linfo_len);
+		if (err)
+			return err;
+		linfo_len = min_t(u32, linfo_len, sizeof(linfo));
+		if (copy_from_user(&linfo, ulinfo, linfo_len))
+			return -EFAULT;
+	}
+
 	prog_btf_id = prog->aux->attach_btf_id;
 	mutex_lock(&targets_mutex);
 	list_for_each_entry(tinfo, &targets, list) {
@@ -411,13 +431,6 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 	if (!existed)
 		return -ENOENT;
 
-	/* Make sure user supplied flags are target expected. */
-	target_fd = attr->link_create.target_fd;
-	if (attr->link_create.flags != tinfo->reg_info->req_linfo)
-		return -EINVAL;
-	if (!attr->link_create.flags && target_fd)
-		return -EINVAL;
-
 	link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
 	if (!link)
 		return -ENOMEM;
@@ -431,28 +444,15 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 		return err;
 	}
 
-	if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) {
-		map = bpf_map_get_with_uref(target_fd);
-		if (IS_ERR(map)) {
-			err = PTR_ERR(map);
-			goto cleanup_link;
-		}
-
-		aux.map = map;
-		err = tinfo->reg_info->check_target(prog, &aux);
+	if (tinfo->reg_info->attach_target) {
+		err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux);
 		if (err) {
-			bpf_map_put_with_uref(map);
-			goto cleanup_link;
+			bpf_link_cleanup(&link_primer);
+			return err;
 		}
-
-		link->aux.map = map;
 	}
 
 	return bpf_link_settle(&link_primer);
-
-cleanup_link:
-	bpf_link_cleanup(&link_primer);
-	return err;
 }
 
 static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index fbe1f557cb88..af86048e5afd 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -98,12 +98,21 @@ static struct bpf_iter_reg bpf_map_reg_info = {
 	.seq_info		= &bpf_map_seq_info,
 };
 
-static int bpf_iter_check_map(struct bpf_prog *prog,
-			      struct bpf_iter_aux_info *aux)
+static int bpf_iter_attach_map(struct bpf_prog *prog,
+			       union bpf_iter_link_info *linfo,
+			       struct bpf_iter_aux_info *aux)
 {
 	u32 key_acc_size, value_acc_size, key_size, value_size;
-	struct bpf_map *map = aux->map;
+	struct bpf_map *map;
 	bool is_percpu = false;
+	int err = -EINVAL;
+
+	if (!linfo->map.map_fd)
+		return -EBADF;
+
+	map = bpf_map_get_with_uref(linfo->map.map_fd);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -112,7 +121,7 @@ static int bpf_iter_check_map(struct bpf_prog *prog,
 	else if (map->map_type != BPF_MAP_TYPE_HASH &&
 		 map->map_type != BPF_MAP_TYPE_LRU_HASH &&
 		 map->map_type != BPF_MAP_TYPE_ARRAY)
-		return -EINVAL;
+		goto put_map;
 
 	key_acc_size = prog->aux->max_rdonly_access;
 	value_acc_size = prog->aux->max_rdwr_access;
@@ -122,10 +131,22 @@ static int bpf_iter_check_map(struct bpf_prog *prog,
 	else
 		value_size = round_up(map->value_size, 8) * num_possible_cpus();
 
-	if (key_acc_size > key_size || value_acc_size > value_size)
-		return -EACCES;
+	if (key_acc_size > key_size || value_acc_size > value_size) {
+		err = -EACCES;
+		goto put_map;
+	}
 
+	aux->map = map;
 	return 0;
+
+put_map:
+	bpf_map_put_with_uref(map);
+	return err;
+}
+
+static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux)
+{
+	bpf_map_put_with_uref(aux->map);
 }
 
 DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,
@@ -133,8 +154,8 @@ DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,
 
 static const struct bpf_iter_reg bpf_map_elem_reg_info = {
 	.target			= "bpf_map_elem",
-	.check_target		= bpf_iter_check_map,
-	.req_linfo		= BPF_ITER_LINK_MAP_FD,
+	.attach_target		= bpf_iter_attach_map,
+	.detach_target		= bpf_iter_detach_map,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__bpf_map_elem, key),
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2f343ce15747..86299a292214 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3883,7 +3883,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *
 	return -EINVAL;
 }
 
-#define BPF_LINK_CREATE_LAST_FIELD link_create.flags
+#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
 static int link_create(union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index d3377c90a291..b988f48153a4 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -1384,18 +1384,39 @@ static int bpf_iter_init_sk_storage_map(void *priv_data,
 	return 0;
 }
 
-static int bpf_iter_check_map(struct bpf_prog *prog,
-			      struct bpf_iter_aux_info *aux)
+static int bpf_iter_attach_map(struct bpf_prog *prog,
+			       union bpf_iter_link_info *linfo,
+			       struct bpf_iter_aux_info *aux)
 {
-	struct bpf_map *map = aux->map;
+	struct bpf_map *map;
+	int err = -EINVAL;
+
+	if (!linfo->map.map_fd)
+		return -EBADF;
+
+	map = bpf_map_get_with_uref(linfo->map.map_fd);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
 
 	if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
-		return -EINVAL;
+		goto put_map;
 
-	if (prog->aux->max_rdonly_access > map->value_size)
-		return -EACCES;
+	if (prog->aux->max_rdonly_access > map->value_size) {
+		err = -EACCES;
+		goto put_map;
+	}
 
+	aux->map = map;
 	return 0;
+
+put_map:
+	bpf_map_put_with_uref(map);
+	return err;
+}
+
+static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux)
+{
+	bpf_map_put_with_uref(aux->map);
 }
 
 static const struct seq_operations bpf_sk_storage_map_seq_ops = {
@@ -1414,8 +1435,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
 
 static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
 	.target			= "bpf_sk_storage_map",
-	.check_target		= bpf_iter_check_map,
-	.req_linfo		= BPF_ITER_LINK_MAP_FD,
+	.attach_target		= bpf_iter_attach_map,
+	.detach_target		= bpf_iter_detach_map,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
-- 
cgit v1.2.3


From 0d360d64b01231cdb36e1936de889f308fd9317f Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 6 Aug 2020 11:26:12 -0700
Subject: bpf: Remove inline from bpf_do_trace_printk

I get the following error during compilation on my side:
kernel/trace/bpf_trace.c: In function 'bpf_do_trace_printk':
kernel/trace/bpf_trace.c:386:34: error: function 'bpf_do_trace_printk' can never be inlined because it uses variable argument lists
 static inline __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...)
                                  ^

Fixes: ac5a72ea5c89 ("bpf: Use dedicated bpf_trace_printk event instead of trace_printk()")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200806182612.1390883-1-sdf@google.com
---
 kernel/trace/bpf_trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cb91ef902cc4..a8d4f253ed77 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -383,7 +383,7 @@ static DEFINE_RAW_SPINLOCK(trace_printk_lock);
 
 #define BPF_TRACE_PRINTK_SIZE   1024
 
-static inline __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...)
+static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...)
 {
 	static char buf[BPF_TRACE_PRINTK_SIZE];
 	unsigned long flags;
-- 
cgit v1.2.3


From b8c1a3090741f349322ad855d2b66d6e9752a60d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 6 Aug 2020 20:31:41 -0700
Subject: bpf: Delete repeated words in comments

Drop repeated words in kernel/bpf/: {has, the}

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200807033141.10437-1-rdunlap@infradead.org
---
 kernel/bpf/core.c     | 2 +-
 kernel/bpf/verifier.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index bde93344164d..ed0b3578867c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1966,7 +1966,7 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
  * @index: the index of the program to replace
  *
  * Skips over dummy programs, by not counting them, when calculating
- * the the position of the program to replace.
+ * the position of the program to replace.
  *
  * Return:
  * * 0		- Success
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b6ccfce3bf4c..ef938f17b944 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8294,7 +8294,7 @@ static bool stacksafe(struct bpf_func_state *old,
 		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
 		    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
 			/* Ex: old explored (safe) state has STACK_SPILL in
-			 * this stack slot, but current has has STACK_MISC ->
+			 * this stack slot, but current has STACK_MISC ->
 			 * this verifier states are not equivalent,
 			 * return false to continue verification of this path
 			 */
-- 
cgit v1.2.3


From 11990a5bd7e558e9203c1070fc52fb6f0488e75b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 6 Aug 2020 14:15:23 -0700
Subject: module: Correctly truncate sysfs sections output

The only-root-readable /sys/module/$module/sections/$section files
did not truncate their output to the available buffer size. While most
paths into the kernfs read handlers end up using PAGE_SIZE buffers,
it's possible to get there through other paths (e.g. splice, sendfile).
Actually limit the output to the "count" passed into the read function,
and report it back correctly. *sigh*

Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/lkml/20200805002015.GE23458@shao2-debian
Fixes: ed66f991bb19 ("module: Refactor section attr into bin attribute")
Cc: stable@vger.kernel.org
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Jessica Yu <jeyu@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/module.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index aa183c9ac0a2..08c46084d8cc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1520,18 +1520,34 @@ struct module_sect_attrs {
 	struct module_sect_attr attrs[];
 };
 
+#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
 static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
 				struct bin_attribute *battr,
 				char *buf, loff_t pos, size_t count)
 {
 	struct module_sect_attr *sattr =
 		container_of(battr, struct module_sect_attr, battr);
+	char bounce[MODULE_SECT_READ_SIZE + 1];
+	size_t wrote;
 
 	if (pos != 0)
 		return -EINVAL;
 
-	return sprintf(buf, "0x%px\n",
-		       kallsyms_show_value(file->f_cred) ? (void *)sattr->address : NULL);
+	/*
+	 * Since we're a binary read handler, we must account for the
+	 * trailing NUL byte that sprintf will write: if "buf" is
+	 * too small to hold the NUL, or the NUL is exactly the last
+	 * byte, the read will look like it got truncated by one byte.
+	 * Since there is no way to ask sprintf nicely to not write
+	 * the NUL, we have to use a bounce buffer.
+	 */
+	wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
+			 kallsyms_show_value(file->f_cred)
+				? (void *)sattr->address : NULL);
+	count = min(count, wrote);
+	memcpy(buf, bounce, count);
+
+	return count;
 }
 
 static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -1580,7 +1596,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
 			goto out;
 		sect_attrs->nsections++;
 		sattr->battr.read = module_sect_read;
-		sattr->battr.size = 3 /* "0x", "\n" */ + (BITS_PER_LONG / 4);
+		sattr->battr.size = MODULE_SECT_READ_SIZE;
 		sattr->battr.attr.mode = 0400;
 		*(gattr++) = &(sattr++)->battr;
 	}
-- 
cgit v1.2.3


From 38cf307c1f2011d413750c5acb725456f47d9172 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 6 Aug 2020 23:17:16 -0700
Subject: mm: fix kthread_use_mm() vs TLB invalidate

For SMP systems using IPI based TLB invalidation, looking at
current->active_mm is entirely reasonable.  This then presents the
following race condition:

  CPU0			CPU1

  flush_tlb_mm(mm)	use_mm(mm)
    <send-IPI>
			  tsk->active_mm = mm;
			  <IPI>
			    if (tsk->active_mm == mm)
			      // flush TLBs
			  </IPI>
			  switch_mm(old_mm,mm,tsk);

Where it is possible the IPI flushed the TLBs for @old_mm, not @mm,
because the IPI lands before we actually switched.

Avoid this by disabling IRQs across changing ->active_mm and
switch_mm().

Of the (SMP) architectures that have IPI based TLB invalidate:

  Alpha    - checks active_mm
  ARC      - ASID specific
  IA64     - checks active_mm
  MIPS     - ASID specific flush
  OpenRISC - shoots down world
  PARISC   - shoots down world
  SH       - ASID specific
  SPARC    - ASID specific
  x86      - N/A
  xtensa   - checks active_mm

So at the very least Alpha, IA64 and Xtensa are suspect.

On top of this, for scheduler consistency we need at least preemption
disabled across changing tsk->mm and doing switch_mm(), which is
currently provided by task_lock(), but that's not sufficient for
PREEMPT_RT.

[akpm@linux-foundation.org: add comment]

Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jann Horn <jannh@google.com>
Cc: Will Deacon <will@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20200721154106.GE10769@hirez.programming.kicks-ass.net
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kthread.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1d9e2fdfd67a..1c8964feeb01 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1241,13 +1241,16 @@ void kthread_use_mm(struct mm_struct *mm)
 	WARN_ON_ONCE(tsk->mm);
 
 	task_lock(tsk);
+	/* Hold off tlb flush IPIs while switching mm's */
+	local_irq_disable();
 	active_mm = tsk->active_mm;
 	if (active_mm != mm) {
 		mmgrab(mm);
 		tsk->active_mm = mm;
 	}
 	tsk->mm = mm;
-	switch_mm(active_mm, mm, tsk);
+	switch_mm_irqs_off(active_mm, mm, tsk);
+	local_irq_enable();
 	task_unlock(tsk);
 #ifdef finish_arch_post_lock_switch
 	finish_arch_post_lock_switch();
@@ -1276,9 +1279,11 @@ void kthread_unuse_mm(struct mm_struct *mm)
 
 	task_lock(tsk);
 	sync_mm_rss(mm);
+	local_irq_disable();
 	tsk->mm = NULL;
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
+	local_irq_enable();
 	task_unlock(tsk);
 }
 EXPORT_SYMBOL_GPL(kthread_unuse_mm);
-- 
cgit v1.2.3


From 4ca1085c9573ea08767521dabce62456e3fc2fd0 Mon Sep 17 00:00:00 2001
From: Ilias Stamatis <stamatis.iliass@gmail.com>
Date: Thu, 6 Aug 2020 23:17:19 -0700
Subject: kthread: remove incorrect comment in kthread_create_on_cpu()

Originally kthread_create_on_cpu() parked and woke up the new thread.
However, since commit a65d40961dc7 ("kthread/smpboot: do not park in
kthread_create_on_cpu()") this is no longer the case.  This patch removes
the comment that has been left behind and is now incorrect / stale.

Fixes: a65d40961dc7 ("kthread/smpboot: do not park in kthread_create_on_cpu()")
Signed-off-by: Ilias Stamatis <stamatis.iliass@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: http://lkml.kernel.org/r/20200611135920.240551-1-stamatis.iliass@gmail.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kthread.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1c8964feeb01..b2807e7be772 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -480,7 +480,6 @@ EXPORT_SYMBOL(kthread_bind);
  *	     to "name.*%u". Code fills in cpu number.
  *
  * Description: This helper function creates and names a kernel thread
- * The thread will be woken and put into park mode.
  */
 struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 					  void *data, unsigned int cpu,
-- 
cgit v1.2.3


From d42f3245c7e299e017213fa028c319316bcdb7f4 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 6 Aug 2020 23:20:39 -0700
Subject: mm: memcg: convert vmstat slab counters to bytes

In order to prepare for per-object slab memory accounting, convert
NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE vmstat items to bytes.

To make it obvious, rename them to NR_SLAB_RECLAIMABLE_B and
NR_SLAB_UNRECLAIMABLE_B (similar to NR_KERNEL_STACK_KB).

Internally global and per-node counters are stored in pages, however memcg
and lruvec counters are stored in bytes.  This scheme may look weird, but
only for now.  As soon as slab pages will be shared between multiple
cgroups, global and node counters will reflect the total number of slab
pages.  However memcg and lruvec counters will be used for per-memcg slab
memory tracking, which will take separate kernel objects in the account.
Keeping global and node counters in pages helps to avoid additional
overhead.

The size of slab memory shouldn't exceed 4Gb on 32-bit machines, so it
will fit into atomic_long_t we use for vmstats.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/20200623174037.3951353-4-guro@fb.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c     |  4 ++--
 fs/proc/meminfo.c       |  4 ++--
 include/linux/mmzone.h  | 16 +++++++++++++---
 kernel/power/snapshot.c |  2 +-
 mm/memcontrol.c         | 11 ++++-------
 mm/oom_kill.c           |  2 +-
 mm/page_alloc.c         |  8 ++++----
 mm/slab.h               | 15 ++++++++-------
 mm/slab_common.c        |  4 ++--
 mm/slob.c               | 12 ++++++------
 mm/slub.c               |  8 ++++----
 mm/vmscan.c             |  3 ++-
 mm/workingset.c         |  6 ++++--
 13 files changed, 53 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index e21e31359297..0cf13e31603c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -368,8 +368,8 @@ static ssize_t node_read_meminfo(struct device *dev,
 	unsigned long sreclaimable, sunreclaimable;
 
 	si_meminfo_node(&i, nid);
-	sreclaimable = node_page_state(pgdat, NR_SLAB_RECLAIMABLE);
-	sunreclaimable = node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE);
+	sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B);
+	sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B);
 	n = sprintf(buf,
 		       "Node %d MemTotal:       %8lu kB\n"
 		       "Node %d MemFree:        %8lu kB\n"
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index e9a6841fc25b..38ea95fd919a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -52,8 +52,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
 
 	available = si_mem_available();
-	sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE);
-	sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE);
+	sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
+	sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
 
 	show_val_kb(m, "MemTotal:       ", i.totalram);
 	show_val_kb(m, "MemFree:        ", i.freeram);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f16306e15b98..b79100edd228 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -174,8 +174,8 @@ enum node_stat_item {
 	NR_INACTIVE_FILE,	/*  "     "     "   "       "         */
 	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
 	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
-	NR_SLAB_RECLAIMABLE,
-	NR_SLAB_UNRECLAIMABLE,
+	NR_SLAB_RECLAIMABLE_B,
+	NR_SLAB_UNRECLAIMABLE_B,
 	NR_ISOLATED_ANON,	/* Temporary isolated pages from anon lru */
 	NR_ISOLATED_FILE,	/* Temporary isolated pages from file lru */
 	WORKINGSET_NODES,
@@ -213,7 +213,17 @@ enum node_stat_item {
  */
 static __always_inline bool vmstat_item_in_bytes(int idx)
 {
-	return false;
+	/*
+	 * Global and per-node slab counters track slab pages.
+	 * It's expected that changes are multiples of PAGE_SIZE.
+	 * Internally values are stored in pages.
+	 *
+	 * Per-memcg and per-lruvec counters track memory, consumed
+	 * by individual slab objects. These counters are actually
+	 * byte-precise.
+	 */
+	return (idx == NR_SLAB_RECLAIMABLE_B ||
+		idx == NR_SLAB_UNRECLAIMABLE_B);
 }
 
 /*
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cef154261fe2..d25749bce7cf 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1663,7 +1663,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
 {
 	unsigned long size;
 
-	size = global_node_page_state(NR_SLAB_RECLAIMABLE)
+	size = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B)
 		+ global_node_page_state(NR_ACTIVE_ANON)
 		+ global_node_page_state(NR_INACTIVE_ANON)
 		+ global_node_page_state(NR_ACTIVE_FILE)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 61ae6658d59f..328b7e7bf9ab 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1391,9 +1391,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
 		       (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
 		       1024);
 	seq_buf_printf(&s, "slab %llu\n",
-		       (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
-			     memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
-		       PAGE_SIZE);
+		       (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
+			     memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
 	seq_buf_printf(&s, "sock %llu\n",
 		       (u64)memcg_page_state(memcg, MEMCG_SOCK) *
 		       PAGE_SIZE);
@@ -1423,11 +1422,9 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
 			       PAGE_SIZE);
 
 	seq_buf_printf(&s, "slab_reclaimable %llu\n",
-		       (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
-		       PAGE_SIZE);
+		       (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
 	seq_buf_printf(&s, "slab_unreclaimable %llu\n",
-		       (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
-		       PAGE_SIZE);
+		       (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
 
 	/* Accumulated memory events */
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6e94962893ee..d30ce75f23fb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -184,7 +184,7 @@ static bool is_dump_unreclaim_slabs(void)
 		 global_node_page_state(NR_ISOLATED_FILE) +
 		 global_node_page_state(NR_UNEVICTABLE);
 
-	return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
+	return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
 }
 
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 901a21f61d68..f9ad093814d2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5220,8 +5220,8 @@ long si_mem_available(void)
 	 * items that are in use, and cannot be freed. Cap this estimate at the
 	 * low watermark.
 	 */
-	reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
-			global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+	reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
+		global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
 	available += reclaimable - min(reclaimable / 2, wmark_low);
 
 	if (available < 0)
@@ -5364,8 +5364,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 		global_node_page_state(NR_UNEVICTABLE),
 		global_node_page_state(NR_FILE_DIRTY),
 		global_node_page_state(NR_WRITEBACK),
-		global_node_page_state(NR_SLAB_RECLAIMABLE),
-		global_node_page_state(NR_SLAB_UNRECLAIMABLE),
+		global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
+		global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
 		global_node_page_state(NR_FILE_MAPPED),
 		global_node_page_state(NR_SHMEM),
 		global_zone_page_state(NR_PAGETABLE),
diff --git a/mm/slab.h b/mm/slab.h
index fceb4341ba91..09be3ca6fe87 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -273,7 +273,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 static inline int cache_vmstat_idx(struct kmem_cache *s)
 {
 	return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
-		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE;
+		NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
 }
 
 #ifdef CONFIG_SLUB_DEBUG
@@ -390,7 +390,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
 
 	if (unlikely(!memcg || mem_cgroup_is_root(memcg))) {
 		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-				    nr_pages);
+				    nr_pages << PAGE_SHIFT);
 		percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
 		return 0;
 	}
@@ -400,7 +400,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
 		goto out;
 
 	lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
-	mod_lruvec_state(lruvec, cache_vmstat_idx(s), nr_pages);
+	mod_lruvec_state(lruvec, cache_vmstat_idx(s), nr_pages << PAGE_SHIFT);
 
 	/* transer try_charge() page references to kmem_cache */
 	percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
@@ -425,11 +425,12 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
 	memcg = READ_ONCE(s->memcg_params.memcg);
 	if (likely(!mem_cgroup_is_root(memcg))) {
 		lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
-		mod_lruvec_state(lruvec, cache_vmstat_idx(s), -nr_pages);
+		mod_lruvec_state(lruvec, cache_vmstat_idx(s),
+				 -(nr_pages << PAGE_SHIFT));
 		memcg_kmem_uncharge(memcg, nr_pages);
 	} else {
 		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-				    -nr_pages);
+				    -(nr_pages << PAGE_SHIFT));
 	}
 	rcu_read_unlock();
 
@@ -513,7 +514,7 @@ static __always_inline int charge_slab_page(struct page *page,
 {
 	if (is_root_cache(s)) {
 		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-				    1 << order);
+				    PAGE_SIZE << order);
 		return 0;
 	}
 
@@ -525,7 +526,7 @@ static __always_inline void uncharge_slab_page(struct page *page, int order,
 {
 	if (is_root_cache(s)) {
 		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-				    -(1 << order));
+				    -(PAGE_SIZE << order));
 		return;
 	}
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 616ec8a0d91a..a73f168b1035 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1363,8 +1363,8 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
 	page = alloc_pages(flags, order);
 	if (likely(page)) {
 		ret = page_address(page);
-		mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
-				    1 << order);
+		mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+				    PAGE_SIZE << order);
 	}
 	ret = kasan_kmalloc_large(ret, size, flags);
 	/* As ret might get tagged, call kmemleak hook after KASAN. */
diff --git a/mm/slob.c b/mm/slob.c
index ac2aecfbc7a8..7cc9805c8091 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -202,8 +202,8 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
 	if (!page)
 		return NULL;
 
-	mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
-			    1 << order);
+	mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+			    PAGE_SIZE << order);
 	return page_address(page);
 }
 
@@ -214,8 +214,8 @@ static void slob_free_pages(void *b, int order)
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += 1 << order;
 
-	mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
-			    -(1 << order));
+	mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
+			    -(PAGE_SIZE << order));
 	__free_pages(sp, order);
 }
 
@@ -552,8 +552,8 @@ void kfree(const void *block)
 		slob_free(m, *m + align);
 	} else {
 		unsigned int order = compound_order(sp);
-		mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
-				    -(1 << order));
+		mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
+				    -(PAGE_SIZE << order));
 		__free_pages(sp, order);
 
 	}
diff --git a/mm/slub.c b/mm/slub.c
index ae39eb392396..2d73d677f7ac 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3991,8 +3991,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 	page = alloc_pages_node(node, flags, order);
 	if (page) {
 		ptr = page_address(page);
-		mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
-				    1 << order);
+		mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+				    PAGE_SIZE << order);
 	}
 
 	return kmalloc_large_node_hook(ptr, size, flags);
@@ -4123,8 +4123,8 @@ void kfree(const void *x)
 
 		BUG_ON(!PageCompound(page));
 		kfree_hook(object);
-		mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
-				    -(1 << order));
+		mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+				    -(PAGE_SIZE << order));
 		__free_pages(page, order);
 		return;
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 749d239c62b2..2ac43664aba4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4222,7 +4222,8 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 	 * unmapped file backed pages.
 	 */
 	if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
-	    node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+	    node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
+	    pgdat->min_slab_pages)
 		return NODE_RECLAIM_FULL;
 
 	/*
diff --git a/mm/workingset.c b/mm/workingset.c
index 50b7937bab32..b199726924dd 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -486,8 +486,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 		for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
 			pages += lruvec_page_state_local(lruvec,
 							 NR_LRU_BASE + i);
-		pages += lruvec_page_state_local(lruvec, NR_SLAB_RECLAIMABLE);
-		pages += lruvec_page_state_local(lruvec, NR_SLAB_UNRECLAIMABLE);
+		pages += lruvec_page_state_local(
+			lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
+		pages += lruvec_page_state_local(
+			lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
 	} else
 #endif
 		pages = node_present_pages(sc->nid);
-- 
cgit v1.2.3


From 991e7673859ed41e7ba83c8c4e57afe8cfebe314 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 6 Aug 2020 23:21:37 -0700
Subject: mm: memcontrol: account kernel stack per node

Currently the kernel stack is being accounted per-zone.  There is no need
to do that.  In addition due to being per-zone, memcg has to keep a
separate MEMCG_KERNEL_STACK_KB.  Make the stat per-node and deprecate
MEMCG_KERNEL_STACK_KB as memcg_stat_item is an extension of
node_stat_item.  In addition localize the kernel stack stats updates to
account_kernel_stack().

Signed-off-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Link: http://lkml.kernel.org/r/20200630161539.1759185-1-shakeelb@google.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c        |  4 ++--
 fs/proc/meminfo.c          |  4 ++--
 include/linux/memcontrol.h | 21 +++++++++++++++++--
 include/linux/mmzone.h     |  8 ++++----
 kernel/fork.c              | 51 +++++++++++++---------------------------------
 kernel/scs.c               |  2 +-
 mm/memcontrol.c            |  2 +-
 mm/page_alloc.c            | 16 +++++++--------
 mm/vmstat.c                |  8 ++++----
 9 files changed, 55 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0cf13e31603c..508b80f6329b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -440,9 +440,9 @@ static ssize_t node_read_meminfo(struct device *dev,
 		       nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
 		       nid, K(i.sharedram),
-		       nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
+		       nid, node_page_state(pgdat, NR_KERNEL_STACK_KB),
 #ifdef CONFIG_SHADOW_CALL_STACK
-		       nid, sum_zone_node_page_state(nid, NR_KERNEL_SCS_KB),
+		       nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
 #endif
 		       nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
 		       nid, 0UL,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 38ea95fd919a..2a4c58f70fb9 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,10 +101,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "SReclaimable:   ", sreclaimable);
 	show_val_kb(m, "SUnreclaim:     ", sunreclaim);
 	seq_printf(m, "KernelStack:    %8lu kB\n",
-		   global_zone_page_state(NR_KERNEL_STACK_KB));
+		   global_node_page_state(NR_KERNEL_STACK_KB));
 #ifdef CONFIG_SHADOW_CALL_STACK
 	seq_printf(m, "ShadowCallStack:%8lu kB\n",
-		   global_zone_page_state(NR_KERNEL_SCS_KB));
+		   global_node_page_state(NR_KERNEL_SCS_KB));
 #endif
 	show_val_kb(m, "PageTables:     ",
 		    global_zone_page_state(NR_PAGETABLE));
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5a8b62d075e6..624400c27eba 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -32,8 +32,6 @@ struct kmem_cache;
 enum memcg_stat_item {
 	MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
 	MEMCG_SOCK,
-	/* XXX: why are these zone and not node counters? */
-	MEMCG_KERNEL_STACK_KB,
 	MEMCG_NR_STAT,
 };
 
@@ -729,8 +727,19 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			int val);
 void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val);
+
 void mod_memcg_obj_state(void *p, int idx, int val);
 
+static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
+					 int val)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__mod_lruvec_slab_state(p, idx, val);
+	local_irq_restore(flags);
+}
+
 static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
 					  enum node_stat_item idx, int val)
 {
@@ -1151,6 +1160,14 @@ static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx,
 	__mod_node_page_state(page_pgdat(page), idx, val);
 }
 
+static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
+					 int val)
+{
+	struct page *page = virt_to_head_page(p);
+
+	mod_node_page_state(page_pgdat(page), idx, val);
+}
+
 static inline void mod_memcg_obj_state(void *p, int idx, int val)
 {
 }
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b79100edd228..a3bd54139a30 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -155,10 +155,6 @@ enum zone_stat_item {
 	NR_ZONE_WRITE_PENDING,	/* Count of dirty, writeback and unstable pages */
 	NR_MLOCK,		/* mlock()ed pages found and moved off LRU */
 	NR_PAGETABLE,		/* used for pagetables */
-	NR_KERNEL_STACK_KB,	/* measured in KiB */
-#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
-	NR_KERNEL_SCS_KB,	/* measured in KiB */
-#endif
 	/* Second 128 byte cacheline */
 	NR_BOUNCE,
 #if IS_ENABLED(CONFIG_ZSMALLOC)
@@ -203,6 +199,10 @@ enum node_stat_item {
 	NR_KERNEL_MISC_RECLAIMABLE,	/* reclaimable non-slab kernel pages */
 	NR_FOLL_PIN_ACQUIRED,	/* via: pin_user_page(), gup flag: FOLL_PIN */
 	NR_FOLL_PIN_RELEASED,	/* pages returned via unpin_user_page() */
+	NR_KERNEL_STACK_KB,	/* measured in KiB */
+#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
+	NR_KERNEL_SCS_KB,	/* measured in KiB */
+#endif
 	NR_VM_NODE_STAT_ITEMS
 };
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 76d3f3387554..c7b4ce9d2647 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -276,13 +276,8 @@ static inline void free_thread_stack(struct task_struct *tsk)
 	if (vm) {
 		int i;
 
-		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
-			mod_memcg_page_state(vm->pages[i],
-					     MEMCG_KERNEL_STACK_KB,
-					     -(int)(PAGE_SIZE / 1024));
-
+		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
 			memcg_kmem_uncharge_page(vm->pages[i], 0);
-		}
 
 		for (i = 0; i < NR_CACHED_STACKS; i++) {
 			if (this_cpu_cmpxchg(cached_stacks[i],
@@ -382,31 +377,14 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
 	void *stack = task_stack_page(tsk);
 	struct vm_struct *vm = task_stack_vm_area(tsk);
 
-	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
-
-	if (vm) {
-		int i;
-
-		BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
 
-		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
-			mod_zone_page_state(page_zone(vm->pages[i]),
-					    NR_KERNEL_STACK_KB,
-					    PAGE_SIZE / 1024 * account);
-		}
-	} else {
-		/*
-		 * All stack pages are in the same zone and belong to the
-		 * same memcg.
-		 */
-		struct page *first_page = virt_to_page(stack);
-
-		mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
-				    THREAD_SIZE / 1024 * account);
-
-		mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
-				    account * (THREAD_SIZE / 1024));
-	}
+	/* All stack pages are in the same node. */
+	if (vm)
+		mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
+				      account * (THREAD_SIZE / 1024));
+	else
+		mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
+				      account * (THREAD_SIZE / 1024));
 }
 
 static int memcg_charge_kernel_stack(struct task_struct *tsk)
@@ -415,24 +393,23 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
 	struct vm_struct *vm = task_stack_vm_area(tsk);
 	int ret;
 
+	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
 	if (vm) {
 		int i;
 
+		BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
 		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
 			/*
 			 * If memcg_kmem_charge_page() fails, page->mem_cgroup
-			 * pointer is NULL, and both memcg_kmem_uncharge_page()
-			 * and mod_memcg_page_state() in free_thread_stack()
-			 * will ignore this page. So it's safe.
+			 * pointer is NULL, and memcg_kmem_uncharge_page() in
+			 * free_thread_stack() will ignore this page.
 			 */
 			ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
 						     0);
 			if (ret)
 				return ret;
-
-			mod_memcg_page_state(vm->pages[i],
-					     MEMCG_KERNEL_STACK_KB,
-					     PAGE_SIZE / 1024);
 		}
 	}
 #endif
diff --git a/kernel/scs.c b/kernel/scs.c
index 5d4d9bbdec36..4ff4a7ba0094 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -17,7 +17,7 @@ static void __scs_account(void *s, int account)
 {
 	struct page *scs_page = virt_to_page(s);
 
-	mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB,
+	mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
 			    account * (SCS_SIZE / SZ_1K));
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 473f9b91d51f..a3e963366769 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1485,7 +1485,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
 		       (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
 		       PAGE_SIZE);
 	seq_buf_printf(&s, "kernel_stack %llu\n",
-		       (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
+		       (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
 		       1024);
 	seq_buf_printf(&s, "slab %llu\n",
 		       (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f9ad093814d2..8d5d8526c2f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5396,6 +5396,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			" anon_thp: %lukB"
 #endif
 			" writeback_tmp:%lukB"
+			" kernel_stack:%lukB"
+#ifdef CONFIG_SHADOW_CALL_STACK
+			" shadow_call_stack:%lukB"
+#endif
 			" all_unreclaimable? %s"
 			"\n",
 			pgdat->node_id,
@@ -5417,6 +5421,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
 #endif
 			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+			node_page_state(pgdat, NR_KERNEL_STACK_KB),
+#ifdef CONFIG_SHADOW_CALL_STACK
+			node_page_state(pgdat, NR_KERNEL_SCS_KB),
+#endif
 			pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
 				"yes" : "no");
 	}
@@ -5448,10 +5456,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			" present:%lukB"
 			" managed:%lukB"
 			" mlocked:%lukB"
-			" kernel_stack:%lukB"
-#ifdef CONFIG_SHADOW_CALL_STACK
-			" shadow_call_stack:%lukB"
-#endif
 			" pagetables:%lukB"
 			" bounce:%lukB"
 			" free_pcp:%lukB"
@@ -5473,10 +5477,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(zone->present_pages),
 			K(zone_managed_pages(zone)),
 			K(zone_page_state(zone, NR_MLOCK)),
-			zone_page_state(zone, NR_KERNEL_STACK_KB),
-#ifdef CONFIG_SHADOW_CALL_STACK
-			zone_page_state(zone, NR_KERNEL_SCS_KB),
-#endif
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(free_pcp),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b171a76bfe83..2b866cbab11d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1140,10 +1140,6 @@ const char * const vmstat_text[] = {
 	"nr_zone_write_pending",
 	"nr_mlock",
 	"nr_page_table_pages",
-	"nr_kernel_stack",
-#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
-	"nr_shadow_call_stack",
-#endif
 	"nr_bounce",
 #if IS_ENABLED(CONFIG_ZSMALLOC)
 	"nr_zspages",
@@ -1194,6 +1190,10 @@ const char * const vmstat_text[] = {
 	"nr_kernel_misc_reclaimable",
 	"nr_foll_pin_acquired",
 	"nr_foll_pin_released",
+	"nr_kernel_stack",
+#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
+	"nr_shadow_call_stack",
+#endif
 
 	/* enum writeback_stat_item counters */
 	"nr_dirty_threshold",
-- 
cgit v1.2.3


From 56f3547bfa4d361148aa748ccb86073bc57f5e6c Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Thu, 6 Aug 2020 23:23:15 -0700
Subject: mm: adjust vm_committed_as_batch according to vm overcommit policy

When checking a performance change for will-it-scale scalability mmap test
[1], we found very high lock contention for spinlock of percpu counter
'vm_committed_as':

    94.14%     0.35%  [kernel.kallsyms]         [k] _raw_spin_lock_irqsave
    48.21% _raw_spin_lock_irqsave;percpu_counter_add_batch;__vm_enough_memory;mmap_region;do_mmap;
    45.91% _raw_spin_lock_irqsave;percpu_counter_add_batch;__do_munmap;

Actually this heavy lock contention is not always necessary.  The
'vm_committed_as' needs to be very precise when the strict
OVERCOMMIT_NEVER policy is set, which requires a rather small batch number
for the percpu counter.

So keep 'batch' number unchanged for strict OVERCOMMIT_NEVER policy, and
lift it to 64X for OVERCOMMIT_ALWAYS and OVERCOMMIT_GUESS policies.  Also
add a sysctl handler to adjust it when the policy is reconfigured.

Benchmark with the same testcase in [1] shows 53% improvement on a 8C/16T
desktop, and 2097%(20X) on a 4S/72C/144T server.  We tested with test
platforms in 0day (server, desktop and laptop), and 80%+ platforms shows
improvements with that test.  And whether it shows improvements depends on
if the test mmap size is bigger than the batch number computed.

And if the lift is 16X, 1/3 of the platforms will show improvements,
though it should help the mmap/unmap usage generally, as Michal Hocko
mentioned:

: I believe that there are non-synthetic worklaods which would benefit from
: a larger batch.  E.g.  large in memory databases which do large mmaps
: during startups from multiple threads.

[1] https://lore.kernel.org/lkml/20200305062138.GI5972@shao2-debian/

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Qian Cai <cai@lca.pw>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: kernel test robot <rong.a.chen@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1589611660-89854-4-git-send-email-feng.tang@intel.com
Link: http://lkml.kernel.org/r/1592725000-73486-4-git-send-email-feng.tang@intel.com
Link: http://lkml.kernel.org/r/1594389708-60781-5-git-send-email-feng.tang@intel.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h   |  2 ++
 include/linux/mman.h |  4 ++++
 kernel/sysctl.c      |  2 +-
 mm/mm_init.c         | 22 ++++++++++++++++------
 mm/util.c            | 41 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 64 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2830f1c0fdc3..1c3470550395 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -206,6 +206,8 @@ int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
 		loff_t *);
 int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
 		loff_t *);
+int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
+		loff_t *);
 
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 4b08e9c9c538..6f34c33075f9 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -57,8 +57,12 @@ extern struct percpu_counter vm_committed_as;
 
 #ifdef CONFIG_SMP
 extern s32 vm_committed_as_batch;
+extern void mm_compute_batch(int overcommit_policy);
 #else
 #define vm_committed_as_batch 0
+static inline void mm_compute_batch(int overcommit_policy)
+{
+}
 #endif
 
 unsigned long vm_memory_committed(void);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1b4d2dc270a5..f785de3caac0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2671,7 +2671,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &sysctl_overcommit_memory,
 		.maxlen		= sizeof(sysctl_overcommit_memory),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= overcommit_policy_handler,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 435e5f794b3b..b06a30fbedff 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -13,6 +13,7 @@
 #include <linux/memory.h>
 #include <linux/notifier.h>
 #include <linux/sched.h>
+#include <linux/mman.h>
 #include "internal.h"
 
 #ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -144,14 +145,23 @@ EXPORT_SYMBOL_GPL(mm_kobj);
 #ifdef CONFIG_SMP
 s32 vm_committed_as_batch = 32;
 
-static void __meminit mm_compute_batch(void)
+void mm_compute_batch(int overcommit_policy)
 {
 	u64 memsized_batch;
 	s32 nr = num_present_cpus();
 	s32 batch = max_t(s32, nr*2, 32);
-
-	/* batch size set to 0.4% of (total memory/#cpus), or max int32 */
-	memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff);
+	unsigned long ram_pages = totalram_pages();
+
+	/*
+	 * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
+	 * (total memory/#cpus), and lift it to 25% for other policies
+	 * to easy the possible lock contention for percpu_counter
+	 * vm_committed_as, while the max limit is INT_MAX
+	 */
+	if (overcommit_policy == OVERCOMMIT_NEVER)
+		memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX);
+	else
+		memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);
 
 	vm_committed_as_batch = max_t(s32, memsized_batch, batch);
 }
@@ -162,7 +172,7 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
 	switch (action) {
 	case MEM_ONLINE:
 	case MEM_OFFLINE:
-		mm_compute_batch();
+		mm_compute_batch(sysctl_overcommit_memory);
 	default:
 		break;
 	}
@@ -176,7 +186,7 @@ static struct notifier_block compute_batch_nb __meminitdata = {
 
 static int __init mm_compute_batch_init(void)
 {
-	mm_compute_batch();
+	mm_compute_batch(sysctl_overcommit_memory);
 	register_hotmemory_notifier(&compute_batch_nb);
 
 	return 0;
diff --git a/mm/util.c b/mm/util.c
index 1c9d097d7e48..8d6280c05238 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -746,6 +746,47 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
 	return ret;
 }
 
+static void sync_overcommit_as(struct work_struct *dummy)
+{
+	percpu_counter_sync(&vm_committed_as);
+}
+
+int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int new_policy;
+	int ret;
+
+	/*
+	 * The deviation of sync_overcommit_as could be big with loose policy
+	 * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
+	 * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
+	 * with the strict "NEVER", and to avoid possible race condtion (even
+	 * though user usually won't too frequently do the switching to policy
+	 * OVERCOMMIT_NEVER), the switch is done in the following order:
+	 *	1. changing the batch
+	 *	2. sync percpu count on each CPU
+	 *	3. switch the policy
+	 */
+	if (write) {
+		t = *table;
+		t.data = &new_policy;
+		ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+		if (ret)
+			return ret;
+
+		mm_compute_batch(new_policy);
+		if (new_policy == OVERCOMMIT_NEVER)
+			schedule_on_each_cpu(sync_overcommit_as);
+		sysctl_overcommit_memory = new_policy;
+	} else {
+		ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	}
+
+	return ret;
+}
+
 int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos)
 {
-- 
cgit v1.2.3


From 26e760c9a7c8ec31fa1a6bfbbce3f63f189ccef0 Mon Sep 17 00:00:00 2001
From: Walter Wu <walter-zh.wu@mediatek.com>
Date: Thu, 6 Aug 2020 23:24:35 -0700
Subject: rcu: kasan: record and print call_rcu() call stack

Patch series "kasan: memorize and print call_rcu stack", v8.

This patchset improves KASAN reports by making them to have call_rcu()
call stack information.  It is useful for programmers to solve
use-after-free or double-free memory issue.

The KASAN report was as follows(cleaned up slightly):

BUG: KASAN: use-after-free in kasan_rcu_reclaim+0x58/0x60

Freed by task 0:
 kasan_save_stack+0x24/0x50
 kasan_set_track+0x24/0x38
 kasan_set_free_info+0x18/0x20
 __kasan_slab_free+0x10c/0x170
 kasan_slab_free+0x10/0x18
 kfree+0x98/0x270
 kasan_rcu_reclaim+0x1c/0x60

Last call_rcu():
 kasan_save_stack+0x24/0x50
 kasan_record_aux_stack+0xbc/0xd0
 call_rcu+0x8c/0x580
 kasan_rcu_uaf+0xf4/0xf8

Generic KASAN will record the last two call_rcu() call stacks and print up
to 2 call_rcu() call stacks in KASAN report.  it is only suitable for
generic KASAN.

This feature considers the size of struct kasan_alloc_meta and
kasan_free_meta, we try to optimize the structure layout and size, lets it
get better memory consumption.

[1]https://bugzilla.kernel.org/show_bug.cgi?id=198437
[2]https://groups.google.com/forum/#!searchin/kasan-dev/better$20stack$20traces$20for$20rcu%7Csort:date/kasan-dev/KQsjT_88hDE/7rNUZprRBgAJ

This patch (of 4):

This feature will record the last two call_rcu() call stacks and prints up
to 2 call_rcu() call stacks in KASAN report.

When call_rcu() is called, we store the call_rcu() call stack into slub
alloc meta-data, so that the KASAN report can print rcu stack.

[1]https://bugzilla.kernel.org/show_bug.cgi?id=198437
[2]https://groups.google.com/forum/#!searchin/kasan-dev/better$20stack$20traces$20for$20rcu%7Csort:date/kasan-dev/KQsjT_88hDE/7rNUZprRBgAJ

[walter-zh.wu@mediatek.com: build fix]
  Link: http://lkml.kernel.org/r/20200710162401.23816-1-walter-zh.wu@mediatek.com

Suggested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Walter Wu <walter-zh.wu@mediatek.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Link: http://lkml.kernel.org/r/20200710162123.23713-1-walter-zh.wu@mediatek.com
Link: http://lkml.kernel.org/r/20200601050847.1096-1-walter-zh.wu@mediatek.com
Link: http://lkml.kernel.org/r/20200601050927.1153-1-walter-zh.wu@mediatek.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  2 ++
 kernel/rcu/tree.c     |  2 ++
 mm/kasan/common.c     |  4 ++--
 mm/kasan/generic.c    | 21 +++++++++++++++++++++
 mm/kasan/kasan.h      |  9 +++++++++
 mm/kasan/report.c     | 28 +++++++++++++++++++++++-----
 6 files changed, 59 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 82522e996c76..18452e35e7b2 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -174,11 +174,13 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 
 void kasan_cache_shrink(struct kmem_cache *cache);
 void kasan_cache_shutdown(struct kmem_cache *cache);
+void kasan_record_aux_stack(void *ptr);
 
 #else /* CONFIG_KASAN_GENERIC */
 
 static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
 static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
+static inline void kasan_record_aux_stack(void *ptr) {}
 
 #endif /* CONFIG_KASAN_GENERIC */
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ac7198ed3197..8ce77d9ac716 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -59,6 +59,7 @@
 #include <linux/sched/clock.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
+#include <linux/kasan.h>
 #include "../time/tick-internal.h"
 
 #include "tree.h"
@@ -2890,6 +2891,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
 	head->func = func;
 	head->next = NULL;
 	local_irq_save(flags);
+	kasan_record_aux_stack(head);
 	rdp = this_cpu_ptr(&rcu_data);
 
 	/* Add the callback to our list. */
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 757d4074fe28..ad24666f50e4 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -40,7 +40,7 @@
 #include "kasan.h"
 #include "../slab.h"
 
-static inline depot_stack_handle_t save_stack(gfp_t flags)
+depot_stack_handle_t kasan_save_stack(gfp_t flags)
 {
 	unsigned long entries[KASAN_STACK_DEPTH];
 	unsigned int nr_entries;
@@ -53,7 +53,7 @@ static inline depot_stack_handle_t save_stack(gfp_t flags)
 static inline void set_track(struct kasan_track *track, gfp_t flags)
 {
 	track->pid = current->pid;
-	track->stack = save_stack(flags);
+	track->stack = kasan_save_stack(flags);
 }
 
 void kasan_enable_current(void)
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 098a7dbaced6..d70586393b04 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -324,3 +324,24 @@ DEFINE_ASAN_SET_SHADOW(f2);
 DEFINE_ASAN_SET_SHADOW(f3);
 DEFINE_ASAN_SET_SHADOW(f5);
 DEFINE_ASAN_SET_SHADOW(f8);
+
+void kasan_record_aux_stack(void *addr)
+{
+	struct page *page = kasan_addr_to_page(addr);
+	struct kmem_cache *cache;
+	struct kasan_alloc_meta *alloc_info;
+	void *object;
+
+	if (!(page && PageSlab(page)))
+		return;
+
+	cache = page->slab_cache;
+	object = nearest_obj(cache, page, addr);
+	alloc_info = get_alloc_info(cache, object);
+
+	/*
+	 * record the last two call_rcu() call stacks.
+	 */
+	alloc_info->aux_stack[1] = alloc_info->aux_stack[0];
+	alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
+}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index cfade6413528..f89a195e336a 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -104,6 +104,13 @@ struct kasan_track {
 
 struct kasan_alloc_meta {
 	struct kasan_track alloc_track;
+#ifdef CONFIG_KASAN_GENERIC
+	/*
+	 * call_rcu() call stack is stored into struct kasan_alloc_meta.
+	 * The free stack is stored into struct kasan_free_meta.
+	 */
+	depot_stack_handle_t aux_stack[2];
+#endif
 	struct kasan_track free_track[KASAN_NR_FREE_STACKS];
 #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
 	u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
@@ -159,6 +166,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip);
 
 struct page *kasan_addr_to_page(const void *addr);
 
+depot_stack_handle_t kasan_save_stack(gfp_t flags);
+
 #if defined(CONFIG_KASAN_GENERIC) && \
 	(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
 void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 51ec45407a0b..445a9d56eb13 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -106,15 +106,20 @@ static void end_report(unsigned long *flags)
 	kasan_enable_current();
 }
 
+static void print_stack(depot_stack_handle_t stack)
+{
+	unsigned long *entries;
+	unsigned int nr_entries;
+
+	nr_entries = stack_depot_fetch(stack, &entries);
+	stack_trace_print(entries, nr_entries, 0);
+}
+
 static void print_track(struct kasan_track *track, const char *prefix)
 {
 	pr_err("%s by task %u:\n", prefix, track->pid);
 	if (track->stack) {
-		unsigned long *entries;
-		unsigned int nr_entries;
-
-		nr_entries = stack_depot_fetch(track->stack, &entries);
-		stack_trace_print(entries, nr_entries, 0);
+		print_stack(track->stack);
 	} else {
 		pr_err("(stack is not available)\n");
 	}
@@ -193,6 +198,19 @@ static void describe_object(struct kmem_cache *cache, void *object,
 		free_track = kasan_get_free_track(cache, object, tag);
 		print_track(free_track, "Freed");
 		pr_err("\n");
+
+#ifdef CONFIG_KASAN_GENERIC
+		if (alloc_info->aux_stack[0]) {
+			pr_err("Last call_rcu():\n");
+			print_stack(alloc_info->aux_stack[0]);
+			pr_err("\n");
+		}
+		if (alloc_info->aux_stack[1]) {
+			pr_err("Second to last call_rcu():\n");
+			print_stack(alloc_info->aux_stack[1]);
+			pr_err("\n");
+		}
+#endif
 	}
 
 	describe_object_addr(cache, object, addr);
-- 
cgit v1.2.3


From 8dcc1d34661d58a7889fb06517c8738d1412d1bc Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 6 Aug 2020 23:24:57 -0700
Subject: kasan: don't tag stacks allocated with pagealloc

Patch series "kasan: support stack instrumentation for tag-based mode", v2.

This patch (of 5):

Prepare Software Tag-Based KASAN for stack tagging support.

With Tag-Based KASAN when kernel stacks are allocated via pagealloc (which
happens when CONFIG_VMAP_STACK is not enabled), they get tagged.  KASAN
instrumentation doesn't expect the sp register to be tagged, and this
leads to false-positive reports.

Fix by resetting the tag of kernel stack pointers after allocation.

Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Walter Wu <walter-zh.wu@mediatek.com>
Cc: Elena Petrova <lenaptr@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Link: http://lkml.kernel.org/r/cover.1596199677.git.andreyknvl@google.com
Link: http://lkml.kernel.org/r/cover.1596544734.git.andreyknvl@google.com
Link: http://lkml.kernel.org/r/12d8c678869268dd0884b01271ab592f30792abf.1596544734.git.andreyknvl@google.com
Link: http://lkml.kernel.org/r/01c678b877755bcf29009176592402cdf6f2cb15.1596199677.git.andreyknvl@google.com
Link: https://bugzilla.kernel.org/show_bug.cgi?id=203497
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index c7b4ce9d2647..35e9894d394c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -261,7 +261,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 					     THREAD_SIZE_ORDER);
 
 	if (likely(page)) {
-		tsk->stack = page_address(page);
+		tsk->stack = kasan_reset_tag(page_address(page));
 		return tsk->stack;
 	}
 	return NULL;
@@ -302,6 +302,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 {
 	unsigned long *stack;
 	stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+	stack = kasan_reset_tag(stack);
 	tsk->stack = stack;
 	return stack;
 }
-- 
cgit v1.2.3


From 38ce2a9e33db61a3041840310077072d6210ead4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 6 Aug 2020 12:46:49 -0400
Subject: tracing: Add trace_array_init_printk() to initialize instance
 trace_printk() buffers

As trace_array_printk() used with not global instances will not add noise to
the main buffer, they are OK to have in the kernel (unlike trace_printk()).
This require the subsystem to create their own tracing instance, and the
trace_array_printk() only writes into those instances.

Add trace_array_init_printk() to initialize the trace_printk() buffers
without printing out the WARNING message.

Reported-by: Sean Paul <sean@poorly.run>
Reviewed-by: Sean Paul <sean@poorly.run>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace.h |  1 +
 kernel/trace/trace.c  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/trace.h b/include/linux/trace.h
index 7fd86d3c691f..36d255d66f88 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -29,6 +29,7 @@ struct trace_array;
 void trace_printk_init_buffers(void);
 int trace_array_printk(struct trace_array *tr, unsigned long ip,
 		const char *fmt, ...);
+int trace_array_init_printk(struct trace_array *tr);
 void trace_array_put(struct trace_array *tr);
 struct trace_array *trace_array_get_by_name(const char *name);
 int trace_array_destroy(struct trace_array *tr);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 06c0feae5ff9..c5f822736261 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3129,6 +3129,9 @@ static int alloc_percpu_trace_buffer(void)
 {
 	struct trace_buffer_struct *buffers;
 
+	if (trace_percpu_buffer)
+		return 0;
+
 	buffers = alloc_percpu(struct trace_buffer_struct);
 	if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
 		return -ENOMEM;
@@ -3331,6 +3334,26 @@ int trace_array_vprintk(struct trace_array *tr,
 	return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
 }
 
+/**
+ * trace_array_printk - Print a message to a specific instance
+ * @tr: The instance trace_array descriptor
+ * @ip: The instruction pointer that this is called from.
+ * @fmt: The format to print (printf format)
+ *
+ * If a subsystem sets up its own instance, they have the right to
+ * printk strings into their tracing instance buffer using this
+ * function. Note, this function will not write into the top level
+ * buffer (use trace_printk() for that), as writing into the top level
+ * buffer should only have events that can be individually disabled.
+ * trace_printk() is only used for debugging a kernel, and should not
+ * be ever encorporated in normal use.
+ *
+ * trace_array_printk() can be used, as it will not add noise to the
+ * top level tracing buffer.
+ *
+ * Note, trace_array_init_printk() must be called on @tr before this
+ * can be used.
+ */
 __printf(3, 0)
 int trace_array_printk(struct trace_array *tr,
 		       unsigned long ip, const char *fmt, ...)
@@ -3355,6 +3378,27 @@ int trace_array_printk(struct trace_array *tr,
 }
 EXPORT_SYMBOL_GPL(trace_array_printk);
 
+/**
+ * trace_array_init_printk - Initialize buffers for trace_array_printk()
+ * @tr: The trace array to initialize the buffers for
+ *
+ * As trace_array_printk() only writes into instances, they are OK to
+ * have in the kernel (unlike trace_printk()). This needs to be called
+ * before trace_array_printk() can be used on a trace_array.
+ */
+int trace_array_init_printk(struct trace_array *tr)
+{
+	if (!tr)
+		return -ENOENT;
+
+	/* This is only allowed for created instances */
+	if (tr == &global_trace)
+		return -EINVAL;
+
+	return alloc_percpu_trace_buffer();
+}
+EXPORT_SYMBOL_GPL(trace_array_init_printk);
+
 __printf(3, 4)
 int trace_array_printk_buf(struct trace_buffer *buffer,
 			   unsigned long ip, const char *fmt, ...)
-- 
cgit v1.2.3


From 15d5761ad31dfb194ebe76554e6af0437eb20424 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Tue, 7 Jul 2020 18:21:16 +0900
Subject: kbuild: introduce ccflags-remove-y and asflags-remove-y

CFLAGS_REMOVE_<file>.o filters out flags when compiling a particular
object, but there is no convenient way to do that for every object in
a directory.

Add ccflags-remove-y and asflags-remove-y to make it easily.

Use ccflags-remove-y to clean up some Makefiles.

The add/remove order works as follows:

 [1] KBUILD_CFLAGS specifies compiler flags used globally

 [2] ccflags-y adds compiler flags for all objects in the
     current Makefile

 [3] ccflags-remove-y removes compiler flags for all objects in the
     current Makefile (New feature)

 [4] CFLAGS_<file> adds compiler flags per file.

 [5] CFLAGS_REMOVE_<file> removes compiler flags per file.

Having [3] before [4] allows us to remove flags from most (but not all)
objects in the current Makefile.

For example, kernel/trace/Makefile removes $(CC_FLAGS_FTRACE)
from all objects in the directory, then adds it back to
trace_selftest_dynamic.o and CFLAGS_trace_kprobe_selftest.o

The same applies to lib/livepatch/Makefile.

Please note ccflags-remove-y has no effect to the sub-directories.
In contrast, the previous notation got rid of compiler flags also from
all the sub-directories.

The following are not affected because they have no sub-directories:

  arch/arm/boot/compressed/
  arch/powerpc/xmon/
  arch/sh/
  kernel/trace/

However, lib/ has several sub-directories.

To keep the behavior, I added ccflags-remove-y to all Makefiles
in subdirectories of lib/, except the following:

  lib/vdso/Makefile        - Kbuild does not descend into this Makefile
  lib/raid/test/Makefile   - This is not used for the kernel build

I think commit 2464a609ded0 ("ftrace: do not trace library functions")
excluded too much. In the next commit, I will remove ccflags-remove-y
from the sub-directories of lib/.

Suggested-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Acked-by: Brendan Higgins <brendanhiggins@google.com> (KUnit)
Tested-by: Anders Roxell <anders.roxell@linaro.org>
---
 Documentation/kbuild/makefiles.rst | 14 ++++++++++++++
 arch/arm/boot/compressed/Makefile  |  6 +-----
 arch/powerpc/xmon/Makefile         |  3 +--
 arch/sh/boot/compressed/Makefile   |  5 +----
 kernel/trace/Makefile              |  4 ++--
 lib/842/Makefile                   |  3 +++
 lib/Makefile                       |  5 +----
 lib/crypto/Makefile                |  2 ++
 lib/dim/Makefile                   |  2 ++
 lib/fonts/Makefile                 |  2 ++
 lib/kunit/Makefile                 |  3 +++
 lib/livepatch/Makefile             |  2 ++
 lib/lz4/Makefile                   |  1 +
 lib/lzo/Makefile                   |  2 ++
 lib/math/Makefile                  |  2 ++
 lib/mpi/Makefile                   |  2 ++
 lib/raid6/Makefile                 |  3 +++
 lib/reed_solomon/Makefile          |  2 ++
 lib/xz/Makefile                    |  3 +++
 lib/zlib_deflate/Makefile          |  2 ++
 lib/zlib_dfltcc/Makefile           |  2 ++
 lib/zlib_inflate/Makefile          |  2 ++
 lib/zstd/Makefile                  |  1 +
 scripts/Makefile.lib               | 14 ++++++++------
 24 files changed, 64 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst
index 6515ebc12b6f..14d8e7d23c04 100644
--- a/Documentation/kbuild/makefiles.rst
+++ b/Documentation/kbuild/makefiles.rst
@@ -368,6 +368,14 @@ more details, with real examples.
 
 		subdir-ccflags-y := -Werror
 
+    ccflags-remove-y, asflags-remove-y
+	These flags are used to remove particular flags for the compiler,
+	assembler invocations.
+
+	Example::
+
+		ccflags-remove-$(CONFIG_MCOUNT) += -pg
+
     CFLAGS_$@, AFLAGS_$@
 	CFLAGS_$@ and AFLAGS_$@ only apply to commands in current
 	kbuild makefile.
@@ -375,6 +383,9 @@ more details, with real examples.
 	$(CFLAGS_$@) specifies per-file options for $(CC).  The $@
 	part has a literal value which specifies the file that it is for.
 
+	CFLAGS_$@ has the higher priority than ccflags-remove-y; CFLAGS_$@
+	can re-add compiler flags that were removed by ccflags-remove-y.
+
 	Example::
 
 		# drivers/scsi/Makefile
@@ -387,6 +398,9 @@ more details, with real examples.
 	$(AFLAGS_$@) is a similar feature for source files in assembly
 	languages.
 
+	AFLAGS_$@ has the higher priority than asflags-remove-y; AFLAGS_$@
+	can re-add assembler flags that were removed by asflags-remove-y.
+
 	Example::
 
 		# arch/arm/kernel/Makefile
diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index cb7a56c6723c..b1147b7f2c8d 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -102,13 +102,9 @@ clean-files += piggy_data lib1funcs.S ashldi3.S bswapsdi2.S hyp-stub.S
 
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 
-ifeq ($(CONFIG_FUNCTION_TRACER),y)
-ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
-endif
-
 ccflags-y := -fpic $(call cc-option,-mno-single-pic-base,) -fno-builtin \
 	     -I$(obj) $(DISABLE_ARM_SSP_PER_TASK_PLUGIN)
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += -pg
 asflags-y := -DZIMAGE
 
 # Supply kernel BSS size to the decompressor via a linker symbol.
diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile
index 89c76ca35640..eb25d7554ffd 100644
--- a/arch/powerpc/xmon/Makefile
+++ b/arch/powerpc/xmon/Makefile
@@ -7,8 +7,7 @@ UBSAN_SANITIZE := n
 KASAN_SANITIZE := n
 
 # Disable ftrace for the entire directory
-ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
 
 ifdef CONFIG_CC_IS_CLANG
 # clang stores addresses on the stack causing the frame size to blow
diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile
index ad0e2403e56f..589d2d8a573d 100644
--- a/arch/sh/boot/compressed/Makefile
+++ b/arch/sh/boot/compressed/Makefile
@@ -28,10 +28,7 @@ IMAGE_OFFSET	:= $(shell /bin/bash -c 'printf "0x%08x" \
 			$(CONFIG_BOOT_LINK_OFFSET)]')
 endif
 
-ifeq ($(CONFIG_MCOUNT),y)
-ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
-endif
+ccflags-remove-$(CONFIG_MCOUNT) += -pg
 
 LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(IMAGE_OFFSET) -e startup \
 		   -T $(obj)/../../kernel/vmlinux.lds
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 6575bb0a0434..7492844a8b1b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -2,9 +2,9 @@
 
 # Do not instrument the tracer itself:
 
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 ifdef CONFIG_FUNCTION_TRACER
-ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
 
 # Avoid recursion due to instrumentation.
 KCSAN_SANITIZE := n
diff --git a/lib/842/Makefile b/lib/842/Makefile
index 6f7aad269288..b815e824ae37 100644
--- a/lib/842/Makefile
+++ b/lib/842/Makefile
@@ -1,3 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
+
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 obj-$(CONFIG_842_COMPRESS) += 842_compress.o
 obj-$(CONFIG_842_DECOMPRESS) += 842_decompress.o
diff --git a/lib/Makefile b/lib/Makefile
index 0cda70649f1c..233f6e644eeb 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -3,10 +3,7 @@
 # Makefile for some libs needed in the kernel.
 #
 
-ifdef CONFIG_FUNCTION_TRACER
-ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
-endif
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
 
 # These files are disabled because they produce lots of non-interesting and/or
 # flaky coverage that is not a function of syscall inputs. For example,
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 3a435629d9ce..b557ef0b07c2 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 # chacha is used by the /dev/random driver which is always builtin
 obj-y						+= chacha.o
 obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC)		+= libchacha.o
diff --git a/lib/dim/Makefile b/lib/dim/Makefile
index 1d6858a108cb..97fc3e89d34e 100644
--- a/lib/dim/Makefile
+++ b/lib/dim/Makefile
@@ -2,6 +2,8 @@
 # DIM Dynamic Interrupt Moderation library
 #
 
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 obj-$(CONFIG_DIMLIB) += dim.o
 
 dim-y := dim.o net_dim.o rdma_dim.o
diff --git a/lib/fonts/Makefile b/lib/fonts/Makefile
index ed95070860de..f951750c179e 100644
--- a/lib/fonts/Makefile
+++ b/lib/fonts/Makefile
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 # Font handling
 
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 font-objs := fonts.o
 
 font-objs-$(CONFIG_FONT_SUN8x16)   += font_sun8x16.o
diff --git a/lib/kunit/Makefile b/lib/kunit/Makefile
index 724b94311ca3..8c847557ab24 100644
--- a/lib/kunit/Makefile
+++ b/lib/kunit/Makefile
@@ -1,3 +1,6 @@
+
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 obj-$(CONFIG_KUNIT) +=			kunit.o
 
 kunit-objs +=				test.o \
diff --git a/lib/livepatch/Makefile b/lib/livepatch/Makefile
index 295b94bff370..9abdf615b088 100644
--- a/lib/livepatch/Makefile
+++ b/lib/livepatch/Makefile
@@ -2,6 +2,8 @@
 #
 # Makefile for livepatch test code.
 
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 obj-$(CONFIG_TEST_LIVEPATCH) += test_klp_atomic_replace.o \
 				test_klp_callbacks_demo.o \
 				test_klp_callbacks_demo2.o \
diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile
index 5b42242afaa2..53da4cab7015 100644
--- a/lib/lz4/Makefile
+++ b/lib/lz4/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 ccflags-y += -O3
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
 
 obj-$(CONFIG_LZ4_COMPRESS) += lz4_compress.o
 obj-$(CONFIG_LZ4HC_COMPRESS) += lz4hc_compress.o
diff --git a/lib/lzo/Makefile b/lib/lzo/Makefile
index 2f58fafbbddd..9565a555275b 100644
--- a/lib/lzo/Makefile
+++ b/lib/lzo/Makefile
@@ -1,4 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 lzo_compress-objs := lzo1x_compress.o
 lzo_decompress-objs := lzo1x_decompress_safe.o
 
diff --git a/lib/math/Makefile b/lib/math/Makefile
index be6909e943bd..49aa50e28185 100644
--- a/lib/math/Makefile
+++ b/lib/math/Makefile
@@ -1,4 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 obj-y += div64.o gcd.o lcm.o int_pow.o int_sqrt.o reciprocal_div.o
 
 obj-$(CONFIG_CORDIC)		+= cordic.o
diff --git a/lib/mpi/Makefile b/lib/mpi/Makefile
index d5874a7f5ff9..df7883521619 100644
--- a/lib/mpi/Makefile
+++ b/lib/mpi/Makefile
@@ -3,6 +3,8 @@
 # MPI multiprecision maths library (from gpg)
 #
 
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 obj-$(CONFIG_MPILIB) = mpi.o
 
 mpi-y = \
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index b4c0df6d706d..3482d6ae3f3b 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
+
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 
 raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
diff --git a/lib/reed_solomon/Makefile b/lib/reed_solomon/Makefile
index 5d4fa68f26cb..a5c9defdac7f 100644
--- a/lib/reed_solomon/Makefile
+++ b/lib/reed_solomon/Makefile
@@ -3,5 +3,7 @@
 # This is a modified version of reed solomon lib,
 #
 
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 obj-$(CONFIG_REED_SOLOMON) += reed_solomon.o
 obj-$(CONFIG_REED_SOLOMON_TEST) += test_rslib.o
diff --git a/lib/xz/Makefile b/lib/xz/Makefile
index fa6af814a8d1..fae9b6c7c389 100644
--- a/lib/xz/Makefile
+++ b/lib/xz/Makefile
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
+
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 obj-$(CONFIG_XZ_DEC) += xz_dec.o
 xz_dec-y := xz_dec_syms.o xz_dec_stream.o xz_dec_lzma2.o
 xz_dec-$(CONFIG_XZ_DEC_BCJ) += xz_dec_bcj.o
diff --git a/lib/zlib_deflate/Makefile b/lib/zlib_deflate/Makefile
index 2622e03c0b94..1fcefe73536f 100644
--- a/lib/zlib_deflate/Makefile
+++ b/lib/zlib_deflate/Makefile
@@ -7,6 +7,8 @@
 # decompression code.
 #
 
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate.o
 
 zlib_deflate-objs := deflate.o deftree.o deflate_syms.o
diff --git a/lib/zlib_dfltcc/Makefile b/lib/zlib_dfltcc/Makefile
index 8e4d5afbbb10..7a8067f6e772 100644
--- a/lib/zlib_dfltcc/Makefile
+++ b/lib/zlib_dfltcc/Makefile
@@ -6,6 +6,8 @@
 # This is the code for s390 zlib hardware support.
 #
 
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 obj-$(CONFIG_ZLIB_DFLTCC) += zlib_dfltcc.o
 
 zlib_dfltcc-objs := dfltcc.o dfltcc_deflate.o dfltcc_inflate.o dfltcc_syms.o
diff --git a/lib/zlib_inflate/Makefile b/lib/zlib_inflate/Makefile
index 27327d3e9f54..a451e96f9845 100644
--- a/lib/zlib_inflate/Makefile
+++ b/lib/zlib_inflate/Makefile
@@ -14,6 +14,8 @@
 # uncompression can be done without blocking on allocation).
 #
 
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
+
 obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate.o
 
 zlib_inflate-objs := inffast.o inflate.o infutil.o \
diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
index f5d778e7e5c7..01be908a2d94 100644
--- a/lib/zstd/Makefile
+++ b/lib/zstd/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_ZSTD_COMPRESS) += zstd_compress.o
 obj-$(CONFIG_ZSTD_DECOMPRESS) += zstd_decompress.o
 
 ccflags-y += -O3
+ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
 
 zstd_compress-y := fse_compress.o huf_compress.o compress.o \
 		   entropy_common.o fse_decompress.o zstd_common.o
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 8fa9aa2c9fca..5cfd377778b4 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -111,12 +111,14 @@ basename_flags = -DKBUILD_BASENAME=$(call name-fix,$(basetarget))
 modname_flags  = -DKBUILD_MODNAME=$(call name-fix,$(modname))
 modfile_flags  = -DKBUILD_MODFILE=$(call stringify,$(modfile))
 
-orig_c_flags   = $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) \
-                 $(ccflags-y) $(CFLAGS_$(target-stem).o)
-_c_flags       = $(filter-out $(CFLAGS_REMOVE_$(target-stem).o), $(orig_c_flags))
-orig_a_flags   = $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) \
-                 $(asflags-y) $(AFLAGS_$(target-stem).o)
-_a_flags       = $(filter-out $(AFLAGS_REMOVE_$(target-stem).o), $(orig_a_flags))
+_c_flags       = $(filter-out $(CFLAGS_REMOVE_$(target-stem).o), \
+                     $(filter-out $(ccflags-remove-y), \
+                         $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(ccflags-y)) \
+                     $(CFLAGS_$(target-stem).o))
+_a_flags       = $(filter-out $(AFLAGS_REMOVE_$(target-stem).o), \
+                     $(filter-out $(asflags-remove-y), \
+                         $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) $(asflags-y)) \
+                     $(AFLAGS_$(target-stem).o))
 _cpp_flags     = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(target-stem).lds)
 
 #
-- 
cgit v1.2.3


From b0294f30256bb6023b2044fd607855123863d98f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 6 Aug 2020 20:32:48 -0700
Subject: time: Delete repeated words in comments

Drop repeated words in kernel/time/.  {when, one, into}

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <john.stultz@linaro.org>
Link: https://lore.kernel.org/r/20200807033248.8452-1-rdunlap@infradead.org
---
 kernel/time/alarmtimer.c  | 2 +-
 kernel/time/sched_clock.c | 2 +-
 kernel/time/timekeeping.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2ffb466af77e..ca223a89530a 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -192,7 +192,7 @@ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
  * When a alarm timer fires, this runs through the timerqueue to
  * see which alarms expired, and runs those. If there are more alarm
  * timers queued for the future, we set the hrtimer to fire when
- * when the next future alarm timer expires.
+ * the next future alarm timer expires.
  */
 static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
 {
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0deaf4b79fb4..1c03eec6ca9b 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -229,7 +229,7 @@ void __init generic_sched_clock_init(void)
 {
 	/*
 	 * If no sched_clock() function has been provided at that point,
-	 * make it the final one one.
+	 * make it the final one.
 	 */
 	if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
 		sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4c7212f3c603..35cd10f1143b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2001,7 +2001,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
  * logarithmic_accumulation - shifted accumulation of cycles
  *
  * This functions accumulates a shifted interval of cycles into
- * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * a shifted interval nanoseconds. Allows for O(log) accumulation
  * loop.
  *
  * Returns the unconsumed cycles.
-- 
cgit v1.2.3


From e27b1636e9337d1a1d174b191e53d0f86421a822 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Tue, 11 Aug 2020 11:00:01 -0700
Subject: genirq/PM: Always unlock IRQ descriptor in rearm_wake_irq()

rearm_wake_irq() does not unlock the irq descriptor if the interrupt
is not suspended or if wakeup is not enabled on it.

Restucture the exit conditions so the unlock is always ensured.

Fixes: 3a79bc63d9075 ("PCI: irq: Introduce rearm_wake_irq()")
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20200811180001.80203-1-linux@roeck-us.net
---
 kernel/irq/pm.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 8f557fa1f4fe..c6c7e187ae74 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -185,14 +185,18 @@ void rearm_wake_irq(unsigned int irq)
 	unsigned long flags;
 	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 
-	if (!desc || !(desc->istate & IRQS_SUSPENDED) ||
-	    !irqd_is_wakeup_set(&desc->irq_data))
+	if (!desc)
 		return;
 
+	if (!(desc->istate & IRQS_SUSPENDED) ||
+	    !irqd_is_wakeup_set(&desc->irq_data))
+		goto unlock;
+
 	desc->istate &= ~IRQS_SUSPENDED;
 	irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
 	__enable_irq(desc);
 
+unlock:
 	irq_put_desc_busunlock(desc, flags);
 }
 
-- 
cgit v1.2.3


From b518154e59aab3ad0780a169c5cc84bd4ee4357e Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 11 Aug 2020 18:30:40 -0700
Subject: mm/vmscan: protect the workingset on anonymous LRU

In current implementation, newly created or swap-in anonymous page is
started on active list.  Growing active list results in rebalancing
active/inactive list so old pages on active list are demoted to inactive
list.  Hence, the page on active list isn't protected at all.

Following is an example of this situation.

Assume that 50 hot pages on active list.  Numbers denote the number of
pages on active/inactive list (active | inactive).

1. 50 hot pages on active list
50(h) | 0

2. workload: 50 newly created (used-once) pages
50(uo) | 50(h)

3. workload: another 50 newly created (used-once) pages
50(uo) | 50(uo), swap-out 50(h)

This patch tries to fix this issue.  Like as file LRU, newly created or
swap-in anonymous pages will be inserted to the inactive list.  They are
promoted to active list if enough reference happens.  This simple
modification changes the above example as following.

1. 50 hot pages on active list
50(h) | 0

2. workload: 50 newly created (used-once) pages
50(h) | 50(uo)

3. workload: another 50 newly created (used-once) pages
50(h) | 50(uo), swap-out 50(uo)

As you can see, hot pages on active list would be protected.

Note that, this implementation has a drawback that the page cannot be
promoted and will be swapped-out if re-access interval is greater than the
size of inactive list but less than the size of total(active+inactive).
To solve this potential issue, following patch will apply workingset
detection similar to the one that's already applied to file LRU.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Link: http://lkml.kernel.org/r/1595490560-15117-3-git-send-email-iamjoonsoo.kim@lge.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h    |  2 +-
 kernel/events/uprobes.c |  2 +-
 mm/huge_memory.c        |  2 +-
 mm/khugepaged.c         |  2 +-
 mm/memory.c             |  9 ++++-----
 mm/migrate.c            |  2 +-
 mm/swap.c               | 13 +++++++------
 mm/swapfile.c           |  2 +-
 mm/userfaultfd.c        |  2 +-
 mm/vmscan.c             |  4 +---
 10 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7eb59bc552a5..51ec9cdb92c0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -352,7 +352,7 @@ extern void deactivate_page(struct page *page);
 extern void mark_page_lazyfree(struct page *page);
 extern void swap_setup(void);
 
-extern void lru_cache_add_active_or_unevictable(struct page *page,
+extern void lru_cache_add_inactive_or_unevictable(struct page *page,
 						struct vm_area_struct *vma);
 
 /* linux/mm/vmscan.c */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 25de10c904e6..49047d479c57 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	if (new_page) {
 		get_page(new_page);
 		page_add_new_anon_rmap(new_page, vma, addr, false);
-		lru_cache_add_active_or_unevictable(new_page, vma);
+		lru_cache_add_inactive_or_unevictable(new_page, vma);
 	} else
 		/* no new page, just dec_mm_counter for old_page */
 		dec_mm_counter(mm, MM_ANONPAGES);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 206f52b36ffb..863c495776d7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -640,7 +640,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		entry = mk_huge_pmd(page, vma->vm_page_prot);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		page_add_new_anon_rmap(page, vma, haddr, true);
-		lru_cache_add_active_or_unevictable(page, vma);
+		lru_cache_add_inactive_or_unevictable(page, vma);
 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
 		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b52bd46ad146..15a9af791014 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1173,7 +1173,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	spin_lock(pmd_ptl);
 	BUG_ON(!pmd_none(*pmd));
 	page_add_new_anon_rmap(new_page, vma, address, true);
-	lru_cache_add_active_or_unevictable(new_page, vma);
+	lru_cache_add_inactive_or_unevictable(new_page, vma);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, address, pmd, _pmd);
 	update_mmu_cache_pmd(vma, address, pmd);
diff --git a/mm/memory.c b/mm/memory.c
index c39a13b09602..6fe8b5b22c57 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2715,7 +2715,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		 */
 		ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
 		page_add_new_anon_rmap(new_page, vma, vmf->address, false);
-		lru_cache_add_active_or_unevictable(new_page, vma);
+		lru_cache_add_inactive_or_unevictable(new_page, vma);
 		/*
 		 * We call the notify macro here because, when using secondary
 		 * mmu page tables (such as kvm shadow page tables), we want the
@@ -3266,10 +3266,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	/* ksm created a completely new copy */
 	if (unlikely(page != swapcache && swapcache)) {
 		page_add_new_anon_rmap(page, vma, vmf->address, false);
-		lru_cache_add_active_or_unevictable(page, vma);
+		lru_cache_add_inactive_or_unevictable(page, vma);
 	} else {
 		do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
-		activate_page(page);
 	}
 
 	swap_free(entry);
@@ -3414,7 +3413,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 
 	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, vma, vmf->address, false);
-	lru_cache_add_active_or_unevictable(page, vma);
+	lru_cache_add_inactive_or_unevictable(page, vma);
 setpte:
 	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
@@ -3672,7 +3671,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
 	if (write && !(vma->vm_flags & VM_SHARED)) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, vmf->address, false);
-		lru_cache_add_active_or_unevictable(page, vma);
+		lru_cache_add_inactive_or_unevictable(page, vma);
 	} else {
 		inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
 		page_add_file_rmap(page, false);
diff --git a/mm/migrate.c b/mm/migrate.c
index d179657f8685..819e55130134 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2830,7 +2830,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 	inc_mm_counter(mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, vma, addr, false);
 	if (!is_zone_device_page(page))
-		lru_cache_add_active_or_unevictable(page, vma);
+		lru_cache_add_inactive_or_unevictable(page, vma);
 	get_page(page);
 
 	if (flush) {
diff --git a/mm/swap.c b/mm/swap.c
index de257c0a89b1..9285e60c7d6e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -476,23 +476,24 @@ void lru_cache_add(struct page *page)
 EXPORT_SYMBOL(lru_cache_add);
 
 /**
- * lru_cache_add_active_or_unevictable
+ * lru_cache_add_inactive_or_unevictable
  * @page:  the page to be added to LRU
  * @vma:   vma in which page is mapped for determining reclaimability
  *
- * Place @page on the active or unevictable LRU list, depending on its
+ * Place @page on the inactive or unevictable LRU list, depending on its
  * evictability.  Note that if the page is not evictable, it goes
  * directly back onto it's zone's unevictable list, it does NOT use a
  * per cpu pagevec.
  */
-void lru_cache_add_active_or_unevictable(struct page *page,
+void lru_cache_add_inactive_or_unevictable(struct page *page,
 					 struct vm_area_struct *vma)
 {
+	bool unevictable;
+
 	VM_BUG_ON_PAGE(PageLRU(page), page);
 
-	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
-		SetPageActive(page);
-	else if (!TestSetPageMlocked(page)) {
+	unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
+	if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
 		/*
 		 * We use the irq-unsafe __mod_zone_page_stat because this
 		 * counter is not modified from interrupt context, and the pte
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c26916e95fd..82183432fdd0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1915,7 +1915,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 		page_add_anon_rmap(page, vma, addr, false);
 	} else { /* ksm created a completely new copy */
 		page_add_new_anon_rmap(page, vma, addr, false);
-		lru_cache_add_active_or_unevictable(page, vma);
+		lru_cache_add_inactive_or_unevictable(page, vma);
 	}
 	swap_free(entry);
 	/*
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index b80419320c7d..9a3d451402d7 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -123,7 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 
 	inc_mm_counter(dst_mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
-	lru_cache_add_active_or_unevictable(page, dst_vma);
+	lru_cache_add_inactive_or_unevictable(page, dst_vma);
 
 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e34fc04b7045..783cd7fdc61a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -998,8 +998,6 @@ static enum page_references page_check_references(struct page *page,
 		return PAGEREF_RECLAIM;
 
 	if (referenced_ptes) {
-		if (PageSwapBacked(page))
-			return PAGEREF_ACTIVATE;
 		/*
 		 * All mapped pages start out with page table
 		 * references from the instantiating fault, so we need
@@ -1022,7 +1020,7 @@ static enum page_references page_check_references(struct page *page,
 		/*
 		 * Activate file-backed executable pages after first usage.
 		 */
-		if (vm_flags & VM_EXEC)
+		if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
 			return PAGEREF_ACTIVATE;
 
 		return PAGEREF_KEEP;
-- 
cgit v1.2.3


From facdaa917c4d5a376d09d25865f5a863f906234a Mon Sep 17 00:00:00 2001
From: Nitin Gupta <nigupta@nvidia.com>
Date: Tue, 11 Aug 2020 18:31:00 -0700
Subject: mm: proactive compaction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For some applications, we need to allocate almost all memory as hugepages.
However, on a running system, higher-order allocations can fail if the
memory is fragmented.  Linux kernel currently does on-demand compaction as
we request more hugepages, but this style of compaction incurs very high
latency.  Experiments with one-time full memory compaction (followed by
hugepage allocations) show that kernel is able to restore a highly
fragmented memory state to a fairly compacted memory state within <1 sec
for a 32G system.  Such data suggests that a more proactive compaction can
help us allocate a large fraction of memory as hugepages keeping
allocation latencies low.

For a more proactive compaction, the approach taken here is to define a
new sysctl called 'vm.compaction_proactiveness' which dictates bounds for
external fragmentation which kcompactd tries to maintain.

The tunable takes a value in range [0, 100], with a default of 20.

Note that a previous version of this patch [1] was found to introduce too
many tunables (per-order extfrag{low, high}), but this one reduces them to
just one sysctl.  Also, the new tunable is an opaque value instead of
asking for specific bounds of "external fragmentation", which would have
been difficult to estimate.  The internal interpretation of this opaque
value allows for future fine-tuning.

Currently, we use a simple translation from this tunable to [low, high]
"fragmentation score" thresholds (low=100-proactiveness, high=low+10%).
The score for a node is defined as weighted mean of per-zone external
fragmentation.  A zone's present_pages determines its weight.

To periodically check per-node score, we reuse per-node kcompactd threads,
which are woken up every 500 milliseconds to check the same.  If a node's
score exceeds its high threshold (as derived from user-provided
proactiveness value), proactive compaction is started until its score
reaches its low threshold value.  By default, proactiveness is set to 20,
which implies threshold values of low=80 and high=90.

This patch is largely based on ideas from Michal Hocko [2].  See also the
LWN article [3].

Performance data
================

System: x64_64, 1T RAM, 80 CPU threads.
Kernel: 5.6.0-rc3 + this patch

echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/defrag

Before starting the driver, the system was fragmented from a userspace
program that allocates all memory and then for each 2M aligned section,
frees 3/4 of base pages using munmap.  The workload is mainly anonymous
userspace pages, which are easy to move around.  I intentionally avoided
unmovable pages in this test to see how much latency we incur when
hugepage allocations hit direct compaction.

1. Kernel hugepage allocation latencies

With the system in such a fragmented state, a kernel driver then allocates
as many hugepages as possible and measures allocation latency:

(all latency values are in microseconds)

- With vanilla 5.6.0-rc3

  percentile latency
  –––––––––– –––––––
	   5    7894
	  10    9496
	  25   12561
	  30   15295
	  40   18244
	  50   21229
	  60   27556
	  75   30147
	  80   31047
	  90   32859
	  95   33799

Total 2M hugepages allocated = 383859 (749G worth of hugepages out of 762G
total free => 98% of free memory could be allocated as hugepages)

- With 5.6.0-rc3 + this patch, with proactiveness=20

sysctl -w vm.compaction_proactiveness=20

  percentile latency
  –––––––––– –––––––
	   5       2
	  10       2
	  25       3
	  30       3
	  40       3
	  50       4
	  60       4
	  75       4
	  80       4
	  90       5
	  95     429

Total 2M hugepages allocated = 384105 (750G worth of hugepages out of 762G
total free => 98% of free memory could be allocated as hugepages)

2. JAVA heap allocation

In this test, we first fragment memory using the same method as for (1).

Then, we start a Java process with a heap size set to 700G and request the
heap to be allocated with THP hugepages.  We also set THP to madvise to
allow hugepage backing of this heap.

/usr/bin/time
 java -Xms700G -Xmx700G -XX:+UseTransparentHugePages -XX:+AlwaysPreTouch

The above command allocates 700G of Java heap using hugepages.

- With vanilla 5.6.0-rc3

17.39user 1666.48system 27:37.89elapsed

- With 5.6.0-rc3 + this patch, with proactiveness=20

8.35user 194.58system 3:19.62elapsed

Elapsed time remains around 3:15, as proactiveness is further increased.

Note that proactive compaction happens throughout the runtime of these
workloads.  The situation of one-time compaction, sufficient to supply
hugepages for following allocation stream, can probably happen for more
extreme proactiveness values, like 80 or 90.

In the above Java workload, proactiveness is set to 20.  The test starts
with a node's score of 80 or higher, depending on the delay between the
fragmentation step and starting the benchmark, which gives more-or-less
time for the initial round of compaction.  As t he benchmark consumes
hugepages, node's score quickly rises above the high threshold (90) and
proactive compaction starts again, which brings down the score to the low
threshold level (80).  Repeat.

bpftrace also confirms proactive compaction running 20+ times during the
runtime of this Java benchmark.  kcompactd threads consume 100% of one of
the CPUs while it tries to bring a node's score within thresholds.

Backoff behavior
================

Above workloads produce a memory state which is easy to compact.  However,
if memory is filled with unmovable pages, proactive compaction should
essentially back off.  To test this aspect:

- Created a kernel driver that allocates almost all memory as hugepages
  followed by freeing first 3/4 of each hugepage.
- Set proactiveness=40
- Note that proactive_compact_node() is deferred maximum number of times
  with HPAGE_FRAG_CHECK_INTERVAL_MSEC of wait between each check
  (=> ~30 seconds between retries).

[1] https://patchwork.kernel.org/patch/11098289/
[2] https://lore.kernel.org/linux-mm/20161230131412.GI13301@dhcp22.suse.cz/
[3] https://lwn.net/Articles/817905/

Signed-off-by: Nitin Gupta <nigupta@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Tested-by: Oleksandr Natalenko <oleksandr@redhat.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Khalid Aziz <khalid.aziz@oracle.com>
Reviewed-by: Oleksandr Natalenko <oleksandr@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Khalid Aziz <khalid.aziz@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Nitin Gupta <ngupta@nitingupta.dev>
Cc: Oleksandr Natalenko <oleksandr@redhat.com>
Link: http://lkml.kernel.org/r/20200616204527.19185-1-nigupta@nvidia.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/sysctl/vm.rst |  15 +++
 include/linux/compaction.h              |   2 +
 kernel/sysctl.c                         |   9 ++
 mm/compaction.c                         | 183 +++++++++++++++++++++++++++++++-
 mm/internal.h                           |   1 +
 mm/vmstat.c                             |  18 ++++
 6 files changed, 223 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index d997cc3c26d0..4b9d2e8e9142 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -119,6 +119,21 @@ all zones are compacted such that free memory is available in contiguous
 blocks where possible. This can be important for example in the allocation of
 huge pages although processes will also directly compact memory as required.
 
+compaction_proactiveness
+========================
+
+This tunable takes a value in the range [0, 100] with a default value of
+20. This tunable determines how aggressively compaction is done in the
+background. Setting it to 0 disables proactive compaction.
+
+Note that compaction has a non-trivial system-wide impact as pages
+belonging to different processes are moved around, which could also lead
+to latency spikes in unsuspecting applications. The kernel employs
+various heuristics to avoid wasting CPU cycles if it detects that
+proactive compaction is not being effective.
+
+Be careful when setting it to extreme values like 100, as that may
+cause excessive background compaction activity.
 
 compact_unevictable_allowed
 ===========================
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 6fa0eea3f530..7a242d46454e 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -85,11 +85,13 @@ static inline unsigned long compact_gap(unsigned int order)
 
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
+extern int sysctl_compaction_proactiveness;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
 			void *buffer, size_t *length, loff_t *ppos);
 extern int sysctl_extfrag_threshold;
 extern int sysctl_compact_unevictable_allowed;
 
+extern int extfrag_for_order(struct zone *zone, unsigned int order);
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
 		unsigned int order, unsigned int alloc_flags,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f785de3caac0..ab72e52f8a7b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2851,6 +2851,15 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0200,
 		.proc_handler	= sysctl_compaction_handler,
 	},
+	{
+		.procname	= "compaction_proactiveness",
+		.data		= &sysctl_compaction_proactiveness,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &one_hundred,
+	},
 	{
 		.procname	= "extfrag_threshold",
 		.data		= &sysctl_extfrag_threshold,
diff --git a/mm/compaction.c b/mm/compaction.c
index 86375605faa9..544a98811c82 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,24 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
 #define pageblock_start_pfn(pfn)	block_start_pfn(pfn, pageblock_order)
 #define pageblock_end_pfn(pfn)		block_end_pfn(pfn, pageblock_order)
 
+/*
+ * Fragmentation score check interval for proactive compaction purposes.
+ */
+static const int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500;
+
+/*
+ * Page order with-respect-to which proactive compaction
+ * calculates external fragmentation, which is used as
+ * the "fragmentation score" of a node/zone.
+ */
+#if defined CONFIG_TRANSPARENT_HUGEPAGE
+#define COMPACTION_HPAGE_ORDER	HPAGE_PMD_ORDER
+#elif defined HUGETLB_PAGE_ORDER
+#define COMPACTION_HPAGE_ORDER	HUGETLB_PAGE_ORDER
+#else
+#define COMPACTION_HPAGE_ORDER	(PMD_SHIFT - PAGE_SHIFT)
+#endif
+
 static unsigned long release_freepages(struct list_head *freelist)
 {
 	struct page *page, *next;
@@ -1857,6 +1875,76 @@ static inline bool is_via_compact_memory(int order)
 	return order == -1;
 }
 
+static bool kswapd_is_running(pg_data_t *pgdat)
+{
+	return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING);
+}
+
+/*
+ * A zone's fragmentation score is the external fragmentation wrt to the
+ * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value
+ * in the range [0, 100].
+ *
+ * The scaling factor ensures that proactive compaction focuses on larger
+ * zones like ZONE_NORMAL, rather than smaller, specialized zones like
+ * ZONE_DMA32. For smaller zones, the score value remains close to zero,
+ * and thus never exceeds the high threshold for proactive compaction.
+ */
+static int fragmentation_score_zone(struct zone *zone)
+{
+	unsigned long score;
+
+	score = zone->present_pages *
+			extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+	return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
+}
+
+/*
+ * The per-node proactive (background) compaction process is started by its
+ * corresponding kcompactd thread when the node's fragmentation score
+ * exceeds the high threshold. The compaction process remains active till
+ * the node's score falls below the low threshold, or one of the back-off
+ * conditions is met.
+ */
+static int fragmentation_score_node(pg_data_t *pgdat)
+{
+	unsigned long score = 0;
+	int zoneid;
+
+	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+		struct zone *zone;
+
+		zone = &pgdat->node_zones[zoneid];
+		score += fragmentation_score_zone(zone);
+	}
+
+	return score;
+}
+
+static int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
+{
+	int wmark_low;
+
+	/*
+	 * Cap the low watermak to avoid excessive compaction
+	 * activity in case a user sets the proactivess tunable
+	 * close to 100 (maximum).
+	 */
+	wmark_low = max(100 - sysctl_compaction_proactiveness, 5);
+	return low ? wmark_low : min(wmark_low + 10, 100);
+}
+
+static bool should_proactive_compact_node(pg_data_t *pgdat)
+{
+	int wmark_high;
+
+	if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat))
+		return false;
+
+	wmark_high = fragmentation_score_wmark(pgdat, false);
+	return fragmentation_score_node(pgdat) > wmark_high;
+}
+
 static enum compact_result __compact_finished(struct compact_control *cc)
 {
 	unsigned int order;
@@ -1883,6 +1971,25 @@ static enum compact_result __compact_finished(struct compact_control *cc)
 			return COMPACT_PARTIAL_SKIPPED;
 	}
 
+	if (cc->proactive_compaction) {
+		int score, wmark_low;
+		pg_data_t *pgdat;
+
+		pgdat = cc->zone->zone_pgdat;
+		if (kswapd_is_running(pgdat))
+			return COMPACT_PARTIAL_SKIPPED;
+
+		score = fragmentation_score_zone(cc->zone);
+		wmark_low = fragmentation_score_wmark(pgdat, true);
+
+		if (score > wmark_low)
+			ret = COMPACT_CONTINUE;
+		else
+			ret = COMPACT_SUCCESS;
+
+		goto out;
+	}
+
 	if (is_via_compact_memory(cc->order))
 		return COMPACT_CONTINUE;
 
@@ -1941,6 +2048,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
 		}
 	}
 
+out:
 	if (cc->contended || fatal_signal_pending(current))
 		ret = COMPACT_CONTENDED;
 
@@ -2421,6 +2529,41 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	return rc;
 }
 
+/*
+ * Compact all zones within a node till each zone's fragmentation score
+ * reaches within proactive compaction thresholds (as determined by the
+ * proactiveness tunable).
+ *
+ * It is possible that the function returns before reaching score targets
+ * due to various back-off conditions, such as, contention on per-node or
+ * per-zone locks.
+ */
+static void proactive_compact_node(pg_data_t *pgdat)
+{
+	int zoneid;
+	struct zone *zone;
+	struct compact_control cc = {
+		.order = -1,
+		.mode = MIGRATE_SYNC_LIGHT,
+		.ignore_skip_hint = true,
+		.whole_zone = true,
+		.gfp_mask = GFP_KERNEL,
+		.proactive_compaction = true,
+	};
+
+	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+		zone = &pgdat->node_zones[zoneid];
+		if (!populated_zone(zone))
+			continue;
+
+		cc.zone = zone;
+
+		compact_zone(&cc, NULL);
+
+		VM_BUG_ON(!list_empty(&cc.freepages));
+		VM_BUG_ON(!list_empty(&cc.migratepages));
+	}
+}
 
 /* Compact all zones within a node */
 static void compact_node(int nid)
@@ -2467,6 +2610,13 @@ static void compact_nodes(void)
 /* The written value is actually unused, all memory is compacted */
 int sysctl_compact_memory;
 
+/*
+ * Tunable for proactive compaction. It determines how
+ * aggressively the kernel should compact memory in the
+ * background. It takes values in the range [0, 100].
+ */
+int __read_mostly sysctl_compaction_proactiveness = 20;
+
 /*
  * This is the entry point for compacting all nodes via
  * /proc/sys/vm/compact_memory
@@ -2646,6 +2796,7 @@ static int kcompactd(void *p)
 {
 	pg_data_t *pgdat = (pg_data_t*)p;
 	struct task_struct *tsk = current;
+	unsigned int proactive_defer = 0;
 
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
@@ -2661,12 +2812,34 @@ static int kcompactd(void *p)
 		unsigned long pflags;
 
 		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
-		wait_event_freezable(pgdat->kcompactd_wait,
-				kcompactd_work_requested(pgdat));
+		if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
+			kcompactd_work_requested(pgdat),
+			msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) {
+
+			psi_memstall_enter(&pflags);
+			kcompactd_do_work(pgdat);
+			psi_memstall_leave(&pflags);
+			continue;
+		}
 
-		psi_memstall_enter(&pflags);
-		kcompactd_do_work(pgdat);
-		psi_memstall_leave(&pflags);
+		/* kcompactd wait timeout */
+		if (should_proactive_compact_node(pgdat)) {
+			unsigned int prev_score, score;
+
+			if (proactive_defer) {
+				proactive_defer--;
+				continue;
+			}
+			prev_score = fragmentation_score_node(pgdat);
+			proactive_compact_node(pgdat);
+			score = fragmentation_score_node(pgdat);
+			/*
+			 * Defer proactive compaction if the fragmentation
+			 * score did not go down i.e. no progress made.
+			 */
+			proactive_defer = score < prev_score ?
+					0 : 1 << COMPACT_MAX_DEFER_SHIFT;
+		}
 	}
 
 	return 0;
diff --git a/mm/internal.h b/mm/internal.h
index 9886db20d94f..42cf0b610847 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -239,6 +239,7 @@ struct compact_control {
 	bool no_set_skip_hint;		/* Don't mark blocks for skipping */
 	bool ignore_block_suitable;	/* Scan blocks considered unsuitable */
 	bool direct_compaction;		/* False from kcompactd or /proc/... */
+	bool proactive_compaction;	/* kcompactd proactive compaction */
 	bool whole_zone;		/* Whole zone should/has been scanned */
 	bool contended;			/* Signal lock or sched contention */
 	bool rescan;			/* Rescanning the same pageblock */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fef03463a0cf..f183aa37994e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1096,6 +1096,24 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
 	return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
 }
 
+/*
+ * Calculates external fragmentation within a zone wrt the given order.
+ * It is defined as the percentage of pages found in blocks of size
+ * less than 1 << order. It returns values in range [0, 100].
+ */
+int extfrag_for_order(struct zone *zone, unsigned int order)
+{
+	struct contig_page_info info;
+
+	fill_contig_page_info(zone, order, &info);
+	if (info.free_pages == 0)
+		return 0;
+
+	return div_u64((info.free_pages -
+			(info.free_blocks_suitable << order)) * 100,
+			info.free_pages);
+}
+
 /* Same as __fragmentation index but allocs contig_page_info on stack */
 int fragmentation_index(struct zone *zone, unsigned int order)
 {
-- 
cgit v1.2.3


From d34c0a7599ea8c301bc471dfa1eb2bf2db6752d1 Mon Sep 17 00:00:00 2001
From: Nitin Gupta <nigupta@nvidia.com>
Date: Tue, 11 Aug 2020 18:31:07 -0700
Subject: mm: use unsigned types for fragmentation score

Proactive compaction uses per-node/zone "fragmentation score" which is
always in range [0, 100], so use unsigned type of these scores as well as
for related constants.

Signed-off-by: Nitin Gupta <nigupta@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Link: http://lkml.kernel.org/r/20200618010319.13159-1-nigupta@nvidia.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h |  4 ++--
 kernel/sysctl.c            |  2 +-
 mm/compaction.c            | 18 +++++++++---------
 mm/vmstat.c                |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 7a242d46454e..25a521d299c1 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -85,13 +85,13 @@ static inline unsigned long compact_gap(unsigned int order)
 
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
-extern int sysctl_compaction_proactiveness;
+extern unsigned int sysctl_compaction_proactiveness;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
 			void *buffer, size_t *length, loff_t *ppos);
 extern int sysctl_extfrag_threshold;
 extern int sysctl_compact_unevictable_allowed;
 
-extern int extfrag_for_order(struct zone *zone, unsigned int order);
+extern unsigned int extfrag_for_order(struct zone *zone, unsigned int order);
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
 		unsigned int order, unsigned int alloc_flags,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ab72e52f8a7b..287862f91717 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2854,7 +2854,7 @@ static struct ctl_table vm_table[] = {
 	{
 		.procname	= "compaction_proactiveness",
 		.data		= &sysctl_compaction_proactiveness,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(sysctl_compaction_proactiveness),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
diff --git a/mm/compaction.c b/mm/compaction.c
index ed8ea1511634..b7d433f1706a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -53,7 +53,7 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
 /*
  * Fragmentation score check interval for proactive compaction purposes.
  */
-static const int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500;
+static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500;
 
 /*
  * Page order with-respect-to which proactive compaction
@@ -1890,7 +1890,7 @@ static bool kswapd_is_running(pg_data_t *pgdat)
  * ZONE_DMA32. For smaller zones, the score value remains close to zero,
  * and thus never exceeds the high threshold for proactive compaction.
  */
-static int fragmentation_score_zone(struct zone *zone)
+static unsigned int fragmentation_score_zone(struct zone *zone)
 {
 	unsigned long score;
 
@@ -1906,9 +1906,9 @@ static int fragmentation_score_zone(struct zone *zone)
  * the node's score falls below the low threshold, or one of the back-off
  * conditions is met.
  */
-static int fragmentation_score_node(pg_data_t *pgdat)
+static unsigned int fragmentation_score_node(pg_data_t *pgdat)
 {
-	unsigned long score = 0;
+	unsigned int score = 0;
 	int zoneid;
 
 	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
@@ -1921,17 +1921,17 @@ static int fragmentation_score_node(pg_data_t *pgdat)
 	return score;
 }
 
-static int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
+static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
 {
-	int wmark_low;
+	unsigned int wmark_low;
 
 	/*
 	 * Cap the low watermak to avoid excessive compaction
 	 * activity in case a user sets the proactivess tunable
 	 * close to 100 (maximum).
 	 */
-	wmark_low = max(100 - sysctl_compaction_proactiveness, 5);
-	return low ? wmark_low : min(wmark_low + 10, 100);
+	wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
+	return low ? wmark_low : min(wmark_low + 10, 100U);
 }
 
 static bool should_proactive_compact_node(pg_data_t *pgdat)
@@ -2615,7 +2615,7 @@ int sysctl_compact_memory;
  * aggressively the kernel should compact memory in the
  * background. It takes values in the range [0, 100].
  */
-int __read_mostly sysctl_compaction_proactiveness = 20;
+unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
 
 /*
  * This is the entry point for compacting all nodes via
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f183aa37994e..3bb034c99887 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1101,7 +1101,7 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
  * It is defined as the percentage of pages found in blocks of size
  * less than 1 << order. It returns values in range [0, 100].
  */
-int extfrag_for_order(struct zone *zone, unsigned int order)
+unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
 {
 	struct contig_page_info info;
 
-- 
cgit v1.2.3


From 3d13f313ce4c34c524ccc37986fe77172f601ff3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 11 Aug 2020 18:33:47 -0700
Subject: uaccess: add force_uaccess_{begin,end} helpers

Add helpers to wrap the get_fs/set_fs magic for undoing any damange done
by set_fs(KERNEL_DS).  There is no real functional benefit, but this
documents the intent of these calls better, and will allow stubbing the
functions out easily for kernels builds that do not allow address space
overrides in the future.

[hch@lst.de: drop two incorrect hunks, fix a commit log typo]
  Link: http://lkml.kernel.org/r/20200714105505.935079-6-hch@lst.de

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Greentime Hu <green.hu@gmail.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Nick Hu <nickhu@andestech.com>
Cc: Vincent Chen <deanbo422@gmail.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Link: http://lkml.kernel.org/r/20200710135706.537715-6-hch@lst.de
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/kernel/sdei.c         |  2 +-
 arch/m68k/include/asm/tlbflush.h |  6 +++---
 arch/mips/kernel/unaligned.c     | 27 +++++++++++++--------------
 arch/nds32/mm/alignment.c        |  7 +++----
 arch/sh/kernel/traps_32.c        | 12 +++++-------
 drivers/firmware/arm_sdei.c      |  5 ++---
 include/linux/uaccess.h          | 18 ++++++++++++++++++
 kernel/events/callchain.c        |  5 ++---
 kernel/events/core.c             |  5 ++---
 kernel/kthread.c                 |  5 ++---
 kernel/stacktrace.c              |  5 ++---
 mm/maccess.c                     | 22 ++++++++++------------
 12 files changed, 63 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index dab88260b137..7689f2031c0c 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -180,7 +180,7 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs,
 
 	/*
 	 * We didn't take an exception to get here, set PAN. UAO will be cleared
-	 * by sdei_event_handler()s set_fs(USER_DS) call.
+	 * by sdei_event_handler()s force_uaccess_begin() call.
 	 */
 	__uaccess_enable_hw_pan();
 
diff --git a/arch/m68k/include/asm/tlbflush.h b/arch/m68k/include/asm/tlbflush.h
index 191e75a6bb24..5337bc2c262f 100644
--- a/arch/m68k/include/asm/tlbflush.h
+++ b/arch/m68k/include/asm/tlbflush.h
@@ -85,10 +85,10 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
 {
 	if (vma->vm_mm == current->active_mm) {
-		mm_segment_t old_fs = get_fs();
-		set_fs(USER_DS);
+		mm_segment_t old_fs = force_uaccess_begin();
+
 		__flush_tlb_one(addr);
-		set_fs(old_fs);
+		force_uaccess_end(old_fs);
 	}
 }
 
diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
index 0adce604fa44..126a5f3f4e4c 100644
--- a/arch/mips/kernel/unaligned.c
+++ b/arch/mips/kernel/unaligned.c
@@ -191,17 +191,16 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 			 * memory, so we need to "switch" the address limit to
 			 * user space, so that address check can work properly.
 			 */
-			seg = get_fs();
-			set_fs(USER_DS);
+			seg = force_uaccess_begin();
 			switch (insn.spec3_format.func) {
 			case lhe_op:
 				if (!access_ok(addr, 2)) {
-					set_fs(seg);
+					force_uaccess_end(seg);
 					goto sigbus;
 				}
 				LoadHWE(addr, value, res);
 				if (res) {
-					set_fs(seg);
+					force_uaccess_end(seg);
 					goto fault;
 				}
 				compute_return_epc(regs);
@@ -209,12 +208,12 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 				break;
 			case lwe_op:
 				if (!access_ok(addr, 4)) {
-					set_fs(seg);
+					force_uaccess_end(seg);
 					goto sigbus;
 				}
 				LoadWE(addr, value, res);
 				if (res) {
-					set_fs(seg);
+					force_uaccess_end(seg);
 					goto fault;
 				}
 				compute_return_epc(regs);
@@ -222,12 +221,12 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 				break;
 			case lhue_op:
 				if (!access_ok(addr, 2)) {
-					set_fs(seg);
+					force_uaccess_end(seg);
 					goto sigbus;
 				}
 				LoadHWUE(addr, value, res);
 				if (res) {
-					set_fs(seg);
+					force_uaccess_end(seg);
 					goto fault;
 				}
 				compute_return_epc(regs);
@@ -235,35 +234,35 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 				break;
 			case she_op:
 				if (!access_ok(addr, 2)) {
-					set_fs(seg);
+					force_uaccess_end(seg);
 					goto sigbus;
 				}
 				compute_return_epc(regs);
 				value = regs->regs[insn.spec3_format.rt];
 				StoreHWE(addr, value, res);
 				if (res) {
-					set_fs(seg);
+					force_uaccess_end(seg);
 					goto fault;
 				}
 				break;
 			case swe_op:
 				if (!access_ok(addr, 4)) {
-					set_fs(seg);
+					force_uaccess_end(seg);
 					goto sigbus;
 				}
 				compute_return_epc(regs);
 				value = regs->regs[insn.spec3_format.rt];
 				StoreWE(addr, value, res);
 				if (res) {
-					set_fs(seg);
+					force_uaccess_end(seg);
 					goto fault;
 				}
 				break;
 			default:
-				set_fs(seg);
+				force_uaccess_end(seg);
 				goto sigill;
 			}
-			set_fs(seg);
+			force_uaccess_end(seg);
 		}
 #endif
 		break;
diff --git a/arch/nds32/mm/alignment.c b/arch/nds32/mm/alignment.c
index c8b9061a2ee3..1eb7ded6992b 100644
--- a/arch/nds32/mm/alignment.c
+++ b/arch/nds32/mm/alignment.c
@@ -512,7 +512,7 @@ int do_unaligned_access(unsigned long addr, struct pt_regs *regs)
 {
 	unsigned long inst;
 	int ret = -EFAULT;
-	mm_segment_t seg = get_fs();
+	mm_segment_t seg;
 
 	inst = get_inst(regs->ipc);
 
@@ -520,13 +520,12 @@ int do_unaligned_access(unsigned long addr, struct pt_regs *regs)
 	      "Faulting addr: 0x%08lx, pc: 0x%08lx [inst: 0x%08lx ]\n", addr,
 	      regs->ipc, inst);
 
-	set_fs(USER_DS);
-
+	seg = force_uaccess_begin();
 	if (inst & NDS32_16BIT_INSTRUCTION)
 		ret = do_16((inst >> 16) & 0xffff, regs);
 	else
 		ret = do_32(inst, regs);
-	set_fs(seg);
+	force_uaccess_end(seg);
 
 	return ret;
 }
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index 058c6181bb30..b62ad0ba2395 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -482,8 +482,6 @@ asmlinkage void do_address_error(struct pt_regs *regs,
 	error_code = lookup_exception_vector();
 #endif
 
-	oldfs = get_fs();
-
 	if (user_mode(regs)) {
 		int si_code = BUS_ADRERR;
 		unsigned int user_action;
@@ -491,13 +489,13 @@ asmlinkage void do_address_error(struct pt_regs *regs,
 		local_irq_enable();
 		inc_unaligned_user_access();
 
-		set_fs(USER_DS);
+		oldfs = force_uaccess_begin();
 		if (copy_from_user(&instruction, (insn_size_t *)(regs->pc & ~1),
 				   sizeof(instruction))) {
-			set_fs(oldfs);
+			force_uaccess_end(oldfs);
 			goto uspace_segv;
 		}
-		set_fs(oldfs);
+		force_uaccess_end(oldfs);
 
 		/* shout about userspace fixups */
 		unaligned_fixups_notify(current, instruction, regs);
@@ -520,11 +518,11 @@ fixup:
 			goto uspace_segv;
 		}
 
-		set_fs(USER_DS);
+		oldfs = force_uaccess_begin();
 		tmp = handle_unaligned_access(instruction, regs,
 					      &user_mem_access, 0,
 					      address);
-		set_fs(oldfs);
+		force_uaccess_end(oldfs);
 
 		if (tmp == 0)
 			return; /* sorted */
diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
index e7e36aab2386..b4b9ce97f415 100644
--- a/drivers/firmware/arm_sdei.c
+++ b/drivers/firmware/arm_sdei.c
@@ -1136,15 +1136,14 @@ int sdei_event_handler(struct pt_regs *regs,
 	 * access kernel memory.
 	 * Do the same here because this doesn't come via the same entry code.
 	*/
-	orig_addr_limit = get_fs();
-	set_fs(USER_DS);
+	orig_addr_limit = force_uaccess_begin();
 
 	err = arg->callback(event_num, regs, arg->callback_arg);
 	if (err)
 		pr_err_ratelimited("event %u on CPU %u failed with error: %d\n",
 				   event_num, smp_processor_id(), err);
 
-	set_fs(orig_addr_limit);
+	force_uaccess_end(orig_addr_limit);
 
 	return err;
 }
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 5c62d0c6f15b..94b285411659 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -8,6 +8,24 @@
 
 #include <asm/uaccess.h>
 
+/*
+ * Force the uaccess routines to be wired up for actual userspace access,
+ * overriding any possible set_fs(KERNEL_DS) still lingering around.  Undone
+ * using force_uaccess_end below.
+ */
+static inline mm_segment_t force_uaccess_begin(void)
+{
+	mm_segment_t fs = get_fs();
+
+	set_fs(USER_DS);
+	return fs;
+}
+
+static inline void force_uaccess_end(mm_segment_t oldfs)
+{
+	set_fs(oldfs);
+}
+
 /*
  * Architectures should provide two primitives (raw_copy_{to,from}_user())
  * and get rid of their private instances of copy_{to,from}_user() and
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c6ce894e4ce9..58cbe357fb2b 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -217,10 +217,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 			if (add_mark)
 				perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 
-			fs = get_fs();
-			set_fs(USER_DS);
+			fs = force_uaccess_begin();
 			perf_callchain_user(&ctx, regs);
-			set_fs(fs);
+			force_uaccess_end(fs);
 		}
 	}
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d1f0a7e5b182..6961333ebad5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6453,10 +6453,9 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
 
 		/* Data. */
 		sp = perf_user_stack_pointer(regs);
-		fs = get_fs();
-		set_fs(USER_DS);
+		fs = force_uaccess_begin();
 		rem = __output_copy_user(handle, (void *) sp, dump_size);
-		set_fs(fs);
+		force_uaccess_end(fs);
 		dyn_size = dump_size - rem;
 
 		perf_output_skip(handle, rem);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b2807e7be772..3edaa380dc7b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1258,8 +1258,7 @@ void kthread_use_mm(struct mm_struct *mm)
 	if (active_mm != mm)
 		mmdrop(active_mm);
 
-	to_kthread(tsk)->oldfs = get_fs();
-	set_fs(USER_DS);
+	to_kthread(tsk)->oldfs = force_uaccess_begin();
 }
 EXPORT_SYMBOL_GPL(kthread_use_mm);
 
@@ -1274,7 +1273,7 @@ void kthread_unuse_mm(struct mm_struct *mm)
 	WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
 	WARN_ON_ONCE(!tsk->mm);
 
-	set_fs(to_kthread(tsk)->oldfs);
+	force_uaccess_end(to_kthread(tsk)->oldfs);
 
 	task_lock(tsk);
 	sync_mm_rss(mm);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 2af66e449aa6..946f44a9e86a 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -233,10 +233,9 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
 	if (current->flags & PF_KTHREAD)
 		return 0;
 
-	fs = get_fs();
-	set_fs(USER_DS);
+	fs = force_uaccess_begin();
 	arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
-	set_fs(fs);
+	force_uaccess_end(fs);
 
 	return c.len;
 }
diff --git a/mm/maccess.c b/mm/maccess.c
index f98ff91e32c6..3bd70405f2d8 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -205,15 +205,14 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
 long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
 {
 	long ret = -EFAULT;
-	mm_segment_t old_fs = get_fs();
+	mm_segment_t old_fs = force_uaccess_begin();
 
-	set_fs(USER_DS);
 	if (access_ok(src, size)) {
 		pagefault_disable();
 		ret = __copy_from_user_inatomic(dst, src, size);
 		pagefault_enable();
 	}
-	set_fs(old_fs);
+	force_uaccess_end(old_fs);
 
 	if (ret)
 		return -EFAULT;
@@ -233,15 +232,14 @@ EXPORT_SYMBOL_GPL(copy_from_user_nofault);
 long copy_to_user_nofault(void __user *dst, const void *src, size_t size)
 {
 	long ret = -EFAULT;
-	mm_segment_t old_fs = get_fs();
+	mm_segment_t old_fs = force_uaccess_begin();
 
-	set_fs(USER_DS);
 	if (access_ok(dst, size)) {
 		pagefault_disable();
 		ret = __copy_to_user_inatomic(dst, src, size);
 		pagefault_enable();
 	}
-	set_fs(old_fs);
+	force_uaccess_end(old_fs);
 
 	if (ret)
 		return -EFAULT;
@@ -270,17 +268,17 @@ EXPORT_SYMBOL_GPL(copy_to_user_nofault);
 long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
 			      long count)
 {
-	mm_segment_t old_fs = get_fs();
+	mm_segment_t old_fs;
 	long ret;
 
 	if (unlikely(count <= 0))
 		return 0;
 
-	set_fs(USER_DS);
+	old_fs = force_uaccess_begin();
 	pagefault_disable();
 	ret = strncpy_from_user(dst, unsafe_addr, count);
 	pagefault_enable();
-	set_fs(old_fs);
+	force_uaccess_end(old_fs);
 
 	if (ret >= count) {
 		ret = count;
@@ -310,14 +308,14 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
  */
 long strnlen_user_nofault(const void __user *unsafe_addr, long count)
 {
-	mm_segment_t old_fs = get_fs();
+	mm_segment_t old_fs;
 	int ret;
 
-	set_fs(USER_DS);
+	old_fs = force_uaccess_begin();
 	pagefault_disable();
 	ret = strnlen_user(unsafe_addr, count);
 	pagefault_enable();
-	set_fs(old_fs);
+	force_uaccess_end(old_fs);
 
 	return ret;
 }
-- 
cgit v1.2.3


From fe81417596fa8b6577fedb7e206ff3e4c7015c13 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 11 Aug 2020 18:33:50 -0700
Subject: exec: use force_uaccess_begin during exec and exit

Both exec and exit want to ensure that the uaccess routines actually do
access user pointers.  Use the newly added force_uaccess_begin helper
instead of an open coded set_fs for that to prepare for kernel builds
where set_fs() does not exist.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Hu <nickhu@andestech.com>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Vincent Chen <deanbo422@gmail.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Link: http://lkml.kernel.org/r/20200710135706.537715-7-hch@lst.de
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c     | 7 ++++++-
 kernel/exit.c | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 3698252719a3..29ef78ae9f50 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1402,7 +1402,12 @@ int begin_new_exec(struct linux_binprm * bprm)
 	if (retval)
 		goto out_unlock;
 
-	set_fs(USER_DS);
+	/*
+	 * Ensure that the uaccess routines can actually operate on userspace
+	 * pointers:
+	 */
+	force_uaccess_begin();
+
 	me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
 					PF_NOFREEZE | PF_NO_SETAFFINITY);
 	flush_thread();
diff --git a/kernel/exit.c b/kernel/exit.c
index e731c414e024..c2d2961576f2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -732,7 +732,7 @@ void __noreturn do_exit(long code)
 	 * mm_release()->clear_child_tid() from writing to a user-controlled
 	 * kernel address.
 	 */
-	set_fs(USER_DS);
+	force_uaccess_begin();
 
 	if (unlikely(in_atomic())) {
 		pr_info("note: %s[%d] exited with preempt_count %d\n",
-- 
cgit v1.2.3


From 8043fc147a97ec2eefc582487f344f2cbe86d12e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 11 Aug 2020 18:34:10 -0700
Subject: kernel: add a kernel_wait helper

Add a helper that waits for a pid and stores the status in the passed in
kernel pointer.  Use it to fix the usage of kernel_wait4 in
call_usermodehelper_exec_sync that only happens to work due to the
implicit set_fs(KERNEL_DS) for kernel threads.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Link: http://lkml.kernel.org/r/20200721130449.5008-1-hch@lst.de
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/task.h |  1 +
 kernel/exit.c              | 16 ++++++++++++++++
 kernel/umh.c               | 29 ++++-------------------------
 3 files changed, 21 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ae3060f0b0c9..a98965007eef 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -88,6 +88,7 @@ struct task_struct *fork_idle(int);
 struct mm_struct *copy_init_mm(void);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
+int kernel_wait(pid_t pid, int *stat);
 
 extern void free_task(struct task_struct *tsk);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index c2d2961576f2..733e80f334e7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1626,6 +1626,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
 	return ret;
 }
 
+int kernel_wait(pid_t pid, int *stat)
+{
+	struct wait_opts wo = {
+		.wo_type	= PIDTYPE_PID,
+		.wo_pid		= find_get_pid(pid),
+		.wo_flags	= WEXITED,
+	};
+	int ret;
+
+	ret = do_wait(&wo);
+	if (ret > 0 && wo.wo_stat)
+		*stat = wo.wo_stat;
+	put_pid(wo.wo_pid);
+	return ret;
+}
+
 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 		int, options, struct rusage __user *, ru)
 {
diff --git a/kernel/umh.c b/kernel/umh.c
index a25433f9cd9a..fcf3ee803630 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -119,37 +119,16 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
 {
 	pid_t pid;
 
-	/* If SIGCLD is ignored kernel_wait4 won't populate the status. */
+	/* If SIGCLD is ignored do_wait won't populate the status. */
 	kernel_sigaction(SIGCHLD, SIG_DFL);
 	pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
-	if (pid < 0) {
+	if (pid < 0)
 		sub_info->retval = pid;
-	} else {
-		int ret = -ECHILD;
-		/*
-		 * Normally it is bogus to call wait4() from in-kernel because
-		 * wait4() wants to write the exit code to a userspace address.
-		 * But call_usermodehelper_exec_sync() always runs as kernel
-		 * thread (workqueue) and put_user() to a kernel address works
-		 * OK for kernel threads, due to their having an mm_segment_t
-		 * which spans the entire address space.
-		 *
-		 * Thus the __user pointer cast is valid here.
-		 */
-		kernel_wait4(pid, (int __user *)&ret, 0, NULL);
-
-		/*
-		 * If ret is 0, either call_usermodehelper_exec_async failed and
-		 * the real error code is already in sub_info->retval or
-		 * sub_info->retval is 0 anyway, so don't mess with it then.
-		 */
-		if (ret)
-			sub_info->retval = ret;
-	}
+	else
+		kernel_wait(pid, &sub_info->retval);
 
 	/* Restore default kernel sig handler */
 	kernel_sigaction(SIGCHLD, SIG_IGN);
-
 	umh_complete(sub_info);
 }
 
-- 
cgit v1.2.3


From 6f9e148c218641ad876845abdea4b0597468c55e Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Tue, 11 Aug 2020 18:36:12 -0700
Subject: kmod: remove redundant "be an" in the comment

There exists redundant "be an" in the comment, remove it.

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: James Morris <jmorris@namei.org>
Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Cc: J. Bruce Fields <bfields@fieldses.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lars Ellenberg <lars.ellenberg@linbit.com>
Cc: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Cc: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Roopa Prabhu <roopa@cumulusnetworks.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Sergei Trofimovich <slyfox@gentoo.org>
Cc: Sergey Kvachonok <ravenexp@gmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tony Vroon <chainsaw@gentoo.org>
Cc: Christoph Hellwig <hch@infradead.org>
Link: http://lkml.kernel.org/r/20200610154923.27510-3-mcgrof@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 37c3c4b97b8e..3cd075ce2a1e 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,9 +36,8 @@
  *
  * If you need less than 50 threads would mean we're dealing with systems
  * smaller than 3200 pages. This assumes you are capable of having ~13M memory,
- * and this would only be an be an upper limit, after which the OOM killer
- * would take effect. Systems like these are very unlikely if modules are
- * enabled.
+ * and this would only be an upper limit, after which the OOM killer would take
+ * effect. Systems like these are very unlikely if modules are enabled.
  */
 #define MAX_KMOD_CONCURRENT 50
 static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
-- 
cgit v1.2.3


From 0935288c6e008c0682ab6171fb5605dcc049e7bd Mon Sep 17 00:00:00 2001
From: Vijay Balakrishna <vijayb@linux.microsoft.com>
Date: Tue, 11 Aug 2020 18:36:33 -0700
Subject: kdump: append kernel build-id string to VMCOREINFO

Make kernel GNU build-id available in VMCOREINFO.  Having build-id in
VMCOREINFO facilitates presenting appropriate kernel namelist image with
debug information file to kernel crash dump analysis tools.  Currently
VMCOREINFO lacks uniquely identifiable key for crash analysis automation.

Regarding if this patch is necessary or matching of linux_banner and
OSRELEASE in VMCOREINFO employed by crash(8) meets the need -- IMO,
build-id approach more foolproof, in most instances it is a cryptographic
hash generated using internal code/ELF bits unlike kernel version string
upon which linux_banner is based that is external to the code.  I feel
each is intended for a different purpose.  Also OSRELEASE is not suitable
when two different kernel builds from same version with different features
enabled.

Currently for most linux (and non-linux) systems build-id can be extracted
using standard methods for file types such as user mode crash dumps,
shared libraries, loadable kernel modules etc., This is an exception for
linux kernel dump.  Having build-id in VMCOREINFO brings some uniformity
for automation tools.

Tyler said:

: I think this is a nice improvement over today's linux_banner approach for
: correlating vmlinux to a kernel dump.
:
: The elf notes parsing in this patch lines up with what is described in in
: the "Notes (Nhdr)" section of the elf(5) man page.
:
: BUILD_ID_MAX is sufficient to hold a sha1 build-id, which is the default
: build-id type today in GNU ld(2).  It is also sufficient to hold the
: "fast" build-id, which is the default build-id type today in LLVM lld(2).

Signed-off-by: Vijay Balakrishna <vijayb@linux.microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Tyler Hicks <tyhicks@linux.microsoft.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Link: http://lkml.kernel.org/r/1591849672-34104-1-git-send-email-vijayb@linux.microsoft.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/crash_core.h |  6 ++++++
 kernel/crash_core.c        | 50 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 525510a9f965..6594dbc34a37 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -38,6 +38,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 
 #define VMCOREINFO_OSRELEASE(value) \
 	vmcoreinfo_append_str("OSRELEASE=%s\n", value)
+#define VMCOREINFO_BUILD_ID(value) \
+	vmcoreinfo_append_str("BUILD-ID=%s\n", value)
 #define VMCOREINFO_PAGESIZE(value) \
 	vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
 #define VMCOREINFO_SYMBOL(name) \
@@ -64,6 +66,10 @@ extern unsigned char *vmcoreinfo_data;
 extern size_t vmcoreinfo_size;
 extern u32 *vmcoreinfo_note;
 
+/* raw contents of kernel .notes section */
+extern const void __start_notes __weak;
+extern const void __stop_notes __weak;
+
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
 void final_note(Elf_Word *buf);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 18175687133a..106e4500fd53 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -11,6 +11,8 @@
 #include <asm/page.h>
 #include <asm/sections.h>
 
+#include <crypto/sha.h>
+
 /* vmcoreinfo stuff */
 unsigned char *vmcoreinfo_data;
 size_t vmcoreinfo_size;
@@ -376,6 +378,53 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
 }
 EXPORT_SYMBOL(paddr_vmcoreinfo_note);
 
+#define NOTES_SIZE (&__stop_notes - &__start_notes)
+#define BUILD_ID_MAX SHA1_DIGEST_SIZE
+#define NT_GNU_BUILD_ID 3
+
+struct elf_note_section {
+	struct elf_note	n_hdr;
+	u8 n_data[];
+};
+
+/*
+ * Add build ID from .notes section as generated by the GNU ld(1)
+ * or LLVM lld(1) --build-id option.
+ */
+static void add_build_id_vmcoreinfo(void)
+{
+	char build_id[BUILD_ID_MAX * 2 + 1];
+	int n_remain = NOTES_SIZE;
+
+	while (n_remain >= sizeof(struct elf_note)) {
+		const struct elf_note_section *note_sec =
+			&__start_notes + NOTES_SIZE - n_remain;
+		const u32 n_namesz = note_sec->n_hdr.n_namesz;
+
+		if (note_sec->n_hdr.n_type == NT_GNU_BUILD_ID &&
+		    n_namesz != 0 &&
+		    !strcmp((char *)&note_sec->n_data[0], "GNU")) {
+			if (note_sec->n_hdr.n_descsz <= BUILD_ID_MAX) {
+				const u32 n_descsz = note_sec->n_hdr.n_descsz;
+				const u8 *s = &note_sec->n_data[n_namesz];
+
+				s = PTR_ALIGN(s, 4);
+				bin2hex(build_id, s, n_descsz);
+				build_id[2 * n_descsz] = '\0';
+				VMCOREINFO_BUILD_ID(build_id);
+				return;
+			}
+			pr_warn("Build ID is too large to include in vmcoreinfo: %u > %u\n",
+				note_sec->n_hdr.n_descsz,
+				BUILD_ID_MAX);
+			return;
+		}
+		n_remain -= sizeof(struct elf_note) +
+			ALIGN(note_sec->n_hdr.n_namesz, 4) +
+			ALIGN(note_sec->n_hdr.n_descsz, 4);
+	}
+}
+
 static int __init crash_save_vmcoreinfo_init(void)
 {
 	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
@@ -394,6 +443,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	}
 
 	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+	add_build_id_vmcoreinfo();
 	VMCOREINFO_PAGESIZE(PAGE_SIZE);
 
 	VMCOREINFO_SYMBOL(init_uts_ns);
-- 
cgit v1.2.3


From 79076e1241bb3bf02d0aac7d39120d8161fe07b1 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Tue, 11 Aug 2020 18:36:46 -0700
Subject: kernel/panic.c: make oops_may_print() return bool

The return value of oops_may_print() is true or false, so change its type
to reflect that.

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Xuefeng Li <lixuefeng@loongson.cn>
Link: http://lkml.kernel.org/r/1591103358-32087-1-git-send-email-yangtiezhu@loongson.cn
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 2 +-
 kernel/panic.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 92517ae53e7c..211005163682 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -322,7 +322,7 @@ void nmi_panic(struct pt_regs *regs, const char *msg);
 extern void oops_enter(void);
 extern void oops_exit(void);
 void print_oops_end_marker(void);
-extern int oops_may_print(void);
+extern bool oops_may_print(void);
 void do_exit(long error_code) __noreturn;
 void complete_and_exit(struct completion *, long) __noreturn;
 
diff --git a/kernel/panic.c b/kernel/panic.c
index e2157ca387c8..93ba061d1c79 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -505,7 +505,7 @@ static void do_oops_enter_exit(void)
  * Return true if the calling CPU is allowed to print oops-related info.
  * This is a bit racy..
  */
-int oops_may_print(void)
+bool oops_may_print(void)
 {
 	return pause_on_oops_flag == 0;
 }
-- 
cgit v1.2.3


From 63037f74725ddd8a767ed2ad0369e60a3bf1f2ce Mon Sep 17 00:00:00 2001
From: Yue Hu <huyue2@yulong.com>
Date: Tue, 11 Aug 2020 18:36:53 -0700
Subject: panic: make print_oops_end_marker() static

Since print_oops_end_marker() is not used externally, also remove it in
kernel.h at the same time.

Signed-off-by: Yue Hu <huyue2@yulong.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/20200724011516.12756-1-zbestahu@gmail.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 1 -
 kernel/panic.c         | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 211005163682..500def620d8f 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -321,7 +321,6 @@ void panic(const char *fmt, ...) __noreturn __cold;
 void nmi_panic(struct pt_regs *regs, const char *msg);
 extern void oops_enter(void);
 extern void oops_exit(void);
-void print_oops_end_marker(void);
 extern bool oops_may_print(void);
 void do_exit(long error_code) __noreturn;
 void complete_and_exit(struct completion *, long) __noreturn;
diff --git a/kernel/panic.c b/kernel/panic.c
index 93ba061d1c79..aef8872ba843 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -551,7 +551,7 @@ static int init_oops_id(void)
 }
 late_initcall(init_oops_id);
 
-void print_oops_end_marker(void)
+static void print_oops_end_marker(void)
 {
 	init_oops_id();
 	pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
-- 
cgit v1.2.3


From 31a1b9878c0667945b078c92bce907552f57f2b6 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 11 Aug 2020 18:36:56 -0700
Subject: kcov: unconditionally add -fno-stack-protector to compiler options

Unconditionally add -fno-stack-protector to KCOV's compiler options, as
all supported compilers support the option.  This saves a compiler
invocation to determine if the option is supported.

Because Clang does not support -fno-conserve-stack, and
-fno-stack-protector was wrapped in the same cc-option, we were missing
-fno-stack-protector with Clang. Unconditionally adding this option
fixes this for Clang.

Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Link: http://lkml.kernel.org/r/20200615184302.7591-1-elver@google.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 5350fd292910..b3da548691c9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -36,7 +36,7 @@ KCOV_INSTRUMENT_stacktrace.o := n
 KCOV_INSTRUMENT_kcov.o := n
 KASAN_SANITIZE_kcov.o := n
 KCSAN_SANITIZE_kcov.o := n
-CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
 
 # cond_syscall is currently not LTO compatible
 CFLAGS_sys_ni.o = $(DISABLE_LTO)
-- 
cgit v1.2.3


From fed79d057d0842d2e381632b783af30745dd7938 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Tue, 11 Aug 2020 18:36:59 -0700
Subject: kcov: make some symbols static

Fix sparse build warnings:

kernel/kcov.c:99:1: warning:
 symbol '__pcpu_scope_kcov_percpu_data' was not declared. Should it be static?
kernel/kcov.c:778:6: warning:
 symbol 'kcov_remote_softirq_start' was not declared. Should it be static?
kernel/kcov.c:795:6: warning:
 symbol 'kcov_remote_softirq_stop' was not declared. Should it be static?

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Andrey Konovalov <andreyknvl@google.com>
Link: http://lkml.kernel.org/r/20200702115501.73077-1-weiyongjun1@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 6afae0bcbac4..6b8368be89c8 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -96,7 +96,7 @@ struct kcov_percpu_data {
 	int			saved_sequence;
 };
 
-DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
+static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
 
 /* Must be called with kcov_remote_lock locked. */
 static struct kcov_remote *kcov_remote_find(u64 handle)
@@ -775,7 +775,7 @@ static inline bool kcov_mode_enabled(unsigned int mode)
 	return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED;
 }
 
-void kcov_remote_softirq_start(struct task_struct *t)
+static void kcov_remote_softirq_start(struct task_struct *t)
 {
 	struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
 	unsigned int mode;
@@ -792,7 +792,7 @@ void kcov_remote_softirq_start(struct task_struct *t)
 	}
 }
 
-void kcov_remote_softirq_stop(struct task_struct *t)
+static void kcov_remote_softirq_stop(struct task_struct *t)
 {
 	struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
 
-- 
cgit v1.2.3


From 64019a2e467a288a16b65ab55ddcbf58c1b00187 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 11 Aug 2020 18:39:01 -0700
Subject: mm/gup: remove task_struct pointer for all gup code

After the cleanup of page fault accounting, gup does not need to pass
task_struct around any more.  Remove that parameter in the whole gup
stack.

Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Link: http://lkml.kernel.org/r/20200707225021.200906-26-peterx@redhat.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arc/kernel/process.c                   |   2 +-
 arch/s390/kvm/interrupt.c                   |   2 +-
 arch/s390/kvm/kvm-s390.c                    |   2 +-
 arch/s390/kvm/priv.c                        |   8 +--
 arch/s390/mm/gmap.c                         |   4 +-
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c |   2 +-
 drivers/infiniband/core/umem_odp.c          |   2 +-
 drivers/vfio/vfio_iommu_type1.c             |   4 +-
 fs/exec.c                                   |   2 +-
 include/linux/mm.h                          |   9 ++-
 kernel/events/uprobes.c                     |   6 +-
 kernel/futex.c                              |   2 +-
 mm/gup.c                                    | 101 ++++++++++++----------------
 mm/memory.c                                 |   2 +-
 mm/process_vm_access.c                      |   2 +-
 security/tomoyo/domain.c                    |   2 +-
 virt/kvm/async_pf.c                         |   2 +-
 virt/kvm/kvm_main.c                         |   2 +-
 18 files changed, 69 insertions(+), 87 deletions(-)

(limited to 'kernel')

diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index e12c80d71b78..efeba1fe7252 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -91,7 +91,7 @@ fault:
 		 goto fail;
 
 	mmap_read_lock(current->mm);
-	ret = fixup_user_fault(current, current->mm, (unsigned long) uaddr,
+	ret = fixup_user_fault(current->mm, (unsigned long) uaddr,
 			       FAULT_FLAG_WRITE, NULL);
 	mmap_read_unlock(current->mm);
 
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 1608fd99bbee..2f177298c663 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2768,7 +2768,7 @@ static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
 	struct page *page = NULL;
 
 	mmap_read_lock(kvm->mm);
-	get_user_pages_remote(NULL, kvm->mm, uaddr, 1, FOLL_WRITE,
+	get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE,
 			      &page, NULL, NULL);
 	mmap_read_unlock(kvm->mm);
 	return page;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 66da278a67fb..6b74b92c1a58 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1892,7 +1892,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 
 		r = set_guest_storage_key(current->mm, hva, keys[i], 0);
 		if (r) {
-			r = fixup_user_fault(current, current->mm, hva,
+			r = fixup_user_fault(current->mm, hva,
 					     FAULT_FLAG_WRITE, &unlocked);
 			if (r)
 				break;
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 2f721a923b54..cd74989ce0b0 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -273,7 +273,7 @@ retry:
 	rc = get_guest_storage_key(current->mm, vmaddr, &key);
 
 	if (rc) {
-		rc = fixup_user_fault(current, current->mm, vmaddr,
+		rc = fixup_user_fault(current->mm, vmaddr,
 				      FAULT_FLAG_WRITE, &unlocked);
 		if (!rc) {
 			mmap_read_unlock(current->mm);
@@ -319,7 +319,7 @@ retry:
 	mmap_read_lock(current->mm);
 	rc = reset_guest_reference_bit(current->mm, vmaddr);
 	if (rc < 0) {
-		rc = fixup_user_fault(current, current->mm, vmaddr,
+		rc = fixup_user_fault(current->mm, vmaddr,
 				      FAULT_FLAG_WRITE, &unlocked);
 		if (!rc) {
 			mmap_read_unlock(current->mm);
@@ -390,7 +390,7 @@ static int handle_sske(struct kvm_vcpu *vcpu)
 						m3 & SSKE_MC);
 
 		if (rc < 0) {
-			rc = fixup_user_fault(current, current->mm, vmaddr,
+			rc = fixup_user_fault(current->mm, vmaddr,
 					      FAULT_FLAG_WRITE, &unlocked);
 			rc = !rc ? -EAGAIN : rc;
 		}
@@ -1094,7 +1094,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 			rc = cond_set_guest_storage_key(current->mm, vmaddr,
 							key, NULL, nq, mr, mc);
 			if (rc < 0) {
-				rc = fixup_user_fault(current, current->mm, vmaddr,
+				rc = fixup_user_fault(current->mm, vmaddr,
 						      FAULT_FLAG_WRITE, &unlocked);
 				rc = !rc ? -EAGAIN : rc;
 			}
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 190357ff86b3..8747487c50a8 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -649,7 +649,7 @@ retry:
 		rc = vmaddr;
 		goto out_up;
 	}
-	if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
+	if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
 			     &unlocked)) {
 		rc = -EFAULT;
 		goto out_up;
@@ -879,7 +879,7 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
 
 	BUG_ON(gmap_is_shadow(gmap));
 	fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
-	if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+	if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
 		return -EFAULT;
 	if (unlocked)
 		/* lost mmap_lock, caller has to retry __gmap_translate */
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index e946032b13e4..2c2bf24140c9 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -469,7 +469,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 					locked = 1;
 				}
 				ret = pin_user_pages_remote
-					(work->task, mm,
+					(mm,
 					 obj->userptr.ptr + pinned * PAGE_SIZE,
 					 npages - pinned,
 					 flags,
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 5e32f61a2fe4..cc6b4befde7c 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -439,7 +439,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 		 * complex (and doesn't gain us much performance in most use
 		 * cases).
 		 */
-		npages = get_user_pages_remote(owning_process, owning_mm,
+		npages = get_user_pages_remote(owning_mm,
 				user_virt, gup_num_pages,
 				flags, local_page_list, NULL, NULL);
 		mmap_read_unlock(owning_mm);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 5e556ac9102a..9d41105bfd01 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -425,7 +425,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
 	if (ret) {
 		bool unlocked = false;
 
-		ret = fixup_user_fault(NULL, mm, vaddr,
+		ret = fixup_user_fault(mm, vaddr,
 				       FAULT_FLAG_REMOTE |
 				       (write_fault ?  FAULT_FLAG_WRITE : 0),
 				       &unlocked);
@@ -453,7 +453,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 		flags |= FOLL_WRITE;
 
 	mmap_read_lock(mm);
-	ret = pin_user_pages_remote(NULL, mm, vaddr, 1, flags | FOLL_LONGTERM,
+	ret = pin_user_pages_remote(mm, vaddr, 1, flags | FOLL_LONGTERM,
 				    page, NULL, NULL);
 	if (ret == 1) {
 		*pfn = page_to_pfn(page[0]);
diff --git a/fs/exec.c b/fs/exec.c
index a57d9785832b..a91003e28eaa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -217,7 +217,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 	 * We are doing an exec().  'current' is the process
 	 * doing the exec and bprm->mm is the new process's mm.
 	 */
-	ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
+	ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
 			&page, NULL, NULL);
 	if (ret <= 0)
 		return NULL;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ec0ffb423769..e7602a3bcef1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1661,7 +1661,7 @@ int invalidate_inode_page(struct page *page);
 extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
 				  unsigned long address, unsigned int flags,
 				  struct pt_regs *regs);
-extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+extern int fixup_user_fault(struct mm_struct *mm,
 			    unsigned long address, unsigned int fault_flags,
 			    bool *unlocked);
 void unmap_mapping_pages(struct address_space *mapping,
@@ -1677,8 +1677,7 @@ static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
 	BUG();
 	return VM_FAULT_SIGBUS;
 }
-static inline int fixup_user_fault(struct task_struct *tsk,
-		struct mm_struct *mm, unsigned long address,
+static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
 		unsigned int fault_flags, bool *unlocked)
 {
 	/* should never happen if there's no MMU */
@@ -1704,11 +1703,11 @@ extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long addr, void *buf, int len, unsigned int gup_flags);
 
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long get_user_pages_remote(struct mm_struct *mm,
 			    unsigned long start, unsigned long nr_pages,
 			    unsigned int gup_flags, struct page **pages,
 			    struct vm_area_struct **vmas, int *locked);
-long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long pin_user_pages_remote(struct mm_struct *mm,
 			   unsigned long start, unsigned long nr_pages,
 			   unsigned int gup_flags, struct page **pages,
 			   struct vm_area_struct **vmas, int *locked);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 49047d479c57..649fd53dc9ad 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -376,7 +376,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
 	if (!vaddr || !d)
 		return -EINVAL;
 
-	ret = get_user_pages_remote(NULL, mm, vaddr, 1,
+	ret = get_user_pages_remote(mm, vaddr, 1,
 			FOLL_WRITE, &page, &vma, NULL);
 	if (unlikely(ret <= 0)) {
 		/*
@@ -477,7 +477,7 @@ retry:
 	if (is_register)
 		gup_flags |= FOLL_SPLIT_PMD;
 	/* Read the page with vaddr into memory */
-	ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags,
+	ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
 				    &old_page, &vma, NULL);
 	if (ret <= 0)
 		return ret;
@@ -2029,7 +2029,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	 * but we treat this as a 'remote' access since it is
 	 * essentially a kernel access to the memory.
 	 */
-	result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
+	result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
 			NULL, NULL);
 	if (result < 0)
 		return result;
diff --git a/kernel/futex.c b/kernel/futex.c
index 83404124b77b..61e8153e6c76 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -678,7 +678,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 	int ret;
 
 	mmap_read_lock(mm);
-	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
+	ret = fixup_user_fault(mm, (unsigned long)uaddr,
 			       FAULT_FLAG_WRITE, NULL);
 	mmap_read_unlock(mm);
 
diff --git a/mm/gup.c b/mm/gup.c
index d5d44c68fa19..39e58df6925d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -859,7 +859,7 @@ unmap:
  * does not include FOLL_NOWAIT, the mmap_lock may be released.  If it
  * is, *@locked will be set to 0 and -EBUSY returned.
  */
-static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
+static int faultin_page(struct vm_area_struct *vma,
 		unsigned long address, unsigned int *flags, int *locked)
 {
 	unsigned int fault_flags = 0;
@@ -962,7 +962,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
 
 /**
  * __get_user_pages() - pin user pages in memory
- * @tsk:	task_struct of target task
  * @mm:		mm_struct of target mm
  * @start:	starting user address
  * @nr_pages:	number of pages from start to pin
@@ -1021,7 +1020,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
  * instead of __get_user_pages. __get_user_pages should be used only if
  * you need some special @gup_flags.
  */
-static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+static long __get_user_pages(struct mm_struct *mm,
 		unsigned long start, unsigned long nr_pages,
 		unsigned int gup_flags, struct page **pages,
 		struct vm_area_struct **vmas, int *locked)
@@ -1103,8 +1102,7 @@ retry:
 
 		page = follow_page_mask(vma, start, foll_flags, &ctx);
 		if (!page) {
-			ret = faultin_page(tsk, vma, start, &foll_flags,
-					   locked);
+			ret = faultin_page(vma, start, &foll_flags, locked);
 			switch (ret) {
 			case 0:
 				goto retry;
@@ -1178,8 +1176,6 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
 
 /**
  * fixup_user_fault() - manually resolve a user page fault
- * @tsk:	the task_struct to use for page fault accounting, or
- *		NULL if faults are not to be recorded.
  * @mm:		mm_struct of target mm
  * @address:	user address
  * @fault_flags:flags to pass down to handle_mm_fault()
@@ -1207,7 +1203,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
  * This function will not return with an unlocked mmap_lock. So it has not the
  * same semantics wrt the @mm->mmap_lock as does filemap_fault().
  */
-int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+int fixup_user_fault(struct mm_struct *mm,
 		     unsigned long address, unsigned int fault_flags,
 		     bool *unlocked)
 {
@@ -1256,8 +1252,7 @@ EXPORT_SYMBOL_GPL(fixup_user_fault);
  * Please note that this function, unlike __get_user_pages will not
  * return 0 for nr_pages > 0 without FOLL_NOWAIT
  */
-static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
-						struct mm_struct *mm,
+static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
 						unsigned long start,
 						unsigned long nr_pages,
 						struct page **pages,
@@ -1290,7 +1285,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
 	pages_done = 0;
 	lock_dropped = false;
 	for (;;) {
-		ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
+		ret = __get_user_pages(mm, start, nr_pages, flags, pages,
 				       vmas, locked);
 		if (!locked)
 			/* VM_FAULT_RETRY couldn't trigger, bypass */
@@ -1350,7 +1345,7 @@ retry:
 		}
 
 		*locked = 1;
-		ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
+		ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
 				       pages, NULL, locked);
 		if (!*locked) {
 			/* Continue to retry until we succeeded */
@@ -1437,7 +1432,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
 	 * We made sure addr is within a VMA, so the following will
 	 * not result in a stack expansion that recurses back here.
 	 */
-	return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+	return __get_user_pages(mm, start, nr_pages, gup_flags,
 				NULL, NULL, locked);
 }
 
@@ -1521,7 +1516,7 @@ struct page *get_dump_page(unsigned long addr)
 	struct vm_area_struct *vma;
 	struct page *page;
 
-	if (__get_user_pages(current, current->mm, addr, 1,
+	if (__get_user_pages(current->mm, addr, 1,
 			     FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
 			     NULL) < 1)
 		return NULL;
@@ -1530,8 +1525,7 @@ struct page *get_dump_page(unsigned long addr)
 }
 #endif /* CONFIG_ELF_CORE */
 #else /* CONFIG_MMU */
-static long __get_user_pages_locked(struct task_struct *tsk,
-		struct mm_struct *mm, unsigned long start,
+static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
 		unsigned long nr_pages, struct page **pages,
 		struct vm_area_struct **vmas, int *locked,
 		unsigned int foll_flags)
@@ -1596,8 +1590,7 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
 }
 
 #ifdef CONFIG_CMA
-static long check_and_migrate_cma_pages(struct task_struct *tsk,
-					struct mm_struct *mm,
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
 					unsigned long start,
 					unsigned long nr_pages,
 					struct page **pages,
@@ -1675,7 +1668,7 @@ check_again:
 		 * again migrating any new CMA pages which we failed to isolate
 		 * earlier.
 		 */
-		ret = __get_user_pages_locked(tsk, mm, start, nr_pages,
+		ret = __get_user_pages_locked(mm, start, nr_pages,
 						   pages, vmas, NULL,
 						   gup_flags);
 
@@ -1689,8 +1682,7 @@ check_again:
 	return ret;
 }
 #else
-static long check_and_migrate_cma_pages(struct task_struct *tsk,
-					struct mm_struct *mm,
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
 					unsigned long start,
 					unsigned long nr_pages,
 					struct page **pages,
@@ -1705,8 +1697,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
  * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
  * allows us to process the FOLL_LONGTERM flag.
  */
-static long __gup_longterm_locked(struct task_struct *tsk,
-				  struct mm_struct *mm,
+static long __gup_longterm_locked(struct mm_struct *mm,
 				  unsigned long start,
 				  unsigned long nr_pages,
 				  struct page **pages,
@@ -1731,7 +1722,7 @@ static long __gup_longterm_locked(struct task_struct *tsk,
 		flags = memalloc_nocma_save();
 	}
 
-	rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
+	rc = __get_user_pages_locked(mm, start, nr_pages, pages,
 				     vmas_tmp, NULL, gup_flags);
 
 	if (gup_flags & FOLL_LONGTERM) {
@@ -1745,7 +1736,7 @@ static long __gup_longterm_locked(struct task_struct *tsk,
 			goto out;
 		}
 
-		rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
+		rc = check_and_migrate_cma_pages(mm, start, rc, pages,
 						 vmas_tmp, gup_flags);
 out:
 		memalloc_nocma_restore(flags);
@@ -1756,22 +1747,20 @@ out:
 	return rc;
 }
 #else /* !CONFIG_FS_DAX && !CONFIG_CMA */
-static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
-						  struct mm_struct *mm,
+static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
 						  unsigned long start,
 						  unsigned long nr_pages,
 						  struct page **pages,
 						  struct vm_area_struct **vmas,
 						  unsigned int flags)
 {
-	return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+	return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
 				       NULL, flags);
 }
 #endif /* CONFIG_FS_DAX || CONFIG_CMA */
 
 #ifdef CONFIG_MMU
-static long __get_user_pages_remote(struct task_struct *tsk,
-				    struct mm_struct *mm,
+static long __get_user_pages_remote(struct mm_struct *mm,
 				    unsigned long start, unsigned long nr_pages,
 				    unsigned int gup_flags, struct page **pages,
 				    struct vm_area_struct **vmas, int *locked)
@@ -1790,20 +1779,18 @@ static long __get_user_pages_remote(struct task_struct *tsk,
 		 * This will check the vmas (even if our vmas arg is NULL)
 		 * and return -ENOTSUPP if DAX isn't allowed in this case:
 		 */
-		return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
+		return __gup_longterm_locked(mm, start, nr_pages, pages,
 					     vmas, gup_flags | FOLL_TOUCH |
 					     FOLL_REMOTE);
 	}
 
-	return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+	return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
 				       locked,
 				       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
 }
 
 /**
  * get_user_pages_remote() - pin user pages in memory
- * @tsk:	the task_struct to use for page fault accounting, or
- *		NULL if faults are not to be recorded.
  * @mm:		mm_struct of target mm
  * @start:	starting user address
  * @nr_pages:	number of pages from start to pin
@@ -1862,7 +1849,7 @@ static long __get_user_pages_remote(struct task_struct *tsk,
  * should use get_user_pages_remote because it cannot pass
  * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
  */
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long get_user_pages_remote(struct mm_struct *mm,
 		unsigned long start, unsigned long nr_pages,
 		unsigned int gup_flags, struct page **pages,
 		struct vm_area_struct **vmas, int *locked)
@@ -1874,13 +1861,13 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 	if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
 		return -EINVAL;
 
-	return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+	return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
 				       pages, vmas, locked);
 }
 EXPORT_SYMBOL(get_user_pages_remote);
 
 #else /* CONFIG_MMU */
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long get_user_pages_remote(struct mm_struct *mm,
 			   unsigned long start, unsigned long nr_pages,
 			   unsigned int gup_flags, struct page **pages,
 			   struct vm_area_struct **vmas, int *locked)
@@ -1888,8 +1875,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 	return 0;
 }
 
-static long __get_user_pages_remote(struct task_struct *tsk,
-				    struct mm_struct *mm,
+static long __get_user_pages_remote(struct mm_struct *mm,
 				    unsigned long start, unsigned long nr_pages,
 				    unsigned int gup_flags, struct page **pages,
 				    struct vm_area_struct **vmas, int *locked)
@@ -1909,11 +1895,10 @@ static long __get_user_pages_remote(struct task_struct *tsk,
  * @vmas:       array of pointers to vmas corresponding to each page.
  *              Or NULL if the caller does not require them.
  *
- * This is the same as get_user_pages_remote(), just with a
- * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's and don't allow
- * passing of a locked parameter.  We also obviously don't pass
- * FOLL_REMOTE in here.
+ * This is the same as get_user_pages_remote(), just with a less-flexible
+ * calling convention where we assume that the mm being operated on belongs to
+ * the current task, and doesn't allow passing of a locked parameter.  We also
+ * obviously don't pass FOLL_REMOTE in here.
  */
 long get_user_pages(unsigned long start, unsigned long nr_pages,
 		unsigned int gup_flags, struct page **pages,
@@ -1926,7 +1911,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
 	if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
 		return -EINVAL;
 
-	return __gup_longterm_locked(current, current->mm, start, nr_pages,
+	return __gup_longterm_locked(current->mm, start, nr_pages,
 				     pages, vmas, gup_flags | FOLL_TOUCH);
 }
 EXPORT_SYMBOL(get_user_pages);
@@ -1936,7 +1921,7 @@ EXPORT_SYMBOL(get_user_pages);
  *
  *      mmap_read_lock(mm);
  *      do_something()
- *      get_user_pages(tsk, mm, ..., pages, NULL);
+ *      get_user_pages(mm, ..., pages, NULL);
  *      mmap_read_unlock(mm);
  *
  *  to:
@@ -1944,7 +1929,7 @@ EXPORT_SYMBOL(get_user_pages);
  *      int locked = 1;
  *      mmap_read_lock(mm);
  *      do_something()
- *      get_user_pages_locked(tsk, mm, ..., pages, &locked);
+ *      get_user_pages_locked(mm, ..., pages, &locked);
  *      if (locked)
  *          mmap_read_unlock(mm);
  *
@@ -1982,7 +1967,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
 	if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
 		return -EINVAL;
 
-	return __get_user_pages_locked(current, current->mm, start, nr_pages,
+	return __get_user_pages_locked(current->mm, start, nr_pages,
 				       pages, NULL, locked,
 				       gup_flags | FOLL_TOUCH);
 }
@@ -1992,12 +1977,12 @@ EXPORT_SYMBOL(get_user_pages_locked);
  * get_user_pages_unlocked() is suitable to replace the form:
  *
  *      mmap_read_lock(mm);
- *      get_user_pages(tsk, mm, ..., pages, NULL);
+ *      get_user_pages(mm, ..., pages, NULL);
  *      mmap_read_unlock(mm);
  *
  *  with:
  *
- *      get_user_pages_unlocked(tsk, mm, ..., pages);
+ *      get_user_pages_unlocked(mm, ..., pages);
  *
  * It is functionally equivalent to get_user_pages_fast so
  * get_user_pages_fast should be used instead if specific gup_flags
@@ -2020,7 +2005,7 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 		return -EINVAL;
 
 	mmap_read_lock(mm);
-	ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
+	ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
 				      &locked, gup_flags | FOLL_TOUCH);
 	if (locked)
 		mmap_read_unlock(mm);
@@ -2665,7 +2650,7 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
 	 */
 	if (gup_flags & FOLL_LONGTERM) {
 		mmap_read_lock(current->mm);
-		ret = __gup_longterm_locked(current, current->mm,
+		ret = __gup_longterm_locked(current->mm,
 					    start, nr_pages,
 					    pages, NULL, gup_flags);
 		mmap_read_unlock(current->mm);
@@ -2908,10 +2893,8 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages,
 EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
 
 /**
- * pin_user_pages_remote() - pin pages of a remote process (task != current)
+ * pin_user_pages_remote() - pin pages of a remote process
  *
- * @tsk:	the task_struct to use for page fault accounting, or
- *		NULL if faults are not to be recorded.
  * @mm:		mm_struct of target mm
  * @start:	starting user address
  * @nr_pages:	number of pages from start to pin
@@ -2932,7 +2915,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
  * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
  * see Documentation/core-api/pin_user_pages.rst for details.
  */
-long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long pin_user_pages_remote(struct mm_struct *mm,
 			   unsigned long start, unsigned long nr_pages,
 			   unsigned int gup_flags, struct page **pages,
 			   struct vm_area_struct **vmas, int *locked)
@@ -2942,7 +2925,7 @@ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 		return -EINVAL;
 
 	gup_flags |= FOLL_PIN;
-	return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+	return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
 				       pages, vmas, locked);
 }
 EXPORT_SYMBOL(pin_user_pages_remote);
@@ -2974,7 +2957,7 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
 		return -EINVAL;
 
 	gup_flags |= FOLL_PIN;
-	return __gup_longterm_locked(current, current->mm, start, nr_pages,
+	return __gup_longterm_locked(current->mm, start, nr_pages,
 				     pages, vmas, gup_flags);
 }
 EXPORT_SYMBOL(pin_user_pages);
@@ -3019,7 +3002,7 @@ long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
 		return -EINVAL;
 
 	gup_flags |= FOLL_PIN;
-	return __get_user_pages_locked(current, current->mm, start, nr_pages,
+	return __get_user_pages_locked(current->mm, start, nr_pages,
 				       pages, NULL, locked,
 				       gup_flags | FOLL_TOUCH);
 }
diff --git a/mm/memory.c b/mm/memory.c
index 2b7f0e00f312..228efaca75d3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4742,7 +4742,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		void *maddr;
 		struct page *page = NULL;
 
-		ret = get_user_pages_remote(tsk, mm, addr, 1,
+		ret = get_user_pages_remote(mm, addr, 1,
 				gup_flags, &page, &vma, NULL);
 		if (ret <= 0) {
 #ifndef CONFIG_HAVE_IOREMAP_PROT
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index cc85ce81914a..29c052099aff 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -105,7 +105,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
 		 * current/current->mm
 		 */
 		mmap_read_lock(mm);
-		pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages,
+		pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages,
 						     flags, process_pages,
 						     NULL, &locked);
 		if (locked)
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 53b3e1f5f227..dc4ecc0b2038 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -914,7 +914,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
 	 * (represented by bprm).  'current' is the process doing
 	 * the execve().
 	 */
-	if (get_user_pages_remote(current, bprm->mm, pos, 1,
+	if (get_user_pages_remote(bprm->mm, pos, 1,
 				FOLL_FORCE, &page, NULL, NULL) <= 0)
 		return false;
 #else
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 390f758d5a27..dd777688d14a 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -61,7 +61,7 @@ static void async_pf_execute(struct work_struct *work)
 	 * access remotely.
 	 */
 	mmap_read_lock(mm);
-	get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL,
+	get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, NULL,
 			&locked);
 	if (locked)
 		mmap_read_unlock(mm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2c2c0254c2d8..737666db02de 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1893,7 +1893,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
 		 * not call the fault handler, so do it here.
 		 */
 		bool unlocked = false;
-		r = fixup_user_fault(current, current->mm, addr,
+		r = fixup_user_fault(current->mm, addr,
 				     (write_fault ? FAULT_FLAG_WRITE : 0),
 				     &unlocked);
 		if (unlocked)
-- 
cgit v1.2.3


From f107cee94ba4d2c7357fde59a1d84346c73d4958 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Tue, 11 Aug 2020 11:00:12 -0700
Subject: genirq: Unlock irq descriptor after errors

In irq_set_irqchip_state(), the irq descriptor is not unlocked after an
error is encountered. While that should never happen in practice, a buggy
driver may trigger it. This would result in a lockup, so fix it.

Fixes: 1d0326f352bb ("genirq: Check irq_data_get_irq_chip() return value before use")
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20200811180012.80269-1-linux@roeck-us.net
---
 kernel/irq/manage.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d55ba625d426..52ac5391dcc6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2731,8 +2731,10 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
 
 	do {
 		chip = irq_data_get_irq_chip(data);
-		if (WARN_ON_ONCE(!chip))
-			return -ENODEV;
+		if (WARN_ON_ONCE(!chip)) {
+			err = -ENODEV;
+			goto out_unlock;
+		}
 		if (chip->irq_set_irqchip_state)
 			break;
 #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
@@ -2745,6 +2747,7 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
 	if (data)
 		err = chip->irq_set_irqchip_state(data, which, val);
 
+out_unlock:
 	irq_put_desc_busunlock(desc, flags);
 	return err;
 }
-- 
cgit v1.2.3


From ebf0d100df0731901c16632f78d78d35f4123bc4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 13 Aug 2020 09:01:38 -0600
Subject: task_work: only grab task signal lock when needed

If JOBCTL_TASK_WORK is already set on the targeted task, then we need
not go through {lock,unlock}_task_sighand() to set it again and queue
a signal wakeup. This is safe as we're checking it _after_ adding the
new task_work with cmpxchg().

The ordering is as follows:

task_work_add()				get_signal()
--------------------------------------------------------------
STORE(task->task_works, new_work);	STORE(task->jobctl);
mb();					mb();
LOAD(task->jobctl);			LOAD(task->task_works);

This speeds up TWA_SIGNAL handling quite a bit, which is important now
that io_uring is relying on it for all task_work deliveries.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jann Horn <jannh@google.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/signal.c    | 16 +++++++++++++++-
 kernel/task_work.c |  8 +++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 6f16f7c5d375..42b67d2cea37 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2541,7 +2541,21 @@ bool get_signal(struct ksignal *ksig)
 
 relock:
 	spin_lock_irq(&sighand->siglock);
-	current->jobctl &= ~JOBCTL_TASK_WORK;
+	/*
+	 * Make sure we can safely read ->jobctl() in task_work add. As Oleg
+	 * states:
+	 *
+	 * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we
+	 * roughly have
+	 *
+	 *	task_work_add:				get_signal:
+	 *	STORE(task->task_works, new_work);	STORE(task->jobctl);
+	 *	mb();					mb();
+	 *	LOAD(task->jobctl);			LOAD(task->task_works);
+	 *
+	 * and we can rely on STORE-MB-LOAD [ in task_work_add].
+	 */
+	smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK);
 	if (unlikely(current->task_works)) {
 		spin_unlock_irq(&sighand->siglock);
 		task_work_run();
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 5c0848ca1287..613b2d634af8 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -42,7 +42,13 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify)
 		set_notify_resume(task);
 		break;
 	case TWA_SIGNAL:
-		if (lock_task_sighand(task, &flags)) {
+		/*
+		 * Only grab the sighand lock if we don't already have some
+		 * task_work pending. This pairs with the smp_store_mb()
+		 * in get_signal(), see comment there.
+		 */
+		if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) &&
+		    lock_task_sighand(task, &flags)) {
 			task->jobctl |= JOBCTL_TASK_WORK;
 			signal_wake_up(task, 0);
 			unlock_task_sighand(task, &flags);
-- 
cgit v1.2.3


From 405fa8ac89e7aaa87282df659e525992f2639e76 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Thu, 13 Aug 2020 08:21:17 -0400
Subject: futex: Convert to use the preferred 'fallthrough' macro

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200813122117.51173-1-linmiaohe@huawei.com
---
 kernel/futex.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 61e8153e6c76..a5876694a60e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3744,12 +3744,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	switch (cmd) {
 	case FUTEX_WAIT:
 		val3 = FUTEX_BITSET_MATCH_ANY;
-		/* fall through */
+		fallthrough;
 	case FUTEX_WAIT_BITSET:
 		return futex_wait(uaddr, flags, val, timeout, val3);
 	case FUTEX_WAKE:
 		val3 = FUTEX_BITSET_MATCH_ANY;
-		/* fall through */
+		fallthrough;
 	case FUTEX_WAKE_BITSET:
 		return futex_wake(uaddr, flags, val, val3);
 	case FUTEX_REQUEUE:
-- 
cgit v1.2.3


From cc172ff301d8079e941a6eb31758951a6d764084 Mon Sep 17 00:00:00 2001
From: Libing Zhou <libing.zhou@nokia-sbell.com>
Date: Fri, 14 Aug 2020 11:02:36 +0800
Subject: sched/debug: Fix the alignment of the show-state debug output

Current sysrq(t) output task fields name are not aligned with
actual task fields value, e.g.:

	kernel: sysrq: Show State
	kernel:  task                        PC stack   pid father
	kernel: systemd         S12456     1      0 0x00000000
	kernel: Call Trace:
	kernel: ? __schedule+0x240/0x740

To make it more readable, print fields name together with task fields
value in the same line, with fixed width:

	kernel: sysrq: Show State
	kernel: task:systemd         state:S stack:12920 pid:    1 ppid:     0 flags:0x00000000
	kernel: Call Trace:
	kernel: __schedule+0x282/0x620

Signed-off-by: Libing Zhou <libing.zhou@nokia-sbell.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200814030236.37835-1-libing.zhou@nokia-sbell.com
---
 kernel/sched/core.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4a0e7b449b88..09fd62568ba9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6387,10 +6387,10 @@ void sched_show_task(struct task_struct *p)
 	if (!try_get_task_stack(p))
 		return;
 
-	printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+	pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
 
 	if (p->state == TASK_RUNNING)
-		printk(KERN_CONT "  running task    ");
+		pr_cont("  running task    ");
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
 #endif
@@ -6399,8 +6399,8 @@ void sched_show_task(struct task_struct *p)
 	if (pid_alive(p))
 		ppid = task_pid_nr(rcu_dereference(p->real_parent));
 	rcu_read_unlock();
-	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-		task_pid_nr(p), ppid,
+	pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+		free, task_pid_nr(p), ppid,
 		(unsigned long)task_thread_info(p)->flags);
 
 	print_worker_info(KERN_INFO, p);
@@ -6435,13 +6435,6 @@ void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
 
-#if BITS_PER_LONG == 32
-	printk(KERN_INFO
-		"  task                PC stack   pid father\n");
-#else
-	printk(KERN_INFO
-		"  task                        PC stack   pid father\n");
-#endif
 	rcu_read_lock();
 	for_each_process_thread(g, p) {
 		/*
-- 
cgit v1.2.3


From a85ffd59bd362d6c2e456e7f523b091830cd5454 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 12 Aug 2020 20:17:00 -0700
Subject: dma-debug: fix debug_dma_assert_idle(), use rcu_read_lock()

Since commit 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common()
logic") improved unlock_page(), it has become more noticeable how
cow_user_page() in a kernel with CONFIG_DMA_API_DEBUG=y can create and
suffer from heavy contention on DMA debug's radix_lock in
debug_dma_assert_idle().

It is only doing a lookup: use rcu_read_lock() and rcu_read_unlock()
instead; though that does require the static ents[] to be moved
onstack...

...but, hold on, isn't that radix_tree_gang_lookup() and loop doing
quite the wrong thing: searching CACHELINES_PER_PAGE entries for an
exact match with the first cacheline of the page in question?
radix_tree_gang_lookup() is the right tool for the job, but we need
nothing more than to check the first entry it can find, reporting if
that falls anywhere within the page.

(Is RCU safe here? As safe as using the spinlock was. The entries are
never freed, so don't need to be freed by RCU. They may be reused, and
there is a faint chance of a race, with an offending entry reused while
printing its error info; but the spinlock did not prevent that either,
and I agree that it's not worth worrying about. ]

[ Side noe: this patch is a clear improvement to the status quo, but the
  next patch will be removing this debug function entirely.

  But just in case we decide we want to resurrect the debugging code
  some day, I'm first applying this improvement patch so that it doesn't
  get lost    - Linus ]

Fixes: 3b7a6418c749 ("dma debug: account for cachelines and read-only mappings in overlap tracking")
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/dma/debug.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index f7f807fb6ebb..6f4f4b9d2d03 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -565,11 +565,8 @@ static void active_cacheline_remove(struct dma_debug_entry *entry)
  */
 void debug_dma_assert_idle(struct page *page)
 {
-	static struct dma_debug_entry *ents[CACHELINES_PER_PAGE];
-	struct dma_debug_entry *entry = NULL;
-	void **results = (void **) &ents;
-	unsigned int nents, i;
-	unsigned long flags;
+	struct dma_debug_entry *entry;
+	unsigned long pfn;
 	phys_addr_t cln;
 
 	if (dma_debug_disabled())
@@ -578,20 +575,14 @@ void debug_dma_assert_idle(struct page *page)
 	if (!page)
 		return;
 
-	cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT;
-	spin_lock_irqsave(&radix_lock, flags);
-	nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln,
-				       CACHELINES_PER_PAGE);
-	for (i = 0; i < nents; i++) {
-		phys_addr_t ent_cln = to_cacheline_number(ents[i]);
+	pfn = page_to_pfn(page);
+	cln = (phys_addr_t) pfn << CACHELINE_PER_PAGE_SHIFT;
 
-		if (ent_cln == cln) {
-			entry = ents[i];
-			break;
-		} else if (ent_cln >= cln + CACHELINES_PER_PAGE)
-			break;
-	}
-	spin_unlock_irqrestore(&radix_lock, flags);
+	rcu_read_lock();
+	if (!radix_tree_gang_lookup(&dma_active_cacheline, (void **) &entry,
+				    cln, 1) || entry->pfn != pfn)
+		entry = NULL;
+	rcu_read_unlock();
 
 	if (!entry)
 		return;
-- 
cgit v1.2.3


From 5848dc5b1b76d83599dcec1b39f188a7cbdca7e2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 14 Aug 2020 15:22:43 -0700
Subject: dma-debug: remove debug_dma_assert_idle() function

This remoes the code from the COW path to call debug_dma_assert_idle(),
which was added many years ago.

Google shows that it hasn't caught anything in the 6+ years we've had it
apart from a false positive, and Hugh just noticed how it had a very
unfortunate spinlock serialization in the COW path.

He fixed that issue the previous commit (a85ffd59bd36: "dma-debug: fix
debug_dma_assert_idle(), use rcu_read_lock()"), but let's see if anybody
even notices when we remove this function entirely.

NOTE! We keep the dma tracking infrastructure that was added by the
commit that introduced it.  Partly to make it easier to resurrect this
debug code if we ever deside to, and partly because that tracking by pfn
and offset looks quite reasonable.

The problem with this debug code was simply that it was expensive and
didn't seem worth it, not that it was wrong per se.

Acked-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dma-debug.h |  6 ------
 kernel/dma/Kconfig        |  5 -----
 kernel/dma/debug.c        | 46 +---------------------------------------------
 mm/memory.c               |  2 --
 4 files changed, 1 insertion(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 4208f94d93f7..7b3b04ba60f3 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -67,8 +67,6 @@ extern void debug_dma_sync_sg_for_device(struct device *dev,
 
 extern void debug_dma_dump_mappings(struct device *dev);
 
-extern void debug_dma_assert_idle(struct page *page);
-
 #else /* CONFIG_DMA_API_DEBUG */
 
 static inline void dma_debug_add_bus(struct bus_type *bus)
@@ -157,10 +155,6 @@ static inline void debug_dma_dump_mappings(struct device *dev)
 {
 }
 
-static inline void debug_dma_assert_idle(struct page *page)
-{
-}
-
 #endif /* CONFIG_DMA_API_DEBUG */
 
 #endif /* __DMA_DEBUG_H */
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index f4770fcfa62b..5732b2b3ef17 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -186,11 +186,6 @@ config DMA_API_DEBUG
 	  drivers like double-freeing of DMA mappings or freeing mappings that
 	  were never allocated.
 
-	  This also attempts to catch cases where a page owned by DMA is
-	  accessed by the cpu in a way that could cause data corruption.  For
-	  example, this enables cow_user_page() to check that the source page is
-	  not undergoing DMA.
-
 	  This option causes a performance degradation.  Use only if you want to
 	  debug device drivers and dma interactions.
 
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 6f4f4b9d2d03..8e9f7b301c6d 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -448,9 +448,6 @@ void debug_dma_dump_mappings(struct device *dev)
  * dma_active_cacheline entry to track per event.  dma_map_sg(), on the
  * other hand, consumes a single dma_debug_entry, but inserts 'nents'
  * entries into the tree.
- *
- * At any time debug_dma_assert_idle() can be called to trigger a
- * warning if any cachelines in the given page are in the active set.
  */
 static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT);
 static DEFINE_SPINLOCK(radix_lock);
@@ -497,10 +494,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln)
 	overlap = active_cacheline_set_overlap(cln, ++overlap);
 
 	/* If we overflowed the overlap counter then we're potentially
-	 * leaking dma-mappings.  Otherwise, if maps and unmaps are
-	 * balanced then this overflow may cause false negatives in
-	 * debug_dma_assert_idle() as the cacheline may be marked idle
-	 * prematurely.
+	 * leaking dma-mappings.
 	 */
 	WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
 		  pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
@@ -555,44 +549,6 @@ static void active_cacheline_remove(struct dma_debug_entry *entry)
 	spin_unlock_irqrestore(&radix_lock, flags);
 }
 
-/**
- * debug_dma_assert_idle() - assert that a page is not undergoing dma
- * @page: page to lookup in the dma_active_cacheline tree
- *
- * Place a call to this routine in cases where the cpu touching the page
- * before the dma completes (page is dma_unmapped) will lead to data
- * corruption.
- */
-void debug_dma_assert_idle(struct page *page)
-{
-	struct dma_debug_entry *entry;
-	unsigned long pfn;
-	phys_addr_t cln;
-
-	if (dma_debug_disabled())
-		return;
-
-	if (!page)
-		return;
-
-	pfn = page_to_pfn(page);
-	cln = (phys_addr_t) pfn << CACHELINE_PER_PAGE_SHIFT;
-
-	rcu_read_lock();
-	if (!radix_tree_gang_lookup(&dma_active_cacheline, (void **) &entry,
-				    cln, 1) || entry->pfn != pfn)
-		entry = NULL;
-	rcu_read_unlock();
-
-	if (!entry)
-		return;
-
-	cln = to_cacheline_number(entry);
-	err_printk(entry->dev, entry,
-		   "cpu touching an active dma mapped cacheline [cln=%pa]\n",
-		   &cln);
-}
-
 /*
  * Wrapper function for adding an entry to the hash.
  * This function takes care of locking itself.
diff --git a/mm/memory.c b/mm/memory.c
index 228efaca75d3..d3c3bbd65a7e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2411,8 +2411,6 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long addr = vmf->address;
 
-	debug_dma_assert_idle(src);
-
 	if (likely(src)) {
 		copy_user_highpage(dst, src, addr, vma);
 		return true;
-- 
cgit v1.2.3


From 846f9e1fb9b9a2c4eecefa2fdbfb51e975a7d532 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 14 Jul 2020 14:18:54 +0200
Subject: dma-mapping: consolidate the NO_DMA definition in kernel/dma/Kconfig

Have a single definition that architetures can select.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Rich Felker <dalias@libc.org>
---
 arch/m68k/Kconfig         | 4 +---
 arch/m68k/Kconfig.machine | 1 +
 arch/um/Kconfig           | 4 +---
 kernel/dma/Kconfig        | 3 +++
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 6ad6cdac74b3..8e488369a7e5 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -17,6 +17,7 @@ config M68K
 	select HAVE_COPY_THREAD_TLS
 	select GENERIC_IRQ_SHOW
 	select GENERIC_ATOMIC64
+	select NO_DMA if !MMU && !COLDFIRE
 	select HAVE_UID16
 	select VIRT_TO_BUS
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS
@@ -60,9 +61,6 @@ config TIME_LOW_RES
 config NO_IOPORT_MAP
 	def_bool y
 
-config NO_DMA
-	def_bool (MMU && SUN3) || (!MMU && !COLDFIRE)
-
 config ZONE_DMA
 	bool
 	default y
diff --git a/arch/m68k/Kconfig.machine b/arch/m68k/Kconfig.machine
index a82651d58af4..17e8c3a292d7 100644
--- a/arch/m68k/Kconfig.machine
+++ b/arch/m68k/Kconfig.machine
@@ -126,6 +126,7 @@ config SUN3
 	depends on MMU
 	depends on !MMU_MOTOROLA
 	select MMU_SUN3 if MMU
+	select NO_DMA
 	select M68020
 	help
 	  This option enables support for the Sun 3 series of workstations
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 9318dc6d1a0c..32c1d1945033 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -15,6 +15,7 @@ config UML
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_COPY_THREAD_TLS
+	select NO_DMA
 	select GENERIC_IRQ_SHOW
 	select GENERIC_CPU_DEVICES
 	select GENERIC_CLOCKEVENTS
@@ -168,9 +169,6 @@ config MMAPPER
 	  This driver allows a host file to be used as emulated IO memory inside
 	  UML.
 
-config NO_DMA
-	def_bool y
-
 config PGTABLE_LEVELS
 	int
 	default 3 if 3_LEVEL_PGTABLES
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 1da3f44f2565..57533d07676f 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
+config NO_DMA
+	bool
+
 config HAS_DMA
 	bool
 	depends on !NO_DMA
-- 
cgit v1.2.3


From 88db0aa2421666d2f73486d15b239a4521983d55 Mon Sep 17 00:00:00 2001
From: Xiaoming Ni <nixiaoming@huawei.com>
Date: Fri, 14 Aug 2020 17:31:07 -0700
Subject: all arch: remove system call sys_sysctl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 61a47c1ad3a4dc ("sysctl: Remove the sysctl system call"),
sys_sysctl is actually unavailable: any input can only return an error.

We have been warning about people using the sysctl system call for years
and believe there are no more users.  Even if there are users of this
interface if they have not complained or fixed their code by now they
probably are not going to, so there is no point in warning them any
longer.

So completely remove sys_sysctl on all architectures.

[nixiaoming@huawei.com: s390: fix build error for sys_call_table_emu]
 Link: http://lkml.kernel.org/r/20200618141426.16884-1-nixiaoming@huawei.com

Signed-off-by: Xiaoming Ni <nixiaoming@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Will Deacon <will@kernel.org>		[arm/arm64]
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Bin Meng <bin.meng@windriver.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: chenzefeng <chenzefeng2@huawei.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christian Brauner <christian@brauner.io>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Diego Elio Pettenò <flameeyes@flameeyes.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kars de Jong <jongk@linux-m68k.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Krzysztof Kozlowski <krzk@kernel.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Marco Elver <elver@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Nick Piggin <npiggin@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Olof Johansson <olof@lixom.net>
Cc: Paul Burton <paulburton@kernel.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Sven Schnelle <svens@stackframe.org>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Zhou Yanjie <zhouyanjie@wanyeetech.com>
Link: http://lkml.kernel.org/r/20200616030734.87257-1-nixiaoming@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/syscalls/syscall.tbl             |   2 +-
 arch/arm/configs/am200epdkit_defconfig             |   1 -
 arch/arm/tools/syscall.tbl                         |   2 +-
 arch/arm64/include/asm/unistd32.h                  |   4 +-
 arch/ia64/kernel/syscalls/syscall.tbl              |   2 +-
 arch/m68k/kernel/syscalls/syscall.tbl              |   2 +-
 arch/microblaze/kernel/syscalls/syscall.tbl        |   2 +-
 arch/mips/configs/cu1000-neo_defconfig             |   1 -
 arch/mips/kernel/syscalls/syscall_n32.tbl          |   2 +-
 arch/mips/kernel/syscalls/syscall_n64.tbl          |   2 +-
 arch/mips/kernel/syscalls/syscall_o32.tbl          |   2 +-
 arch/parisc/kernel/syscalls/syscall.tbl            |   2 +-
 arch/powerpc/kernel/syscalls/syscall.tbl           |   2 +-
 arch/s390/kernel/syscalls/syscall.tbl              |   2 +-
 arch/sh/configs/dreamcast_defconfig                |   1 -
 arch/sh/configs/espt_defconfig                     |   1 -
 arch/sh/configs/hp6xx_defconfig                    |   1 -
 arch/sh/configs/landisk_defconfig                  |   1 -
 arch/sh/configs/lboxre2_defconfig                  |   1 -
 arch/sh/configs/microdev_defconfig                 |   1 -
 arch/sh/configs/migor_defconfig                    |   1 -
 arch/sh/configs/r7780mp_defconfig                  |   1 -
 arch/sh/configs/r7785rp_defconfig                  |   1 -
 arch/sh/configs/rts7751r2d1_defconfig              |   1 -
 arch/sh/configs/rts7751r2dplus_defconfig           |   1 -
 arch/sh/configs/se7206_defconfig                   |   1 -
 arch/sh/configs/se7343_defconfig                   |   1 -
 arch/sh/configs/se7619_defconfig                   |   1 -
 arch/sh/configs/se7705_defconfig                   |   1 -
 arch/sh/configs/se7750_defconfig                   |   1 -
 arch/sh/configs/se7751_defconfig                   |   1 -
 arch/sh/configs/secureedge5410_defconfig           |   1 -
 arch/sh/configs/sh03_defconfig                     |   1 -
 arch/sh/configs/sh7710voipgw_defconfig             |   1 -
 arch/sh/configs/sh7757lcr_defconfig                |   1 -
 arch/sh/configs/sh7763rdp_defconfig                |   1 -
 arch/sh/configs/shmin_defconfig                    |   1 -
 arch/sh/configs/titan_defconfig                    |   1 -
 arch/sh/kernel/syscalls/syscall.tbl                |   2 +-
 arch/sparc/kernel/syscalls/syscall.tbl             |   2 +-
 arch/x86/entry/syscalls/syscall_32.tbl             |   2 +-
 arch/x86/entry/syscalls/syscall_64.tbl             |   2 +-
 arch/xtensa/kernel/syscalls/syscall.tbl            |   2 +-
 include/linux/compat.h                             |   1 -
 include/linux/syscalls.h                           |   2 -
 include/linux/sysctl.h                             |   6 +-
 kernel/Makefile                                    |   2 +-
 kernel/sys_ni.c                                    |   1 -
 kernel/sysctl_binary.c                             | 171 ---------------------
 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl |   2 +-
 tools/perf/arch/s390/entry/syscalls/syscall.tbl    |   2 +-
 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl  |   2 +-
 52 files changed, 24 insertions(+), 227 deletions(-)
 delete mode 100644 kernel/sysctl_binary.c

(limited to 'kernel')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index a28fb211881d..ec8bed9e7b75 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -249,7 +249,7 @@
 316	common	mlockall			sys_mlockall
 317	common	munlockall			sys_munlockall
 318	common	sysinfo				sys_sysinfo
-319	common	_sysctl				sys_sysctl
+319	common	_sysctl				sys_ni_syscall
 # 320 was sys_idle
 321	common	oldumount			sys_oldumount
 322	common	swapon				sys_swapon
diff --git a/arch/arm/configs/am200epdkit_defconfig b/arch/arm/configs/am200epdkit_defconfig
index f56ac394caf1..4e49d6cb2f62 100644
--- a/arch/arm/configs/am200epdkit_defconfig
+++ b/arch/arm/configs/am200epdkit_defconfig
@@ -3,7 +3,6 @@ CONFIG_LOCALVERSION="gum"
 CONFIG_SYSVIPC=y
 CONFIG_SYSFS_DEPRECATED_V2=y
 CONFIG_EXPERT=y
-# CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_EPOLL is not set
 # CONFIG_SHMEM is not set
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 7e8ee4adf269..171077cbf419 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -162,7 +162,7 @@
 146	common	writev			sys_writev
 147	common	getsid			sys_getsid
 148	common	fdatasync		sys_fdatasync
-149	common	_sysctl			sys_sysctl
+149	common	_sysctl			sys_ni_syscall
 150	common	mlock			sys_mlock
 151	common	munlock			sys_munlock
 152	common	mlockall		sys_mlockall
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 17e81bd9a2d3..734860ac7cf9 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -308,8 +308,8 @@ __SYSCALL(__NR_writev, compat_sys_writev)
 __SYSCALL(__NR_getsid, sys_getsid)
 #define __NR_fdatasync 148
 __SYSCALL(__NR_fdatasync, sys_fdatasync)
-#define __NR__sysctl 149
-__SYSCALL(__NR__sysctl, compat_sys_sysctl)
+			/* 149 was sys_sysctl */
+__SYSCALL(149, sys_ni_syscall)
 #define __NR_mlock 150
 __SYSCALL(__NR_mlock, sys_mlock)
 #define __NR_munlock 151
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index ced9c83e47c9..f52a41f4c340 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -135,7 +135,7 @@
 123	common	writev				sys_writev
 124	common	pread64				sys_pread64
 125	common	pwrite64			sys_pwrite64
-126	common	_sysctl				sys_sysctl
+126	common	_sysctl				sys_ni_syscall
 127	common	mmap				sys_mmap
 128	common	munmap				sys_munmap
 129	common	mlock				sys_mlock
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 1a4822de7292..81fc799d8392 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -156,7 +156,7 @@
 146	common	writev				sys_writev
 147	common	getsid				sys_getsid
 148	common	fdatasync			sys_fdatasync
-149	common	_sysctl				sys_sysctl
+149	common	_sysctl				sys_ni_syscall
 150	common	mlock				sys_mlock
 151	common	munlock				sys_munlock
 152	common	mlockall			sys_mlockall
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index a3f4be8e7238..b4e263916f41 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -156,7 +156,7 @@
 146	common	writev				sys_writev
 147	common	getsid				sys_getsid
 148	common	fdatasync			sys_fdatasync
-149	common	_sysctl				sys_sysctl
+149	common	_sysctl				sys_ni_syscall
 150	common	mlock				sys_mlock
 151	common	munlock				sys_munlock
 152	common	mlockall			sys_mlockall
diff --git a/arch/mips/configs/cu1000-neo_defconfig b/arch/mips/configs/cu1000-neo_defconfig
index 6b471cdb16cf..e924c817f73d 100644
--- a/arch/mips/configs/cu1000-neo_defconfig
+++ b/arch/mips/configs/cu1000-neo_defconfig
@@ -17,7 +17,6 @@ CONFIG_CGROUP_CPUACCT=y
 CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
 # CONFIG_VM_EVENT_COUNTERS is not set
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 6b4ee92e3aed..f9df9edb67a4 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -159,7 +159,7 @@
 149	n32	munlockall			sys_munlockall
 150	n32	vhangup				sys_vhangup
 151	n32	pivot_root			sys_pivot_root
-152	n32	_sysctl				compat_sys_sysctl
+152	n32	_sysctl				sys_ni_syscall
 153	n32	prctl				sys_prctl
 154	n32	adjtimex			sys_adjtimex_time32
 155	n32	setrlimit			compat_sys_setrlimit
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 391acbf425a0..557f9954a2b9 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -159,7 +159,7 @@
 149	n64	munlockall			sys_munlockall
 150	n64	vhangup				sys_vhangup
 151	n64	pivot_root			sys_pivot_root
-152	n64	_sysctl				sys_sysctl
+152	n64	_sysctl				sys_ni_syscall
 153	n64	prctl				sys_prctl
 154	n64	adjtimex			sys_adjtimex
 155	n64	setrlimit			sys_setrlimit
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 5727c5187508..195b43cf27c8 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -164,7 +164,7 @@
 150	o32	unused150			sys_ni_syscall
 151	o32	getsid				sys_getsid
 152	o32	fdatasync			sys_fdatasync
-153	o32	_sysctl				sys_sysctl			compat_sys_sysctl
+153	o32	_sysctl				sys_ni_syscall
 154	o32	mlock				sys_mlock
 155	o32	munlock				sys_munlock
 156	o32	mlockall			sys_mlockall
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 292baabefade..def64d221cd4 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -163,7 +163,7 @@
 146	common	writev			sys_writev			compat_sys_writev
 147	common	getsid			sys_getsid
 148	common	fdatasync		sys_fdatasync
-149	common	_sysctl			sys_sysctl			compat_sys_sysctl
+149	common	_sysctl			sys_ni_syscall
 150	common	mlock			sys_mlock
 151	common	munlock			sys_munlock
 152	common	mlockall		sys_mlockall
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index be9f74546068..c2d737ff2e7b 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -197,7 +197,7 @@
 146	common	writev				sys_writev			compat_sys_writev
 147	common	getsid				sys_getsid
 148	common	fdatasync			sys_fdatasync
-149	nospu	_sysctl				sys_sysctl			compat_sys_sysctl
+149	nospu	_sysctl				sys_ni_syscall
 150	common	mlock				sys_mlock
 151	common	munlock				sys_munlock
 152	common	mlockall			sys_mlockall
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index f1fda4375526..10456bc936fb 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -138,7 +138,7 @@
 146  common	writev			sys_writev			compat_sys_writev
 147  common	getsid			sys_getsid			sys_getsid
 148  common	fdatasync		sys_fdatasync			sys_fdatasync
-149  common	_sysctl			sys_sysctl			compat_sys_sysctl
+149  common	_sysctl			-				-
 150  common	mlock			sys_mlock			sys_mlock
 151  common	munlock			sys_munlock			sys_munlock
 152  common	mlockall		sys_mlockall			sys_mlockall
diff --git a/arch/sh/configs/dreamcast_defconfig b/arch/sh/configs/dreamcast_defconfig
index ae067e0b15e3..6a82c7b8ff32 100644
--- a/arch/sh/configs/dreamcast_defconfig
+++ b/arch/sh/configs/dreamcast_defconfig
@@ -1,7 +1,6 @@
 CONFIG_SYSVIPC=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_LOG_BUF_SHIFT=14
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_SLAB=y
 CONFIG_PROFILING=y
 CONFIG_MODULES=y
diff --git a/arch/sh/configs/espt_defconfig b/arch/sh/configs/espt_defconfig
index a5b865a75d22..9a988c347e9d 100644
--- a/arch/sh/configs/espt_defconfig
+++ b/arch/sh/configs/espt_defconfig
@@ -5,7 +5,6 @@ CONFIG_LOG_BUF_SHIFT=14
 CONFIG_NAMESPACES=y
 CONFIG_UTS_NS=y
 CONFIG_IPC_NS=y
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_SLAB=y
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
diff --git a/arch/sh/configs/hp6xx_defconfig b/arch/sh/configs/hp6xx_defconfig
index a92db6694ce2..70e6605d7f7e 100644
--- a/arch/sh/configs/hp6xx_defconfig
+++ b/arch/sh/configs/hp6xx_defconfig
@@ -3,7 +3,6 @@ CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=14
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_SLAB=y
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_CPU_SUBTYPE_SH7709=y
diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig
index 567af752b1bb..ba6ec042606f 100644
--- a/arch/sh/configs/landisk_defconfig
+++ b/arch/sh/configs/landisk_defconfig
@@ -1,6 +1,5 @@
 CONFIG_SYSVIPC=y
 CONFIG_LOG_BUF_SHIFT=14
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS_EXTRA_PASS=y
 CONFIG_SLAB=y
 CONFIG_MODULES=y
diff --git a/arch/sh/configs/lboxre2_defconfig b/arch/sh/configs/lboxre2_defconfig
index 10f6d371ce2c..05e4ac6fed5f 100644
--- a/arch/sh/configs/lboxre2_defconfig
+++ b/arch/sh/configs/lboxre2_defconfig
@@ -1,6 +1,5 @@
 CONFIG_SYSVIPC=y
 CONFIG_LOG_BUF_SHIFT=14
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS_EXTRA_PASS=y
 CONFIG_SLAB=y
 CONFIG_MODULES=y
diff --git a/arch/sh/configs/microdev_defconfig b/arch/sh/configs/microdev_defconfig
index ed84d1303acf..c65667d00313 100644
--- a/arch/sh/configs/microdev_defconfig
+++ b/arch/sh/configs/microdev_defconfig
@@ -2,7 +2,6 @@ CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_SLAB=y
 # CONFIG_BLK_DEV_BSG is not set
 CONFIG_CPU_SUBTYPE_SH4_202=y
diff --git a/arch/sh/configs/migor_defconfig b/arch/sh/configs/migor_defconfig
index 37e9521a99e5..a24cf8cd2cea 100644
--- a/arch/sh/configs/migor_defconfig
+++ b/arch/sh/configs/migor_defconfig
@@ -4,7 +4,6 @@ CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_SLAB=y
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig
index c97ec60cff27..e922659fdadb 100644
--- a/arch/sh/configs/r7780mp_defconfig
+++ b/arch/sh/configs/r7780mp_defconfig
@@ -3,7 +3,6 @@ CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=14
-# CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_FUTEX is not set
 # CONFIG_EPOLL is not set
 CONFIG_SLAB=y
diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig
index 55fce65eb454..5978866358ec 100644
--- a/arch/sh/configs/r7785rp_defconfig
+++ b/arch/sh/configs/r7785rp_defconfig
@@ -7,7 +7,6 @@ CONFIG_RCU_TRACE=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=14
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_SLAB=y
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
diff --git a/arch/sh/configs/rts7751r2d1_defconfig b/arch/sh/configs/rts7751r2d1_defconfig
index 6a3cfe08295f..fc9c22152b08 100644
--- a/arch/sh/configs/rts7751r2d1_defconfig
+++ b/arch/sh/configs/rts7751r2d1_defconfig
@@ -1,7 +1,6 @@
 CONFIG_SYSVIPC=y
 CONFIG_LOG_BUF_SHIFT=14
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_SLAB=y
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
diff --git a/arch/sh/configs/rts7751r2dplus_defconfig b/arch/sh/configs/rts7751r2dplus_defconfig
index 2b3d7d280672..ff3fd6787fd6 100644
--- a/arch/sh/configs/rts7751r2dplus_defconfig
+++ b/arch/sh/configs/rts7751r2dplus_defconfig
@@ -1,7 +1,6 @@
 CONFIG_SYSVIPC=y
 CONFIG_LOG_BUF_SHIFT=14
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_SLAB=y
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig
index 21a43f14ffac..ff5bb4489922 100644
--- a/arch/sh/configs/se7206_defconfig
+++ b/arch/sh/configs/se7206_defconfig
@@ -18,7 +18,6 @@ CONFIG_USER_NS=y
 CONFIG_PID_NS=y
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_UID16 is not set
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_ELF_CORE is not set
 # CONFIG_COMPAT_BRK is not set
diff --git a/arch/sh/configs/se7343_defconfig b/arch/sh/configs/se7343_defconfig
index 4e794e719a28..5d6c19338ebf 100644
--- a/arch/sh/configs/se7343_defconfig
+++ b/arch/sh/configs/se7343_defconfig
@@ -2,7 +2,6 @@
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_LOG_BUF_SHIFT=14
-# CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_FUTEX is not set
 # CONFIG_EPOLL is not set
 # CONFIG_SHMEM is not set
diff --git a/arch/sh/configs/se7619_defconfig b/arch/sh/configs/se7619_defconfig
index 3264415a5931..71a672c30716 100644
--- a/arch/sh/configs/se7619_defconfig
+++ b/arch/sh/configs/se7619_defconfig
@@ -1,7 +1,6 @@
 # CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_LOG_BUF_SHIFT=14
 # CONFIG_UID16 is not set
-# CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_KALLSYMS is not set
 # CONFIG_HOTPLUG is not set
 # CONFIG_ELF_CORE is not set
diff --git a/arch/sh/configs/se7705_defconfig b/arch/sh/configs/se7705_defconfig
index 4496b94b7d88..ed00a6eeadf5 100644
--- a/arch/sh/configs/se7705_defconfig
+++ b/arch/sh/configs/se7705_defconfig
@@ -2,7 +2,6 @@
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
-# CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_KALLSYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_SLAB=y
diff --git a/arch/sh/configs/se7750_defconfig b/arch/sh/configs/se7750_defconfig
index b23f67542728..3f1c13799d79 100644
--- a/arch/sh/configs/se7750_defconfig
+++ b/arch/sh/configs/se7750_defconfig
@@ -5,7 +5,6 @@ CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=14
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
-# CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_SLAB=y
 CONFIG_MODULES=y
diff --git a/arch/sh/configs/se7751_defconfig b/arch/sh/configs/se7751_defconfig
index 162343683937..4a024065bb75 100644
--- a/arch/sh/configs/se7751_defconfig
+++ b/arch/sh/configs/se7751_defconfig
@@ -3,7 +3,6 @@ CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
-# CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_SLAB=y
 CONFIG_MODULES=y
diff --git a/arch/sh/configs/secureedge5410_defconfig b/arch/sh/configs/secureedge5410_defconfig
index 360592d63a2f..8422599cfb04 100644
--- a/arch/sh/configs/secureedge5410_defconfig
+++ b/arch/sh/configs/secureedge5410_defconfig
@@ -1,7 +1,6 @@
 # CONFIG_SWAP is not set
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_BLK_DEV_INITRD=y
-# CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_SLAB=y
 # CONFIG_BLK_DEV_BSG is not set
diff --git a/arch/sh/configs/sh03_defconfig b/arch/sh/configs/sh03_defconfig
index 87db9a84b5ec..f0073ed39947 100644
--- a/arch/sh/configs/sh03_defconfig
+++ b/arch/sh/configs/sh03_defconfig
@@ -3,7 +3,6 @@ CONFIG_POSIX_MQUEUE=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_BLK_DEV_INITRD=y
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_SLAB=y
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
diff --git a/arch/sh/configs/sh7710voipgw_defconfig b/arch/sh/configs/sh7710voipgw_defconfig
index 08426913c0e3..0d814770b07f 100644
--- a/arch/sh/configs/sh7710voipgw_defconfig
+++ b/arch/sh/configs/sh7710voipgw_defconfig
@@ -2,7 +2,6 @@
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_LOG_BUF_SHIFT=14
-# CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_FUTEX is not set
 # CONFIG_EPOLL is not set
 # CONFIG_SHMEM is not set
diff --git a/arch/sh/configs/sh7757lcr_defconfig b/arch/sh/configs/sh7757lcr_defconfig
index d0933a9b9799..a2700ab165af 100644
--- a/arch/sh/configs/sh7757lcr_defconfig
+++ b/arch/sh/configs/sh7757lcr_defconfig
@@ -8,7 +8,6 @@ CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_BLK_DEV_INITRD=y
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS_ALL=y
 CONFIG_SLAB=y
 CONFIG_MODULES=y
diff --git a/arch/sh/configs/sh7763rdp_defconfig b/arch/sh/configs/sh7763rdp_defconfig
index d0a0aa74cecf..26c5fd02c87a 100644
--- a/arch/sh/configs/sh7763rdp_defconfig
+++ b/arch/sh/configs/sh7763rdp_defconfig
@@ -5,7 +5,6 @@ CONFIG_LOG_BUF_SHIFT=14
 CONFIG_NAMESPACES=y
 CONFIG_UTS_NS=y
 CONFIG_IPC_NS=y
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_SLAB=y
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
diff --git a/arch/sh/configs/shmin_defconfig b/arch/sh/configs/shmin_defconfig
index a27b129b93c5..c0b6f40d01cc 100644
--- a/arch/sh/configs/shmin_defconfig
+++ b/arch/sh/configs/shmin_defconfig
@@ -1,7 +1,6 @@
 # CONFIG_SWAP is not set
 CONFIG_LOG_BUF_SHIFT=14
 # CONFIG_UID16 is not set
-# CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_KALLSYMS is not set
 # CONFIG_HOTPLUG is not set
 # CONFIG_BUG is not set
diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
index 4ec961ace688..ba887f1351be 100644
--- a/arch/sh/configs/titan_defconfig
+++ b/arch/sh/configs/titan_defconfig
@@ -6,7 +6,6 @@ CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=16
 CONFIG_BLK_DEV_INITRD=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
-# CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_SLAB=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 96848db9659e..ae0a00beea5f 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -156,7 +156,7 @@
 146	common	writev				sys_writev
 147	common	getsid				sys_getsid
 148	common	fdatasync			sys_fdatasync
-149	common	_sysctl				sys_sysctl
+149	common	_sysctl				sys_ni_syscall
 150	common	mlock				sys_mlock
 151	common	munlock				sys_munlock
 152	common	mlockall			sys_mlockall
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 46024e80ee86..4af114e84f20 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -300,7 +300,7 @@
 249	64	nanosleep		sys_nanosleep
 250	32	mremap			sys_mremap
 250	64	mremap			sys_64_mremap
-251	common	_sysctl			sys_sysctl			compat_sys_sysctl
+251	common	_sysctl			sys_ni_syscall
 252	common	getsid			sys_getsid
 253	common	fdatasync		sys_fdatasync
 254	32	nfsservctl		sys_ni_syscall			sys_nis_syscall
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index e31a75262c9c..9d1102873666 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -160,7 +160,7 @@
 146	i386	writev			sys_writev			compat_sys_writev
 147	i386	getsid			sys_getsid
 148	i386	fdatasync		sys_fdatasync
-149	i386	_sysctl			sys_sysctl			compat_sys_sysctl
+149	i386	_sysctl			sys_ni_syscall
 150	i386	mlock			sys_mlock
 151	i386	munlock			sys_munlock
 152	i386	mlockall		sys_mlockall
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9d82078c949a..f30d6ae9a688 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -164,7 +164,7 @@
 153	common	vhangup			sys_vhangup
 154	common	modify_ldt		sys_modify_ldt
 155	common	pivot_root		sys_pivot_root
-156	64	_sysctl			sys_sysctl
+156	64	_sysctl			sys_ni_syscall
 157	common	prctl			sys_prctl
 158	common	arch_prctl		sys_arch_prctl
 159	common	adjtimex		sys_adjtimex
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index d216ccba42f7..6276e3c2d3fc 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -222,7 +222,7 @@
 204	common	quotactl			sys_quotactl
 # 205 was old nfsservctl
 205	common	nfsservctl			sys_ni_syscall
-206	common	_sysctl				sys_sysctl
+206	common	_sysctl				sys_ni_syscall
 207	common	bdflush				sys_bdflush
 208	common	uname				sys_newuname
 209	common	sysinfo				sys_sysinfo
diff --git a/include/linux/compat.h b/include/linux/compat.h
index c4255d8a4a8a..d38c4d7e83bd 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -851,7 +851,6 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u32);
 asmlinkage long compat_sys_recv(int fd, void __user *buf, compat_size_t len,
 				unsigned flags);
-asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args);
 
 /* obsolete: fs/readdir.c */
 asmlinkage long compat_sys_old_readdir(unsigned int fd,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index dc2b827c81e5..75ac7f8ae93c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -47,7 +47,6 @@ struct stat64;
 struct statfs;
 struct statfs64;
 struct statx;
-struct __sysctl_args;
 struct sysinfo;
 struct timespec;
 struct __kernel_old_timeval;
@@ -1117,7 +1116,6 @@ asmlinkage long sys_send(int, void __user *, size_t, unsigned);
 asmlinkage long sys_bdflush(int func, long data);
 asmlinkage long sys_oldumount(char __user *name);
 asmlinkage long sys_uselib(const char __user *library);
-asmlinkage long sys_sysctl(struct __sysctl_args __user *args);
 asmlinkage long sys_sysfs(int option,
 				unsigned long arg1, unsigned long arg2);
 asmlinkage long sys_fork(void);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 50bb7f383a1b..51298a4f4623 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -74,15 +74,13 @@ int proc_do_static_key(struct ctl_table *table, int write, void *buffer,
  * sysctl names can be mirrored automatically under /proc/sys.  The
  * procname supplied controls /proc naming.
  *
- * The table's mode will be honoured both for sys_sysctl(2) and
- * proc-fs access.
+ * The table's mode will be honoured for proc-fs access.
  *
  * Leaf nodes in the sysctl tree will be represented by a single file
  * under /proc; non-leaf nodes will be represented by directories.  A
  * null procname disables /proc mirroring at this node.
  *
- * sysctl(2) can automatically manage read and write requests through
- * the sysctl table.  The data and maxlen fields of the ctl_table
+ * The data and maxlen fields of the ctl_table
  * struct enable minimal validation of the values being written to be
  * performed, and the mode field allows minimal authentication.
  * 
diff --git a/kernel/Makefile b/kernel/Makefile
index b3da548691c9..9a20016d4900 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,7 +5,7 @@
 
 obj-y     = fork.o exec_domain.o panic.o \
 	    cpu.o exit.o softirq.o resource.o \
-	    sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
+	    sysctl.o capability.o ptrace.o user.o \
 	    signal.o sys.o umh.o workqueue.o pid.o task_work.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 3b69a560a7ac..4d59775ea79c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -364,7 +364,6 @@ COND_SYSCALL(socketcall);
 COND_SYSCALL_COMPAT(socketcall);
 
 /* compat syscalls for arm64, x86, ... */
-COND_SYSCALL_COMPAT(sysctl);
 COND_SYSCALL_COMPAT(fanotify_mark);
 
 /* x86 */
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
deleted file mode 100644
index 7d550cc76a3b..000000000000
--- a/kernel/sysctl_binary.c
+++ /dev/null
@@ -1,171 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include "../fs/xfs/xfs_sysctl.h"
-#include <linux/sunrpc/debug.h>
-#include <linux/string.h>
-#include <linux/syscalls.h>
-#include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/fs.h>
-#include <linux/nsproxy.h>
-#include <linux/pid_namespace.h>
-#include <linux/file.h>
-#include <linux/ctype.h>
-#include <linux/netdevice.h>
-#include <linux/kernel.h>
-#include <linux/uuid.h>
-#include <linux/slab.h>
-#include <linux/compat.h>
-
-static ssize_t binary_sysctl(const int *name, int nlen,
-	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
-	return -ENOSYS;
-}
-
-static void deprecated_sysctl_warning(const int *name, int nlen)
-{
-	int i;
-
-	/*
-	 * CTL_KERN/KERN_VERSION is used by older glibc and cannot
-	 * ever go away.
-	 */
-	if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION)
-		return;
-
-	if (printk_ratelimit()) {
-		printk(KERN_INFO
-			"warning: process `%s' used the deprecated sysctl "
-			"system call with ", current->comm);
-		for (i = 0; i < nlen; i++)
-			printk(KERN_CONT "%d.", name[i]);
-		printk(KERN_CONT "\n");
-	}
-	return;
-}
-
-#define WARN_ONCE_HASH_BITS 8
-#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
-
-static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
-
-#define FNV32_OFFSET 2166136261U
-#define FNV32_PRIME 0x01000193
-
-/*
- * Print each legacy sysctl (approximately) only once.
- * To avoid making the tables non-const use a external
- * hash-table instead.
- * Worst case hash collision: 6, but very rarely.
- * NOTE! We don't use the SMP-safe bit tests. We simply
- * don't care enough.
- */
-static void warn_on_bintable(const int *name, int nlen)
-{
-	int i;
-	u32 hash = FNV32_OFFSET;
-
-	for (i = 0; i < nlen; i++)
-		hash = (hash ^ name[i]) * FNV32_PRIME;
-	hash %= WARN_ONCE_HASH_SIZE;
-	if (__test_and_set_bit(hash, warn_once_bitmap))
-		return;
-	deprecated_sysctl_warning(name, nlen);
-}
-
-static ssize_t do_sysctl(int __user *args_name, int nlen,
-	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
-	int name[CTL_MAXNAME];
-	int i;
-
-	/* Check args->nlen. */
-	if (nlen < 0 || nlen > CTL_MAXNAME)
-		return -ENOTDIR;
-	/* Read in the sysctl name for simplicity */
-	for (i = 0; i < nlen; i++)
-		if (get_user(name[i], args_name + i))
-			return -EFAULT;
-
-	warn_on_bintable(name, nlen);
-
-	return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
-}
-
-SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
-{
-	struct __sysctl_args tmp;
-	size_t oldlen = 0;
-	ssize_t result;
-
-	if (copy_from_user(&tmp, args, sizeof(tmp)))
-		return -EFAULT;
-
-	if (tmp.oldval && !tmp.oldlenp)
-		return -EFAULT;
-
-	if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
-		return -EFAULT;
-
-	result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
-			   tmp.newval, tmp.newlen);
-
-	if (result >= 0) {
-		oldlen = result;
-		result = 0;
-	}
-
-	if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
-		return -EFAULT;
-
-	return result;
-}
-
-
-#ifdef CONFIG_COMPAT
-
-struct compat_sysctl_args {
-	compat_uptr_t	name;
-	int		nlen;
-	compat_uptr_t	oldval;
-	compat_uptr_t	oldlenp;
-	compat_uptr_t	newval;
-	compat_size_t	newlen;
-	compat_ulong_t	__unused[4];
-};
-
-COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args)
-{
-	struct compat_sysctl_args tmp;
-	compat_size_t __user *compat_oldlenp;
-	size_t oldlen = 0;
-	ssize_t result;
-
-	if (copy_from_user(&tmp, args, sizeof(tmp)))
-		return -EFAULT;
-
-	if (tmp.oldval && !tmp.oldlenp)
-		return -EFAULT;
-
-	compat_oldlenp = compat_ptr(tmp.oldlenp);
-	if (compat_oldlenp && get_user(oldlen, compat_oldlenp))
-		return -EFAULT;
-
-	result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
-			   compat_ptr(tmp.oldval), oldlen,
-			   compat_ptr(tmp.newval), tmp.newlen);
-
-	if (result >= 0) {
-		oldlen = result;
-		result = 0;
-	}
-
-	if (compat_oldlenp && put_user(oldlen, compat_oldlenp))
-		return -EFAULT;
-
-	return result;
-}
-
-#endif /* CONFIG_COMPAT */
diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
index b190f2eb2611..3ca6fe057a0b 100644
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@@ -193,7 +193,7 @@
 146	common	writev				sys_writev			compat_sys_writev
 147	common	getsid				sys_getsid
 148	common	fdatasync			sys_fdatasync
-149	nospu	_sysctl				sys_sysctl			compat_sys_sysctl
+149	nospu	_sysctl				sys_ni_syscall
 150	common	mlock				sys_mlock
 151	common	munlock				sys_munlock
 152	common	mlockall			sys_mlockall
diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
index 56ae24b6e4be..6a0bbea225db 100644
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -138,7 +138,7 @@
 146  common	writev			sys_writev			compat_sys_writev
 147  common	getsid			sys_getsid			sys_getsid
 148  common	fdatasync		sys_fdatasync			sys_fdatasync
-149  common	_sysctl			sys_sysctl			compat_sys_sysctl
+149  common	_sysctl			-				-
 150  common	mlock			sys_mlock			compat_sys_mlock
 151  common	munlock			sys_munlock			compat_sys_munlock
 152  common	mlockall		sys_mlockall			sys_mlockall
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
index 9d82078c949a..f30d6ae9a688 100644
--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -164,7 +164,7 @@
 153	common	vhangup			sys_vhangup
 154	common	modify_ldt		sys_modify_ldt
 155	common	pivot_root		sys_pivot_root
-156	64	_sysctl			sys_sysctl
+156	64	_sysctl			sys_ni_syscall
 157	common	prctl			sys_prctl
 158	common	arch_prctl		sys_arch_prctl
 159	common	adjtimex		sys_adjtimex
-- 
cgit v1.2.3