From e17d43b93e544f5016c0251d2074c15568d5d963 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:08 +0300 Subject: perf: Add perf text poke event Record (single instruction) changes to the kernel text (i.e. self-modifying code) in order to support tracers like Intel PT and ARM CoreSight. A copy of the running kernel code is needed as a reference point (e.g. from /proc/kcore). The text poke event records the old bytes and the new bytes so that the event can be processed forwards or backwards. The basic problem is recording the modified instruction in an unambiguous manner given SMP instruction cache (in)coherence. That is, when modifying an instruction concurrently any solution with one or multiple timestamps is not sufficient: CPU0 CPU1 0 1 write insn A 2 execute insn A 3 sync-I$ 4 Due to I$, CPU1 might execute either the old or new A. No matter where we record tracepoints on CPU0, one simply cannot tell what CPU1 will have observed, except that at 0 it must be the old one and at 4 it must be the new one. To solve this, take inspiration from x86 text poking, which has to solve this exact problem due to variable length instruction encoding and I-fetch windows. 1) overwrite the instruction with a breakpoint and sync I$ This guarantees that that code flow will never hit the target instruction anymore, on any CPU (or rather, it will cause an exception). 2) issue the TEXT_POKE event 3) overwrite the breakpoint with the new instruction and sync I$ Now we know that any execution after the TEXT_POKE event will either observe the breakpoint (and hit the exception) or the new instruction. So by guarding the TEXT_POKE event with an exception on either side; we can now tell, without doubt, which instruction another CPU will have observed. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-2-adrian.hunter@intel.com --- kernel/events/core.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 856d98c36f56..9b8f92500833 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -394,6 +394,7 @@ static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; +static atomic_t nr_text_poke_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4575,7 +4576,7 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || attr->task || attr->ksymbol || - attr->context_switch || + attr->context_switch || attr->text_poke || attr->bpf_event) return true; return false; @@ -4651,6 +4652,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_dec(&nr_bpf_events); + if (event->attr.text_poke) + atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -8628,6 +8631,89 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_text_poke_event { + const void *old_bytes; + const void *new_bytes; + size_t pad; + u16 old_len; + u16 new_len; + + struct { + struct perf_event_header header; + + u64 addr; + } event_id; +}; + +static int perf_event_text_poke_match(struct perf_event *event) +{ + return event->attr.text_poke; +} + +static void perf_event_text_poke_output(struct perf_event *event, void *data) +{ + struct perf_text_poke_event *text_poke_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + u64 padding = 0; + int ret; + + if (!perf_event_text_poke_match(event)) + return; + + perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, text_poke_event->event_id); + perf_output_put(&handle, text_poke_event->old_len); + perf_output_put(&handle, text_poke_event->new_len); + + __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); + __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); + + if (text_poke_event->pad) + __output_copy(&handle, &padding, text_poke_event->pad); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_text_poke(const void *addr, const void *old_bytes, + size_t old_len, const void *new_bytes, size_t new_len) +{ + struct perf_text_poke_event text_poke_event; + size_t tot, pad; + + if (!atomic_read(&nr_text_poke_events)) + return; + + tot = sizeof(text_poke_event.old_len) + old_len; + tot += sizeof(text_poke_event.new_len) + new_len; + pad = ALIGN(tot, sizeof(u64)) - tot; + + text_poke_event = (struct perf_text_poke_event){ + .old_bytes = old_bytes, + .new_bytes = new_bytes, + .pad = pad, + .old_len = old_len, + .new_len = new_len, + .event_id = { + .header = { + .type = PERF_RECORD_TEXT_POKE, + .misc = PERF_RECORD_MISC_KERNEL, + .size = sizeof(text_poke_event.event_id) + tot + pad, + }, + .addr = (unsigned long)addr, + }, + }; + + perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -10945,6 +11031,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_inc(&nr_bpf_events); + if (event->attr.text_poke) + atomic_inc(&nr_text_poke_events); if (inc) { /* -- cgit v1.2.3 From d002b8bc6dbc20e9043e279196cff8795dba05fe Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 28 May 2020 11:00:58 +0300 Subject: kprobes: Add symbols for kprobe insn pages Symbols are needed for tools to describe instruction addresses. Pages allocated for kprobe's purposes need symbols to be created for them. Add such symbols to be visible via /proc/kallsyms. Note: kprobe insn pages are not used if ftrace is configured. To see the effect of this patch, the kernel must be configured with: # CONFIG_FUNCTION_TRACER is not set CONFIG_KPROBES=y and for optimised kprobes: CONFIG_OPTPROBES=y Example on x86: # perf probe __schedule Added new event: probe:__schedule (on __schedule) # cat /proc/kallsyms | grep '\[__builtin__kprobes\]' ffffffffc00d4000 t kprobe_insn_page [__builtin__kprobes] ffffffffc00d6000 t kprobe_optinsn_page [__builtin__kprobes] Note: This patch adds "__builtin__kprobes" as a module name in /proc/kallsyms for symbols for pages allocated for kprobes' purposes, even though "__builtin__kprobes" is not a module. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20200528080058.20230-1-adrian.hunter@intel.com --- include/linux/kprobes.h | 15 +++++++++++++++ kernel/kallsyms.c | 37 +++++++++++++++++++++++++++++++++---- kernel/kprobes.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 594265bfd390..13fc58a74c04 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -242,6 +242,7 @@ struct kprobe_insn_cache { struct mutex mutex; void *(*alloc)(void); /* allocate insn page */ void (*free)(void *); /* free insn page */ + const char *sym; /* symbol for insn pages */ struct list_head pages; /* list of kprobe_insn_page */ size_t insn_size; /* size of instruction slot */ int nr_garbage; @@ -272,6 +273,10 @@ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ { \ return __is_insn_slot_addr(&kprobe_##__name##_slots, addr); \ } +#define KPROBE_INSN_PAGE_SYM "kprobe_insn_page" +#define KPROBE_OPTINSN_PAGE_SYM "kprobe_optinsn_page" +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, + unsigned long *value, char *type, char *sym); #else /* __ARCH_WANT_KPROBES_INSN_SLOT */ #define DEFINE_INSN_CACHE_OPS(__name) \ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ @@ -373,6 +378,11 @@ void dump_kprobe(struct kprobe *kp); void *alloc_insn_page(void); void free_insn_page(void *page); +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym); + +int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, + char *type, char *sym); #else /* !CONFIG_KPROBES: */ static inline int kprobes_built_in(void) @@ -435,6 +445,11 @@ static inline bool within_kprobe_blacklist(unsigned long addr) { return true; } +static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *sym) +{ + return -ERANGE; +} #endif /* CONFIG_KPROBES */ static inline int disable_kretprobe(struct kretprobe *rp) { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 16c8c605f4b0..c6cc293c0e67 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -24,6 +24,7 @@ #include #include #include +#include #include /* @@ -437,6 +438,7 @@ struct kallsym_iter { loff_t pos_arch_end; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; + loff_t pos_bpf_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; @@ -496,11 +498,33 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) static int get_ksymbol_bpf(struct kallsym_iter *iter) { + int ret; + strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; - return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, - &iter->value, &iter->type, - iter->name) < 0 ? 0 : 1; + ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, + &iter->value, &iter->type, + iter->name); + if (ret < 0) { + iter->pos_bpf_end = iter->pos; + return 0; + } + + return 1; +} + +/* + * This uses "__builtin__kprobes" as a module name for symbols for pages + * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a + * module. + */ +static int get_ksymbol_kprobe(struct kallsym_iter *iter) +{ + strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); + iter->exported = 0; + return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end, + &iter->value, &iter->type, + iter->name) < 0 ? 0 : 1; } /* Returns space to next name. */ @@ -527,6 +551,7 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->pos_arch_end = 0; iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; + iter->pos_bpf_end = 0; } } @@ -551,7 +576,11 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) get_ksymbol_ftrace_mod(iter)) return 1; - return get_ksymbol_bpf(iter); + if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) && + get_ksymbol_bpf(iter)) + return 1; + + return get_ksymbol_kprobe(iter); } /* Returns false if pos at or past end of file. */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 50cd84f53df0..058c0be3464b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -118,6 +118,7 @@ struct kprobe_insn_cache kprobe_insn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_INSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, @@ -290,12 +291,34 @@ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr) return ret; } +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, + unsigned long *value, char *type, char *sym) +{ + struct kprobe_insn_page *kip; + int ret = -ERANGE; + + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { + if ((*symnum)--) + continue; + strlcpy(sym, c->sym, KSYM_NAME_LEN); + *type = 't'; + *value = (unsigned long)kip->insns; + ret = 0; + break; + } + rcu_read_unlock(); + + return ret; +} + #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ struct kprobe_insn_cache kprobe_optinsn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_OPTINSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, @@ -2197,6 +2220,28 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry) kprobe_remove_area_blacklist(entry, entry + 1); } +int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, + char *type, char *sym) +{ + return -ERANGE; +} + +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym) +{ +#ifdef __ARCH_WANT_KPROBES_INSN_SLOT + if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym)) + return 0; +#ifdef CONFIG_OPTPROBES + if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym)) + return 0; +#endif +#endif + if (!arch_kprobe_get_kallsym(&symnum, value, type, sym)) + return 0; + return -ERANGE; +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; -- cgit v1.2.3 From 69e49088692899d25dedfa22f00dfb9761e86ed7 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:11 +0300 Subject: kprobes: Add perf ksymbol events for kprobe insn pages Symbols are needed for tools to describe instruction addresses. Pages allocated for kprobe's purposes need symbols to be created for them. Add such symbols to be visible via perf ksymbol events. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20200512121922.8997-5-adrian.hunter@intel.com --- include/uapi/linux/perf_event.h | 5 +++++ kernel/kprobes.c | 12 ++++++++++++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e5bee6c17b86..e1a4179144a1 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1049,6 +1049,11 @@ enum perf_event_type { enum perf_record_ksymbol_type { PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + /* + * Out of line code such as kprobe-replaced instructions or optimized + * kprobes. + */ + PERF_RECORD_KSYMBOL_TYPE_OOL = 2, PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ }; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 058c0be3464b..2b58740ca0f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -184,6 +185,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->cache = c; list_add_rcu(&kip->list, &c->pages); slot = kip->insns; + + /* Record the perf ksymbol register event after adding the page */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, + PAGE_SIZE, false, c->sym); out: mutex_unlock(&c->mutex); return slot; @@ -202,6 +207,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) * next time somebody inserts a probe. */ if (!list_is_singular(&kip->list)) { + /* + * Record perf ksymbol unregister event before removing + * the page. + */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + (unsigned long)kip->insns, PAGE_SIZE, true, + kip->cache->sym); list_del_rcu(&kip->list); synchronize_rcu(); kip->cache->free(kip->insns); -- cgit v1.2.3 From fc0ea795f53c8d7040fa42471f74fe51d78d0834 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:13 +0300 Subject: ftrace: Add symbols for ftrace trampolines Symbols are needed for tools to describe instruction addresses. Pages allocated for ftrace's purposes need symbols to be created for them. Add such symbols to be visible via /proc/kallsyms. Example on x86 with CONFIG_DYNAMIC_FTRACE=y # echo function > /sys/kernel/debug/tracing/current_tracer # cat /proc/kallsyms | grep '\[__builtin__ftrace\]' ffffffffc0238000 t ftrace_trampoline [__builtin__ftrace] Note: This patch adds "__builtin__ftrace" as a module name in /proc/kallsyms for symbols for pages allocated for ftrace's purposes, even though "__builtin__ftrace" is not a module. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-7-adrian.hunter@intel.com --- include/linux/ftrace.h | 12 +++++--- kernel/kallsyms.c | 5 ++++ kernel/trace/ftrace.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 88 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e339dac91ee6..ce2c06f72e86 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -58,9 +58,6 @@ struct ftrace_direct_func; const char * ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym); -int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, - char *type, char *name, - char *module_name, int *exported); #else static inline const char * ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, @@ -68,6 +65,13 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, { return NULL; } +#endif + +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, + char *module_name, int *exported); +#else static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported) @@ -76,7 +80,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val } #endif - #ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; @@ -207,6 +210,7 @@ struct ftrace_ops { struct ftrace_ops_hash old_hash; unsigned long trampoline; unsigned long trampoline_size; + struct list_head list; #endif }; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index c6cc293c0e67..834bfdc43235 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -482,6 +482,11 @@ static int get_ksymbol_mod(struct kallsym_iter *iter) return 1; } +/* + * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace + * purposes. In that case "__builtin__ftrace" is used as a module name, even + * though "__builtin__ftrace" is not a module. + */ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) { int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c163c3531faf..31675b209db2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2764,6 +2764,38 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) { } +/* List of trace_ops that have allocated trampolines */ +static LIST_HEAD(ftrace_ops_trampoline_list); + +static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_add_rcu(&ops->list, &ftrace_ops_trampoline_list); +} + +static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_del_rcu(&ops->list); +} + +/* + * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols + * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is + * not a module. + */ +#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace" +#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline" + +static void ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && + ops->trampoline) + ftrace_remove_trampoline_from_kallsyms(ops); + + arch_ftrace_trampoline_free(ops); +} + static void ftrace_startup_enable(int command) { if (saved_ftrace_func != ftrace_trace_function) { @@ -2934,7 +2966,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) synchronize_rcu_tasks(); free_ops: - arch_ftrace_trampoline_free(ops); + ftrace_trampoline_free(ops); } return 0; @@ -6178,6 +6210,27 @@ struct ftrace_mod_map { unsigned int num_funcs; }; +static int ftrace_get_trampoline_kallsym(unsigned int symnum, + unsigned long *value, char *type, + char *name, char *module_name, + int *exported) +{ + struct ftrace_ops *op; + + list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) { + if (!op->trampoline || symnum--) + continue; + *value = op->trampoline; + *type = 't'; + strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); + strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); + *exported = 0; + return 0; + } + + return -ERANGE; +} + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6514,6 +6567,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, { struct ftrace_mod_map *mod_map; struct ftrace_mod_func *mod_func; + int ret; preempt_disable(); list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { @@ -6540,8 +6594,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, WARN_ON(1); break; } + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); preempt_enable(); - return -ERANGE; + return ret; } #else @@ -6553,6 +6609,18 @@ allocate_ftrace_mod_map(struct module *mod, { return NULL; } +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, char *module_name, + int *exported) +{ + int ret; + + preempt_disable(); + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); + preempt_enable(); + return ret; +} #endif /* CONFIG_MODULES */ struct ftrace_init_func { @@ -6733,7 +6801,12 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) static void ftrace_update_trampoline(struct ftrace_ops *ops) { + unsigned long trampoline = ops->trampoline; + arch_ftrace_update_trampoline(ops); + if (ops->trampoline && ops->trampoline != trampoline && + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + ftrace_add_trampoline_to_kallsyms(ops); } void ftrace_init_trace_array(struct trace_array *tr) -- cgit v1.2.3 From dd9ddf466ad7a5d2e247925d81ebb0b878bf3b76 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:14 +0300 Subject: ftrace: Add perf ksymbol events for ftrace trampolines Symbols are needed for tools to describe instruction addresses. Pages allocated for ftrace's purposes need symbols to be created for them. Add such symbols to be visible via perf ksymbol events. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-8-adrian.hunter@intel.com --- include/uapi/linux/perf_event.h | 2 +- kernel/trace/ftrace.c | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e1a4179144a1..52ca2093831c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1051,7 +1051,7 @@ enum perf_record_ksymbol_type { PERF_RECORD_KSYMBOL_TYPE_BPF = 1, /* * Out of line code such as kprobe-replaced instructions or optimized - * kprobes. + * kprobes or ftrace trampolines. */ PERF_RECORD_KSYMBOL_TYPE_OOL = 2, PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 31675b209db2..2baaf7716537 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2790,8 +2790,13 @@ static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) static void ftrace_trampoline_free(struct ftrace_ops *ops) { if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && - ops->trampoline) + ops->trampoline) { + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, + true, FTRACE_TRAMPOLINE_SYM); + /* Remove from kallsyms after the perf events */ ftrace_remove_trampoline_from_kallsyms(ops); + } arch_ftrace_trampoline_free(ops); } @@ -6805,8 +6810,13 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) arch_ftrace_update_trampoline(ops); if (ops->trampoline && ops->trampoline != trampoline && - (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) { + /* Add to kallsyms before the perf events */ ftrace_add_trampoline_to_kallsyms(ops); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, false, + FTRACE_TRAMPOLINE_SYM); + } } void ftrace_init_trace_array(struct trace_array *tr) -- cgit v1.2.3 From 548e1f6c76e1eb80ba29edd4286b9b9f2c37f5bf Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 12 May 2020 15:19:15 +0300 Subject: ftrace: Add perf text poke events for ftrace trampolines Add perf text poke events for ftrace trampolines when created and when freed. There can be 3 text_poke events for ftrace trampolines: 1. NULL -> trampoline By ftrace_update_trampoline() when !ops->trampoline Trampoline created 2. [e.g. on x86] CALL rel32 -> CALL rel32 By arch_ftrace_update_trampoline() when ops->trampoline and ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP [e.g. on x86] via text_poke_bp() which generates text poke events Trampoline-called function target updated 3. trampoline -> NULL By ftrace_trampoline_free() when ops->trampoline and ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP Trampoline freed Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200512121922.8997-9-adrian.hunter@intel.com --- kernel/trace/ftrace.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2baaf7716537..d6bba734ab72 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2791,6 +2791,13 @@ static void ftrace_trampoline_free(struct ftrace_ops *ops) { if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && ops->trampoline) { + /* + * Record the text poke event before the ksymbol unregister + * event. + */ + perf_event_text_poke((void *)ops->trampoline, + (void *)ops->trampoline, + ops->trampoline_size, NULL, 0); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, ops->trampoline, ops->trampoline_size, true, FTRACE_TRAMPOLINE_SYM); @@ -6816,6 +6823,13 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, ops->trampoline, ops->trampoline_size, false, FTRACE_TRAMPOLINE_SYM); + /* + * Record the perf text poke event after the ksymbol register + * event. + */ + perf_event_text_poke((void *)ops->trampoline, NULL, 0, + (void *)ops->trampoline, + ops->trampoline_size); } } -- cgit v1.2.3 From 3dc167ba5729ddd2d8e3fa1841653792c295d3f1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 19 May 2020 19:25:06 +0200 Subject: sched/cputime: Improve cputime_adjust() People report that utime and stime from /proc//stat become very wrong when the numbers are big enough, especially if you watch these counters incrementally. Specifically, the current implementation of: stime*rtime/total, results in a saw-tooth function on top of the desired line, where the teeth grow in size the larger the values become. IOW, it has a relative error. The result is that, when watching incrementally as time progresses (for large values), we'll see periods of pure stime or utime increase, irrespective of the actual ratio we're striving for. Replace scale_stime() with a math64.h helper: mul_u64_u64_div_u64() that is far more accurate. This also allows architectures to override the implementation -- for instance they can opt for the old algorithm if this new one turns out to be too expensive for them. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200519172506.GA317395@hirez.programming.kicks-ass.net --- arch/x86/include/asm/div64.h | 14 ++++++++++++-- include/linux/math64.h | 2 ++ kernel/sched/cputime.c | 46 +------------------------------------------- lib/math/div64.c | 41 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index 9b8cb50768c2..b8f1dc0761e4 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -74,16 +74,26 @@ static inline u64 mul_u32_u32(u32 a, u32 b) #else # include -static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div) +/* + * Will generate an #DE when the result doesn't fit u64, could fix with an + * __ex_table[] entry when it becomes an issue. + */ +static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div) { u64 q; asm ("mulq %2; divq %3" : "=a" (q) - : "a" (a), "rm" ((u64)mul), "rm" ((u64)div) + : "a" (a), "rm" (mul), "rm" (div) : "rdx"); return q; } +#define mul_u64_u64_div_u64 mul_u64_u64_div_u64 + +static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div) +{ + return mul_u64_u64_div_u64(a, mul, div); +} #define mul_u64_u32_div mul_u64_u32_div #endif /* CONFIG_X86_32 */ diff --git a/include/linux/math64.h b/include/linux/math64.h index 11a267413e8e..d097119419e6 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -263,6 +263,8 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) } #endif /* mul_u64_u32_div */ +u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); + #define DIV64_U64_ROUND_UP(ll, d) \ ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ff9435dee1df..5a55d2300452 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -519,50 +519,6 @@ void account_idle_ticks(unsigned long ticks) account_idle_time(cputime); } -/* - * Perform (stime * rtime) / total, but avoid multiplication overflow by - * losing precision when the numbers are big. - */ -static u64 scale_stime(u64 stime, u64 rtime, u64 total) -{ - u64 scaled; - - for (;;) { - /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) - swap(rtime, stime); - - /* Make sure 'total' fits in 32 bits */ - if (total >> 32) - goto drop_precision; - - /* Does rtime (and thus stime) fit in 32 bits? */ - if (!(rtime >> 32)) - break; - - /* Can we just balance rtime/stime rather than dropping bits? */ - if (stime >> 31) - goto drop_precision; - - /* We can grow stime and shrink rtime and try to make them both fit */ - stime <<= 1; - rtime >>= 1; - continue; - -drop_precision: - /* We drop from rtime, it has more bits than stime */ - rtime >>= 1; - total >>= 1; - } - - /* - * Make sure gcc understands that this is a 32x32->64 multiply, - * followed by a 64/32->64 divide. - */ - scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); - return scaled; -} - /* * Adjust tick based cputime random precision against scheduler runtime * accounting. @@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, goto update; } - stime = scale_stime(stime, rtime, stime + utime); + stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); update: /* diff --git a/lib/math/div64.c b/lib/math/div64.c index 368ca7fd0d82..3952a07130d8 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -190,3 +190,44 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) return __iter_div_u64_rem(dividend, divisor, remainder); } EXPORT_SYMBOL(iter_div_u64_rem); + +#ifndef mul_u64_u64_div_u64 +u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) +{ + u64 res = 0, div, rem; + int shift; + + /* can a * b overflow ? */ + if (ilog2(a) + ilog2(b) > 62) { + /* + * (b * a) / c is equal to + * + * (b / c) * a + + * (b % c) * a / c + * + * if nothing overflows. Can the 1st multiplication + * overflow? Yes, but we do not care: this can only + * happen if the end result can't fit in u64 anyway. + * + * So the code below does + * + * res = (b / c) * a; + * b = b % c; + */ + div = div64_u64_rem(b, c, &rem); + res = div * a; + b = rem; + + shift = ilog2(a) + ilog2(b) - 62; + if (shift > 0) { + /* drop precision */ + b >>= shift; + c >>= shift; + if (!c) + return res; + } + } + + return res + div64_u64(a * b, c); +} +#endif -- cgit v1.2.3 From 844eb6458facb09d4871a480d8bda06550927a80 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 3 Jun 2020 10:03:01 +0200 Subject: sched/pelt: Remove redundant cap_scale() definition Besides in PELT cap_scale() is used in the Deadline scheduler class for scale-invariant bandwidth enforcement. Remove the cap_scale() definition in kernel/sched/pelt.c and keep the one in kernel/sched/sched.h. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200603080304.16548-2-dietmar.eggemann@arm.com --- kernel/sched/pelt.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index b4b1ff96642f..dea5567e4f72 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -83,8 +83,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) return c1 + c2 + c3; } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) - /* * Accumulate the three separate parts of the sum; d1 the remainder * of the last (incomplete) period, d2 the span of full periods and d3 -- cgit v1.2.3 From 0900acf2d8273f79432a4ded122ad5a265e85783 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 3 Jun 2020 10:03:02 +0200 Subject: sched/core: Remove redundant 'preempt' param from sched_class->yield_to_task() Commit 6d1cafd8b56e ("sched: Resched proper CPU on yield_to()") moved the code to resched the CPU from yield_to_task_fair() to yield_to() making the preempt parameter in sched_class->yield_to_task() unnecessary. Remove it. No other sched_class implements yield_to_task(). Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200603080304.16548-3-dietmar.eggemann@arm.com --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8f360326861e..9c89b0eaf796 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5810,7 +5810,7 @@ again: if (task_running(p_rq, p) || p->state) goto out_unlock; - yielded = curr->sched_class->yield_to_task(rq, p, preempt); + yielded = curr->sched_class->yield_to_task(rq, p); if (yielded) { schedstat_inc(rq->yld_count); /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cbcb2f71599b..6a4dab2f7c07 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7157,7 +7157,7 @@ static void yield_task_fair(struct rq *rq) set_skip_buddy(se); } -static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1d4e94c1e5fe..8d5d06881294 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1748,7 +1748,7 @@ struct sched_class { void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); - bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p); void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); -- cgit v1.2.3 From e3e76a6a04114ec95b0969cd026e8904c67b431b Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 3 Jun 2020 10:03:03 +0200 Subject: sched/idle,stop: Remove .get_rr_interval from sched_class The idle task and stop task sched_classes return 0 in this function. The single call site in sched_rr_get_interval() calls p->sched_class->get_rr_interval() only conditional in case it is defined. Otherwise time_slice=0 will be used. The deadline sched class does not define it. Commit a57beec5d427 ("sched: Make sched_class::get_rr_interval() optional") introduced the default time-slice=0 for sched classes which do not provide this function. So .get_rr_interval for idle and stop sched_class can be removed to shrink the code a little. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200603080304.16548-4-dietmar.eggemann@arm.com --- kernel/sched/idle.c | 7 ------- kernel/sched/stop_task.c | 8 -------- 2 files changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 05deb81bb3e3..8d75ca201484 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -446,11 +446,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) BUG(); } -static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_idle(struct rq *rq) { } @@ -479,8 +474,6 @@ const struct sched_class idle_sched_class = { .task_tick = task_tick_idle, - .get_rr_interval = get_rr_interval_idle, - .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, .update_curr = update_curr_idle, diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 4c9e9975684f..3e50a6a8f1e5 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -102,12 +102,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) BUG(); /* how!?, what priority? */ } -static unsigned int -get_rr_interval_stop(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_stop(struct rq *rq) { } @@ -136,8 +130,6 @@ const struct sched_class stop_sched_class = { .task_tick = task_tick_stop, - .get_rr_interval = get_rr_interval_stop, - .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, -- cgit v1.2.3 From 1ca2034ed798aea72a68d3904bd39a6cbfbdf405 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 3 Jun 2020 10:03:04 +0200 Subject: sched/fair: Remove unused 'sd' parameter from scale_rt_capacity() Since commit 8ec59c0f5f49 ("sched/topology: Remove unused 'sd' parameter from arch_scale_cpu_capacity()") it is no longer needed. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200603080304.16548-5-dietmar.eggemann@arm.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6a4dab2f7c07..69da576f7f48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8038,7 +8038,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) }; } -static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long max = arch_scale_cpu_capacity(cpu); @@ -8070,7 +8070,7 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(sd, cpu); + unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); -- cgit v1.2.3 From 4581bea8b4ec4de353369775dfef921191e393b3 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 27 May 2020 17:39:14 +0100 Subject: sched/debug: Add new tracepoints to track util_est The util_est signals are key elements for EAS task placement and frequency selection. Having tracepoints to track these signals enables load-tracking and schedutil testing and/or debugging by a toolkit. Signed-off-by: Vincent Donnefort Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/1590597554-370150-1-git-send-email-vincent.donnefort@arm.com --- include/trace/events/sched.h | 8 ++++++++ kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 6 ++++++ 3 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ed168b0e2c53..04f9a4c7b0d9 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -634,6 +634,14 @@ DECLARE_TRACE(sched_overutilized_tp, TP_PROTO(struct root_domain *rd, bool overutilized), TP_ARGS(rd, overutilized)); +DECLARE_TRACE(sched_util_est_cfs_tp, + TP_PROTO(struct cfs_rq *cfs_rq), + TP_ARGS(cfs_rq)); + +DECLARE_TRACE(sched_util_est_se_tp, + TP_PROTO(struct sched_entity *se), + TP_ARGS(se)); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9c89b0eaf796..0208b71bef80 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -36,6 +36,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 69da576f7f48..a785a9b262dd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3922,6 +3922,8 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, enqueued = cfs_rq->avg.util_est.enqueued; enqueued += _task_util_est(p); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); } /* @@ -3952,6 +3954,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); + trace_sched_util_est_cfs_tp(cfs_rq); + /* * Skip update of task's estimated utilization when the task has not * yet completed an activation, e.g. being migrated. @@ -4017,6 +4021,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; done: WRITE_ONCE(p->se.avg.util_est, ue); + + trace_sched_util_est_se_tp(&p->se); } static inline int task_fits_capacity(struct task_struct *p, long capacity) -- cgit v1.2.3 From 461daba06bdcb9c7a3f92b9bbd110e1f7d093ffc Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 28 May 2020 12:54:42 -0700 Subject: psi: eliminate kthread_worker from psi trigger scheduling mechanism Each psi group requires a dedicated kthread_delayed_work and kthread_worker. Since no other work can be performed using psi_group's kthread_worker, the same result can be obtained using a task_struct and a timer directly. This makes psi triggering simpler by removing lists and locks involved with kthread_worker usage and eliminates the need for poll_scheduled atomic use in the hot path. Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200528195442.190116-1-surenb@google.com --- include/linux/psi_types.h | 7 +-- kernel/sched/psi.c | 113 ++++++++++++++++++++++++++-------------------- 2 files changed, 68 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 4b7258495a04..b95f3211566a 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -153,9 +153,10 @@ struct psi_group { unsigned long avg[NR_PSI_STATES - 1][3]; /* Monitor work control */ - atomic_t poll_scheduled; - struct kthread_worker __rcu *poll_kworker; - struct kthread_delayed_work poll_work; + struct task_struct __rcu *poll_task; + struct timer_list poll_timer; + wait_queue_head_t poll_wait; + atomic_t poll_wakeup; /* Protects data used by the monitor */ struct mutex trigger_lock; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 8f45cdb6463b..e53b711bd643 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -190,7 +190,6 @@ static void group_init(struct psi_group *group) INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); /* Init trigger-related members */ - atomic_set(&group->poll_scheduled, 0); mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); @@ -199,7 +198,7 @@ static void group_init(struct psi_group *group) memset(group->polling_total, 0, sizeof(group->polling_total)); group->polling_next_update = ULLONG_MAX; group->polling_until = 0; - rcu_assign_pointer(group->poll_kworker, NULL); + rcu_assign_pointer(group->poll_task, NULL); } void __init psi_init(void) @@ -547,47 +546,38 @@ static u64 update_triggers(struct psi_group *group, u64 now) return now + group->poll_min_period; } -/* - * Schedule polling if it's not already scheduled. It's safe to call even from - * hotpath because even though kthread_queue_delayed_work takes worker->lock - * spinlock that spinlock is never contended due to poll_scheduled atomic - * preventing such competition. - */ +/* Schedule polling if it's not already scheduled. */ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) { - struct kthread_worker *kworker; + struct task_struct *task; - /* Do not reschedule if already scheduled */ - if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + /* + * Do not reschedule if already scheduled. + * Possible race with a timer scheduled after this check but before + * mod_timer below can be tolerated because group->polling_next_update + * will keep updates on schedule. + */ + if (timer_pending(&group->poll_timer)) return; rcu_read_lock(); - kworker = rcu_dereference(group->poll_kworker); + task = rcu_dereference(group->poll_task); /* * kworker might be NULL in case psi_trigger_destroy races with * psi_task_change (hotpath) which can't use locks */ - if (likely(kworker)) - kthread_queue_delayed_work(kworker, &group->poll_work, delay); - else - atomic_set(&group->poll_scheduled, 0); + if (likely(task)) + mod_timer(&group->poll_timer, jiffies + delay); rcu_read_unlock(); } -static void psi_poll_work(struct kthread_work *work) +static void psi_poll_work(struct psi_group *group) { - struct kthread_delayed_work *dwork; - struct psi_group *group; u32 changed_states; u64 now; - dwork = container_of(work, struct kthread_delayed_work, work); - group = container_of(dwork, struct psi_group, poll_work); - - atomic_set(&group->poll_scheduled, 0); - mutex_lock(&group->trigger_lock); now = sched_clock(); @@ -623,6 +613,35 @@ out: mutex_unlock(&group->trigger_lock); } +static int psi_poll_worker(void *data) +{ + struct psi_group *group = (struct psi_group *)data; + struct sched_param param = { + .sched_priority = 1, + }; + + sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + + while (true) { + wait_event_interruptible(group->poll_wait, + atomic_cmpxchg(&group->poll_wakeup, 1, 0) || + kthread_should_stop()); + if (kthread_should_stop()) + break; + + psi_poll_work(group); + } + return 0; +} + +static void poll_timer_fn(struct timer_list *t) +{ + struct psi_group *group = from_timer(group, t, poll_timer); + + atomic_set(&group->poll_wakeup, 1); + wake_up_interruptible(&group->poll_wait); +} + static void record_times(struct psi_group_cpu *groupc, int cpu, bool memstall_tick) { @@ -1099,22 +1118,20 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, mutex_lock(&group->trigger_lock); - if (!rcu_access_pointer(group->poll_kworker)) { - struct sched_param param = { - .sched_priority = 1, - }; - struct kthread_worker *kworker; + if (!rcu_access_pointer(group->poll_task)) { + struct task_struct *task; - kworker = kthread_create_worker(0, "psimon"); - if (IS_ERR(kworker)) { + task = kthread_create(psi_poll_worker, group, "psimon"); + if (IS_ERR(task)) { kfree(t); mutex_unlock(&group->trigger_lock); - return ERR_CAST(kworker); + return ERR_CAST(task); } - sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); - kthread_init_delayed_work(&group->poll_work, - psi_poll_work); - rcu_assign_pointer(group->poll_kworker, kworker); + atomic_set(&group->poll_wakeup, 0); + init_waitqueue_head(&group->poll_wait); + wake_up_process(task); + timer_setup(&group->poll_timer, poll_timer_fn, 0); + rcu_assign_pointer(group->poll_task, task); } list_add(&t->node, &group->triggers); @@ -1132,7 +1149,7 @@ static void psi_trigger_destroy(struct kref *ref) { struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); struct psi_group *group = t->group; - struct kthread_worker *kworker_to_destroy = NULL; + struct task_struct *task_to_destroy = NULL; if (static_branch_likely(&psi_disabled)) return; @@ -1158,13 +1175,13 @@ static void psi_trigger_destroy(struct kref *ref) period = min(period, div_u64(tmp->win.size, UPDATES_PER_WINDOW)); group->poll_min_period = period; - /* Destroy poll_kworker when the last trigger is destroyed */ + /* Destroy poll_task when the last trigger is destroyed */ if (group->poll_states == 0) { group->polling_until = 0; - kworker_to_destroy = rcu_dereference_protected( - group->poll_kworker, + task_to_destroy = rcu_dereference_protected( + group->poll_task, lockdep_is_held(&group->trigger_lock)); - rcu_assign_pointer(group->poll_kworker, NULL); + rcu_assign_pointer(group->poll_task, NULL); } } @@ -1172,25 +1189,23 @@ static void psi_trigger_destroy(struct kref *ref) /* * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_kworker RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_kworker + * poll_task RCUs to complete their read-side critical sections + * before destroying the trigger and optionally the poll_task */ synchronize_rcu(); /* * Destroy the kworker after releasing trigger_lock to prevent a * deadlock while waiting for psi_poll_work to acquire trigger_lock */ - if (kworker_to_destroy) { + if (task_to_destroy) { /* * After the RCU grace period has expired, the worker - * can no longer be found through group->poll_kworker. + * can no longer be found through group->poll_task. * But it might have been already scheduled before * that - deschedule it cleanly before destroying it. */ - kthread_cancel_delayed_work_sync(&group->poll_work); - atomic_set(&group->poll_scheduled, 0); - - kthread_destroy_worker(kworker_to_destroy); + del_timer_sync(&group->poll_timer); + kthread_stop(task_to_destroy); } kfree(t); } -- cgit v1.2.3 From 043eb8e1051143a24811e6f35c276e35ae8247b6 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 27 May 2020 16:29:08 +0200 Subject: kthread: Switch to cpu_possible_mask Next patch will switch unbound kernel threads mask to housekeeping_cpumask(), a subset of cpu_possible_mask. So in order to ease bisection, lets first switch kthreads default affinity from cpu_all_mask to cpu_possible_mask. It looks safe to do so as cpu_possible_mask seem to be initialized at setup_arch() time, way before kthreadd is created. Suggested-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker Signed-off-by: Marcelo Tosatti Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200527142909.23372-2-frederic@kernel.org --- kernel/kthread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 8e3d2d7fdf5e..b86d37cda109 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -383,7 +383,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), * The kernel thread should not inherit these properties. */ sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, cpu_all_mask); + set_cpus_allowed_ptr(task, cpu_possible_mask); } kfree(create); return task; @@ -608,7 +608,7 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, cpu_all_mask); + set_cpus_allowed_ptr(tsk, cpu_possible_mask); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; -- cgit v1.2.3 From 9cc5b8656892a72438ee7deb5e80f5be47643b8b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 27 May 2020 16:29:09 +0200 Subject: isolcpus: Affine unbound kernel threads to housekeeping cpus This is a kernel enhancement that configures the cpu affinity of kernel threads via kernel boot option nohz_full=. When this option is specified, the cpumask is immediately applied upon kthread launch. This does not affect kernel threads that specify cpu and node. This allows CPU isolation (that is not allowing certain threads to execute on certain CPUs) without using the isolcpus=domain parameter, making it possible to enable load balancing on such CPUs during runtime (see kernel-parameters.txt). Note-1: this is based off on Wind River's patch at https://github.com/starlingx-staging/stx-integ/blob/master/kernel/kernel-std/centos/patches/affine-compute-kernel-threads.patch Difference being that this patch is limited to modifying kernel thread cpumask. Behaviour of other threads can be controlled via cgroups or sched_setaffinity. Note-2: Wind River's patch was based off Christoph Lameter's patch at https://lwn.net/Articles/565932/ with the only difference being the kernel parameter changed from kthread to kthread_cpus. Signed-off-by: Frederic Weisbecker Signed-off-by: Marcelo Tosatti Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200527142909.23372-3-frederic@kernel.org --- include/linux/sched/isolation.h | 1 + kernel/kthread.c | 6 ++++-- kernel/sched/isolation.c | 3 ++- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 0fbcbacd1b29..cc9f393e2a70 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -14,6 +14,7 @@ enum hk_flags { HK_FLAG_DOMAIN = (1 << 5), HK_FLAG_WQ = (1 << 6), HK_FLAG_MANAGED_IRQ = (1 << 7), + HK_FLAG_KTHREAD = (1 << 8), }; #ifdef CONFIG_CPU_ISOLATION diff --git a/kernel/kthread.c b/kernel/kthread.c index b86d37cda109..032b610912b0 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -383,7 +384,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), * The kernel thread should not inherit these properties. */ sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, cpu_possible_mask); + set_cpus_allowed_ptr(task, + housekeeping_cpumask(HK_FLAG_KTHREAD)); } kfree(create); return task; @@ -608,7 +610,7 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, cpu_possible_mask); + set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 808244f3ddd9..5a6ea03f9882 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -140,7 +140,8 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned int flags; - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | + HK_FLAG_MISC | HK_FLAG_KTHREAD; return housekeeping_setup(str, flags); } -- cgit v1.2.3 From b4098bfc5efb1fd7ecf40165132a1283aeea3500 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Jul 2019 16:54:10 +0200 Subject: sched/deadline: Impose global limits on sched_attr::sched_period Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726161357.397880775@infradead.org --- include/linux/sched/sysctl.h | 3 +++ kernel/sched/deadline.c | 23 +++++++++++++++++++++-- kernel/sysctl.c | 14 ++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 660ac49f2b53..24be30a40814 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -61,6 +61,9 @@ int sched_proc_update_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; +extern unsigned int sysctl_sched_dl_period_max; +extern unsigned int sysctl_sched_dl_period_min; + #ifdef CONFIG_UCLAMP_TASK extern unsigned int sysctl_sched_uclamp_util_min; extern unsigned int sysctl_sched_uclamp_util_max; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 504d2f51b0d6..f31964ad9c2e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2634,6 +2634,14 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_flags = dl_se->flags; } +/* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting rediculous long effective runtimes, on the bottom end we + * guard against timer DoS. + */ +unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ +unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ + /* * This function validates the new parameters of a -deadline task. * We ask for the deadline not being zero, and greater or equal @@ -2646,6 +2654,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + u64 period, max, min; + /* special dl tasks don't actually use any parameter */ if (attr->sched_flags & SCHED_FLAG_SUGOV) return true; @@ -2669,12 +2679,21 @@ bool __checkparam_dl(const struct sched_attr *attr) attr->sched_period & (1ULL << 63)) return false; + period = attr->sched_period; + if (!period) + period = attr->sched_deadline; + /* runtime <= deadline <= period (if period != 0) */ - if ((attr->sched_period != 0 && - attr->sched_period < attr->sched_deadline) || + if (period < attr->sched_deadline || attr->sched_deadline < attr->sched_runtime) return false; + max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC; + min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC; + + if (period < min || period > max) + return false; + return true; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index db1ce7af2563..4aea67d3d552 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1779,6 +1779,20 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, + { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_rr_timeslice_ms", .data = &sysctl_sched_rr_timeslice, -- cgit v1.2.3 From 3ea2f097b17e13a8280f1f9386c331b326a3dbef Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 9 Jun 2020 14:37:48 +0200 Subject: sched/fair: Fix NOHZ next idle balance With commit: 'b7031a02ec75 ("sched/fair: Add NOHZ_STATS_KICK")' rebalance_domains of the local cfs_rq happens before others idle cpus have updated nohz.next_balance and its value is overwritten. Move the update of nohz.next_balance for other idles cpus before balancing and updating the next_balance of local cfs_rq. Also, the nohz.next_balance is now updated only if all idle cpus got a chance to rebalance their domains and the idle balance has not been aborted because of new activities on the CPU. In case of need_resched, the idle load balance will be kick the next jiffie in order to address remaining ilb. Fixes: b7031a02ec75 ("sched/fair: Add NOHZ_STATS_KICK") Reported-by: Peng Liu Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20200609123748.18636-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a785a9b262dd..295c9ffa850b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10022,7 +10022,12 @@ static void kick_ilb(unsigned int flags) { int ilb_cpu; - nohz.next_balance++; + /* + * Increase nohz.next_balance only when if full ilb is triggered but + * not if we only update stats. + */ + if (flags & NOHZ_BALANCE_KICK) + nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); @@ -10343,6 +10348,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, } } + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; + /* Newly idle CPU doesn't need an update */ if (idle != CPU_NEWLY_IDLE) { update_blocked_averages(this_cpu); @@ -10363,14 +10376,6 @@ abort: if (has_blocked_load) WRITE_ONCE(nohz.has_blocked, 1); - /* - * next_balance will be updated only when there is a need. - * When the CPU is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) - nohz.next_balance = next_balance; - return ret; } -- cgit v1.2.3 From 9b1b234bb86bcdcdb142e900d39b599185465dbb Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Tue, 9 Jun 2020 23:09:36 +0800 Subject: sched: correct SD_flags returned by tl->sd_flags() During sched domain init, we check whether non-topological SD_flags are returned by tl->sd_flags(), if found, fire a waning and correct the violation, but the code failed to correct the violation. Correct this. Fixes: 143e1e28cb40 ("sched: Rework sched_domain topology definition") Signed-off-by: Peng Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20200609150936.GA13060@iZj6chx1xj0e0buvshuecpZ --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ba81187bb7af..9079d865a935 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1328,7 +1328,7 @@ sd_init(struct sched_domain_topology_level *tl, sd_flags = (*tl->sd_flags)(); if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, "wrong sd_flags in topology description\n")) - sd_flags &= ~TOPOLOGY_SD_FLAGS; + sd_flags &= TOPOLOGY_SD_FLAGS; /* Apply detected topology flags */ sd_flags |= dflags; -- cgit v1.2.3 From c81b89329933c6c0be809d4c0d2cb57c49153ee3 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 20 May 2020 15:42:39 +0200 Subject: sched/deadline: Optimize dl_bw_cpus() Return the weight of the root domain (rd) span in case it is a subset of the cpu_active_mask. Continue to compute the number of CPUs over rd span and cpu_active_mask when in hotplug. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200520134243.19352-2-dietmar.eggemann@arm.com --- kernel/sched/deadline.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f31964ad9c2e..ec90265e9d8e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -54,10 +54,16 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus = 0; + int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); + + if (cpumask_subset(rd->span, cpu_active_mask)) + return cpumask_weight(rd->span); + + cpus = 0; + for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; -- cgit v1.2.3 From fc9dc698472aa460a8b3b036d9b1d0b751f12f58 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 20 May 2020 15:42:40 +0200 Subject: sched/deadline: Add dl_bw_capacity() Capacity-aware SCHED_DEADLINE Admission Control (AC) needs root domain (rd) CPU capacity sum. Introduce dl_bw_capacity() which for a symmetric rd w/ a CPU capacity of SCHED_CAPACITY_SCALE simply relies on dl_bw_cpus() to return #CPUs multiplied by SCHED_CAPACITY_SCALE. For an asymmetric rd or a CPU capacity < SCHED_CAPACITY_SCALE it computes the CPU capacity sum over rd span and cpu_active_mask. A 'XXX Fix:' comment was added to highlight that if 'rq->rd == def_root_domain' AC should be performed against the capacity of the CPU the task is running on rather the rd CPU capacity sum. This issue already exists w/o capacity awareness. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200520134243.19352-3-dietmar.eggemann@arm.com --- kernel/sched/deadline.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ec90265e9d8e..01f474a5bd14 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -69,6 +69,34 @@ static inline int dl_bw_cpus(int i) return cpus; } + +static inline unsigned long __dl_bw_capacity(int i) +{ + struct root_domain *rd = cpu_rq(i)->rd; + unsigned long cap = 0; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + + for_each_cpu_and(i, rd->span, cpu_active_mask) + cap += capacity_orig_of(i); + + return cap; +} + +/* + * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity + * of the CPU the task is running on rather rd's \Sum CPU capacity. + */ +static inline unsigned long dl_bw_capacity(int i) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity) && + capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { + return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; + } else { + return __dl_bw_capacity(i); + } +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -79,6 +107,11 @@ static inline int dl_bw_cpus(int i) { return 1; } + +static inline unsigned long dl_bw_capacity(int i) +{ + return SCHED_CAPACITY_SCALE; +} #endif static inline -- cgit v1.2.3 From 60ffd5edc5e4fa69622c125c54ef8e7d5d894af8 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Wed, 20 May 2020 15:42:41 +0200 Subject: sched/deadline: Improve admission control for asymmetric CPU capacities The current SCHED_DEADLINE (DL) admission control ensures that sum of reserved CPU bandwidth < x * M where x = /proc/sys/kernel/sched_rt_{runtime,period}_us M = # CPUs in root domain. DL admission control works well for homogeneous systems where the capacity of all CPUs are equal (1024). I.e. bounded tardiness for DL and non-starvation of non-DL tasks is guaranteed. But on heterogeneous systems where capacity of CPUs are different it could fail by over-allocating CPU time on smaller capacity CPUs. On an Arm big.LITTLE/DynamIQ system DL tasks can easily starve other tasks making it unusable. Fix this by explicitly considering the CPU capacity in the DL admission test by replacing M with the root domain CPU capacity sum. Signed-off-by: Luca Abeni Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200520134243.19352-4-dietmar.eggemann@arm.com --- kernel/sched/deadline.c | 30 +++++++++++++++++------------- kernel/sched/sched.h | 6 +++--- 2 files changed, 20 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 01f474a5bd14..9ebd0a9241ed 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2590,11 +2590,12 @@ void sched_dl_do_global(void) int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; + int cpus, err = -1, cpu = task_cpu(p); + struct dl_bw *dl_b = dl_bw_of(cpu); + unsigned long cap; if (attr->sched_flags & SCHED_FLAG_SUGOV) return 0; @@ -2609,15 +2610,17 @@ int sched_dl_overflow(struct task_struct *p, int policy, * allocated bandwidth of the container. */ raw_spin_lock(&dl_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { + !__dl_overflow(dl_b, cap, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) { /* * XXX this is slightly incorrect: when the task * utilization decreases, we should delay the total @@ -2772,19 +2775,19 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) #ifdef CONFIG_SMP int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) { + unsigned long flags, cap; unsigned int dest_cpu; struct dl_bw *dl_b; bool overflow; - int cpus, ret; - unsigned long flags; + int ret; dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); rcu_read_lock_sched(); dl_b = dl_bw_of(dest_cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + cap = dl_bw_capacity(dest_cpu); + overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw); if (overflow) { ret = -EBUSY; } else { @@ -2794,6 +2797,8 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo * We will free resources in the source root_domain * later on (see set_cpus_allowed_dl()). */ + int cpus = dl_bw_cpus(dest_cpu); + __dl_add(dl_b, p->dl.dl_bw, cpus); ret = 0; } @@ -2826,16 +2831,15 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, bool dl_cpu_busy(unsigned int cpu) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow; - int cpus; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); + cap = dl_bw_capacity(cpu); + overflow = __dl_overflow(dl_b, cap, 0, 0); raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8d5d06881294..91b250f265c0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -310,11 +310,11 @@ void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) __dl_update(dl_b, -((s32)tsk_bw / cpus)); } -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap, + u64 old_bw, u64 new_bw) { return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; + cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; } extern void init_dl_bw(struct dl_bw *dl_b); -- cgit v1.2.3 From b4118988fdcb4554ea6687dd8ff68bcab690b8ea Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Wed, 20 May 2020 15:42:42 +0200 Subject: sched/deadline: Make DL capacity-aware The current SCHED_DEADLINE (DL) scheduler uses a global EDF scheduling algorithm w/o considering CPU capacity or task utilization. This works well on homogeneous systems where DL tasks are guaranteed to have a bounded tardiness but presents issues on heterogeneous systems. A DL task can migrate to a CPU which does not have enough CPU capacity to correctly serve the task (e.g. a task w/ 70ms runtime and 100ms period on a CPU w/ 512 capacity). Add the DL fitness function dl_task_fits_capacity() for DL admission control on heterogeneous systems. A task fits onto a CPU if: CPU original capacity / 1024 >= task runtime / task deadline Use this function on heterogeneous systems to try to find a CPU which meets this criterion during task wakeup, push and offline migration. On homogeneous systems the original behavior of the DL admission control should be retained. Signed-off-by: Luca Abeni Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200520134243.19352-5-dietmar.eggemann@arm.com --- kernel/sched/cpudeadline.c | 14 +++++++++++++- kernel/sched/deadline.c | 18 ++++++++++++++---- kernel/sched/sched.h | 15 +++++++++++++++ 3 files changed, 42 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5cc4012572ec..8630f2a40a3f 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -121,7 +121,19 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (later_mask && cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { - return 1; + int cpu; + + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return 1; + + /* Ensure the capacity of the CPUs fits the task. */ + for_each_cpu(cpu, later_mask) { + if (!dl_task_fits_capacity(p, cpu)) + cpumask_clear_cpu(cpu, later_mask); + } + + if (!cpumask_empty(later_mask)) + return 1; } else { int best_cpu = cpudl_maximum(cp); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9ebd0a9241ed..84e84ba0b00a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1643,6 +1643,7 @@ static int select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; + bool select_rq; struct rq *rq; if (sd_flag != SD_BALANCE_WAKE) @@ -1662,10 +1663,19 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + select_rq = unlikely(dl_task(curr)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &curr->dl)) && + p->nr_cpus_allowed > 1; + + /* + * Take the capacity of the CPU into account to + * ensure it fits the requirement of the task. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + select_rq |= !dl_task_fits_capacity(p, cpu); + + if (select_rq) { int target = find_later_rq(p); if (target != -1 && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 91b250f265c0..336887607b3d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -317,6 +317,21 @@ static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap, cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; } +/* + * Verify the fitness of task @p to run on @cpu taking into account the + * CPU original capacity and the runtime/deadline ratio of the task. + * + * The function will return true if the CPU original capacity of the + * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the + * task and false otherwise. + */ +static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned long cap = arch_scale_cpu_capacity(cpu); + + return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime; +} + extern void init_dl_bw(struct dl_bw *dl_b); extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); -- cgit v1.2.3 From 23e71d8ba42933bff12e453858fd68c073bc5258 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Wed, 20 May 2020 15:42:43 +0200 Subject: sched/deadline: Implement fallback mechanism for !fit case When a task has a runtime that cannot be served within the scheduling deadline by any of the idle CPU (later_mask) the task is doomed to miss its deadline. This can happen since the SCHED_DEADLINE admission control guarantees only bounded tardiness and not the hard respect of all deadlines. In this case try to select the idle CPU with the largest CPU capacity to minimize tardiness. Favor task_cpu(p) if it has max capacity of !fitting CPUs so that find_later_rq() can potentially still return it (most likely cache-hot) early. Signed-off-by: Luca Abeni Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200520134243.19352-6-dietmar.eggemann@arm.com --- kernel/sched/cpudeadline.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 8630f2a40a3f..8cb06c8c7eb1 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -121,19 +121,31 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (later_mask && cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { - int cpu; + unsigned long cap, max_cap = 0; + int cpu, max_cpu = -1; if (!static_branch_unlikely(&sched_asym_cpucapacity)) return 1; /* Ensure the capacity of the CPUs fits the task. */ for_each_cpu(cpu, later_mask) { - if (!dl_task_fits_capacity(p, cpu)) + if (!dl_task_fits_capacity(p, cpu)) { cpumask_clear_cpu(cpu, later_mask); + + cap = capacity_orig_of(cpu); + + if (cap > max_cap || + (cpu == task_cpu(p) && cap == max_cap)) { + max_cap = cap; + max_cpu = cpu; + } + } } - if (!cpumask_empty(later_mask)) - return 1; + if (cpumask_empty(later_mask)) + cpumask_set_cpu(max_cpu, later_mask); + + return 1; } else { int best_cpu = cpudl_maximum(cp); -- cgit v1.2.3 From c49694173da004b1b16082f82f28bd625415fbb2 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 2 Jun 2020 21:50:02 +0200 Subject: sched/deadline: Fix a typo in a comment s/deadine/deadline/ Signed-off-by: Christophe JAILLET Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200602195002.677448-1-christophe.jaillet@wanadoo.fr --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 84e84ba0b00a..d4708e29008f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1137,7 +1137,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) * cannot use the runtime, and so it replenishes the task. This rule * works fine for implicit deadline tasks (deadline == period), and the * CBS was designed for implicit deadline tasks. However, a task with - * constrained deadline (deadine < period) might be awakened after the + * constrained deadline (deadline < period) might be awakened after the * deadline, but before the next period. In this case, replenishing the * task would allow it to run for runtime / deadline. As in this case * deadline < period, CBS enables a task to run for more than the -- cgit v1.2.3 From 87e867b4269f29dac8190bca13912d08163a277f Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 12 Jun 2020 17:47:03 +0200 Subject: sched/pelt: Cleanup PELT divider Factorize in a single place the calculation of the divider to be used to to compute *_avg from *_sum value Suggested-by: Dietmar Eggemann Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200612154703.23555-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 32 ++++++++++++++++++-------------- kernel/sched/pelt.c | 2 +- kernel/sched/pelt.h | 5 +++++ 3 files changed, 24 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 295c9ffa850b..0424a0af5f87 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3094,7 +3094,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, #ifdef CONFIG_SMP do { - u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; + u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); @@ -3440,16 +3440,18 @@ static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + u32 divider; /* Nothing to update */ if (!delta) return; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; se->avg.util_sum = se->avg.util_avg * divider; @@ -3463,16 +3465,18 @@ static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + u32 divider; /* Nothing to update */ if (!delta) return; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; se->avg.runnable_sum = se->avg.runnable_avg * divider; @@ -3500,7 +3504,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details. */ - divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + divider = get_pelt_divider(&cfs_rq->avg); if (runnable_sum >= 0) { /* @@ -3646,7 +3650,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (cfs_rq->removed.nr) { unsigned long r; - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(&cfs_rq->avg); raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); @@ -3701,7 +3705,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details. */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + u32 divider = get_pelt_divider(&cfs_rq->avg); /* * When we attach the @se to the @cfs_rq, we must align the decay diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index dea5567e4f72..11bea3b08115 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -262,7 +262,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, static __always_inline void ___update_load_avg(struct sched_avg *sa, unsigned long load) { - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(sa); /* * Step 2: update *_avg. diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index eb034d9f024d..795e43e02afc 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -37,6 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running) } #endif +static inline u32 get_pelt_divider(struct sched_avg *avg) +{ + return LOAD_AVG_MAX - 1024 + avg->period_contrib; +} + /* * When a task is dequeued, its estimated utilization should not be update if * its util_avg has not been updated at least once. -- cgit v1.2.3 From 7318d4cc14c8c8a5dde2b0b72ea50fd2545f0b7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched: Provide sched_set_fifo() SCHED_FIFO (or any static priority scheduler) is a broken scheduler model; it is fundamentally incapable of resource management, the one thing an OS is actually supposed to do. It is impossible to compose static priority workloads. One cannot take two well designed and functional static priority workloads and mash them together and still expect them to work. Therefore it doesn't make sense to expose the priority field; the kernel is fundamentally incapable of setting a sensible value, it needs systems knowledge that it doesn't have. Take away sched_setschedule() / sched_setattr() from modules and replace them with: - sched_set_fifo(p); create a FIFO task (at prio 50) - sched_set_fifo_low(p); create a task higher than NORMAL, which ends up being a FIFO task at prio 1. - sched_set_normal(p, nice); (re)set the task to normal This stops the proliferation of randomly chosen, and irrelevant, FIFO priorities that dont't really mean anything anyway. The system administrator/integrator, whoever has insight into the actual system design and requirements (userspace) can set-up appropriate priorities if and when needed. Cc: airlied@redhat.com Cc: alexander.deucher@amd.com Cc: awalls@md.metrocast.net Cc: axboe@kernel.dk Cc: broonie@kernel.org Cc: daniel.lezcano@linaro.org Cc: gregkh@linuxfoundation.org Cc: hannes@cmpxchg.org Cc: herbert@gondor.apana.org.au Cc: hverkuil@xs4all.nl Cc: john.stultz@linaro.org Cc: nico@fluxnic.net Cc: paulmck@kernel.org Cc: rafael.j.wysocki@intel.com Cc: rmk+kernel@arm.linux.org.uk Cc: sudeep.holla@arm.com Cc: tglx@linutronix.de Cc: ulf.hansson@linaro.org Cc: wim@linux-watchdog.org Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Tested-by: Paul E. McKenney --- include/linux/sched.h | 3 +++ kernel/sched/core.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index b62e6aaf28f0..b792b8f0f4cf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1653,6 +1653,9 @@ extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); +extern int sched_set_fifo(struct task_struct *p); +extern int sched_set_fifo_low(struct task_struct *p); +extern int sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0208b71bef80..40d3939b0520 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5124,6 +5124,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy, * @policy: new policy. * @param: structure containing the new RT priority. * + * Use sched_set_fifo(), read its comment. + * * Return: 0 on success. An error code otherwise. * * NOTE that the task may be already dead. @@ -5166,6 +5168,51 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, } EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); +/* + * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally + * incapable of resource management, which is the one thing an OS really should + * be doing. + * + * This is of course the reason it is limited to privileged users only. + * + * Worse still; it is fundamentally impossible to compose static priority + * workloads. You cannot take two correctly working static prio workloads + * and smash them together and still expect them to work. + * + * For this reason 'all' FIFO tasks the kernel creates are basically at: + * + * MAX_RT_PRIO / 2 + * + * The administrator _MUST_ configure the system, the kernel simply doesn't + * know enough information to make a sensible choice. + */ +int sched_set_fifo(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; + return sched_setscheduler_nocheck(p, SCHED_FIFO, &sp); +} +EXPORT_SYMBOL_GPL(sched_set_fifo); + +/* + * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. + */ +int sched_set_fifo_low(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = 1 }; + return sched_setscheduler_nocheck(p, SCHED_FIFO, &sp); +} +EXPORT_SYMBOL_GPL(sched_set_fifo_low); + +int sched_set_normal(struct task_struct *p, int nice) +{ + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + .sched_nice = nice, + }; + return sched_setattr_nocheck(p, &attr); +} +EXPORT_SYMBOL_GPL(sched_set_normal); + static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { -- cgit v1.2.3 From 7a40798c714ff462863352d490b382515daba49e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched,irq: Convert to sched_set_fifo() Because SCHED_FIFO is a broken scheduler model (see previous patches) take away the priority field, the kernel can't possibly make an informed decision. Effectively no change. Cc: tglx@linutronix.de Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar --- kernel/irq/manage.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 761911168438..06b62746e1df 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1271,9 +1271,6 @@ static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { struct task_struct *t; - struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; if (!secondary) { t = kthread_create(irq_thread, new, "irq/%d-%s", irq, @@ -1281,13 +1278,12 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) } else { t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, new->name); - param.sched_priority -= 1; } if (IS_ERR(t)) return PTR_ERR(t); - sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); + sched_set_fifo(t); /* * We keep the reference to the task struct even if -- cgit v1.2.3 From 93db9129fa4beb78e2227554d237e8db04ca514c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched,locktorture: Convert to sched_set_fifo() Because SCHED_FIFO is a broken scheduler model (see previous patches) take away the priority field, the kernel can't possibly make an informed decision. Effectively changes prio from 99 to 50. Cc: paulmck@kernel.org Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Reviewed-by: Paul E. McKenney --- kernel/locking/locktorture.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 5efbfc68ce99..fca41a7ded0c 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -436,8 +436,6 @@ static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) static void torture_rtmutex_boost(struct torture_random_state *trsp) { - int policy; - struct sched_param param; const unsigned int factor = 50000; /* yes, quite arbitrary */ if (!rt_task(current)) { @@ -448,8 +446,7 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) */ if (trsp && !(torture_random(trsp) % (cxt.nrealwriters_stress * factor))) { - policy = SCHED_FIFO; - param.sched_priority = MAX_RT_PRIO - 1; + sched_set_fifo(current); } else /* common case, do nothing */ return; } else { @@ -462,13 +459,10 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) */ if (!trsp || !(torture_random(trsp) % (cxt.nrealwriters_stress * factor * 2))) { - policy = SCHED_NORMAL; - param.sched_priority = 0; + sched_set_normal(current, 0); } else /* common case, do nothing */ return; } - - sched_setscheduler_nocheck(current, policy, ¶m); } static void torture_rtmutex_delay(struct torture_random_state *trsp) -- cgit v1.2.3 From b1433395c4cc078b4382429e6d7dd1721714094f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched,rcuperf: Convert to sched_set_fifo_low() Because SCHED_FIFO is a broken scheduler model (see previous patches) take away the priority field, the kernel can't possibly make an informed decision. Effectively no change. Cc: paulmck@kernel.org Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Reviewed-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 16dd1e6b7c09..64249a045232 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -354,7 +354,6 @@ rcu_perf_writer(void *arg) int i_max; long me = (long)arg; struct rcu_head *rhp = NULL; - struct sched_param sp; bool started = false, done = false, alldone = false; u64 t; u64 *wdp; @@ -363,8 +362,7 @@ rcu_perf_writer(void *arg) VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); - sp.sched_priority = 1; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + sched_set_fifo_low(current); if (holdoff) schedule_timeout_uninterruptible(holdoff * HZ); @@ -420,9 +418,7 @@ retry: started = true; if (!done && i >= MIN_MEAS) { done = true; - sp.sched_priority = 0; - sched_setscheduler_nocheck(current, - SCHED_NORMAL, &sp); + sched_set_normal(current, 0); pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", perf_type, PERF_FLAG, me, MIN_MEAS); if (atomic_inc_return(&n_rcu_perf_writer_finished) >= -- cgit v1.2.3 From ce1c8afd3f3119ddaddcdff27579f5723f55c75e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched,rcutorture: Convert to sched_set_fifo_low() Because SCHED_FIFO is a broken scheduler model (see previous patches) take away the priority field, the kernel can't possibly make an informed decision. Effectively no change. Cc: paulmck@kernel.org Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Reviewed-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efb792e13fca..bbc3c8a9e7b9 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -889,13 +889,11 @@ static int rcu_torture_boost(void *arg) unsigned long endtime; unsigned long oldstarttime; struct rcu_boost_inflight rbi = { .inflight = 0 }; - struct sched_param sp; VERBOSE_TOROUT_STRING("rcu_torture_boost started"); /* Set real-time priority. */ - sp.sched_priority = 1; - if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { + if (sched_set_fifo_low(current) < 0) { VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!"); n_rcu_torture_boost_rterror++; } -- cgit v1.2.3 From 2cca5426b9c108998bc03230cd6ae440f3e205ed Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched,psi: Convert to sched_set_fifo_low() Because SCHED_FIFO is a broken scheduler model (see previous patches) take away the priority field, the kernel can't possibly make an informed decision. Effectively no change. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Acked-by: Johannes Weiner --- kernel/sched/psi.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index e53b711bd643..967732c0766c 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -616,11 +616,8 @@ out: static int psi_poll_worker(void *data) { struct psi_group *group = (struct psi_group *)data; - struct sched_param param = { - .sched_priority = 1, - }; - sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + sched_set_fifo_low(current); while (true) { wait_event_interruptible(group->poll_wait, -- cgit v1.2.3 From 616d91b68cd56bcb1954b6a5af7d542401fde772 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Apr 2020 12:09:13 +0200 Subject: sched: Remove sched_setscheduler*() EXPORTs Now that nothing (modular) still uses sched_setscheduler(), remove the exports. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar --- kernel/sched/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 40d3939b0520..f882d3d74dad 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5135,13 +5135,11 @@ int sched_setscheduler(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, true); } -EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setattr(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, true, true); } -EXPORT_SYMBOL_GPL(sched_setattr); int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { @@ -5166,7 +5164,6 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } -EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); /* * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally -- cgit v1.2.3 From 8b700983de82f79e05b2c1136d6513ea4c9b22c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Apr 2020 13:10:04 +0200 Subject: sched: Remove sched_set_*() return value Ingo suggested that since the new sched_set_*() functions are implemented using the 'nocheck' variants, they really shouldn't ever fail, so remove the return value. Cc: axboe@kernel.dk Cc: daniel.lezcano@linaro.org Cc: sudeep.holla@arm.com Cc: airlied@redhat.com Cc: broonie@kernel.org Cc: paulmck@kernel.org Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar --- drivers/block/drbd/drbd_receiver.c | 4 +--- drivers/firmware/psci/psci_checker.c | 3 +-- drivers/gpu/drm/msm/msm_drv.c | 5 +---- drivers/platform/chrome/cros_ec_spi.c | 7 +++---- include/linux/sched.h | 6 +++--- kernel/rcu/rcutorture.c | 5 +---- kernel/sched/core.c | 12 ++++++------ 7 files changed, 16 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 140fd98274b1..280615efef74 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -6020,9 +6020,7 @@ int drbd_ack_receiver(struct drbd_thread *thi) int expect = header_size; bool ping_timeout_active = false; - rv = sched_set_fifo_low(current); - if (rv < 0) - drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv); + sched_set_fifo_low(current); while (get_t_state(thi) == RUNNING) { drbd_thread_current_set_cpu(thi); diff --git a/drivers/firmware/psci/psci_checker.c b/drivers/firmware/psci/psci_checker.c index a5279a430274..6fff482847e7 100644 --- a/drivers/firmware/psci/psci_checker.c +++ b/drivers/firmware/psci/psci_checker.c @@ -281,8 +281,7 @@ static int suspend_test_thread(void *arg) wait_for_completion(&suspend_threads_started); /* Set maximum priority to preempt all other threads on this CPU. */ - if (sched_set_fifo(current)) - pr_warn("Failed to set suspend thread scheduler on CPU %d\n", cpu); + sched_set_fifo(current); dev = this_cpu_read(cpuidle_devices); drv = cpuidle_get_cpu_driver(dev); diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 89a8b9c7e044..556cca38487c 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -509,10 +509,7 @@ static int msm_drm_init(struct device *dev, struct drm_driver *drv) goto err_msm_uninit; } - ret = sched_set_fifo(priv->event_thread[i].thread); - if (ret) - dev_warn(dev, "event_thread set priority failed:%d\n", - ret); + sched_set_fifo(priv->event_thread[i].thread); } ret = drm_vblank_init(ddev, priv->num_crtcs); diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c index c20a43a97040..d09260382550 100644 --- a/drivers/platform/chrome/cros_ec_spi.c +++ b/drivers/platform/chrome/cros_ec_spi.c @@ -725,10 +725,9 @@ static int cros_ec_spi_devm_high_pri_alloc(struct device *dev, if (err) return err; - err = sched_set_fifo(ec_spi->high_pri_worker->task); - if (err) - dev_err(dev, "Can't set cros_ec high pri priority: %d\n", err); - return err; + sched_set_fifo(ec_spi->high_pri_worker->task); + + return 0; } static int cros_ec_spi_probe(struct spi_device *spi) diff --git a/include/linux/sched.h b/include/linux/sched.h index b792b8f0f4cf..ae7664492af2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1653,9 +1653,9 @@ extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); -extern int sched_set_fifo(struct task_struct *p); -extern int sched_set_fifo_low(struct task_struct *p); -extern int sched_set_normal(struct task_struct *p, int nice); +extern void sched_set_fifo(struct task_struct *p); +extern void sched_set_fifo_low(struct task_struct *p); +extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bbc3c8a9e7b9..b4c1146de414 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -893,10 +893,7 @@ static int rcu_torture_boost(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_boost started"); /* Set real-time priority. */ - if (sched_set_fifo_low(current) < 0) { - VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!"); - n_rcu_torture_boost_rterror++; - } + sched_set_fifo_low(current); init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f882d3d74dad..41d3778ea80e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5183,30 +5183,30 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, * The administrator _MUST_ configure the system, the kernel simply doesn't * know enough information to make a sensible choice. */ -int sched_set_fifo(struct task_struct *p) +void sched_set_fifo(struct task_struct *p) { struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; - return sched_setscheduler_nocheck(p, SCHED_FIFO, &sp); + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); } EXPORT_SYMBOL_GPL(sched_set_fifo); /* * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. */ -int sched_set_fifo_low(struct task_struct *p) +void sched_set_fifo_low(struct task_struct *p) { struct sched_param sp = { .sched_priority = 1 }; - return sched_setscheduler_nocheck(p, SCHED_FIFO, &sp); + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); } EXPORT_SYMBOL_GPL(sched_set_fifo_low); -int sched_set_normal(struct task_struct *p, int nice) +void sched_set_normal(struct task_struct *p, int nice) { struct sched_attr attr = { .sched_policy = SCHED_NORMAL, .sched_nice = nice, }; - return sched_setattr_nocheck(p, &attr); + WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); } EXPORT_SYMBOL_GPL(sched_set_normal); -- cgit v1.2.3 From 60997c3d45d9a67daf01c56d805ae4fec37e0bd8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 3 Jun 2020 21:48:55 +0200 Subject: close_range: add CLOSE_RANGE_UNSHARE One of the use-cases of close_range() is to drop file descriptors just before execve(). This would usually be expressed in the sequence: unshare(CLONE_FILES); close_range(3, ~0U); as pointed out by Linus it might be desirable to have this be a part of close_range() itself under a new flag CLOSE_RANGE_UNSHARE. This expands {dup,unshare)_fd() to take a max_fds argument that indicates the maximum number of file descriptors to copy from the old struct files. When the user requests that all file descriptors are supposed to be closed via close_range(min, max) then we can cap via unshare_fd(min) and hence don't need to do any of the heavy fput() work for everything above min. The patch makes it so that if CLOSE_RANGE_UNSHARE is requested and we do in fact currently share our file descriptor table we create a new private copy. We then close all fds in the requested range and finally after we're done we install the new fd table. Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner --- fs/file.c | 65 +++++++++++++++++++++++++++++++++++----- fs/open.c | 5 +--- include/linux/fdtable.h | 8 +++-- include/uapi/linux/close_range.h | 9 ++++++ kernel/fork.c | 11 +++---- 5 files changed, 79 insertions(+), 19 deletions(-) create mode 100644 include/uapi/linux/close_range.h (limited to 'kernel') diff --git a/fs/file.c b/fs/file.c index 1b8ff05e8311..340bc9569f9d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -19,6 +19,7 @@ #include #include #include +#include unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -265,12 +266,22 @@ static unsigned int count_open_files(struct fdtable *fdt) return i; } +static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) +{ + unsigned int count; + + count = count_open_files(fdt); + if (max_fds < NR_OPEN_DEFAULT) + max_fds = NR_OPEN_DEFAULT; + return min(count, max_fds); +} + /* * Allocate a new files structure and copy contents from the * passed in files structure. * errorp will be valid only when the returned files_struct is NULL. */ -struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) +struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; @@ -297,7 +308,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = count_open_files(old_fdt); + open_files = sane_fdtable_size(old_fdt, max_fds); /* * Check whether we need to allocate a larger fd array and fd set. @@ -328,7 +339,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = count_open_files(old_fdt); + open_files = sane_fdtable_size(old_fdt, max_fds); } copy_fd_bitmaps(new_fdt, old_fdt, open_files); @@ -665,32 +676,72 @@ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. */ -int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd) +int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) { unsigned int cur_max; + struct task_struct *me = current; + struct files_struct *cur_fds = me->files, *fds = NULL; + + if (flags & ~CLOSE_RANGE_UNSHARE) + return -EINVAL; if (fd > max_fd) return -EINVAL; rcu_read_lock(); - cur_max = files_fdtable(files)->max_fds; + cur_max = files_fdtable(cur_fds)->max_fds; rcu_read_unlock(); /* cap to last valid index into fdtable */ cur_max--; + if (flags & CLOSE_RANGE_UNSHARE) { + int ret; + unsigned int max_unshare_fds = NR_OPEN_MAX; + + /* + * If the requested range is greater than the current maximum, + * we're closing everything so only copy all file descriptors + * beneath the lowest file descriptor. + */ + if (max_fd >= cur_max) + max_unshare_fds = fd; + + ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); + if (ret) + return ret; + + /* + * We used to share our file descriptor table, and have now + * created a private one, make sure we're using it below. + */ + if (fds) + swap(cur_fds, fds); + } + max_fd = min(max_fd, cur_max); while (fd <= max_fd) { struct file *file; - file = pick_file(files, fd++); + file = pick_file(cur_fds, fd++); if (!file) continue; - filp_close(file, files); + filp_close(file, cur_fds); cond_resched(); } + if (fds) { + /* + * We're done closing the files we were supposed to. Time to install + * the new file descriptor table and drop the old one. + */ + task_lock(me); + me->files = cur_fds; + task_unlock(me); + put_files_struct(fds); + } + return 0; } diff --git a/fs/open.c b/fs/open.c index 073ea3c45347..5e62f18adc5b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1324,10 +1324,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { - if (flags) - return -EINVAL; - - return __close_range(current->files, fd, max_fd); + return __close_range(fd, max_fd, flags); } /* diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index fcd07181a365..a32bf47c593e 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -22,6 +22,7 @@ * as this is the granularity returned by copy_fdset(). */ #define NR_OPEN_DEFAULT BITS_PER_LONG +#define NR_OPEN_MAX ~0U struct fdtable { unsigned int max_fds; @@ -109,7 +110,7 @@ struct files_struct *get_files_struct(struct task_struct *); void put_files_struct(struct files_struct *fs); void reset_files_struct(struct files_struct *); int unshare_files(struct files_struct **); -struct files_struct *dup_fd(struct files_struct *, int *) __latent_entropy; +struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), @@ -121,9 +122,10 @@ extern void __fd_install(struct files_struct *files, unsigned int fd, struct file *file); extern int __close_fd(struct files_struct *files, unsigned int fd); -extern int __close_range(struct files_struct *files, unsigned int fd, - unsigned int max_fd); +extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); extern int __close_fd_get_file(unsigned int fd, struct file **res); +extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, + struct files_struct **new_fdp); extern struct kmem_cache *files_cachep; diff --git a/include/uapi/linux/close_range.h b/include/uapi/linux/close_range.h new file mode 100644 index 000000000000..6928a9fdee3c --- /dev/null +++ b/include/uapi/linux/close_range.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_CLOSE_RANGE_H +#define _UAPI_LINUX_CLOSE_RANGE_H + +/* Unshare the file descriptor table before closing file descriptors. */ +#define CLOSE_RANGE_UNSHARE (1U << 1) + +#endif /* _UAPI_LINUX_CLOSE_RANGE_H */ + diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..8948121c8454 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1474,7 +1474,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk) goto out; } - newf = dup_fd(oldf, &error); + newf = dup_fd(oldf, NR_OPEN_MAX, &error); if (!newf) goto out; @@ -2907,14 +2907,15 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) /* * Unshare file descriptor table if it is being shared */ -static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) +int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, + struct files_struct **new_fdp) { struct files_struct *fd = current->files; int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, &error); + *new_fdp = dup_fd(fd, max_fds, &error); if (!*new_fdp) return error; } @@ -2974,7 +2975,7 @@ int ksys_unshare(unsigned long unshare_flags) err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, &new_fd); + err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@ -3063,7 +3064,7 @@ int unshare_files(struct files_struct **displaced) struct files_struct *copy = NULL; int error; - error = unshare_fd(CLONE_FILES, ©); + error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); if (error || !copy) { *displaced = NULL; return error; -- cgit v1.2.3 From bbccc11bc8848926065915e6193fd4c6e33c85ef Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 24 May 2020 15:52:38 -0500 Subject: audit: Use struct_size() helper in alloc_chunk One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct audit_chunk { ... struct node { struct list_head list; struct audit_tree *owner; unsigned index; /* index; upper bit indicates 'will prune' */ } owners[]; }; Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. So, replace the following form: offsetof(struct audit_chunk, owners) + count * sizeof(struct node); with: struct_size(chunk, owners, count) This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Paul Moore --- kernel/audit_tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e49c912f862d..1b7a2f041793 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -188,11 +188,9 @@ static struct fsnotify_mark *alloc_mark(void) static struct audit_chunk *alloc_chunk(int count) { struct audit_chunk *chunk; - size_t size; int i; - size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); - chunk = kzalloc(size, GFP_KERNEL); + chunk = kzalloc(struct_size(chunk, owners, count), GFP_KERNEL); if (!chunk) return NULL; -- cgit v1.2.3 From 6c6935419e2fd8ef1fcde71c4e50b9095e520900 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 18 Jun 2020 16:46:31 -0700 Subject: bpf: Avoid verifier failure for 32bit pointer arithmetic When do experiments with llvm (disabling instcombine and simplifyCFG), I hit the following error with test_seg6_loop.o. ; R1=pkt(id=0,off=0,r=48,imm=0), R7=pkt(id=0,off=40,r=48,imm=0) w2 = w7 ; R2_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) w2 -= w1 R2 32-bit pointer arithmetic prohibited The corresponding source code is: uint32_t srh_off // srh and skb->data are all packet pointers srh_off = (char *)srh - (char *)(long)skb->data; The verifier does not support 32-bit pointer/scalar arithmetic. Without my llvm change, the code looks like ; R3=pkt(id=0,off=40,r=48,imm=0), R8=pkt(id=0,off=0,r=48,imm=0) w3 -= w8 ; R3_w=inv(id=0) This is explicitly allowed in verifier if both registers are pointers and the opcode is BPF_SUB. To fix this problem, I changed the verifier to allow 32-bit pointer/scaler BPF_SUB operations. At the source level, the issue could be workarounded with inline asm or changing "uint32_t srh_off" to "uint64_t srh_off". But I feel that verifier change might be the right thing to do. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200618234631.3321118-1-yhs@fb.com --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 34cde841ab68..a1857c4ffaaf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5031,6 +5031,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ + if (opcode == BPF_SUB && env->allow_ptr_leaks) { + __mark_reg_unknown(env, dst_reg); + return 0; + } + verbose(env, "R%d 32-bit pointer arithmetic prohibited\n", dst); -- cgit v1.2.3 From 3af8588c77186bf08e55e7281da83d88373481d7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 8 Jun 2020 17:28:50 +0200 Subject: fork: fold legacy_clone_args_valid() into _do_fork() This separate helper only existed to guarantee the mutual exclusivity of CLONE_PIDFD and CLONE_PARENT_SETTID for legacy clone since CLONE_PIDFD abuses the parent_tid field to return the pidfd. But we can actually handle this uniformely thus removing the helper. For legacy clone we can detect that CLONE_PIDFD is specified in conjunction with CLONE_PARENT_SETTID because they will share the same memory which is invalid and for clone3() setting the separate pidfd and parent_tid fields to the same memory is bogus as well. So fold that helper directly into _do_fork() by detecting this case. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Al Viro Cc: Geert Uytterhoeven Cc: "Matthew Wilcox (Oracle)" Cc: "Peter Zijlstra (Intel)" Cc: linux-m68k@lists.linux-m68k.org Cc: x86@kernel.org Signed-off-by: Christian Brauner --- arch/m68k/kernel/process.c | 3 --- arch/x86/kernel/sys_ia32.c | 3 --- include/linux/sched/task.h | 1 - kernel/fork.c | 30 ++++++++++++++---------------- 4 files changed, 14 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 90ae376b7ab1..0608439ba452 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -125,9 +125,6 @@ asmlinkage int m68k_clone(struct pt_regs *regs) .tls = regs->d5, }; - if (!legacy_clone_args_valid(&args)) - return -EINVAL; - return _do_fork(&args); } diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c index f8d65c99feb8..720cde885042 100644 --- a/arch/x86/kernel/sys_ia32.c +++ b/arch/x86/kernel/sys_ia32.c @@ -251,9 +251,6 @@ COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags, .tls = tls_val, }; - if (!legacy_clone_args_valid(&args)) - return -EINVAL; - return _do_fork(&args); } #endif /* CONFIG_IA32_EMULATION */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 38359071236a..ddce0ea515d1 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -96,7 +96,6 @@ extern void exit_files(struct task_struct *); extern void exit_itimers(struct signal_struct *); extern long _do_fork(struct kernel_clone_args *kargs); -extern bool legacy_clone_args_valid(const struct kernel_clone_args *kargs); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..9875aeb2ba41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2422,6 +2422,20 @@ long _do_fork(struct kernel_clone_args *args) int trace = 0; long nr; + /* + * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument + * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are + * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate + * field in struct clone_args and it still doesn't make sense to have + * them both point at the same memory location. Performing this check + * here has the advantage that we don't need to have a separate helper + * to check for legacy clone(). + */ + if ((args->flags & CLONE_PIDFD) && + (args->flags & CLONE_PARENT_SETTID) && + (args->pidfd == args->parent_tid)) + return -EINVAL; + /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly @@ -2479,16 +2493,6 @@ long _do_fork(struct kernel_clone_args *args) return nr; } -bool legacy_clone_args_valid(const struct kernel_clone_args *kargs) -{ - /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ - if ((kargs->flags & CLONE_PIDFD) && - (kargs->flags & CLONE_PARENT_SETTID)) - return false; - - return true; -} - #ifndef CONFIG_HAVE_COPY_THREAD_TLS /* For compatibility with architectures that call do_fork directly rather than * using the syscall entry points below. */ @@ -2508,9 +2512,6 @@ long do_fork(unsigned long clone_flags, .stack_size = stack_size, }; - if (!legacy_clone_args_valid(&args)) - return -EINVAL; - return _do_fork(&args); } #endif @@ -2593,9 +2594,6 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, .tls = tls, }; - if (!legacy_clone_args_valid(&args)) - return -EINVAL; - return _do_fork(&args); } #endif -- cgit v1.2.3 From a2d0d62f4d9e3546134ba08e102ca7decd4ed836 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 19 Jun 2020 14:11:41 -0700 Subject: bpf: Switch btf_parse_vmlinux to btf_find_by_name_kind btf_parse_vmlinux() implements manual search for struct bpf_ctx_convert since at the time of implementing btf_find_by_name_kind() was not available. Later btf_find_by_name_kind() was introduced in 27ae7997a661 ("bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS"). It provides similar search functionality and can be leveraged in btf_parse_vmlinux(). Do it. Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/6e12d5c3e8a3d552925913ef73a695dd1bb27800.1592600985.git.rdna@fb.com --- kernel/bpf/btf.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 58c9af1d4808..3eb804618a53 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3591,7 +3591,7 @@ struct btf *btf_parse_vmlinux(void) struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; struct btf *btf = NULL; - int err, i; + int err, btf_id; env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) @@ -3625,24 +3625,13 @@ struct btf *btf_parse_vmlinux(void) goto errout; /* find struct bpf_ctx_convert for type checking later */ - for (i = 1; i <= btf->nr_types; i++) { - const struct btf_type *t; - const char *tname; - - t = btf_type_by_id(btf, i); - if (!__btf_type_is_struct(t)) - continue; - tname = __btf_name_by_offset(btf, t->name_off); - if (!strcmp(tname, "bpf_ctx_convert")) { - /* btf_parse_vmlinux() runs under bpf_verifier_lock */ - bpf_ctx_convert.t = t; - break; - } - } - if (i > btf->nr_types) { - err = -ENOENT; + btf_id = btf_find_by_name_kind(btf, "bpf_ctx_convert", BTF_KIND_STRUCT); + if (btf_id < 0) { + err = btf_id; goto errout; } + /* btf_parse_vmlinux() runs under bpf_verifier_lock */ + bpf_ctx_convert.t = btf_type_by_id(btf, btf_id); bpf_struct_ops_init(btf, log); -- cgit v1.2.3 From 41c48f3a98231738c5ce79f6f2aa6e40ba924d18 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 19 Jun 2020 14:11:43 -0700 Subject: bpf: Support access to bpf map fields There are multiple use-cases when it's convenient to have access to bpf map fields, both `struct bpf_map` and map type specific struct-s such as `struct bpf_array`, `struct bpf_htab`, etc. For example while working with sock arrays it can be necessary to calculate the key based on map->max_entries (some_hash % max_entries). Currently this is solved by communicating max_entries via "out-of-band" channel, e.g. via additional map with known key to get info about target map. That works, but is not very convenient and error-prone while working with many maps. In other cases necessary data is dynamic (i.e. unknown at loading time) and it's impossible to get it at all. For example while working with a hash table it can be convenient to know how much capacity is already used (bpf_htab.count.counter for BPF_F_NO_PREALLOC case). At the same time kernel knows this info and can provide it to bpf program. Fill this gap by adding support to access bpf map fields from bpf program for both `struct bpf_map` and map type specific fields. Support is implemented via btf_struct_access() so that a user can define their own `struct bpf_map` or map type specific struct in their program with only necessary fields and preserve_access_index attribute, cast a map to this struct and use a field. For example: struct bpf_map { __u32 max_entries; } __attribute__((preserve_access_index)); struct bpf_array { struct bpf_map map; __u32 elem_size; } __attribute__((preserve_access_index)); struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 4); __type(key, __u32); __type(value, __u32); } m_array SEC(".maps"); SEC("cgroup_skb/egress") int cg_skb(void *ctx) { struct bpf_array *array = (struct bpf_array *)&m_array; struct bpf_map *map = (struct bpf_map *)&m_array; /* .. use map->max_entries or array->map.max_entries .. */ } Similarly to other btf_struct_access() use-cases (e.g. struct tcp_sock in net/ipv4/bpf_tcp_ca.c) the patch allows access to any fields of corresponding struct. Only reading from map fields is supported. For btf_struct_access() to work there should be a way to know btf id of a struct that corresponds to a map type. To get btf id there should be a way to get a stringified name of map-specific struct, such as "bpf_array", "bpf_htab", etc for a map type. Two new fields are added to `struct bpf_map_ops` to handle it: * .map_btf_name keeps a btf name of a struct returned by map_alloc(); * .map_btf_id is used to cache btf id of that struct. To make btf ids calculation cheaper they're calculated once while preparing btf_vmlinux and cached same way as it's done for btf_id field of `struct bpf_func_proto` While calculating btf ids, struct names are NOT checked for collision. Collisions will be checked as a part of the work to prepare btf ids used in verifier in compile time that should land soon. The only known collision for `struct bpf_htab` (kernel/bpf/hashtab.c vs net/core/sock_map.c) was fixed earlier. Both new fields .map_btf_name and .map_btf_id must be set for a map type for the feature to work. If neither is set for a map type, verifier will return ENOTSUPP on a try to access map_ptr of corresponding type. If just one of them set, it's verifier misconfiguration. Only `struct bpf_array` for BPF_MAP_TYPE_ARRAY and `struct bpf_htab` for BPF_MAP_TYPE_HASH are supported by this patch. Other map types will be supported separately. The feature is available only for CONFIG_DEBUG_INFO_BTF=y and gated by perfmon_capable() so that unpriv programs won't have access to bpf map fields. Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/6479686a0cd1e9067993df57b4c3eef0e276fec9.1592600985.git.rdna@fb.com --- include/linux/bpf.h | 9 +++ include/linux/bpf_verifier.h | 1 + kernel/bpf/arraymap.c | 3 + kernel/bpf/btf.c | 40 +++++++++++ kernel/bpf/hashtab.c | 3 + kernel/bpf/verifier.c | 82 +++++++++++++++++++--- .../selftests/bpf/verifier/map_ptr_mixing.c | 2 +- 7 files changed, 131 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 07052d44bca1..1e1501ee53ce 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -92,6 +92,10 @@ struct bpf_map_ops { int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts); + + /* BTF name and id of struct allocated by map_alloc */ + const char * const map_btf_name; + int *map_btf_id; }; struct bpf_map_memory { @@ -1109,6 +1113,11 @@ static inline bool bpf_allow_ptr_leaks(void) return perfmon_capable(); } +static inline bool bpf_allow_ptr_to_map_access(void) +{ + return perfmon_capable(); +} + static inline bool bpf_bypass_spec_v1(void) { return perfmon_capable(); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ca08db4ffb5f..53c7bd568c5d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -379,6 +379,7 @@ struct bpf_verifier_env { u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; + bool allow_ptr_to_map_access; bool bpf_capable; bool bypass_spec_v1; bool bypass_spec_v4; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 11584618e861..e7caa48812fb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -494,6 +494,7 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + pgoff); } +static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -510,6 +511,8 @@ const struct bpf_map_ops array_map_ops = { .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, + .map_btf_name = "bpf_array", + .map_btf_id = &array_map_btf_id, }; const struct bpf_map_ops percpu_array_map_ops = { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3eb804618a53..e377d1981730 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3571,6 +3571,41 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, return ctx_type; } +static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = { +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +#define BPF_LINK_TYPE(_id, _name) +#define BPF_MAP_TYPE(_id, _ops) \ + [_id] = &_ops, +#include +#undef BPF_PROG_TYPE +#undef BPF_LINK_TYPE +#undef BPF_MAP_TYPE +}; + +static int btf_vmlinux_map_ids_init(const struct btf *btf, + struct bpf_verifier_log *log) +{ + const struct bpf_map_ops *ops; + int i, btf_id; + + for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) { + ops = btf_vmlinux_map_ops[i]; + if (!ops || (!ops->map_btf_name && !ops->map_btf_id)) + continue; + if (!ops->map_btf_name || !ops->map_btf_id) { + bpf_log(log, "map type %d is misconfigured\n", i); + return -EINVAL; + } + btf_id = btf_find_by_name_kind(btf, ops->map_btf_name, + BTF_KIND_STRUCT); + if (btf_id < 0) + return btf_id; + *ops->map_btf_id = btf_id; + } + + return 0; +} + static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, @@ -3633,6 +3668,11 @@ struct btf *btf_parse_vmlinux(void) /* btf_parse_vmlinux() runs under bpf_verifier_lock */ bpf_ctx_convert.t = btf_type_by_id(btf, btf_id); + /* find bpf map structs for map_ptr access checking */ + err = btf_vmlinux_map_ids_init(btf, log); + if (err < 0) + goto errout; + bpf_struct_ops_init(btf, log); btf_verifier_env_free(env); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b4b288a3c3c9..2c5999e02060 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1614,6 +1614,7 @@ htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, true, false); } +static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1625,6 +1626,8 @@ const struct bpf_map_ops htab_map_ops = { .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, BATCH_OPS(htab), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_map_btf_id, }; const struct bpf_map_ops htab_lru_map_ops = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a1857c4ffaaf..7460f967cb75 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1351,6 +1351,19 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, __mark_reg_not_init(env, regs + regno); } +static void mark_btf_ld_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno, + enum bpf_reg_type reg_type, u32 btf_id) +{ + if (reg_type == SCALAR_VALUE) { + mark_reg_unknown(env, regs, regno); + return; + } + mark_reg_known_zero(env, regs, regno); + regs[regno].type = PTR_TO_BTF_ID; + regs[regno].btf_id = btf_id; +} + #define DEF_NOT_SUBREG (0) static void init_reg_state(struct bpf_verifier_env *env, struct bpf_func_state *state) @@ -3182,19 +3195,68 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; - if (atype == BPF_READ && value_regno >= 0) { - if (ret == SCALAR_VALUE) { - mark_reg_unknown(env, regs, value_regno); - return 0; - } - mark_reg_known_zero(env, regs, value_regno); - regs[value_regno].type = PTR_TO_BTF_ID; - regs[value_regno].btf_id = btf_id; + if (atype == BPF_READ && value_regno >= 0) + mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + + return 0; +} + +static int check_ptr_to_map_access(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, + int regno, int off, int size, + enum bpf_access_type atype, + int value_regno) +{ + struct bpf_reg_state *reg = regs + regno; + struct bpf_map *map = reg->map_ptr; + const struct btf_type *t; + const char *tname; + u32 btf_id; + int ret; + + if (!btf_vmlinux) { + verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n"); + return -ENOTSUPP; + } + + if (!map->ops->map_btf_id || !*map->ops->map_btf_id) { + verbose(env, "map_ptr access not supported for map type %d\n", + map->map_type); + return -ENOTSUPP; + } + + t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id); + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + + if (!env->allow_ptr_to_map_access) { + verbose(env, + "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", + tname); + return -EPERM; } + if (off < 0) { + verbose(env, "R%d is %s invalid negative access: off=%d\n", + regno, tname, off); + return -EACCES; + } + + if (atype != BPF_READ) { + verbose(env, "only read from %s is supported\n", tname); + return -EACCES; + } + + ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); + if (ret < 0) + return ret; + + if (value_regno >= 0) + mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + return 0; } + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -3363,6 +3425,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == PTR_TO_BTF_ID) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); + } else if (reg->type == CONST_PTR_TO_MAP) { + err = check_ptr_to_map_access(env, regs, regno, off, size, t, + value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -10951,6 +11016,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->strict_alignment = false; env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); diff --git a/tools/testing/selftests/bpf/verifier/map_ptr_mixing.c b/tools/testing/selftests/bpf/verifier/map_ptr_mixing.c index cd26ee6b7b1d..1f2b8c4cb26d 100644 --- a/tools/testing/selftests/bpf/verifier/map_ptr_mixing.c +++ b/tools/testing/selftests/bpf/verifier/map_ptr_mixing.c @@ -56,7 +56,7 @@ .fixup_map_in_map = { 16 }, .fixup_map_array_48b = { 13 }, .result = REJECT, - .errstr = "R0 invalid mem access 'map_ptr'", + .errstr = "only read from bpf_array is supported", }, { "cond: two branches returning different map pointers for lookup (tail, tail)", -- cgit v1.2.3 From 2872e9ac33a4440173418147351ed4f93177e763 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 19 Jun 2020 14:11:44 -0700 Subject: bpf: Set map_btf_{name, id} for all map types Set map_btf_name and map_btf_id for all map types so that map fields can be accessed by bpf programs. Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/a825f808f22af52b018dbe82f1c7d29dab5fc978.1592600985.git.rdna@fb.com --- kernel/bpf/arraymap.c | 15 +++++++++++++++ kernel/bpf/bpf_struct_ops.c | 3 +++ kernel/bpf/cpumap.c | 3 +++ kernel/bpf/devmap.c | 6 ++++++ kernel/bpf/hashtab.c | 12 ++++++++++++ kernel/bpf/local_storage.c | 3 +++ kernel/bpf/lpm_trie.c | 3 +++ kernel/bpf/queue_stack_maps.c | 6 ++++++ kernel/bpf/reuseport_array.c | 3 +++ kernel/bpf/ringbuf.c | 3 +++ kernel/bpf/stackmap.c | 3 +++ net/core/bpf_sk_storage.c | 3 +++ net/core/sock_map.c | 6 ++++++ net/xdp/xskmap.c | 3 +++ 14 files changed, 72 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e7caa48812fb..ec5cd11032aa 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -515,6 +515,7 @@ const struct bpf_map_ops array_map_ops = { .map_btf_id = &array_map_btf_id, }; +static int percpu_array_map_btf_id; const struct bpf_map_ops percpu_array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -525,6 +526,8 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &percpu_array_map_btf_id, }; static int fd_array_map_alloc_check(union bpf_attr *attr) @@ -871,6 +874,7 @@ static void prog_array_map_free(struct bpf_map *map) fd_array_map_free(map); } +static int prog_array_map_btf_id; const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, @@ -886,6 +890,8 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, + .map_btf_name = "bpf_array", + .map_btf_id = &prog_array_map_btf_id, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -964,6 +970,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, rcu_read_unlock(); } +static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -975,6 +982,8 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &perf_event_array_map_btf_id, }; #ifdef CONFIG_CGROUPS @@ -997,6 +1006,7 @@ static void cgroup_fd_array_free(struct bpf_map *map) fd_array_map_free(map); } +static int cgroup_array_map_btf_id; const struct bpf_map_ops cgroup_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -1007,6 +1017,8 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &cgroup_array_map_btf_id, }; #endif @@ -1080,6 +1092,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } +static int array_of_maps_map_btf_id; const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, @@ -1092,4 +1105,6 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_array", + .map_btf_id = &array_of_maps_map_btf_id, }; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index c6b0decaa46a..969c5d47f81f 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -611,6 +611,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) return map; } +static int bpf_struct_ops_map_btf_id; const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_alloc_check = bpf_struct_ops_map_alloc_check, .map_alloc = bpf_struct_ops_map_alloc, @@ -620,6 +621,8 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_delete_elem = bpf_struct_ops_map_delete_elem, .map_update_elem = bpf_struct_ops_map_update_elem, .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, + .map_btf_name = "bpf_struct_ops_map", + .map_btf_id = &bpf_struct_ops_map_btf_id, }; /* "const void *" because some subsystem is diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 27595fc6da56..bd8658055c16 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -543,6 +543,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static int cpu_map_btf_id; const struct bpf_map_ops cpu_map_ops = { .map_alloc = cpu_map_alloc, .map_free = cpu_map_free, @@ -551,6 +552,8 @@ const struct bpf_map_ops cpu_map_ops = { .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_cpu_map", + .map_btf_id = &cpu_map_btf_id, }; static int bq_flush_to_queue(struct xdp_bulk_queue *bq) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 0cbb72cdaf63..58acc46861ef 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -747,6 +747,7 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int dev_map_btf_id; const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -755,8 +756,11 @@ const struct bpf_map_ops dev_map_ops = { .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_dtab", + .map_btf_id = &dev_map_btf_id, }; +static int dev_map_hash_map_btf_id; const struct bpf_map_ops dev_map_hash_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -765,6 +769,8 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_dtab", + .map_btf_id = &dev_map_hash_map_btf_id, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2c5999e02060..acd06081d81d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1630,6 +1630,7 @@ const struct bpf_map_ops htab_map_ops = { .map_btf_id = &htab_map_btf_id, }; +static int htab_lru_map_btf_id; const struct bpf_map_ops htab_lru_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1642,6 +1643,8 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, BATCH_OPS(htab_lru), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_lru_map_btf_id, }; /* Called from eBPF program */ @@ -1746,6 +1749,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static int htab_percpu_map_btf_id; const struct bpf_map_ops htab_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1756,8 +1760,11 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, BATCH_OPS(htab_percpu), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_percpu_map_btf_id, }; +static int htab_lru_percpu_map_btf_id; const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1768,6 +1775,8 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, BATCH_OPS(htab_lru_percpu), + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_lru_percpu_map_btf_id, }; static int fd_htab_map_alloc_check(union bpf_attr *attr) @@ -1890,6 +1899,7 @@ static void htab_of_map_free(struct bpf_map *map) fd_htab_map_free(map); } +static int htab_of_maps_map_btf_id; const struct bpf_map_ops htab_of_maps_map_ops = { .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, @@ -1902,4 +1912,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_htab", + .map_btf_id = &htab_of_maps_map_btf_id, }; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 33d01866bcc2..51bd5a8cb01b 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -409,6 +409,7 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, rcu_read_unlock(); } +static int cgroup_storage_map_btf_id; const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, @@ -418,6 +419,8 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_delete_elem = cgroup_storage_delete_elem, .map_check_btf = cgroup_storage_check_btf, .map_seq_show_elem = cgroup_storage_seq_show_elem, + .map_btf_name = "bpf_cgroup_storage_map", + .map_btf_id = &cgroup_storage_map_btf_id, }; int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index c8cc4e4cf98d..1abd4e3f906d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -735,6 +735,7 @@ static int trie_check_btf(const struct bpf_map *map, -EINVAL : 0; } +static int trie_map_btf_id; const struct bpf_map_ops trie_map_ops = { .map_alloc = trie_alloc, .map_free = trie_free, @@ -743,4 +744,6 @@ const struct bpf_map_ops trie_map_ops = { .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, .map_check_btf = trie_check_btf, + .map_btf_name = "lpm_trie", + .map_btf_id = &trie_map_btf_id, }; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 05c8e043b9d2..80c66a6d7c54 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -262,6 +262,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, return -EINVAL; } +static int queue_map_btf_id; const struct bpf_map_ops queue_map_ops = { .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, @@ -273,8 +274,11 @@ const struct bpf_map_ops queue_map_ops = { .map_pop_elem = queue_map_pop_elem, .map_peek_elem = queue_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, + .map_btf_name = "bpf_queue_stack", + .map_btf_id = &queue_map_btf_id, }; +static int stack_map_btf_id; const struct bpf_map_ops stack_map_ops = { .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, @@ -286,4 +290,6 @@ const struct bpf_map_ops stack_map_ops = { .map_pop_elem = stack_map_pop_elem, .map_peek_elem = stack_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, + .map_btf_name = "bpf_queue_stack", + .map_btf_id = &stack_map_btf_id, }; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 21cde24386db..a09922f656e4 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -345,6 +345,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, return 0; } +static int reuseport_array_map_btf_id; const struct bpf_map_ops reuseport_array_ops = { .map_alloc_check = reuseport_array_alloc_check, .map_alloc = reuseport_array_alloc, @@ -352,4 +353,6 @@ const struct bpf_map_ops reuseport_array_ops = { .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, + .map_btf_name = "reuseport_array", + .map_btf_id = &reuseport_array_map_btf_id, }; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 180414bb0d3e..dbf37aff4827 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -294,6 +294,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, return 0; } +static int ringbuf_map_btf_id; const struct bpf_map_ops ringbuf_map_ops = { .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, @@ -303,6 +304,8 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, + .map_btf_name = "bpf_ringbuf_map", + .map_btf_id = &ringbuf_map_btf_id, }; /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 599488f25e40..27dc9b1b08a5 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -613,6 +613,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } +static int stack_trace_map_btf_id; const struct bpf_map_ops stack_trace_map_ops = { .map_alloc = stack_map_alloc, .map_free = stack_map_free, @@ -621,6 +622,8 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_stack_map", + .map_btf_id = &stack_trace_map_btf_id, }; static int __init stack_map_init(void) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 1dae4b543243..6f921c4ddc2c 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -919,6 +919,7 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) return -ENOENT; } +static int sk_storage_map_btf_id; const struct bpf_map_ops sk_storage_map_ops = { .map_alloc_check = bpf_sk_storage_map_alloc_check, .map_alloc = bpf_sk_storage_map_alloc, @@ -928,6 +929,8 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_update_elem = bpf_fd_sk_storage_update_elem, .map_delete_elem = bpf_fd_sk_storage_delete_elem, .map_check_btf = bpf_sk_storage_map_check_btf, + .map_btf_name = "bpf_sk_storage_map", + .map_btf_id = &sk_storage_map_btf_id, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 2b884f2d562a..4c1123c749bb 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -643,6 +643,7 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +static int sock_map_btf_id; const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, @@ -653,6 +654,8 @@ const struct bpf_map_ops sock_map_ops = { .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_stab", + .map_btf_id = &sock_map_btf_id, }; struct bpf_shtab_elem { @@ -1176,6 +1179,7 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .arg4_type = ARG_ANYTHING, }; +static int sock_hash_map_btf_id; const struct bpf_map_ops sock_hash_ops = { .map_alloc = sock_hash_alloc, .map_free = sock_hash_free, @@ -1186,6 +1190,8 @@ const struct bpf_map_ops sock_hash_ops = { .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, + .map_btf_name = "bpf_shtab", + .map_btf_id = &sock_hash_map_btf_id, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 1dc7208c71ba..8367adbbe9df 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -254,6 +254,7 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, spin_unlock_bh(&map->lock); } +static int xsk_map_btf_id; const struct bpf_map_ops xsk_map_ops = { .map_alloc = xsk_map_alloc, .map_free = xsk_map_free, @@ -264,4 +265,6 @@ const struct bpf_map_ops xsk_map_ops = { .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, .map_check_btf = map_check_no_btf, + .map_btf_name = "xsk_map", + .map_btf_id = &xsk_map_btf_id, }; -- cgit v1.2.3 From 8e6cf365e1d5c70e275a77a3c5ad7e3dc685474c Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 4 Jun 2020 09:20:49 -0400 Subject: audit: log nftables configuration change events iptables, ip6tables, arptables and ebtables table registration, replacement and unregistration configuration events are logged for the native (legacy) iptables setsockopt api, but not for the nftables netlink api which is used by the nft-variant of iptables in addition to nftables itself. Add calls to log the configuration actions in the nftables netlink api. This uses the same NETFILTER_CFG record format but overloads the table field. type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.878:162) : table=?:0;?:0 family=unspecified entries=2 op=nft_register_gen pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld ... type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.878:162) : table=firewalld:1;?:0 family=inet entries=0 op=nft_register_table pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld ... type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.911:163) : table=firewalld:1;filter_FORWARD:85 family=inet entries=8 op=nft_register_chain pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld ... type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.911:163) : table=firewalld:1;filter_FORWARD:85 family=inet entries=101 op=nft_register_rule pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld ... type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.911:163) : table=firewalld:1;__set0:87 family=inet entries=87 op=nft_register_setelem pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld ... type=NETFILTER_CFG msg=audit(2020-05-28 17:46:41.911:163) : table=firewalld:1;__set0:87 family=inet entries=0 op=nft_register_set pid=396 subj=system_u:system_r:firewalld_t:s0 comm=firewalld For further information please see issue https://github.com/linux-audit/audit-kernel/issues/124 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 18 ++++++++ kernel/auditsc.c | 24 ++++++++-- net/netfilter/nf_tables_api.c | 103 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 142 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 3fcd9ee49734..604ede630580 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -12,6 +12,7 @@ #include #include #include +#include #define AUDIT_INO_UNSET ((unsigned long)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) @@ -98,6 +99,23 @@ enum audit_nfcfgop { AUDIT_XT_OP_REGISTER, AUDIT_XT_OP_REPLACE, AUDIT_XT_OP_UNREGISTER, + AUDIT_NFT_OP_TABLE_REGISTER, + AUDIT_NFT_OP_TABLE_UNREGISTER, + AUDIT_NFT_OP_CHAIN_REGISTER, + AUDIT_NFT_OP_CHAIN_UNREGISTER, + AUDIT_NFT_OP_RULE_REGISTER, + AUDIT_NFT_OP_RULE_UNREGISTER, + AUDIT_NFT_OP_SET_REGISTER, + AUDIT_NFT_OP_SET_UNREGISTER, + AUDIT_NFT_OP_SETELEM_REGISTER, + AUDIT_NFT_OP_SETELEM_UNREGISTER, + AUDIT_NFT_OP_GEN_REGISTER, + AUDIT_NFT_OP_OBJ_REGISTER, + AUDIT_NFT_OP_OBJ_UNREGISTER, + AUDIT_NFT_OP_OBJ_RESET, + AUDIT_NFT_OP_FLOWTABLE_REGISTER, + AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, + AUDIT_NFT_OP_INVALID, }; extern int is_audit_feature_set(int which); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 468a23390457..3a9100e95fda 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -75,6 +75,7 @@ #include #include #include +#include #include "audit.h" @@ -136,9 +137,26 @@ struct audit_nfcfgop_tab { }; static const struct audit_nfcfgop_tab audit_nfcfgs[] = { - { AUDIT_XT_OP_REGISTER, "register" }, - { AUDIT_XT_OP_REPLACE, "replace" }, - { AUDIT_XT_OP_UNREGISTER, "unregister" }, + { AUDIT_XT_OP_REGISTER, "xt_register" }, + { AUDIT_XT_OP_REPLACE, "xt_replace" }, + { AUDIT_XT_OP_UNREGISTER, "xt_unregister" }, + { AUDIT_NFT_OP_TABLE_REGISTER, "nft_register_table" }, + { AUDIT_NFT_OP_TABLE_UNREGISTER, "nft_unregister_table" }, + { AUDIT_NFT_OP_CHAIN_REGISTER, "nft_register_chain" }, + { AUDIT_NFT_OP_CHAIN_UNREGISTER, "nft_unregister_chain" }, + { AUDIT_NFT_OP_RULE_REGISTER, "nft_register_rule" }, + { AUDIT_NFT_OP_RULE_UNREGISTER, "nft_unregister_rule" }, + { AUDIT_NFT_OP_SET_REGISTER, "nft_register_set" }, + { AUDIT_NFT_OP_SET_UNREGISTER, "nft_unregister_set" }, + { AUDIT_NFT_OP_SETELEM_REGISTER, "nft_register_setelem" }, + { AUDIT_NFT_OP_SETELEM_UNREGISTER, "nft_unregister_setelem" }, + { AUDIT_NFT_OP_GEN_REGISTER, "nft_register_gen" }, + { AUDIT_NFT_OP_OBJ_REGISTER, "nft_register_obj" }, + { AUDIT_NFT_OP_OBJ_UNREGISTER, "nft_unregister_obj" }, + { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" }, + { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" }, + { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" }, + { AUDIT_NFT_OP_INVALID, "nft_invalid" }, }; static int audit_match_perm(struct audit_context *ctx, int mask) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 073aa1051d43..164700273947 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -693,6 +694,16 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; int err; + char *buf = kasprintf(GFP_KERNEL, "%s:%llu;?:0", + ctx->table->name, ctx->table->handle); + + audit_log_nfcfg(buf, + ctx->family, + ctx->table->use, + event == NFT_MSG_NEWTABLE ? + AUDIT_NFT_OP_TABLE_REGISTER : + AUDIT_NFT_OP_TABLE_UNREGISTER); + kfree(buf); if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) @@ -1428,6 +1439,17 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; int err; + char *buf = kasprintf(GFP_KERNEL, "%s:%llu;%s:%llu", + ctx->table->name, ctx->table->handle, + ctx->chain->name, ctx->chain->handle); + + audit_log_nfcfg(buf, + ctx->family, + ctx->chain->use, + event == NFT_MSG_NEWCHAIN ? + AUDIT_NFT_OP_CHAIN_REGISTER : + AUDIT_NFT_OP_CHAIN_UNREGISTER); + kfree(buf); if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) @@ -2693,6 +2715,17 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, { struct sk_buff *skb; int err; + char *buf = kasprintf(GFP_KERNEL, "%s:%llu;%s:%llu", + ctx->table->name, ctx->table->handle, + ctx->chain->name, ctx->chain->handle); + + audit_log_nfcfg(buf, + ctx->family, + rule->handle, + event == NFT_MSG_NEWRULE ? + AUDIT_NFT_OP_RULE_REGISTER : + AUDIT_NFT_OP_RULE_UNREGISTER); + kfree(buf); if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) @@ -3695,6 +3728,17 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, struct sk_buff *skb; u32 portid = ctx->portid; int err; + char *buf = kasprintf(gfp_flags, "%s:%llu;%s:%llu", + ctx->table->name, ctx->table->handle, + set->name, set->handle); + + audit_log_nfcfg(buf, + ctx->family, + set->field_count, + event == NFT_MSG_NEWSET ? + AUDIT_NFT_OP_SET_REGISTER : + AUDIT_NFT_OP_SET_UNREGISTER); + kfree(buf); if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) @@ -4811,6 +4855,17 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, u32 portid = ctx->portid; struct sk_buff *skb; int err; + char *buf = kasprintf(GFP_KERNEL, "%s:%llu;%s:%llu", + ctx->table->name, ctx->table->handle, + set->name, set->handle); + + audit_log_nfcfg(buf, + ctx->family, + set->handle, + event == NFT_MSG_NEWSETELEM ? + AUDIT_NFT_OP_SETELEM_REGISTER : + AUDIT_NFT_OP_SETELEM_UNREGISTER); + kfree(buf); if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) return; @@ -5892,6 +5947,19 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) obj->ops->type->type != filter->type) goto cont; + if (reset) { + char *buf = kasprintf(GFP_KERNEL, + "%s:%llu;?:0", + table->name, + table->handle); + + audit_log_nfcfg(buf, + family, + obj->handle, + AUDIT_NFT_OP_OBJ_RESET); + kfree(buf); + } + if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, @@ -6002,6 +6070,17 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) reset = true; + if (reset) { + char *buf = kasprintf(GFP_KERNEL, "%s:%llu;?:0", + table->name, table->handle); + + audit_log_nfcfg(buf, + family, + obj->handle, + AUDIT_NFT_OP_OBJ_RESET); + kfree(buf); + } + err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, family, table, obj, reset); @@ -6077,6 +6156,16 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, { struct sk_buff *skb; int err; + char *buf = kasprintf(GFP_KERNEL, "%s:%llu;?:0", + table->name, table->handle); + + audit_log_nfcfg(buf, + family, + obj->handle, + event == NFT_MSG_NEWOBJ ? + AUDIT_NFT_OP_OBJ_REGISTER : + AUDIT_NFT_OP_OBJ_UNREGISTER); + kfree(buf); if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) @@ -6856,6 +6945,17 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, { struct sk_buff *skb; int err; + char *buf = kasprintf(GFP_KERNEL, "%s:%llu;%s:%llu", + flowtable->table->name, flowtable->table->handle, + flowtable->name, flowtable->handle); + + audit_log_nfcfg(buf, + ctx->family, + flowtable->hooknum, + event == NFT_MSG_NEWFLOWTABLE ? + AUDIT_NFT_OP_FLOWTABLE_REGISTER : + AUDIT_NFT_OP_FLOWTABLE_UNREGISTER); + kfree(buf); if (ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) @@ -6977,6 +7077,9 @@ static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb, struct sk_buff *skb2; int err; + audit_log_nfcfg("?:0;?:0", 0, net->nft.base_seq, + AUDIT_NFT_OP_GEN_REGISTER); + if (nlmsg_report(nlh) && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) return; -- cgit v1.2.3 From 005e696df65d0ff90468ecf38a50aa584dc82421 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Wed, 24 Jun 2020 15:33:26 +0300 Subject: gcc-plugins/stackleak: Don't instrument itself There is no need to try instrumenting functions in kernel/stackleak.c. Otherwise that can cause issues if the cleanup pass of stackleak gcc plugin is disabled. Signed-off-by: Alexander Popov Link: https://lore.kernel.org/r/20200624123330.83226-2-alex.popov@linux.com Signed-off-by: Kees Cook --- kernel/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index f3218bc5ec69..155b5380500a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -125,6 +125,7 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o +CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN) obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n KCSAN_SANITIZE_stackleak.o := n -- cgit v1.2.3 From feee1b8c490821f29aae416a5422795f5a29263d Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Wed, 24 Jun 2020 15:33:29 +0300 Subject: gcc-plugins/stackleak: Use asm instrumentation to avoid useless register saving The kernel code instrumentation in stackleak gcc plugin works in two stages. At first, stack tracking is added to GIMPLE representation of every function (except some special cases). And later, when stack frame size info is available, stack tracking is removed from the RTL representation of the functions with small stack frame. There is an unwanted side-effect for these functions: some of them do useless work with caller-saved registers. As an example of such case, proc_sys_write without() instrumentation: 55 push %rbp 41 b8 01 00 00 00 mov $0x1,%r8d 48 89 e5 mov %rsp,%rbp e8 11 ff ff ff callq ffffffff81284610 5d pop %rbp c3 retq 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) 00 00 00 proc_sys_write() with instrumentation: 55 push %rbp 48 89 e5 mov %rsp,%rbp 41 56 push %r14 41 55 push %r13 41 54 push %r12 53 push %rbx 49 89 f4 mov %rsi,%r12 48 89 fb mov %rdi,%rbx 49 89 d5 mov %rdx,%r13 49 89 ce mov %rcx,%r14 4c 89 f1 mov %r14,%rcx 4c 89 ea mov %r13,%rdx 4c 89 e6 mov %r12,%rsi 48 89 df mov %rbx,%rdi 41 b8 01 00 00 00 mov $0x1,%r8d e8 f2 fe ff ff callq ffffffff81298e80 5b pop %rbx 41 5c pop %r12 41 5d pop %r13 41 5e pop %r14 5d pop %rbp c3 retq 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) 00 00 Let's improve the instrumentation to avoid this: 1. Make stackleak_track_stack() save all register that it works with. Use no_caller_saved_registers attribute for that function. This attribute is available for x86_64 and i386 starting from gcc-7. 2. Insert calling stackleak_track_stack() in asm: asm volatile("call stackleak_track_stack" :: "r" (current_stack_pointer)) Here we use ASM_CALL_CONSTRAINT trick from arch/x86/include/asm/asm.h. The input constraint is taken into account during gcc shrink-wrapping optimization. It is needed to be sure that stackleak_track_stack() call is inserted after the prologue of the containing function, when the stack frame is prepared. This work is a deep reengineering of the idea described on grsecurity blog https://grsecurity.net/resolving_an_unfortunate_stackleak_interaction Signed-off-by: Alexander Popov Acked-by: Miguel Ojeda Link: https://lore.kernel.org/r/20200624123330.83226-5-alex.popov@linux.com Signed-off-by: Kees Cook --- include/linux/compiler_attributes.h | 13 +++ kernel/stackleak.c | 16 +-- scripts/Makefile.gcc-plugins | 2 + scripts/gcc-plugins/stackleak_plugin.c | 205 ++++++++++++++++++++++++++++----- 4 files changed, 196 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index cdf016596659..551ea8cb70b1 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -37,6 +37,7 @@ # define __GCC4_has_attribute___copy__ 0 # define __GCC4_has_attribute___designated_init__ 0 # define __GCC4_has_attribute___externally_visible__ 1 +# define __GCC4_has_attribute___no_caller_saved_registers__ 0 # define __GCC4_has_attribute___noclone__ 1 # define __GCC4_has_attribute___nonstring__ 0 # define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8) @@ -175,6 +176,18 @@ */ #define __mode(x) __attribute__((__mode__(x))) +/* + * Optional: only supported since gcc >= 7 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/x86-Function-Attributes.html#index-no_005fcaller_005fsaved_005fregisters-function-attribute_002c-x86 + * clang: https://clang.llvm.org/docs/AttributeReference.html#no-caller-saved-registers + */ +#if __has_attribute(__no_caller_saved_registers__) +# define __no_caller_saved_registers __attribute__((__no_caller_saved_registers__)) +#else +# define __no_caller_saved_registers +#endif + /* * Optional: not supported by clang * diff --git a/kernel/stackleak.c b/kernel/stackleak.c index b193a59fc05b..a8fc9ae1d03d 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -104,19 +104,9 @@ asmlinkage void notrace stackleak_erase(void) } NOKPROBE_SYMBOL(stackleak_erase); -void __used notrace stackleak_track_stack(void) +void __used __no_caller_saved_registers notrace stackleak_track_stack(void) { - /* - * N.B. stackleak_erase() fills the kernel stack with the poison value, - * which has the register width. That code assumes that the value - * of 'lowest_stack' is aligned on the register width boundary. - * - * That is true for x86 and x86_64 because of the kernel stack - * alignment on these platforms (for details, see 'cc_stack_align' in - * arch/x86/Makefile). Take care of that when you port STACKLEAK to - * new platforms. - */ - unsigned long sp = (unsigned long)&sp; + unsigned long sp = current_stack_pointer; /* * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than @@ -125,6 +115,8 @@ void __used notrace stackleak_track_stack(void) */ BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH); + /* 'lowest_stack' should be aligned on the register width boundary */ + sp = ALIGN(sp, sizeof(unsigned long)); if (sp < current->lowest_stack && sp >= (unsigned long)task_stack_page(current) + sizeof(unsigned long)) { diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins index 5f7df50cfe7a..952e46876329 100644 --- a/scripts/Makefile.gcc-plugins +++ b/scripts/Makefile.gcc-plugins @@ -33,6 +33,8 @@ gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK) \ += -DSTACKLEAK_PLUGIN gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK) \ += -fplugin-arg-stackleak_plugin-track-min-size=$(CONFIG_STACKLEAK_TRACK_MIN_SIZE) +gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK) \ + += -fplugin-arg-stackleak_plugin-arch=$(SRCARCH) ifdef CONFIG_GCC_PLUGIN_STACKLEAK DISABLE_STACKLEAK_PLUGIN += -fplugin-arg-stackleak_plugin-disable endif diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c index cc75eeba0be1..a18b0d4af456 100644 --- a/scripts/gcc-plugins/stackleak_plugin.c +++ b/scripts/gcc-plugins/stackleak_plugin.c @@ -20,7 +20,7 @@ * * Debugging: * - use fprintf() to stderr, debug_generic_expr(), debug_gimple_stmt(), - * print_rtl() and print_simple_rtl(); + * print_rtl_single() and debug_rtx(); * - add "-fdump-tree-all -fdump-rtl-all" to the plugin CFLAGS in * Makefile.gcc-plugins to see the verbose dumps of the gcc passes; * - use gcc -E to understand the preprocessing shenanigans; @@ -32,6 +32,7 @@ __visible int plugin_is_GPL_compatible; static int track_frame_size = -1; +static bool build_for_x86 = false; static const char track_function[] = "stackleak_track_stack"; /* @@ -43,32 +44,31 @@ static GTY(()) tree track_function_decl; static struct plugin_info stackleak_plugin_info = { .version = "201707101337", .help = "track-min-size=nn\ttrack stack for functions with a stack frame size >= nn bytes\n" + "arch=target_arch\tspecify target build arch\n" "disable\t\tdo not activate the plugin\n" }; -static void stackleak_add_track_stack(gimple_stmt_iterator *gsi, bool after) +static void add_stack_tracking_gcall(gimple_stmt_iterator *gsi, bool after) { gimple stmt; - gcall *stackleak_track_stack; + gcall *gimple_call; cgraph_node_ptr node; basic_block bb; - /* Insert call to void stackleak_track_stack(void) */ + /* Insert calling stackleak_track_stack() */ stmt = gimple_build_call(track_function_decl, 0); - stackleak_track_stack = as_a_gcall(stmt); - if (after) { - gsi_insert_after(gsi, stackleak_track_stack, - GSI_CONTINUE_LINKING); - } else { - gsi_insert_before(gsi, stackleak_track_stack, GSI_SAME_STMT); - } + gimple_call = as_a_gcall(stmt); + if (after) + gsi_insert_after(gsi, gimple_call, GSI_CONTINUE_LINKING); + else + gsi_insert_before(gsi, gimple_call, GSI_SAME_STMT); /* Update the cgraph */ - bb = gimple_bb(stackleak_track_stack); + bb = gimple_bb(gimple_call); node = cgraph_get_create_node(track_function_decl); gcc_assert(node); cgraph_create_edge(cgraph_get_node(current_function_decl), node, - stackleak_track_stack, bb->count, + gimple_call, bb->count, compute_call_stmt_bb_frequency(current_function_decl, bb)); } @@ -85,6 +85,79 @@ static bool is_alloca(gimple stmt) return false; } +static tree get_current_stack_pointer_decl(void) +{ + varpool_node_ptr node; + + FOR_EACH_VARIABLE(node) { + tree var = NODE_DECL(node); + tree name = DECL_NAME(var); + + if (DECL_NAME_LENGTH(var) != sizeof("current_stack_pointer") - 1) + continue; + + if (strcmp(IDENTIFIER_POINTER(name), "current_stack_pointer")) + continue; + + return var; + } + + return NULL_TREE; +} + +static void add_stack_tracking_gasm(gimple_stmt_iterator *gsi, bool after) +{ + gasm *asm_call = NULL; + tree sp_decl, input; + vec *inputs = NULL; + + /* 'no_caller_saved_registers' is currently supported only for x86 */ + gcc_assert(build_for_x86); + + /* + * Insert calling stackleak_track_stack() in asm: + * asm volatile("call stackleak_track_stack" + * :: "r" (current_stack_pointer)) + * Use ASM_CALL_CONSTRAINT trick from arch/x86/include/asm/asm.h. + * This constraint is taken into account during gcc shrink-wrapping + * optimization. It is needed to be sure that stackleak_track_stack() + * call is inserted after the prologue of the containing function, + * when the stack frame is prepared. + */ + sp_decl = get_current_stack_pointer_decl(); + if (sp_decl == NULL_TREE) { + add_stack_tracking_gcall(gsi, after); + return; + } + input = build_tree_list(NULL_TREE, build_const_char_string(2, "r")); + input = chainon(NULL_TREE, build_tree_list(input, sp_decl)); + vec_safe_push(inputs, input); + asm_call = gimple_build_asm_vec("call stackleak_track_stack", + inputs, NULL, NULL, NULL); + gimple_asm_set_volatile(asm_call, true); + if (after) + gsi_insert_after(gsi, asm_call, GSI_CONTINUE_LINKING); + else + gsi_insert_before(gsi, asm_call, GSI_SAME_STMT); + update_stmt(asm_call); +} + +static void add_stack_tracking(gimple_stmt_iterator *gsi, bool after) +{ + /* + * The 'no_caller_saved_registers' attribute is used for + * stackleak_track_stack(). If the compiler supports this attribute for + * the target arch, we can add calling stackleak_track_stack() in asm. + * That improves performance: we avoid useless operations with the + * caller-saved registers in the functions from which we will remove + * stackleak_track_stack() call during the stackleak_cleanup pass. + */ + if (lookup_attribute_spec(get_identifier("no_caller_saved_registers"))) + add_stack_tracking_gasm(gsi, after); + else + add_stack_tracking_gcall(gsi, after); +} + /* * Work with the GIMPLE representation of the code. Insert the * stackleak_track_stack() call after alloca() and into the beginning @@ -94,7 +167,7 @@ static unsigned int stackleak_instrument_execute(void) { basic_block bb, entry_bb; bool prologue_instrumented = false, is_leaf = true; - gimple_stmt_iterator gsi; + gimple_stmt_iterator gsi = { 0 }; /* * ENTRY_BLOCK_PTR is a basic block which represents possible entry @@ -123,7 +196,7 @@ static unsigned int stackleak_instrument_execute(void) continue; /* Insert stackleak_track_stack() call after alloca() */ - stackleak_add_track_stack(&gsi, true); + add_stack_tracking(&gsi, true); if (bb == entry_bb) prologue_instrumented = true; } @@ -168,7 +241,7 @@ static unsigned int stackleak_instrument_execute(void) bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun)); } gsi = gsi_after_labels(bb); - stackleak_add_track_stack(&gsi, false); + add_stack_tracking(&gsi, false); return 0; } @@ -182,21 +255,10 @@ static bool large_stack_frame(void) #endif } -/* - * Work with the RTL representation of the code. - * Remove the unneeded stackleak_track_stack() calls from the functions - * which don't call alloca() and don't have a large enough stack frame size. - */ -static unsigned int stackleak_cleanup_execute(void) +static void remove_stack_tracking_gcall(void) { rtx_insn *insn, *next; - if (cfun->calls_alloca) - return 0; - - if (large_stack_frame()) - return 0; - /* * Find stackleak_track_stack() calls. Loop through the chain of insns, * which is an RTL representation of the code for a function. @@ -257,6 +319,84 @@ static unsigned int stackleak_cleanup_execute(void) } #endif } +} + +static bool remove_stack_tracking_gasm(void) +{ + bool removed = false; + rtx_insn *insn, *next; + + /* 'no_caller_saved_registers' is currently supported only for x86 */ + gcc_assert(build_for_x86); + + /* + * Find stackleak_track_stack() asm calls. Loop through the chain of + * insns, which is an RTL representation of the code for a function. + * + * The example of a matching insn: + * (insn 11 5 12 2 (parallel [ (asm_operands/v + * ("call stackleak_track_stack") ("") 0 + * [ (reg/v:DI 7 sp [ current_stack_pointer ]) ] + * [ (asm_input:DI ("r")) ] []) + * (clobber (reg:CC 17 flags)) ]) -1 (nil)) + */ + for (insn = get_insns(); insn; insn = next) { + rtx body; + + next = NEXT_INSN(insn); + + /* Check the expression code of the insn */ + if (!NONJUMP_INSN_P(insn)) + continue; + + /* + * Check the expression code of the insn body, which is an RTL + * Expression (RTX) describing the side effect performed by + * that insn. + */ + body = PATTERN(insn); + + if (GET_CODE(body) != PARALLEL) + continue; + + body = XVECEXP(body, 0, 0); + + if (GET_CODE(body) != ASM_OPERANDS) + continue; + + if (strcmp(ASM_OPERANDS_TEMPLATE(body), + "call stackleak_track_stack")) { + continue; + } + + delete_insn_and_edges(insn); + gcc_assert(!removed); + removed = true; + } + + return removed; +} + +/* + * Work with the RTL representation of the code. + * Remove the unneeded stackleak_track_stack() calls from the functions + * which don't call alloca() and don't have a large enough stack frame size. + */ +static unsigned int stackleak_cleanup_execute(void) +{ + bool removed = false; + + if (cfun->calls_alloca) + return 0; + + if (large_stack_frame()) + return 0; + + if (lookup_attribute_spec(get_identifier("no_caller_saved_registers"))) + removed = remove_stack_tracking_gasm(); + + if (!removed) + remove_stack_tracking_gcall(); return 0; } @@ -392,6 +532,15 @@ __visible int plugin_init(struct plugin_name_args *plugin_info, plugin_name, argv[i].key, argv[i].value); return 1; } + } else if (!strcmp(argv[i].key, "arch")) { + if (!argv[i].value) { + error(G_("no value supplied for option '-fplugin-arg-%s-%s'"), + plugin_name, argv[i].key); + return 1; + } + + if (!strcmp(argv[i].value, "x86")) + build_for_x86 = true; } else { error(G_("unknown option '-fplugin-arg-%s-%s'"), plugin_name, argv[i].key); -- cgit v1.2.3 From 521b512b157a1315ff2bf11c11ab184c79515aea Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 27 May 2020 10:58:47 +0100 Subject: PM / EM: change naming convention from 'capacity' to 'performance' The Energy Model uses concept of performance domain and capacity states in order to calculate power used by CPUs. Change naming convention from capacity to performance state would enable wider usage in future, e.g. upcoming support for other devices other than CPUs. Acked-by: Daniel Lezcano Acked-by: Quentin Perret Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- drivers/thermal/cpufreq_cooling.c | 12 +++--- include/linux/energy_model.h | 86 +++++++++++++++++++++------------------ kernel/power/energy_model.c | 44 ++++++++++---------- kernel/sched/topology.c | 20 ++++----- 4 files changed, 84 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 9e124020519f..641995ebc107 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -333,18 +333,18 @@ static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev, return false; policy = cpufreq_cdev->policy; - if (!cpumask_equal(policy->related_cpus, to_cpumask(em->cpus))) { + if (!cpumask_equal(policy->related_cpus, em_span_cpus(em))) { pr_err("The span of pd %*pbl is misaligned with cpufreq policy %*pbl\n", - cpumask_pr_args(to_cpumask(em->cpus)), + cpumask_pr_args(em_span_cpus(em)), cpumask_pr_args(policy->related_cpus)); return false; } nr_levels = cpufreq_cdev->max_level + 1; - if (em->nr_cap_states != nr_levels) { - pr_err("The number of cap states in pd %*pbl (%u) doesn't match the number of cooling levels (%u)\n", - cpumask_pr_args(to_cpumask(em->cpus)), - em->nr_cap_states, nr_levels); + if (em_pd_nr_perf_states(em) != nr_levels) { + pr_err("The number of performance states in pd %*pbl (%u) doesn't match the number of cooling levels (%u)\n", + cpumask_pr_args(em_span_cpus(em)), + em_pd_nr_perf_states(em), nr_levels); return false; } diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index ade6486a3382..fe336a9eb5d4 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -10,13 +10,13 @@ #include /** - * em_cap_state - Capacity state of a performance domain + * em_perf_state - Performance state of a performance domain * @frequency: The CPU frequency in KHz, for consistency with CPUFreq * @power: The power consumed by 1 CPU at this level, in milli-watts * @cost: The cost coefficient associated with this level, used during * energy calculation. Equal to: power * max_frequency / frequency */ -struct em_cap_state { +struct em_perf_state { unsigned long frequency; unsigned long power; unsigned long cost; @@ -24,8 +24,8 @@ struct em_cap_state { /** * em_perf_domain - Performance domain - * @table: List of capacity states, in ascending order - * @nr_cap_states: Number of capacity states + * @table: List of performance states, in ascending order + * @nr_perf_states: Number of performance states * @cpus: Cpumask covering the CPUs of the domain * * A "performance domain" represents a group of CPUs whose performance is @@ -34,22 +34,27 @@ struct em_cap_state { * CPUFreq policies. */ struct em_perf_domain { - struct em_cap_state *table; - int nr_cap_states; + struct em_perf_state *table; + int nr_perf_states; unsigned long cpus[]; }; +#define em_span_cpus(em) (to_cpumask((em)->cpus)) + #ifdef CONFIG_ENERGY_MODEL #define EM_CPU_MAX_POWER 0xFFFF struct em_data_callback { /** - * active_power() - Provide power at the next capacity state of a CPU - * @power : Active power at the capacity state in mW (modified) - * @freq : Frequency at the capacity state in kHz (modified) + * active_power() - Provide power at the next performance state of + * a CPU + * @power : Active power at the performance state in mW + * (modified) + * @freq : Frequency at the performance state in kHz + * (modified) * @cpu : CPU for which we do this operation * - * active_power() must find the lowest capacity state of 'cpu' above + * active_power() must find the lowest performance state of 'cpu' above * 'freq' and update 'power' and 'freq' to the matching active power * and frequency. * @@ -80,46 +85,46 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util) { unsigned long freq, scale_cpu; - struct em_cap_state *cs; + struct em_perf_state *ps; int i, cpu; /* - * In order to predict the capacity state, map the utilization of the - * most utilized CPU of the performance domain to a requested frequency, - * like schedutil. + * In order to predict the performance state, map the utilization of + * the most utilized CPU of the performance domain to a requested + * frequency, like schedutil. */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); - cs = &pd->table[pd->nr_cap_states - 1]; - freq = map_util_freq(max_util, cs->frequency, scale_cpu); + ps = &pd->table[pd->nr_perf_states - 1]; + freq = map_util_freq(max_util, ps->frequency, scale_cpu); /* - * Find the lowest capacity state of the Energy Model above the + * Find the lowest performance state of the Energy Model above the * requested frequency. */ - for (i = 0; i < pd->nr_cap_states; i++) { - cs = &pd->table[i]; - if (cs->frequency >= freq) + for (i = 0; i < pd->nr_perf_states; i++) { + ps = &pd->table[i]; + if (ps->frequency >= freq) break; } /* - * The capacity of a CPU in the domain at that capacity state (cs) + * The capacity of a CPU in the domain at the performance state (ps) * can be computed as: * - * cs->freq * scale_cpu - * cs->cap = -------------------- (1) + * ps->freq * scale_cpu + * ps->cap = -------------------- (1) * cpu_max_freq * * So, ignoring the costs of idle states (which are not available in - * the EM), the energy consumed by this CPU at that capacity state is - * estimated as: + * the EM), the energy consumed by this CPU at that performance state + * is estimated as: * - * cs->power * cpu_util + * ps->power * cpu_util * cpu_nrg = -------------------- (2) - * cs->cap + * ps->cap * - * since 'cpu_util / cs->cap' represents its percentage of busy time. + * since 'cpu_util / ps->cap' represents its percentage of busy time. * * NOTE: Although the result of this computation actually is in * units of power, it can be manipulated as an energy value @@ -129,34 +134,35 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd, * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product * of two terms: * - * cs->power * cpu_max_freq cpu_util + * ps->power * cpu_max_freq cpu_util * cpu_nrg = ------------------------ * --------- (3) - * cs->freq scale_cpu + * ps->freq scale_cpu * - * The first term is static, and is stored in the em_cap_state struct - * as 'cs->cost'. + * The first term is static, and is stored in the em_perf_state struct + * as 'ps->cost'. * * Since all CPUs of the domain have the same micro-architecture, they - * share the same 'cs->cost', and the same CPU capacity. Hence, the + * share the same 'ps->cost', and the same CPU capacity. Hence, the * total energy of the domain (which is the simple sum of the energy of * all of its CPUs) can be factorized as: * - * cs->cost * \Sum cpu_util + * ps->cost * \Sum cpu_util * pd_nrg = ------------------------ (4) * scale_cpu */ - return cs->cost * sum_util / scale_cpu; + return ps->cost * sum_util / scale_cpu; } /** - * em_pd_nr_cap_states() - Get the number of capacity states of a perf. domain + * em_pd_nr_perf_states() - Get the number of performance states of a perf. + * domain * @pd : performance domain for which this must be done * - * Return: the number of capacity states in the performance domain table + * Return: the number of performance states in the performance domain table */ -static inline int em_pd_nr_cap_states(struct em_perf_domain *pd) +static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) { - return pd->nr_cap_states; + return pd->nr_perf_states; } #else @@ -177,7 +183,7 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd, { return 0; } -static inline int em_pd_nr_cap_states(struct em_perf_domain *pd) +static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) { return 0; } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0a9326f5f421..9892d548a0fa 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -27,18 +27,18 @@ static DEFINE_MUTEX(em_pd_mutex); #ifdef CONFIG_DEBUG_FS static struct dentry *rootdir; -static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd) +static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) { struct dentry *d; char name[24]; - snprintf(name, sizeof(name), "cs:%lu", cs->frequency); + snprintf(name, sizeof(name), "ps:%lu", ps->frequency); - /* Create per-cs directory */ + /* Create per-ps directory */ d = debugfs_create_dir(name, pd); - debugfs_create_ulong("frequency", 0444, d, &cs->frequency); - debugfs_create_ulong("power", 0444, d, &cs->power); - debugfs_create_ulong("cost", 0444, d, &cs->cost); + debugfs_create_ulong("frequency", 0444, d, &ps->frequency); + debugfs_create_ulong("power", 0444, d, &ps->power); + debugfs_create_ulong("cost", 0444, d, &ps->cost); } static int em_debug_cpus_show(struct seq_file *s, void *unused) @@ -62,9 +62,9 @@ static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops); - /* Create a sub-directory for each capacity state */ - for (i = 0; i < pd->nr_cap_states; i++) - em_debug_create_cs(&pd->table[i], d); + /* Create a sub-directory for each performance state */ + for (i = 0; i < pd->nr_perf_states; i++) + em_debug_create_ps(&pd->table[i], d); } static int __init em_debug_init(void) @@ -84,7 +84,7 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, unsigned long opp_eff, prev_opp_eff = ULONG_MAX; unsigned long power, freq, prev_freq = 0; int i, ret, cpu = cpumask_first(span); - struct em_cap_state *table; + struct em_perf_state *table; struct em_perf_domain *pd; u64 fmax; @@ -99,26 +99,26 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, if (!table) goto free_pd; - /* Build the list of capacity states for this performance domain */ + /* Build the list of performance states for this performance domain */ for (i = 0, freq = 0; i < nr_states; i++, freq++) { /* * active_power() is a driver callback which ceils 'freq' to - * lowest capacity state of 'cpu' above 'freq' and updates + * lowest performance state of 'cpu' above 'freq' and updates * 'power' and 'freq' accordingly. */ ret = cb->active_power(&power, &freq, cpu); if (ret) { - pr_err("pd%d: invalid cap. state: %d\n", cpu, ret); - goto free_cs_table; + pr_err("pd%d: invalid perf. state: %d\n", cpu, ret); + goto free_ps_table; } /* * We expect the driver callback to increase the frequency for - * higher capacity states. + * higher performance states. */ if (freq <= prev_freq) { pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq); - goto free_cs_table; + goto free_ps_table; } /* @@ -127,7 +127,7 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, */ if (!power || power > EM_CPU_MAX_POWER) { pr_err("pd%d: invalid power: %lu\n", cpu, power); - goto free_cs_table; + goto free_ps_table; } table[i].power = power; @@ -141,12 +141,12 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, */ opp_eff = freq / power; if (opp_eff >= prev_opp_eff) - pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n", + pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", cpu, i, i - 1); prev_opp_eff = opp_eff; } - /* Compute the cost of each capacity_state. */ + /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = 0; i < nr_states; i++) { table[i].cost = div64_u64(fmax * table[i].power, @@ -154,14 +154,14 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, } pd->table = table; - pd->nr_cap_states = nr_states; + pd->nr_perf_states = nr_states; cpumask_copy(to_cpumask(pd->cpus), span); em_debug_create_pd(pd, cpu); return pd; -free_cs_table: +free_ps_table: kfree(table); free_pd: kfree(pd); @@ -185,7 +185,7 @@ EXPORT_SYMBOL_GPL(em_cpu_get); /** * em_register_perf_domain() - Register the Energy Model of a performance domain * @span : Mask of CPUs in the performance domain - * @nr_states : Number of capacity states to register + * @nr_states : Number of performance states to register * @cb : Callback functions providing the data of the Energy Model * * Create Energy Model tables for a performance domain using the callbacks diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ba81187bb7af..2f91d3126365 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -272,10 +272,10 @@ static void perf_domain_debug(const struct cpumask *cpu_map, printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map)); while (pd) { - printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }", + printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }", cpumask_first(perf_domain_span(pd)), cpumask_pr_args(perf_domain_span(pd)), - em_pd_nr_cap_states(pd->em_pd)); + em_pd_nr_perf_states(pd->em_pd)); pd = pd->next; } @@ -313,26 +313,26 @@ static void sched_energy_set(bool has_eas) * * The complexity of the Energy Model is defined as: * - * C = nr_pd * (nr_cpus + nr_cs) + * C = nr_pd * (nr_cpus + nr_ps) * * with parameters defined as: * - nr_pd: the number of performance domains * - nr_cpus: the number of CPUs - * - nr_cs: the sum of the number of capacity states of all performance + * - nr_ps: the sum of the number of performance states of all performance * domains (for example, on a system with 2 performance domains, - * with 10 capacity states each, nr_cs = 2 * 10 = 20). + * with 10 performance states each, nr_ps = 2 * 10 = 20). * * It is generally not a good idea to use such a model in the wake-up path on * very complex platforms because of the associated scheduling overheads. The * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs - * with per-CPU DVFS and less than 8 capacity states each, for example. + * with per-CPU DVFS and less than 8 performance states each, for example. */ #define EM_MAX_COMPLEXITY 2048 extern struct cpufreq_governor schedutil_gov; static bool build_perf_domains(const struct cpumask *cpu_map) { - int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); + int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; @@ -384,15 +384,15 @@ static bool build_perf_domains(const struct cpumask *cpu_map) pd = tmp; /* - * Count performance domains and capacity states for the + * Count performance domains and performance states for the * complexity check. */ nr_pd++; - nr_cs += em_pd_nr_cap_states(pd->em_pd); + nr_ps += em_pd_nr_perf_states(pd->em_pd); } /* Bail out if the Energy Model complexity is too high. */ - if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) { + if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", cpumask_pr_args(cpu_map)); goto free; -- cgit v1.2.3 From 7d9895c7fbfc9c70afce7029b7de0f3f974adb88 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 27 May 2020 10:58:48 +0100 Subject: PM / EM: introduce em_dev_register_perf_domain function Add now function in the Energy Model framework which is going to support new devices. This function will help in transition and make it smoother. For now it still checks if the cpumask is a valid pointer, which will be removed later when the new structures and infrastructure will be ready. Acked-by: Daniel Lezcano Acked-by: Quentin Perret Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 13 +++++++++++-- kernel/power/energy_model.c | 40 ++++++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index fe336a9eb5d4..7c048df98447 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -2,6 +2,7 @@ #ifndef _LINUX_ENERGY_MODEL_H #define _LINUX_ENERGY_MODEL_H #include +#include #include #include #include @@ -42,7 +43,7 @@ struct em_perf_domain { #define em_span_cpus(em) (to_cpumask((em)->cpus)) #ifdef CONFIG_ENERGY_MODEL -#define EM_CPU_MAX_POWER 0xFFFF +#define EM_MAX_POWER 0xFFFF struct em_data_callback { /** @@ -59,7 +60,7 @@ struct em_data_callback { * and frequency. * * The power is the one of a single CPU in the domain, expressed in - * milli-watts. It is expected to fit in the [0, EM_CPU_MAX_POWER] + * milli-watts. It is expected to fit in the [0, EM_MAX_POWER] * range. * * Return 0 on success. @@ -71,6 +72,8 @@ struct em_data_callback { struct em_perf_domain *em_cpu_get(int cpu); int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, struct em_data_callback *cb); +int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, + struct em_data_callback *cb, cpumask_t *span); /** * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain @@ -174,6 +177,12 @@ static inline int em_register_perf_domain(cpumask_t *span, { return -EINVAL; } +static inline +int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, + struct em_data_callback *cb, cpumask_t *span) +{ + return -EINVAL; +} static inline struct em_perf_domain *em_cpu_get(int cpu) { return NULL; diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 9892d548a0fa..875b163e54ab 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -125,7 +125,7 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, * The power returned by active_state() is expected to be * positive, in milli-watts and to fit into 16 bits. */ - if (!power || power > EM_CPU_MAX_POWER) { + if (!power || power > EM_MAX_POWER) { pr_err("pd%d: invalid power: %lu\n", cpu, power); goto free_ps_table; } @@ -183,10 +183,13 @@ struct em_perf_domain *em_cpu_get(int cpu) EXPORT_SYMBOL_GPL(em_cpu_get); /** - * em_register_perf_domain() - Register the Energy Model of a performance domain - * @span : Mask of CPUs in the performance domain + * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device + * @dev : Device for which the EM is to register * @nr_states : Number of performance states to register * @cb : Callback functions providing the data of the Energy Model + * @span : Pointer to cpumask_t, which in case of a CPU device is + * obligatory. It can be taken from i.e. 'policy->cpus'. For other + * type of devices this should be set to NULL. * * Create Energy Model tables for a performance domain using the callbacks * defined in cb. @@ -196,14 +199,14 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * * Return 0 on success */ -int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, - struct em_data_callback *cb) +int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, + struct em_data_callback *cb, cpumask_t *span) { unsigned long cap, prev_cap = 0; struct em_perf_domain *pd; int cpu, ret = 0; - if (!span || !nr_states || !cb) + if (!dev || !span || !nr_states || !cb) return -EINVAL; /* @@ -255,4 +258,29 @@ unlock: return ret; } +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_register_perf_domain() - Register the Energy Model of a performance domain + * @span : Mask of CPUs in the performance domain + * @nr_states : Number of capacity states to register + * @cb : Callback functions providing the data of the Energy Model + * + * Create Energy Model tables for a performance domain using the callbacks + * defined in cb. + * + * If multiple clients register the same performance domain, all but the first + * registration will be ignored. + * + * Return 0 on success + */ +int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, + struct em_data_callback *cb) +{ + struct device *cpu_dev; + + cpu_dev = get_cpu_device(cpumask_first(span)); + + return em_dev_register_perf_domain(cpu_dev, nr_states, cb, span); +} EXPORT_SYMBOL_GPL(em_register_perf_domain); -- cgit v1.2.3 From d0351cc3b0f57214d157e4d589564730af2aedae Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 27 May 2020 10:58:49 +0100 Subject: PM / EM: update callback structure and add device pointer The Energy Model framework is going to support devices other that CPUs. In order to make this happen change the callback function and add pointer to a device as an argument. Update the related users to use new function and new callback from the Energy Model. Acked-by: Quentin Perret Signed-off-by: Lukasz Luba Acked-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/scmi-cpufreq.c | 11 +++-------- drivers/opp/of.c | 9 ++------- include/linux/energy_model.h | 15 ++++++++------- kernel/power/energy_model.c | 9 +++++---- 4 files changed, 18 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index 61623e2ff149..11ee24e06d12 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -103,17 +103,12 @@ scmi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) } static int __maybe_unused -scmi_get_cpu_power(unsigned long *power, unsigned long *KHz, int cpu) +scmi_get_cpu_power(unsigned long *power, unsigned long *KHz, + struct device *cpu_dev) { - struct device *cpu_dev = get_cpu_device(cpu); unsigned long Hz; int ret, domain; - if (!cpu_dev) { - pr_err("failed to get cpu%d device\n", cpu); - return -ENODEV; - } - domain = handle->perf_ops->device_domain_id(cpu_dev); if (domain < 0) return domain; @@ -200,7 +195,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) policy->fast_switch_possible = true; - em_register_perf_domain(policy->cpus, nr_opp, &em_cb); + em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus); return 0; diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 9a5873591a40..e273f419a4bf 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -1216,9 +1216,8 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node); * calculation failed because of missing parameters, 0 otherwise. */ static int __maybe_unused _get_cpu_power(unsigned long *mW, unsigned long *kHz, - int cpu) + struct device *cpu_dev) { - struct device *cpu_dev; struct dev_pm_opp *opp; struct device_node *np; unsigned long mV, Hz; @@ -1226,10 +1225,6 @@ static int __maybe_unused _get_cpu_power(unsigned long *mW, unsigned long *kHz, u64 tmp; int ret; - cpu_dev = get_cpu_device(cpu); - if (!cpu_dev) - return -ENODEV; - np = of_node_get(cpu_dev->of_node); if (!np) return -EINVAL; @@ -1297,6 +1292,6 @@ void dev_pm_opp_of_register_em(struct cpumask *cpus) if (ret || !cap) return; - em_register_perf_domain(cpus, nr_opp, &em_cb); + em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, cpus); } EXPORT_SYMBOL_GPL(dev_pm_opp_of_register_em); diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 7c048df98447..7076cb22b247 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -48,24 +48,25 @@ struct em_perf_domain { struct em_data_callback { /** * active_power() - Provide power at the next performance state of - * a CPU + * a device * @power : Active power at the performance state in mW * (modified) * @freq : Frequency at the performance state in kHz * (modified) - * @cpu : CPU for which we do this operation + * @dev : Device for which we do this operation (can be a CPU) * - * active_power() must find the lowest performance state of 'cpu' above + * active_power() must find the lowest performance state of 'dev' above * 'freq' and update 'power' and 'freq' to the matching active power * and frequency. * - * The power is the one of a single CPU in the domain, expressed in - * milli-watts. It is expected to fit in the [0, EM_MAX_POWER] - * range. + * In case of CPUs, the power is the one of a single CPU in the domain, + * expressed in milli-watts. It is expected to fit in the + * [0, EM_MAX_POWER] range. * * Return 0 on success. */ - int (*active_power)(unsigned long *power, unsigned long *freq, int cpu); + int (*active_power)(unsigned long *power, unsigned long *freq, + struct device *dev); }; #define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 875b163e54ab..5b8a1566526a 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -78,8 +78,9 @@ core_initcall(em_debug_init); #else /* CONFIG_DEBUG_FS */ static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {} #endif -static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, - struct em_data_callback *cb) +static struct em_perf_domain * +em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, + cpumask_t *span) { unsigned long opp_eff, prev_opp_eff = ULONG_MAX; unsigned long power, freq, prev_freq = 0; @@ -106,7 +107,7 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, * lowest performance state of 'cpu' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, cpu); + ret = cb->active_power(&power, &freq, dev); if (ret) { pr_err("pd%d: invalid perf. state: %d\n", cpu, ret); goto free_ps_table; @@ -237,7 +238,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, } /* Create the performance domain and add it to the Energy Model. */ - pd = em_create_pd(span, nr_states, cb); + pd = em_create_pd(dev, nr_states, cb, span); if (!pd) { ret = -EINVAL; goto unlock; -- cgit v1.2.3 From a67549c8e568627290234e9fbe833cb9dfd36b55 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:26 +0000 Subject: blktrace: annotate required lock on do_blk_trace_setup() Ensure it is clear which lock is required on do_blk_trace_setup(). Suggested-by: Bart Van Assche Signed-off-by: Luis Chamberlain Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 5ef0484513ec..5a88a6b55933 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -483,6 +483,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct dentry *dir = NULL; int ret; + lockdep_assert_held(&q->blk_trace_mutex); + if (!buts->buf_size || !buts->buf_nr) return -EINVAL; -- cgit v1.2.3 From bad8e64fb19d3a0de5e564d9a7271c31bd684369 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:28 +0000 Subject: blktrace: fix debugfs use after free On commit 6ac93117ab00 ("blktrace: use existing disk debugfs directory") merged on v4.12 Omar fixed the original blktrace code for request-based drivers (multiqueue). This however left in place a possible crash, if you happen to abuse blktrace while racing to remove / add a device. We used to use asynchronous removal of the request_queue, and with that the issue was easier to reproduce. Now that we have reverted to synchronous removal of the request_queue, the issue is still possible to reproduce, its however just a bit more difficult. We essentially run two instances of break-blktrace which add/remove a loop device, and setup a blktrace and just never tear the blktrace down. We do this twice in parallel. This is easily reproduced with the script run_0004.sh from break-blktrace [0]. We can end up with two types of panics each reflecting where we race, one a failed blktrace setup: [ 252.426751] debugfs: Directory 'loop0' with parent 'block' already present! [ 252.432265] BUG: kernel NULL pointer dereference, address: 00000000000000a0 [ 252.436592] #PF: supervisor write access in kernel mode [ 252.439822] #PF: error_code(0x0002) - not-present page [ 252.442967] PGD 0 P4D 0 [ 252.444656] Oops: 0002 [#1] SMP NOPTI [ 252.446972] CPU: 10 PID: 1153 Comm: break-blktrace Tainted: G E 5.7.0-rc2-next-20200420+ #164 [ 252.452673] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 [ 252.456343] RIP: 0010:down_write+0x15/0x40 [ 252.458146] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00 00 00 48 0f b1 55 00 75 0f 48 8b 04 25 c0 8b 01 00 48 89 45 08 5d [ 252.463638] RSP: 0018:ffffa626415abcc8 EFLAGS: 00010246 [ 252.464950] RAX: 0000000000000000 RBX: ffff958c25f0f5c0 RCX: ffffff8100000000 [ 252.466727] RDX: 0000000000000001 RSI: ffffff8100000000 RDI: 00000000000000a0 [ 252.468482] RBP: 00000000000000a0 R08: 0000000000000000 R09: 0000000000000001 [ 252.470014] R10: 0000000000000000 R11: ffff958d1f9227ff R12: 0000000000000000 [ 252.471473] R13: ffff958c25ea5380 R14: ffffffff8cce15f1 R15: 00000000000000a0 [ 252.473346] FS: 00007f2e69dee540(0000) GS:ffff958c2fc80000(0000) knlGS:0000000000000000 [ 252.475225] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 252.476267] CR2: 00000000000000a0 CR3: 0000000427d10004 CR4: 0000000000360ee0 [ 252.477526] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 252.478776] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 252.479866] Call Trace: [ 252.480322] simple_recursive_removal+0x4e/0x2e0 [ 252.481078] ? debugfs_remove+0x60/0x60 [ 252.481725] ? relay_destroy_buf+0x77/0xb0 [ 252.482662] debugfs_remove+0x40/0x60 [ 252.483518] blk_remove_buf_file_callback+0x5/0x10 [ 252.484328] relay_close_buf+0x2e/0x60 [ 252.484930] relay_open+0x1ce/0x2c0 [ 252.485520] do_blk_trace_setup+0x14f/0x2b0 [ 252.486187] __blk_trace_setup+0x54/0xb0 [ 252.486803] blk_trace_ioctl+0x90/0x140 [ 252.487423] ? do_sys_openat2+0x1ab/0x2d0 [ 252.488053] blkdev_ioctl+0x4d/0x260 [ 252.488636] block_ioctl+0x39/0x40 [ 252.489139] ksys_ioctl+0x87/0xc0 [ 252.489675] __x64_sys_ioctl+0x16/0x20 [ 252.490380] do_syscall_64+0x52/0x180 [ 252.491032] entry_SYSCALL_64_after_hwframe+0x44/0xa9 And the other on the device removal: [ 128.528940] debugfs: Directory 'loop0' with parent 'block' already present! [ 128.615325] BUG: kernel NULL pointer dereference, address: 00000000000000a0 [ 128.619537] #PF: supervisor write access in kernel mode [ 128.622700] #PF: error_code(0x0002) - not-present page [ 128.625842] PGD 0 P4D 0 [ 128.627585] Oops: 0002 [#1] SMP NOPTI [ 128.629871] CPU: 12 PID: 544 Comm: break-blktrace Tainted: G E 5.7.0-rc2-next-20200420+ #164 [ 128.635595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 [ 128.640471] RIP: 0010:down_write+0x15/0x40 [ 128.643041] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00 00 00 48 0f b1 55 00 75 0f 65 48 8b 04 25 c0 8b 01 00 48 89 45 08 5d [ 128.650180] RSP: 0018:ffffa9c3c05ebd78 EFLAGS: 00010246 [ 128.651820] RAX: 0000000000000000 RBX: ffff8ae9a6370240 RCX: ffffff8100000000 [ 128.653942] RDX: 0000000000000001 RSI: ffffff8100000000 RDI: 00000000000000a0 [ 128.655720] RBP: 00000000000000a0 R08: 0000000000000002 R09: ffff8ae9afd2d3d0 [ 128.657400] R10: 0000000000000056 R11: 0000000000000000 R12: 0000000000000000 [ 128.659099] R13: 0000000000000000 R14: 0000000000000003 R15: 00000000000000a0 [ 128.660500] FS: 00007febfd995540(0000) GS:ffff8ae9afd00000(0000) knlGS:0000000000000000 [ 128.662204] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 128.663426] CR2: 00000000000000a0 CR3: 0000000420042003 CR4: 0000000000360ee0 [ 128.664776] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 128.666022] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 128.667282] Call Trace: [ 128.667801] simple_recursive_removal+0x4e/0x2e0 [ 128.668663] ? debugfs_remove+0x60/0x60 [ 128.669368] debugfs_remove+0x40/0x60 [ 128.669985] blk_trace_free+0xd/0x50 [ 128.670593] __blk_trace_remove+0x27/0x40 [ 128.671274] blk_trace_shutdown+0x30/0x40 [ 128.671935] blk_release_queue+0x95/0xf0 [ 128.672589] kobject_put+0xa5/0x1b0 [ 128.673188] disk_release+0xa2/0xc0 [ 128.673786] device_release+0x28/0x80 [ 128.674376] kobject_put+0xa5/0x1b0 [ 128.674915] loop_remove+0x39/0x50 [loop] [ 128.675511] loop_control_ioctl+0x113/0x130 [loop] [ 128.676199] ksys_ioctl+0x87/0xc0 [ 128.676708] __x64_sys_ioctl+0x16/0x20 [ 128.677274] do_syscall_64+0x52/0x180 [ 128.677823] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The common theme here is: debugfs: Directory 'loop0' with parent 'block' already present This crash happens because of how blktrace uses the debugfs directory where it places its files. Upon init we always create the same directory which would be needed by blktrace but we only do this for make_request drivers (multiqueue) block drivers. When you race a removal of these devices with a blktrace setup you end up in a situation where the make_request recursive debugfs removal will sweep away the blktrace files and then later blktrace will also try to remove individual dentries which are already NULL. The inverse is also possible and hence the two types of use after frees. We don't create the block debugfs directory on init for these types of block devices: * request-based block driver block devices * every possible partition * scsi-generic And so, this race should in theory only be possible with make_request drivers. We can fix the UAF by simply re-using the debugfs directory for make_request drivers (multiqueue) and only creating the ephemeral directory for the other type of block devices. The new clarifications on relying on the q->blk_trace_mutex *and* also checking for q->blk_trace *prior* to processing a blktrace ensures the debugfs directories are only created if no possible directory name clashes are possible. This goes tested with: o nvme partitions o ISCSI with tgt, and blktracing against scsi-generic with: o block o tape o cdrom o media changer o blktests This patch is part of the work which disputes the severity of CVE-2019-19770 which shows this issue is not a core debugfs issue, but a misuse of debugfs within blktace. Fixes: 6ac93117ab00 ("blktrace: use existing disk debugfs directory") Reported-by: syzbot+603294af2d01acfdd6da@syzkaller.appspotmail.com Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Cc: Bart Van Assche Cc: Omar Sandoval Cc: Hannes Reinecke Cc: Nicolai Stange Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: "Martin K. Petersen" Cc: "James E.J. Bottomley" Cc: yu kuai Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 5a88a6b55933..e27dee345d81 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -524,10 +524,18 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->msg_data) goto err; - ret = -ENOENT; - - dir = debugfs_lookup(buts->name, blk_debugfs_root); - if (!dir) +#ifdef CONFIG_BLK_DEBUG_FS + /* + * When tracing whole make_request drivers (multiqueue) block devices, + * reuse the existing debugfs directory created by the block layer on + * init. For request-based block devices, all partitions block devices, + * and scsi-generic block devices we create a temporary new debugfs + * directory that will be removed once the trace ends. + */ + if (queue_is_mq(q) && bdev && bdev == bdev->bd_contains) + dir = q->debugfs_dir; + else +#endif bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); bt->dev = dev; @@ -565,8 +573,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = 0; err: - if (dir && !bt->dir) - dput(dir); if (ret) blk_trace_free(bt); return ret; -- cgit v1.2.3 From b431ef837e3374da0db8ff6683170359aaa0859c Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:29 +0000 Subject: blktrace: ensure our debugfs dir exists We make an assumption that a debugfs directory exists, but since this can fail ensure it exists before allowing blktrace setup to complete. Otherwise we end up stuffing blktrace files on the debugfs root directory. In the worst case scenario this *in theory* can create an eventual panic *iff* in the future a similarly named file is created prior on the debugfs root directory. This theoretical crash can happen due to a recursive removal followed by a specific dentry removal. This doesn't fix any known crash, however I have seen the files go into the main debugfs root directory in cases where the debugfs directory was not created due to other internal bugs with blktrace now fixed. blktrace is also completely useless without this directory, so this ensures to userspace we only setup blktrace if the kernel can stuff files where they are supposed to go into. debugfs directory creations typically aren't checked for, and we have maintainers doing sweep removals of these checks, but since we need this check to ensure proper userspace blktrace functionality we make sure to annotate the justification for the check. Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e27dee345d81..46c273f4dec5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -538,6 +538,18 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, #endif bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); + /* + * As blktrace relies on debugfs for its interface the debugfs directory + * is required, contrary to the usual mantra of not checking for debugfs + * files or directories. + */ + if (IS_ERR_OR_NULL(dir)) { + pr_warn("debugfs_dir not present for %s so skipping\n", + buts->name); + ret = -ENOENT; + goto err; + } + bt->dev = dev; atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); -- cgit v1.2.3 From 85e0cbbb8a79537dbc465e9deb449a08b2b092a6 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:30 +0000 Subject: block: create the request_queue debugfs_dir on registration We were only creating the request_queue debugfs_dir only for make_request block drivers (multiqueue), but never for request-based block drivers. We did this as we were only creating non-blktrace additional debugfs files on that directory for make_request drivers. However, since blktrace *always* creates that directory anyway, we special-case the use of that directory on blktrace. Other than this being an eye-sore, this exposes request-based block drivers to the same debugfs fragile race that used to exist with make_request block drivers where if we start adding files onto that directory we can later run a race with a double removal of dentries on the directory if we don't deal with this carefully on blktrace. Instead, just simplify things by always creating the request_queue debugfs_dir on request_queue registration. Rename the mutex also to reflect the fact that this is used outside of the blktrace context. Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 8 +------ block/blk-mq-debugfs.c | 5 ----- block/blk-sysfs.c | 9 ++++++++ block/blk.h | 2 -- include/linux/blkdev.h | 5 +++-- kernel/trace/blktrace.c | 58 ++++++++++++++++++++++--------------------------- 6 files changed, 39 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/block/blk-core.c b/block/blk-core.c index a99b22fac38a..a9769c1a2875 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -51,9 +51,7 @@ #include "blk-pm.h" #include "blk-rq-qos.h" -#ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; -#endif EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -555,9 +553,7 @@ struct request_queue *__blk_alloc_queue(int node_id) kobject_init(&q->kobj, &blk_queue_ktype); -#ifdef CONFIG_BLK_DEV_IO_TRACE - mutex_init(&q->blk_trace_mutex); -#endif + mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); spin_lock_init(&q->queue_lock); @@ -1931,9 +1927,7 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); -#ifdef CONFIG_DEBUG_FS blk_debugfs_root = debugfs_create_dir("block", NULL); -#endif return 0; } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 15df3a36e9fa..a2800bc56fb4 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -824,9 +824,6 @@ void blk_mq_debugfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), - blk_debugfs_root); - debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); /* @@ -857,9 +854,7 @@ void blk_mq_debugfs_register(struct request_queue *q) void blk_mq_debugfs_unregister(struct request_queue *q) { - debugfs_remove_recursive(q->debugfs_dir); q->sched_debugfs_dir = NULL; - q->debugfs_dir = NULL; } static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 561624d4cc4e..be67952e7be2 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "blk.h" #include "blk-mq.h" @@ -917,6 +918,9 @@ static void blk_release_queue(struct kobject *kobj) blk_mq_release(q); blk_trace_shutdown(q); + mutex_lock(&q->debugfs_mutex); + debugfs_remove_recursive(q->debugfs_dir); + mutex_unlock(&q->debugfs_mutex); if (queue_is_mq(q)) blk_mq_debugfs_unregister(q); @@ -989,6 +993,11 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } + mutex_lock(&q->debugfs_mutex); + q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), + blk_debugfs_root); + mutex_unlock(&q->debugfs_mutex); + if (queue_is_mq(q)) { __blk_mq_register_dev(dev, q); blk_mq_debugfs_register(q); diff --git a/block/blk.h b/block/blk.h index 8ba4a5e4fe07..3a120a070dac 100644 --- a/block/blk.h +++ b/block/blk.h @@ -14,9 +14,7 @@ /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) -#ifdef CONFIG_DEBUG_FS extern struct dentry *blk_debugfs_root; -#endif struct blk_flush_queue { unsigned int flush_pending_idx:1; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e214e0e9f868..c0701237116d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -528,9 +528,9 @@ struct request_queue { unsigned int sg_timeout; unsigned int sg_reserved_size; int node; + struct mutex debugfs_mutex; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace __rcu *blk_trace; - struct mutex blk_trace_mutex; #endif /* * for flush operations @@ -574,8 +574,9 @@ struct request_queue { struct list_head tag_set_list; struct bio_set bio_split; -#ifdef CONFIG_BLK_DEBUG_FS struct dentry *debugfs_dir; + +#ifdef CONFIG_BLK_DEBUG_FS struct dentry *sched_debugfs_dir; struct dentry *rqos_debugfs_dir; #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 46c273f4dec5..c086c38f4954 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -348,7 +348,7 @@ static int __blk_trace_remove(struct request_queue *q) struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (!bt) return -EINVAL; @@ -362,9 +362,9 @@ int blk_trace_remove(struct request_queue *q) { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_remove(q); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -483,14 +483,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct dentry *dir = NULL; int ret; - lockdep_assert_held(&q->blk_trace_mutex); + lockdep_assert_held(&q->debugfs_mutex); if (!buts->buf_size || !buts->buf_nr) return -EINVAL; - if (!blk_debugfs_root) - return -ENOENT; - strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; @@ -505,7 +502,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * we can be. */ if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex))) { + lockdep_is_held(&q->debugfs_mutex))) { pr_warn("Concurrent blktraces are not allowed on %s\n", buts->name); return -EBUSY; @@ -524,18 +521,15 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->msg_data) goto err; -#ifdef CONFIG_BLK_DEBUG_FS /* - * When tracing whole make_request drivers (multiqueue) block devices, - * reuse the existing debugfs directory created by the block layer on - * init. For request-based block devices, all partitions block devices, + * When tracing the whole disk reuse the existing debugfs directory + * created by the block layer on init. For partitions block devices, * and scsi-generic block devices we create a temporary new debugfs * directory that will be removed once the trace ends. */ - if (queue_is_mq(q) && bdev && bdev == bdev->bd_contains) + if (bdev && bdev == bdev->bd_contains) dir = q->debugfs_dir; else -#endif bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); /* @@ -617,9 +611,9 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_setup(q, name, dev, bdev, arg); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -665,7 +659,7 @@ static int __blk_trace_startstop(struct request_queue *q, int start) struct blk_trace *bt; bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; @@ -705,9 +699,9 @@ int blk_trace_startstop(struct request_queue *q, int start) { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_startstop(q, start); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -736,7 +730,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); switch (cmd) { case BLKTRACESETUP: @@ -763,7 +757,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) break; } - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -774,14 +768,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex))) { + lockdep_is_held(&q->debugfs_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); } #ifdef CONFIG_BLK_CGROUP @@ -1662,7 +1656,7 @@ static int blk_trace_remove_queue(struct request_queue *q) struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; @@ -1837,10 +1831,10 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; @@ -1858,7 +1852,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); out_bdput: bdput(bdev); out: @@ -1901,10 +1895,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { if (!!value == !!bt) { ret = 0; @@ -1921,7 +1915,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); } if (ret == 0) { @@ -1936,7 +1930,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); out_bdput: bdput(bdev); out: -- cgit v1.2.3 From 1bc138c622959979eb547be2d3bbc6442a5c80b0 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 10 Jun 2020 11:12:23 +0100 Subject: PM / EM: add support for other devices than CPUs in Energy Model Add support for other devices than CPUs. The registration function does not require a valid cpumask pointer and is ready to handle new devices. Some of the internal structures has been reorganized in order to keep consistent view (like removing per_cpu pd pointers). Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/device.h | 5 + include/linux/energy_model.h | 29 +++-- kernel/power/energy_model.c | 244 +++++++++++++++++++++++++++++-------------- 3 files changed, 194 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/include/linux/device.h b/include/linux/device.h index 15460a5ac024..b72e6f9ad845 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -13,6 +13,7 @@ #define _DEVICE_H_ #include +#include #include #include #include @@ -559,6 +560,10 @@ struct device { struct dev_pm_info power; struct dev_pm_domain *pm_domain; +#ifdef CONFIG_ENERGY_MODEL + struct em_perf_domain *em_pd; +#endif + #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN struct irq_domain *msi_domain; #endif diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 7076cb22b247..2d4689964029 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -12,8 +12,10 @@ /** * em_perf_state - Performance state of a performance domain - * @frequency: The CPU frequency in KHz, for consistency with CPUFreq - * @power: The power consumed by 1 CPU at this level, in milli-watts + * @frequency: The frequency in KHz, for consistency with CPUFreq + * @power: The power consumed at this level, in milli-watts (by 1 CPU or + by a registered device). It can be a total power: static and + dynamic. * @cost: The cost coefficient associated with this level, used during * energy calculation. Equal to: power * max_frequency / frequency */ @@ -27,12 +29,16 @@ struct em_perf_state { * em_perf_domain - Performance domain * @table: List of performance states, in ascending order * @nr_perf_states: Number of performance states - * @cpus: Cpumask covering the CPUs of the domain + * @cpus: Cpumask covering the CPUs of the domain. It's here + * for performance reasons to avoid potential cache + * misses during energy calculations in the scheduler + * and simplifies allocating/freeing that memory region. * - * A "performance domain" represents a group of CPUs whose performance is - * scaled together. All CPUs of a performance domain must have the same - * micro-architecture. Performance domains often have a 1-to-1 mapping with - * CPUFreq policies. + * In case of CPU device, a "performance domain" represents a group of CPUs + * whose performance is scaled together. All CPUs of a performance domain + * must have the same micro-architecture. Performance domains often have + * a 1-to-1 mapping with CPUFreq policies. In case of other devices the @cpus + * field is unused. */ struct em_perf_domain { struct em_perf_state *table; @@ -71,10 +77,12 @@ struct em_data_callback { #define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb } struct em_perf_domain *em_cpu_get(int cpu); +struct em_perf_domain *em_pd_get(struct device *dev); int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, struct em_data_callback *cb); int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, struct em_data_callback *cb, cpumask_t *span); +void em_dev_unregister_perf_domain(struct device *dev); /** * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain @@ -184,10 +192,17 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, { return -EINVAL; } +static inline void em_dev_unregister_perf_domain(struct device *dev) +{ +} static inline struct em_perf_domain *em_cpu_get(int cpu) { return NULL; } +static inline struct em_perf_domain *em_pd_get(struct device *dev) +{ + return NULL; +} static inline unsigned long em_pd_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util) { diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 5b8a1566526a..32d76e78f992 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -1,9 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Energy Model of CPUs + * Energy Model of devices * - * Copyright (c) 2018, Arm ltd. + * Copyright (c) 2018-2020, Arm ltd. * Written by: Quentin Perret, Arm ltd. + * Improvements provided by: Lukasz Luba, Arm ltd. */ #define pr_fmt(fmt) "energy_model: " fmt @@ -15,15 +16,17 @@ #include #include -/* Mapping of each CPU to the performance domain to which it belongs. */ -static DEFINE_PER_CPU(struct em_perf_domain *, em_data); - /* * Mutex serializing the registrations of performance domains and letting * callbacks defined by drivers sleep. */ static DEFINE_MUTEX(em_pd_mutex); +static bool _is_cpu_device(struct device *dev) +{ + return (dev->bus == &cpu_subsys); +} + #ifdef CONFIG_DEBUG_FS static struct dentry *rootdir; @@ -49,22 +52,30 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); -static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) +static void em_debug_create_pd(struct device *dev) { struct dentry *d; - char name[8]; int i; - snprintf(name, sizeof(name), "pd%d", cpu); - /* Create the directory of the performance domain */ - d = debugfs_create_dir(name, rootdir); + d = debugfs_create_dir(dev_name(dev), rootdir); - debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops); + if (_is_cpu_device(dev)) + debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, + &em_debug_cpus_fops); /* Create a sub-directory for each performance state */ - for (i = 0; i < pd->nr_perf_states; i++) - em_debug_create_ps(&pd->table[i], d); + for (i = 0; i < dev->em_pd->nr_perf_states; i++) + em_debug_create_ps(&dev->em_pd->table[i], d); + +} + +static void em_debug_remove_pd(struct device *dev) +{ + struct dentry *debug_dir; + + debug_dir = debugfs_lookup(dev_name(dev), rootdir); + debugfs_remove_recursive(debug_dir); } static int __init em_debug_init(void) @@ -76,40 +87,34 @@ static int __init em_debug_init(void) } core_initcall(em_debug_init); #else /* CONFIG_DEBUG_FS */ -static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {} +static void em_debug_create_pd(struct device *dev) {} +static void em_debug_remove_pd(struct device *dev) {} #endif -static struct em_perf_domain * -em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, - cpumask_t *span) + +static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, + int nr_states, struct em_data_callback *cb) { unsigned long opp_eff, prev_opp_eff = ULONG_MAX; unsigned long power, freq, prev_freq = 0; - int i, ret, cpu = cpumask_first(span); struct em_perf_state *table; - struct em_perf_domain *pd; + int i, ret; u64 fmax; - if (!cb->active_power) - return NULL; - - pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); - if (!pd) - return NULL; - table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); if (!table) - goto free_pd; + return -ENOMEM; /* Build the list of performance states for this performance domain */ for (i = 0, freq = 0; i < nr_states; i++, freq++) { /* * active_power() is a driver callback which ceils 'freq' to - * lowest performance state of 'cpu' above 'freq' and updates + * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ ret = cb->active_power(&power, &freq, dev); if (ret) { - pr_err("pd%d: invalid perf. state: %d\n", cpu, ret); + dev_err(dev, "EM: invalid perf. state: %d\n", + ret); goto free_ps_table; } @@ -118,7 +123,8 @@ em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, * higher performance states. */ if (freq <= prev_freq) { - pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq); + dev_err(dev, "EM: non-increasing freq: %lu\n", + freq); goto free_ps_table; } @@ -127,7 +133,8 @@ em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, * positive, in milli-watts and to fit into 16 bits. */ if (!power || power > EM_MAX_POWER) { - pr_err("pd%d: invalid power: %lu\n", cpu, power); + dev_err(dev, "EM: invalid power: %lu\n", + power); goto free_ps_table; } @@ -142,8 +149,8 @@ em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, */ opp_eff = freq / power; if (opp_eff >= prev_opp_eff) - pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", - cpu, i, i - 1); + dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", + i, i - 1); prev_opp_eff = opp_eff; } @@ -156,30 +163,82 @@ em_create_pd(struct device *dev, int nr_states, struct em_data_callback *cb, pd->table = table; pd->nr_perf_states = nr_states; - cpumask_copy(to_cpumask(pd->cpus), span); - em_debug_create_pd(pd, cpu); - - return pd; + return 0; free_ps_table: kfree(table); -free_pd: - kfree(pd); + return -EINVAL; +} + +static int em_create_pd(struct device *dev, int nr_states, + struct em_data_callback *cb, cpumask_t *cpus) +{ + struct em_perf_domain *pd; + struct device *cpu_dev; + int cpu, ret; + + if (_is_cpu_device(dev)) { + pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); + if (!pd) + return -ENOMEM; + + cpumask_copy(em_span_cpus(pd), cpus); + } else { + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return -ENOMEM; + } + + ret = em_create_perf_table(dev, pd, nr_states, cb); + if (ret) { + kfree(pd); + return ret; + } + + if (_is_cpu_device(dev)) + for_each_cpu(cpu, cpus) { + cpu_dev = get_cpu_device(cpu); + cpu_dev->em_pd = pd; + } + + dev->em_pd = pd; + + return 0; +} + +/** + * em_pd_get() - Return the performance domain for a device + * @dev : Device to find the performance domain for + * + * Returns the performance domain to which @dev belongs, or NULL if it doesn't + * exist. + */ +struct em_perf_domain *em_pd_get(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev)) + return NULL; - return NULL; + return dev->em_pd; } +EXPORT_SYMBOL_GPL(em_pd_get); /** * em_cpu_get() - Return the performance domain for a CPU * @cpu : CPU to find the performance domain for * - * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't + * Returns the performance domain to which @cpu belongs, or NULL if it doesn't * exist. */ struct em_perf_domain *em_cpu_get(int cpu) { - return READ_ONCE(per_cpu(em_data, cpu)); + struct device *cpu_dev; + + cpu_dev = get_cpu_device(cpu); + if (!cpu_dev) + return NULL; + + return em_pd_get(cpu_dev); } EXPORT_SYMBOL_GPL(em_cpu_get); @@ -188,7 +247,7 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * @dev : Device for which the EM is to register * @nr_states : Number of performance states to register * @cb : Callback functions providing the data of the Energy Model - * @span : Pointer to cpumask_t, which in case of a CPU device is + * @cpus : Pointer to cpumask_t, which in case of a CPU device is * obligatory. It can be taken from i.e. 'policy->cpus'. For other * type of devices this should be set to NULL. * @@ -201,13 +260,12 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * Return 0 on success */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *span) + struct em_data_callback *cb, cpumask_t *cpus) { unsigned long cap, prev_cap = 0; - struct em_perf_domain *pd; - int cpu, ret = 0; + int cpu, ret; - if (!dev || !span || !nr_states || !cb) + if (!dev || !nr_states || !cb) return -EINVAL; /* @@ -216,47 +274,50 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, */ mutex_lock(&em_pd_mutex); - for_each_cpu(cpu, span) { - /* Make sure we don't register again an existing domain. */ - if (READ_ONCE(per_cpu(em_data, cpu))) { - ret = -EEXIST; - goto unlock; - } + if (dev->em_pd) { + ret = -EEXIST; + goto unlock; + } - /* - * All CPUs of a domain must have the same micro-architecture - * since they all share the same table. - */ - cap = arch_scale_cpu_capacity(cpu); - if (prev_cap && prev_cap != cap) { - pr_err("CPUs of %*pbl must have the same capacity\n", - cpumask_pr_args(span)); + if (_is_cpu_device(dev)) { + if (!cpus) { + dev_err(dev, "EM: invalid CPU mask\n"); ret = -EINVAL; goto unlock; } - prev_cap = cap; + + for_each_cpu(cpu, cpus) { + if (em_cpu_get(cpu)) { + dev_err(dev, "EM: exists for CPU%d\n", cpu); + ret = -EEXIST; + goto unlock; + } + /* + * All CPUs of a domain must have the same + * micro-architecture since they all share the same + * table. + */ + cap = arch_scale_cpu_capacity(cpu); + if (prev_cap && prev_cap != cap) { + dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n", + cpumask_pr_args(cpus)); + + ret = -EINVAL; + goto unlock; + } + prev_cap = cap; + } } - /* Create the performance domain and add it to the Energy Model. */ - pd = em_create_pd(dev, nr_states, cb, span); - if (!pd) { - ret = -EINVAL; + ret = em_create_pd(dev, nr_states, cb, cpus); + if (ret) goto unlock; - } - for_each_cpu(cpu, span) { - /* - * The per-cpu array can be read concurrently from em_cpu_get(). - * The barrier enforces the ordering needed to make sure readers - * can only access well formed em_perf_domain structs. - */ - smp_store_release(per_cpu_ptr(&em_data, cpu), pd); - } + em_debug_create_pd(dev); + dev_info(dev, "EM: created perf domain\n"); - pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span)); unlock: mutex_unlock(&em_pd_mutex); - return ret; } EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); @@ -285,3 +346,32 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, return em_dev_register_perf_domain(cpu_dev, nr_states, cb, span); } EXPORT_SYMBOL_GPL(em_register_perf_domain); + +/** + * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device + * @dev : Device for which the EM is registered + * + * Unregister the EM for the specified @dev (but not a CPU device). + */ +void em_dev_unregister_perf_domain(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev) || !dev->em_pd) + return; + + if (_is_cpu_device(dev)) + return; + + /* + * The mutex separates all register/unregister requests and protects + * from potential clean-up/setup issues in the debugfs directories. + * The debugfs directory name is the same as device's name. + */ + mutex_lock(&em_pd_mutex); + em_debug_remove_pd(dev); + + kfree(dev->em_pd->table); + kfree(dev->em_pd); + dev->em_pd = NULL; + mutex_unlock(&em_pd_mutex); +} +EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); -- cgit v1.2.3 From 07891f15d91317b2220a0b610a2d7e324a88105d Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 27 May 2020 10:58:51 +0100 Subject: PM / EM: remove em_register_perf_domain Remove old function em_register_perf_domain which is no longer needed. There is em_dev_register_perf_domain that covers old use cases and new as well. Acked-by: Daniel Lezcano Acked-by: Quentin Perret Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 7 ------- kernel/power/energy_model.c | 25 ------------------------- 2 files changed, 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 2d4689964029..0f94e871a202 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -78,8 +78,6 @@ struct em_data_callback { struct em_perf_domain *em_cpu_get(int cpu); struct em_perf_domain *em_pd_get(struct device *dev); -int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, - struct em_data_callback *cb); int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, struct em_data_callback *cb, cpumask_t *span); void em_dev_unregister_perf_domain(struct device *dev); @@ -181,11 +179,6 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) struct em_data_callback {}; #define EM_DATA_CB(_active_power_cb) { } -static inline int em_register_perf_domain(cpumask_t *span, - unsigned int nr_states, struct em_data_callback *cb) -{ - return -EINVAL; -} static inline int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, struct em_data_callback *cb, cpumask_t *span) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 32d76e78f992..c1ff7fa030ab 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -322,31 +322,6 @@ unlock: } EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); -/** - * em_register_perf_domain() - Register the Energy Model of a performance domain - * @span : Mask of CPUs in the performance domain - * @nr_states : Number of capacity states to register - * @cb : Callback functions providing the data of the Energy Model - * - * Create Energy Model tables for a performance domain using the callbacks - * defined in cb. - * - * If multiple clients register the same performance domain, all but the first - * registration will be ignored. - * - * Return 0 on success - */ -int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, - struct em_data_callback *cb) -{ - struct device *cpu_dev; - - cpu_dev = get_cpu_device(cpumask_first(span)); - - return em_dev_register_perf_domain(cpu_dev, nr_states, cb, span); -} -EXPORT_SYMBOL_GPL(em_register_perf_domain); - /** * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device * @dev : Device for which the EM is registered -- cgit v1.2.3 From f0b5694791ce70dba16758c3b838d5ddc7731b02 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 27 May 2020 10:58:52 +0100 Subject: PM / EM: change name of em_pd_energy to em_cpu_energy Energy Model framework now supports other devices than CPUs. Refactor some of the functions in order to prevent wrong usage. The old function em_pd_energy has to generic name. It must not be used without proper cpumask pointer, which is possible only for CPU devices. Thus, rename it and add proper description to warn of potential wrong usage for other devices. Acked-by: Daniel Lezcano Acked-by: Quentin Perret Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 11 ++++++++--- kernel/sched/fair.c | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 0f94e871a202..b67a51c574b9 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -83,15 +83,20 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, void em_dev_unregister_perf_domain(struct device *dev); /** - * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain + * em_cpu_energy() - Estimates the energy consumed by the CPUs of a + performance domain * @pd : performance domain for which energy has to be estimated * @max_util : highest utilization among CPUs of the domain * @sum_util : sum of the utilization of all CPUs in the domain * + * This function must be used only for CPU devices. There is no validation, + * i.e. if the EM is a CPU type and has cpumask allocated. It is called from + * the scheduler code quite frequently and that is why there is not checks. + * * Return: the sum of the energy consumed by the CPUs of the domain assuming * a capacity state satisfying the max utilization of the domain. */ -static inline unsigned long em_pd_energy(struct em_perf_domain *pd, +static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util) { unsigned long freq, scale_cpu; @@ -196,7 +201,7 @@ static inline struct em_perf_domain *em_pd_get(struct device *dev) { return NULL; } -static inline unsigned long em_pd_energy(struct em_perf_domain *pd, +static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util) { return 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cbcb2f71599b..6da601c8d383 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6497,7 +6497,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) max_util = max(max_util, cpu_util); } - return em_pd_energy(pd->em_pd, max_util, sum_util); + return em_cpu_energy(pd->em_pd, max_util, sum_util); } /* -- cgit v1.2.3 From c06b0229579806f5b5e14af64cf9c5dc771445b3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:07 -0700 Subject: bpf: Support 'X' in bpf_seq_printf() helper 'X' tells kernel to print hex with upper case letters. /proc/net/tcp{4,6} seq_file show() used this, and supports it in bpf_seq_printf() helper too. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230807.3988014-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a3ac7de98baa..b44505f56b6d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -681,7 +681,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, } if (fmt[i] != 'i' && fmt[i] != 'd' && - fmt[i] != 'u' && fmt[i] != 'x') { + fmt[i] != 'u' && fmt[i] != 'x' && + fmt[i] != 'X') { err = -EINVAL; goto out; } -- cgit v1.2.3 From 72e2b2b66f9c1225e51fc4a1c1e8512959195b76 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:08 -0700 Subject: bpf: Allow tracing programs to use bpf_jiffies64() helper /proc/net/tcp{4,6} uses jiffies for various computations. Let us add bpf_jiffies64() helper to tracing program so bpf_iter and other programs can use it. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230808.3988073-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b44505f56b6d..0159f12d2af5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1135,6 +1135,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; default: return NULL; } -- cgit v1.2.3 From af7ec13833619e17f03aa73a785a2f871da6d66b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:09 -0700 Subject: bpf: Add bpf_skc_to_tcp6_sock() helper The helper is used in tracing programs to cast a socket pointer to a tcp6_sock pointer. The return value could be NULL if the casting is illegal. A new helper return type RET_PTR_TO_BTF_ID_OR_NULL is added so the verifier is able to deduce proper return types for the helper. Different from the previous BTF_ID based helpers, the bpf_skc_to_tcp6_sock() argument can be several possible btf_ids. More specifically, all possible socket data structures with sock_common appearing in the first in the memory layout. This patch only added socket types related to tcp and udp. All possible argument btf_id and return value btf_id for helper bpf_skc_to_tcp6_sock() are pre-calculcated and cached. In the future, it is even possible to precompute these btf_id's at kernel build time. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230809.3988195-1-yhs@fb.com --- include/linux/bpf.h | 12 +++++++ include/uapi/linux/bpf.h | 9 ++++- kernel/bpf/btf.c | 1 + kernel/bpf/verifier.c | 43 ++++++++++++++++------ kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 82 ++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 9 ++++- 8 files changed, 148 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1e1501ee53ce..c0e38ad07848 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -265,6 +265,7 @@ enum bpf_return_type { RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ + RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -287,6 +288,12 @@ struct bpf_func_proto { enum bpf_arg_type arg_type[5]; }; int *btf_id; /* BTF ids of arguments */ + bool (*check_btf_id)(u32 btf_id, u32 arg); /* if the argument btf_id is + * valid. Often used if more + * than one btf id is permitted + * for this argument. + */ + int *ret_btf_id; /* return value btf_id */ }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -1524,6 +1531,7 @@ static inline bool bpf_map_is_dev_bound(struct bpf_map *map) struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); void bpf_map_offload_map_free(struct bpf_map *map); +void init_btf_sock_ids(struct btf *btf); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) @@ -1549,6 +1557,9 @@ static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) static inline void bpf_map_offload_map_free(struct bpf_map *map) { } +static inline void init_btf_sock_ids(struct btf *btf) +{ +} #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_BPF_STREAM_PARSER) @@ -1638,6 +1649,7 @@ extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d9737d51dd19..e90ad07b291a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3255,6 +3255,12 @@ union bpf_attr { * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level * is returned or the error code -EACCES in case the skb is not * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3392,7 +3398,8 @@ union bpf_attr { FN(ringbuf_submit), \ FN(ringbuf_discard), \ FN(ringbuf_query), \ - FN(csum_level), + FN(csum_level), \ + FN(skc_to_tcp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e377d1981730..4c3007f428b1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3674,6 +3674,7 @@ struct btf *btf_parse_vmlinux(void) goto errout; bpf_struct_ops_init(btf, log); + init_btf_sock_ids(btf); btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7460f967cb75..7de98906ddf4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3800,12 +3800,14 @@ static int int_ptr_type_to_size(enum bpf_arg_type type) return -EINVAL; } -static int check_func_arg(struct bpf_verifier_env *env, u32 regno, - enum bpf_arg_type arg_type, - struct bpf_call_arg_meta *meta) +static int check_func_arg(struct bpf_verifier_env *env, u32 arg, + struct bpf_call_arg_meta *meta, + const struct bpf_func_proto *fn) { + u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; + enum bpf_arg_type arg_type = fn->arg_type[arg]; int err = 0; if (arg_type == ARG_DONTCARE) @@ -3885,9 +3887,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_BTF_ID; if (type != expected_type) goto err_type; - if (reg->btf_id != meta->btf_id) { - verbose(env, "Helper has type %s got %s in R%d\n", - kernel_type_name(meta->btf_id), + if (!fn->check_btf_id) { + if (reg->btf_id != meta->btf_id) { + verbose(env, "Helper has type %s got %s in R%d\n", + kernel_type_name(meta->btf_id), + kernel_type_name(reg->btf_id), regno); + + return -EACCES; + } + } else if (!fn->check_btf_id(reg->btf_id, arg)) { + verbose(env, "Helper does not support %s in R%d\n", kernel_type_name(reg->btf_id), regno); return -EACCES; @@ -4709,10 +4718,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - err = btf_resolve_helper_id(&env->log, fn, i); - if (err > 0) - meta.btf_id = err; - err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); + if (!fn->check_btf_id) { + err = btf_resolve_helper_id(&env->log, fn, i); + if (err > 0) + meta.btf_id = err; + } + err = check_func_arg(env, i, &meta, fn); if (err) return err; } @@ -4815,6 +4826,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; + } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + int ret_btf_id; + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + ret_btf_id = *fn->ret_btf_id; + if (ret_btf_id == 0) { + verbose(env, "invalid return type %d of func %s#%d\n", + fn->ret_type, func_id_name(func_id), func_id); + return -EINVAL; + } + regs[BPF_REG_0].btf_id = ret_btf_id; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0159f12d2af5..2a97a268f533 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1515,6 +1515,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_output_proto; case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; + case BPF_FUNC_skc_to_tcp6_sock: + return &bpf_skc_to_tcp6_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index c713b6b8938f..176e27d75c51 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -9225,3 +9226,84 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) { bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } + +/* Define a list of socket types which can be the argument for + * skc_to_*_sock() helpers. All these sockets should have + * sock_common as the first argument in its memory layout. + */ +#define BTF_SOCK_TYPE_xxx \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, "inet_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, "inet_connection_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, "inet_request_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, "inet_timewait_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, "request_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, "sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, "sock_common") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, "tcp_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, "tcp_request_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, "tcp_timewait_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, "tcp6_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, "udp_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, "udp6_sock") + +enum { +#define BTF_SOCK_TYPE(name, str) name, +BTF_SOCK_TYPE_xxx +#undef BTF_SOCK_TYPE +MAX_BTF_SOCK_TYPE, +}; + +static int btf_sock_ids[MAX_BTF_SOCK_TYPE]; + +#ifdef CONFIG_BPF_SYSCALL +static const char *bpf_sock_types[] = { +#define BTF_SOCK_TYPE(name, str) str, +BTF_SOCK_TYPE_xxx +#undef BTF_SOCK_TYPE +}; + +void init_btf_sock_ids(struct btf *btf) +{ + int i, btf_id; + + for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) { + btf_id = btf_find_by_name_kind(btf, bpf_sock_types[i], + BTF_KIND_STRUCT); + if (btf_id > 0) + btf_sock_ids[i] = btf_id; + } +} +#endif + +static bool check_arg_btf_id(u32 btf_id, u32 arg) +{ + int i; + + /* only one argument, no need to check arg */ + for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) + if (btf_sock_ids[i] == btf_id) + return true; + return false; +} + +BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) +{ + /* tcp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct tcp6_sock); + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && + sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { + .func = bpf_skc_to_tcp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 91fa668fa860..6c2f64118651 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -421,6 +421,7 @@ class PrinterHelpers(Printer): 'struct sockaddr', 'struct tcphdr', 'struct seq_file', + 'struct tcp6_sock', 'struct __sk_buff', 'struct sk_msg_md', @@ -458,6 +459,7 @@ class PrinterHelpers(Printer): 'struct sockaddr', 'struct tcphdr', 'struct seq_file', + 'struct tcp6_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d9737d51dd19..e90ad07b291a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3255,6 +3255,12 @@ union bpf_attr { * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level * is returned or the error code -EACCES in case the skb is not * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3392,7 +3398,8 @@ union bpf_attr { FN(ringbuf_submit), \ FN(ringbuf_discard), \ FN(ringbuf_query), \ - FN(csum_level), + FN(csum_level), \ + FN(skc_to_tcp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 478cfbdf5f13dfe09cfd0b1cbac821f5e27f6108 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:11 -0700 Subject: bpf: Add bpf_skc_to_{tcp, tcp_timewait, tcp_request}_sock() helpers Three more helpers are added to cast a sock_common pointer to an tcp_sock, tcp_timewait_sock or a tcp_request_sock for tracing programs. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230811.3988277-1-yhs@fb.com --- include/linux/bpf.h | 3 ++ include/uapi/linux/bpf.h | 23 +++++++++++++++- kernel/trace/bpf_trace.c | 6 ++++ net/core/filter.c | 62 ++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 6 ++++ tools/include/uapi/linux/bpf.h | 23 +++++++++++++++- 6 files changed, 121 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c0e38ad07848..c23998cf6699 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1650,6 +1650,9 @@ extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e90ad07b291a..b9412ab275f3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3261,6 +3261,24 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3399,7 +3417,10 @@ union bpf_attr { FN(ringbuf_discard), \ FN(ringbuf_query), \ FN(csum_level), \ - FN(skc_to_tcp6_sock), + FN(skc_to_tcp6_sock), \ + FN(skc_to_tcp_sock), \ + FN(skc_to_tcp_timewait_sock), \ + FN(skc_to_tcp_request_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2a97a268f533..48d935b0d87c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1517,6 +1517,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_output_proto; case BPF_FUNC_skc_to_tcp6_sock: return &bpf_skc_to_tcp6_sock_proto; + case BPF_FUNC_skc_to_tcp_sock: + return &bpf_skc_to_tcp_sock_proto; + case BPF_FUNC_skc_to_tcp_timewait_sock: + return &bpf_skc_to_tcp_timewait_sock_proto; + case BPF_FUNC_skc_to_tcp_request_sock: + return &bpf_skc_to_tcp_request_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index 176e27d75c51..0b4e5aed7e20 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -74,6 +74,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -9307,3 +9308,64 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { .check_btf_id = check_arg_btf_id, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], }; + +BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) +{ + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { + .func = bpf_skc_to_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +}; + +BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) +{ + if (sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { + .func = bpf_skc_to_tcp_timewait_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], +}; + +BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) +{ + if (sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { + .func = bpf_skc_to_tcp_request_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6c2f64118651..d886657c6aaa 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -422,6 +422,9 @@ class PrinterHelpers(Printer): 'struct tcphdr', 'struct seq_file', 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', 'struct __sk_buff', 'struct sk_msg_md', @@ -460,6 +463,9 @@ class PrinterHelpers(Printer): 'struct tcphdr', 'struct seq_file', 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e90ad07b291a..b9412ab275f3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3261,6 +3261,24 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3399,7 +3417,10 @@ union bpf_attr { FN(ringbuf_discard), \ FN(ringbuf_query), \ FN(csum_level), \ - FN(skc_to_tcp6_sock), + FN(skc_to_tcp6_sock), \ + FN(skc_to_tcp_sock), \ + FN(skc_to_tcp_timewait_sock), \ + FN(skc_to_tcp_request_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 0d4fad3e57df2bf61e8ffc8d12a34b1caf9b8835 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:15 -0700 Subject: bpf: Add bpf_skc_to_udp6_sock() helper The helper is used in tracing programs to cast a socket pointer to a udp6_sock pointer. The return value could be NULL if the casting is illegal. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Cc: Eric Dumazet Link: https://lore.kernel.org/bpf/20200623230815.3988481-1-yhs@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 9 ++++++++- kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 22 ++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 9 ++++++++- 6 files changed, 43 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c23998cf6699..3d2ade703a35 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1653,6 +1653,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9412ab275f3..0cb8ec948816 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3279,6 +3279,12 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3420,7 +3426,8 @@ union bpf_attr { FN(skc_to_tcp6_sock), \ FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ - FN(skc_to_tcp_request_sock), + FN(skc_to_tcp_request_sock), \ + FN(skc_to_udp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 48d935b0d87c..5d59dda5f661 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1523,6 +1523,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_tcp_timewait_sock_proto; case BPF_FUNC_skc_to_tcp_request_sock: return &bpf_skc_to_tcp_request_sock_proto; + case BPF_FUNC_skc_to_udp6_sock: + return &bpf_skc_to_udp6_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index 0b4e5aed7e20..c796e141ea8e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9369,3 +9369,25 @@ const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { .check_btf_id = check_arg_btf_id, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], }; + +BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) +{ + /* udp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct udp6_sock); + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && + sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { + .func = bpf_skc_to_udp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index d886657c6aaa..6bab40ff442e 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -425,6 +425,7 @@ class PrinterHelpers(Printer): 'struct tcp_sock', 'struct tcp_timewait_sock', 'struct tcp_request_sock', + 'struct udp6_sock', 'struct __sk_buff', 'struct sk_msg_md', @@ -466,6 +467,7 @@ class PrinterHelpers(Printer): 'struct tcp_sock', 'struct tcp_timewait_sock', 'struct tcp_request_sock', + 'struct udp6_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b9412ab275f3..0cb8ec948816 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3279,6 +3279,12 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3420,7 +3426,8 @@ union bpf_attr { FN(skc_to_tcp6_sock), \ FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ - FN(skc_to_tcp_request_sock), + FN(skc_to_tcp_request_sock), \ + FN(skc_to_udp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 590d69796346353878b275c5512c664e3f875f24 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Dec 2019 16:44:52 -0500 Subject: sched: Force the address order of each sched class descriptor In order to make a micro optimization in pick_next_task(), the order of the sched class descriptor address must be in the same order as their priority to each other. That is: &idle_sched_class < &fair_sched_class < &rt_sched_class < &dl_sched_class < &stop_sched_class In order to guarantee this order of the sched class descriptors, add each one into their own data section and force the order in the linker script. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/157675913272.349305.8936736338884044103.stgit@localhost.localdomain --- include/asm-generic/vmlinux.lds.h | 13 +++++++++++++ kernel/sched/deadline.c | 3 ++- kernel/sched/fair.c | 3 ++- kernel/sched/idle.c | 3 ++- kernel/sched/rt.c | 3 ++- kernel/sched/stop_task.c | 3 ++- 6 files changed, 23 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index db600ef218d7..2186d7b01af6 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -108,6 +108,18 @@ #define SBSS_MAIN .sbss #endif +/* + * The order of the sched class addresses are important, as they are + * used to determine the order of the priority of each sched class in + * relation to each other. + */ +#define SCHED_DATA \ + *(__idle_sched_class) \ + *(__fair_sched_class) \ + *(__rt_sched_class) \ + *(__dl_sched_class) \ + *(__stop_sched_class) + /* * Align to a 32 byte boundary equal to the * alignment gcc 4.5 uses for a struct @@ -388,6 +400,7 @@ .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ __start_rodata = .; \ *(.rodata) *(.rodata.*) \ + SCHED_DATA \ RO_AFTER_INIT_DATA /* Read only after init */ \ . = ALIGN(8); \ __start___tracepoints_ptrs = .; \ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d4708e29008f..d9e79462993b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2479,7 +2479,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, } } -const struct sched_class dl_sched_class = { +const struct sched_class dl_sched_class + __attribute__((section("__dl_sched_class"))) = { .next = &rt_sched_class, .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0424a0af5f87..3365f6b07c36 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11122,7 +11122,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -const struct sched_class fair_sched_class = { +const struct sched_class fair_sched_class + __attribute__((section("__fair_sched_class"))) = { .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8d75ca201484..f5806295356b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -453,7 +453,8 @@ static void update_curr_idle(struct rq *rq) /* * Simple, special scheduling class for the per-CPU idle tasks: */ -const struct sched_class idle_sched_class = { +const struct sched_class idle_sched_class + __attribute__((section("__idle_sched_class"))) = { /* .next is NULL */ /* no enqueue/yield_task for idle tasks */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f395ddb75f38..6543d4430331 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2429,7 +2429,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -const struct sched_class rt_sched_class = { +const struct sched_class rt_sched_class + __attribute__((section("__rt_sched_class"))) = { .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 3e50a6a8f1e5..f4bbd54caae0 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -109,7 +109,8 @@ static void update_curr_stop(struct rq *rq) /* * Simple, special scheduling class for the per-CPU stop tasks: */ -const struct sched_class stop_sched_class = { +const struct sched_class stop_sched_class + __attribute__((section("__stop_sched_class"))) = { .next = &dl_sched_class, .enqueue_task = enqueue_task_stop, -- cgit v1.2.3 From c3a340f7e7eadac7662ab104ceb16432e5a4c6b2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Dec 2019 16:44:53 -0500 Subject: sched: Have sched_class_highest define by vmlinux.lds.h Now that the sched_class descriptors are defined by the linker script, and this needs to be aware of the existance of stop_sched_class when SMP is enabled or not, as it is used as the "highest" priority when defined. Move the declaration of sched_class_highest to the same location in the linker script that inserts stop_sched_class, and this will also make it easier to see what should be defined as the highest class, as this linker script location defines the priorities as well. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191219214558.682913590@goodmis.org --- include/asm-generic/vmlinux.lds.h | 5 ++++- kernel/sched/core.c | 8 ++++++++ kernel/sched/sched.h | 17 +++++++++-------- 3 files changed, 21 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 2186d7b01af6..66fb84c3dc7e 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -114,11 +114,14 @@ * relation to each other. */ #define SCHED_DATA \ + STRUCT_ALIGN(); \ + __begin_sched_classes = .; \ *(__idle_sched_class) \ *(__fair_sched_class) \ *(__rt_sched_class) \ *(__dl_sched_class) \ - *(__stop_sched_class) + *(__stop_sched_class) \ + __end_sched_classes = .; /* * Align to a 32 byte boundary equal to the diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0208b71bef80..81640fe0eae8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6646,6 +6646,14 @@ void __init sched_init(void) unsigned long ptr = 0; int i; + /* Make sure the linker didn't screw up */ + BUG_ON(&idle_sched_class + 1 != &fair_sched_class || + &fair_sched_class + 1 != &rt_sched_class || + &rt_sched_class + 1 != &dl_sched_class); +#ifdef CONFIG_SMP + BUG_ON(&dl_sched_class + 1 != &stop_sched_class); +#endif + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 336887607b3d..4165c06d1d7b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1811,7 +1811,7 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif -}; +} __aligned(32); /* STRUCT_ALIGN(), vmlinux.lds.h */ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { @@ -1825,17 +1825,18 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) next->sched_class->set_next_task(rq, next, false); } -#ifdef CONFIG_SMP -#define sched_class_highest (&stop_sched_class) -#else -#define sched_class_highest (&dl_sched_class) -#endif +/* Defined in include/asm-generic/vmlinux.lds.h */ +extern struct sched_class __begin_sched_classes[]; +extern struct sched_class __end_sched_classes[]; + +#define sched_class_highest (__end_sched_classes - 1) +#define sched_class_lowest (__begin_sched_classes - 1) #define for_class_range(class, _from, _to) \ - for (class = (_from); class != (_to); class = class->next) + for (class = (_from); class != (_to); class--) #define for_each_class(class) \ - for_class_range(class, sched_class_highest, NULL) + for_class_range(class, sched_class_highest, sched_class_lowest) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; -- cgit v1.2.3 From a87e749e8fa1aaef9b4db32e21c2795e69ce67bf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Dec 2019 16:44:54 -0500 Subject: sched: Remove struct sched_class::next field Now that the sched_class descriptors are defined in order via the linker script vmlinux.lds.h, there's no reason to have a "next" pointer to the previous priroity structure. The order of the sturctures can be aligned as an array, and used to index and find the next sched_class descriptor. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20191219214558.845353593@goodmis.org --- kernel/sched/deadline.c | 1 - kernel/sched/fair.c | 1 - kernel/sched/idle.c | 1 - kernel/sched/rt.c | 1 - kernel/sched/sched.h | 1 - kernel/sched/stop_task.c | 1 - 6 files changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d9e79462993b..c9cc1d6fa363 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2481,7 +2481,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, const struct sched_class dl_sched_class __attribute__((section("__dl_sched_class"))) = { - .next = &rt_sched_class, .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3365f6b07c36..a63f400013de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11124,7 +11124,6 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task */ const struct sched_class fair_sched_class __attribute__((section("__fair_sched_class"))) = { - .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f5806295356b..336d478bddc8 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -455,7 +455,6 @@ static void update_curr_idle(struct rq *rq) */ const struct sched_class idle_sched_class __attribute__((section("__idle_sched_class"))) = { - /* .next is NULL */ /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6543d4430331..f215eea6a966 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2431,7 +2431,6 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) const struct sched_class rt_sched_class __attribute__((section("__rt_sched_class"))) = { - .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4165c06d1d7b..549e7e6e0a66 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1754,7 +1754,6 @@ extern const u32 sched_prio_to_wmult[40]; #define RETRY_TASK ((void *)-1UL) struct sched_class { - const struct sched_class *next; #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index f4bbd54caae0..394bc8126a1e 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -111,7 +111,6 @@ static void update_curr_stop(struct rq *rq) */ const struct sched_class stop_sched_class __attribute__((section("__stop_sched_class"))) = { - .next = &dl_sched_class, .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, -- cgit v1.2.3 From aa93cd53bc1b91b5f99c7b55e3dcc1ac98e99558 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 19 Dec 2019 16:44:55 -0500 Subject: sched: Micro optimization in pick_next_task() and in check_preempt_curr() This introduces an optimization based on xxx_sched_class addresses in two hot scheduler functions: pick_next_task() and check_preempt_curr(). It is possible to compare pointers to sched classes to check, which of them has a higher priority, instead of current iterations using for_each_class(). One more result of the patch is that size of object file becomes a little less (excluding added BUG_ON(), which goes in __init section): $size kernel/sched/core.o text data bss dec hex filename before: 66446 18957 676 86079 1503f kernel/sched/core.o after: 66398 18957 676 86031 1500f kernel/sched/core.o Signed-off-by: Kirill Tkhai Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/711a9c4b-ff32-1136-b848-17c622d548f3@yandex.ru --- kernel/sched/core.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 81640fe0eae8..0a08d0c235d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1412,20 +1412,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - const struct sched_class *class; - - if (p->sched_class == rq->curr->sched_class) { + if (p->sched_class == rq->curr->sched_class) rq->curr->sched_class->check_preempt_curr(rq, p, flags); - } else { - for_each_class(class) { - if (class == rq->curr->sched_class) - break; - if (class == p->sched_class) { - resched_curr(rq); - break; - } - } - } + else if (p->sched_class > rq->curr->sched_class) + resched_curr(rq); /* * A queue event has occurred, and we're going to schedule. In @@ -4003,8 +3993,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * higher scheduling class, because otherwise those loose the * opportunity to pull in more work from other CPUs. */ - if (likely((prev->sched_class == &idle_sched_class || - prev->sched_class == &fair_sched_class) && + if (likely(prev->sched_class <= &fair_sched_class && rq->nr_running == rq->cfs.h_nr_running)) { p = pick_next_task_fair(rq, prev, rf); -- cgit v1.2.3 From 423d02e1463b21109106f52d94f7396b63731f3b Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Tue, 16 Jun 2020 14:04:07 +0800 Subject: sched/fair: Optimize dequeue_task_fair() While looking at enqueue_task_fair and dequeue_task_fair, it occurred to me that dequeue_task_fair can also be optimized as Vincent described in commit 7d148be69e3a ("sched/fair: Optimize enqueue_task_fair()"). When encountering throttled cfs_rq, dequeue_throttle label can ensure se not to be NULL, and rq->nr_running remains unchanged, so we can also skip the early balance check. Signed-off-by: Peng Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/701eef9a40de93dcf5fe7063fd607bca5db38e05.1592287263.git.rocking@linux.alibaba.com --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a63f400013de..b9b9f19e80c1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5624,14 +5624,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } -dequeue_throttle: - if (!se) - sub_nr_running(rq, 1); + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, 1); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; +dequeue_throttle: util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); } -- cgit v1.2.3 From bba18a1af33e0f0a1ad7d028208b306ef3f3df12 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 18 Jun 2020 19:47:50 +0300 Subject: console: Propagate error code from console ->setup() Since console ->setup() hook returns meaningful error codes, propagate it to the caller of try_enable_new_console(). Signed-off-by: Andy Shevchenko Reviewed-by: Petr Mladek Acked-by: Benjamin Herrenschmidt Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200618164751.56828-6-andriy.shevchenko@linux.intel.com --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8c14835be46c..aaea3ad182e1 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2668,7 +2668,7 @@ early_param("keep_bootcon", keep_bootcon_setup); static int try_enable_new_console(struct console *newcon, bool user_specified) { struct console_cmdline *c; - int i; + int i, err; for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && c->name[0]; @@ -2691,8 +2691,8 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) return 0; if (newcon->setup && - newcon->setup(newcon, c->options) != 0) - return -EIO; + (err = newcon->setup(newcon, c->options)) != 0) + return err; } newcon->flags |= CON_ENABLED; if (i == preferred_console) { -- cgit v1.2.3 From 504603767ab639c47912be08d62e02b43125e82e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 18 Jun 2020 19:47:51 +0300 Subject: console: Fix trivia typo 'change' -> 'chance' I bet the word 'chance' has to be used in 'had a chance to be called', but, alas, I'm not native speaker... Signed-off-by: Andy Shevchenko Reviewed-by: Petr Mladek Acked-by: Benjamin Herrenschmidt Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200618164751.56828-7-andriy.shevchenko@linux.intel.com --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index aaea3ad182e1..6623e975675a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2705,7 +2705,7 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) /* * Some consoles, such as pstore and netconsole, can be enabled even * without matching. Accept the pre-enabled consoles only when match() - * and setup() had a change to be called. + * and setup() had a chance to be called. */ if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) return 0; -- cgit v1.2.3 From f3bdc62fd82ed93dbe4d049eacba310de7eb2a6a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 17 Jun 2020 15:58:23 +0200 Subject: blktrace: Provide event for request merging Currently blk-mq does not report any event when two requests get merged in the elevator. This then results in difficult to understand sequence of events like: ... 8,0 34 1579 0.608765271 2718 I WS 215023504 + 40 [dbench] 8,0 34 1584 0.609184613 2719 A WS 215023544 + 56 <- (8,4) 2160568 8,0 34 1585 0.609184850 2719 Q WS 215023544 + 56 [dbench] 8,0 34 1586 0.609188524 2719 G WS 215023544 + 56 [dbench] 8,0 3 602 0.609684162 773 D WS 215023504 + 96 [kworker/3:1H] 8,0 34 1591 0.609843593 0 C WS 215023504 + 96 [0] and you can only guess (after quite some headscratching since the above excerpt is intermixed with a lot of other IO) that request 215023544+56 got merged to request 215023504+40. Provide proper event for request merging like we used to do in the legacy block layer. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 ++ include/trace/events/block.h | 15 +++++++++++++++ kernel/trace/blktrace.c | 10 ++++++++++ 3 files changed, 27 insertions(+) (limited to 'kernel') diff --git a/block/blk-merge.c b/block/blk-merge.c index f0b0bae075a0..9c9fb21584b6 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -793,6 +793,8 @@ static struct request *attempt_merge(struct request_queue *q, */ blk_account_io_merge_request(next); + trace_block_rq_merge(q, next); + /* * ownership of bio passed from next to req, return 'next' for * the caller to free diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 93b114226af8..34d64ca306b1 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -211,6 +211,21 @@ DEFINE_EVENT(block_rq, block_rq_issue, TP_ARGS(q, rq) ); +/** + * block_rq_merge - merge request with another one in the elevator + * @q: queue holding operation + * @rq: block IO operation operation request + * + * Called when block operation request @rq from queue @q is merged to another + * request queued in the elevator. + */ +DEFINE_EVENT(block_rq, block_rq_merge, + + TP_PROTO(struct request_queue *q, struct request *rq), + + TP_ARGS(q, rq) +); + /** * block_bio_bounce - used bounce buffer when processing block operation * @q: queue holding the block operation diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c086c38f4954..7ba62d68885a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -860,6 +860,13 @@ static void blk_add_trace_rq_issue(void *ignore, blk_trace_request_get_cgid(q, rq)); } +static void blk_add_trace_rq_merge(void *ignore, + struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE, + blk_trace_request_get_cgid(q, rq)); +} + static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) @@ -1144,6 +1151,8 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); WARN_ON(ret); + ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); + WARN_ON(ret); ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); WARN_ON(ret); ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); @@ -1190,6 +1199,7 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); -- cgit v1.2.3 From 985098a05eee6bf5caca7e997e02a5b15242cfa0 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 23 Jun 2020 09:09:10 +0200 Subject: docs: fix references for DMA*.txt files As we moved those files to core-api, fix references to point to their newer locations. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/37b2fd159fbc7655dbf33b3eb1215396a25f6344.1592895969.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/PCI/pci.rst | 6 +++--- Documentation/block/biodoc.rst | 2 +- Documentation/bus-virt-phys-mapping.txt | 2 +- Documentation/core-api/dma-api.rst | 6 +++--- Documentation/core-api/dma-isa-lpc.rst | 2 +- Documentation/driver-api/usb/dma.rst | 6 +++--- Documentation/translations/ko_KR/memory-barriers.txt | 6 +++--- arch/ia64/hp/common/sba_iommu.c | 12 ++++++------ arch/parisc/kernel/pci-dma.c | 2 +- arch/x86/include/asm/dma-mapping.h | 4 ++-- arch/x86/kernel/amd_gart_64.c | 2 +- drivers/parisc/sba_iommu.c | 14 +++++++------- include/linux/dma-mapping.h | 2 +- include/media/videobuf-dma-sg.h | 2 +- kernel/dma/debug.c | 2 +- 15 files changed, 35 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst index 8c016d8c9862..d10d3fe604c5 100644 --- a/Documentation/PCI/pci.rst +++ b/Documentation/PCI/pci.rst @@ -265,7 +265,7 @@ Set the DMA mask size --------------------- .. note:: If anything below doesn't make sense, please refer to - Documentation/DMA-API.txt. This section is just a reminder that + :doc:`/core-api/dma-api`. This section is just a reminder that drivers need to indicate DMA capabilities of the device and is not an authoritative source for DMA interfaces. @@ -291,7 +291,7 @@ Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are Setup shared control data ------------------------- Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared) -memory. See Documentation/DMA-API.txt for a full description of +memory. See :doc:`/core-api/dma-api` for a full description of the DMA APIs. This section is just a reminder that it needs to be done before enabling DMA on the device. @@ -421,7 +421,7 @@ owners if there is one. Then clean up "consistent" buffers which contain the control data. -See Documentation/DMA-API.txt for details on unmapping interfaces. +See :doc:`/core-api/dma-api` for details on unmapping interfaces. Unregister from other subsystems diff --git a/Documentation/block/biodoc.rst b/Documentation/block/biodoc.rst index b964796ec9c7..ba7f45d0271c 100644 --- a/Documentation/block/biodoc.rst +++ b/Documentation/block/biodoc.rst @@ -196,7 +196,7 @@ a virtual address mapping (unlike the earlier scheme of virtual address do not have a corresponding kernel virtual address space mapping) and low-memory pages. -Note: Please refer to Documentation/DMA-API-HOWTO.txt for a discussion +Note: Please refer to :doc:`/core-api/dma-api-howto` for a discussion on PCI high mem DMA aspects and mapping of scatter gather lists, and support for 64 bit PCI. diff --git a/Documentation/bus-virt-phys-mapping.txt b/Documentation/bus-virt-phys-mapping.txt index 4bb07c2f3e7d..c7bc99cd2e21 100644 --- a/Documentation/bus-virt-phys-mapping.txt +++ b/Documentation/bus-virt-phys-mapping.txt @@ -8,7 +8,7 @@ How to access I/O mapped memory from within device drivers The virt_to_bus() and bus_to_virt() functions have been superseded by the functionality provided by the PCI DMA interface - (see Documentation/DMA-API-HOWTO.txt). They continue + (see :doc:`/core-api/dma-api-howto`). They continue to be documented below for historical purposes, but new code must not use them. --davidm 00/12/12 diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst index 2d8d2fed7317..63b4a2f20867 100644 --- a/Documentation/core-api/dma-api.rst +++ b/Documentation/core-api/dma-api.rst @@ -5,7 +5,7 @@ Dynamic DMA mapping using the generic device :Author: James E.J. Bottomley This document describes the DMA API. For a more gentle introduction -of the API (and actual examples), see Documentation/DMA-API-HOWTO.txt. +of the API (and actual examples), see :doc:`/core-api/dma-api-howto`. This API is split into two pieces. Part I describes the basic API. Part II describes extensions for supporting non-consistent memory @@ -471,7 +471,7 @@ without the _attrs suffixes, except that they pass an optional dma_attrs. The interpretation of DMA attributes is architecture-specific, and -each attribute should be documented in Documentation/DMA-attributes.txt. +each attribute should be documented in :doc:`/core-api/dma-attributes`. If dma_attrs are 0, the semantics of each of these functions is identical to those of the corresponding function @@ -484,7 +484,7 @@ for DMA:: #include /* DMA_ATTR_FOO should be defined in linux/dma-mapping.h and - * documented in Documentation/DMA-attributes.txt */ + * documented in Documentation/core-api/dma-attributes.rst */ ... unsigned long attr; diff --git a/Documentation/core-api/dma-isa-lpc.rst b/Documentation/core-api/dma-isa-lpc.rst index b1ec7b16c21f..e59a3d35a93d 100644 --- a/Documentation/core-api/dma-isa-lpc.rst +++ b/Documentation/core-api/dma-isa-lpc.rst @@ -17,7 +17,7 @@ To do ISA style DMA you need to include two headers:: #include The first is the generic DMA API used to convert virtual addresses to -bus addresses (see Documentation/DMA-API.txt for details). +bus addresses (see :doc:`/core-api/dma-api` for details). The second contains the routines specific to ISA DMA transfers. Since this is not present on all platforms make sure you construct your diff --git a/Documentation/driver-api/usb/dma.rst b/Documentation/driver-api/usb/dma.rst index 59d5aee89e37..2b3dbd3265b4 100644 --- a/Documentation/driver-api/usb/dma.rst +++ b/Documentation/driver-api/usb/dma.rst @@ -10,7 +10,7 @@ API overview The big picture is that USB drivers can continue to ignore most DMA issues, though they still must provide DMA-ready buffers (see -``Documentation/DMA-API-HOWTO.txt``). That's how they've worked through +:doc:`/core-api/dma-api-howto`). That's how they've worked through the 2.4 (and earlier) kernels, or they can now be DMA-aware. DMA-aware usb drivers: @@ -60,7 +60,7 @@ and effects like cache-trashing can impose subtle penalties. force a consistent memory access ordering by using memory barriers. It's not using a streaming DMA mapping, so it's good for small transfers on systems where the I/O would otherwise thrash an IOMMU mapping. (See - ``Documentation/DMA-API-HOWTO.txt`` for definitions of "coherent" and + :doc:`/core-api/dma-api-howto` for definitions of "coherent" and "streaming" DMA mappings.) Asking for 1/Nth of a page (as well as asking for N pages) is reasonably @@ -91,7 +91,7 @@ Working with existing buffers Existing buffers aren't usable for DMA without first being mapped into the DMA address space of the device. However, most buffers passed to your driver can safely be used with such DMA mapping. (See the first section -of Documentation/DMA-API-HOWTO.txt, titled "What memory is DMA-able?") +of :doc:`/core-api/dma-api-howto`, titled "What memory is DMA-able?") - When you're using scatterlists, you can map everything at once. On some systems, this kicks in an IOMMU and turns the scatterlists into single diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index 34d041d68f78..604cee350e53 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -570,8 +570,8 @@ ACQUIRE 는 해당 오퍼레이션의 로드 부분에만 적용되고 RELEASE [*] 버스 마스터링 DMA 와 일관성에 대해서는 다음을 참고하시기 바랍니다: Documentation/driver-api/pci/pci.rst - Documentation/DMA-API-HOWTO.txt - Documentation/DMA-API.txt + Documentation/core-api/dma-api-howto.rst + Documentation/core-api/dma-api.rst 데이터 의존성 배리어 (역사적) @@ -1907,7 +1907,7 @@ Mandatory 배리어들은 SMP 시스템에서도 UP 시스템에서도 SMP 효 writel_relaxed() 와 같은 완화된 I/O 접근자들에 대한 자세한 내용을 위해서는 "커널 I/O 배리어의 효과" 섹션을, consistent memory 에 대한 자세한 내용을 - 위해선 Documentation/DMA-API.txt 문서를 참고하세요. + 위해선 Documentation/core-api/dma-api.rst 문서를 참고하세요. ========================= diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index a806227c1fad..656a4888c300 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -907,7 +907,7 @@ sba_mark_invalid(struct ioc *ioc, dma_addr_t iova, size_t byte_cnt) * @dir: dma direction * @attrs: optional dma attributes * - * See Documentation/DMA-API-HOWTO.txt + * See Documentation/core-api/dma-api-howto.rst */ static dma_addr_t sba_map_page(struct device *dev, struct page *page, unsigned long poff, size_t size, @@ -1028,7 +1028,7 @@ sba_mark_clean(struct ioc *ioc, dma_addr_t iova, size_t size) * @dir: R/W or both. * @attrs: optional dma attributes * - * See Documentation/DMA-API-HOWTO.txt + * See Documentation/core-api/dma-api-howto.rst */ static void sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -1105,7 +1105,7 @@ static void sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size, * @size: number of bytes mapped in driver buffer. * @dma_handle: IOVA of new buffer. * - * See Documentation/DMA-API-HOWTO.txt + * See Documentation/core-api/dma-api-howto.rst */ static void * sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, @@ -1162,7 +1162,7 @@ sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, * @vaddr: virtual address IOVA of "consistent" buffer. * @dma_handler: IO virtual address of "consistent" buffer. * - * See Documentation/DMA-API-HOWTO.txt + * See Documentation/core-api/dma-api-howto.rst */ static void sba_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) @@ -1425,7 +1425,7 @@ static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist, * @dir: R/W or both. * @attrs: optional dma attributes * - * See Documentation/DMA-API-HOWTO.txt + * See Documentation/core-api/dma-api-howto.rst */ static int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction dir, @@ -1524,7 +1524,7 @@ static int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist, * @dir: R/W or both. * @attrs: optional dma attributes * - * See Documentation/DMA-API-HOWTO.txt + * See Documentation/core-api/dma-api-howto.rst */ static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction dir, diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index 70cd24bdcfec..4f1596bb1936 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -3,7 +3,7 @@ ** PARISC 1.1 Dynamic DMA mapping support. ** This implementation is for PA-RISC platforms that do not support ** I/O TLBs (aka DMA address translation hardware). -** See Documentation/DMA-API-HOWTO.txt for interface definitions. +** See Documentation/core-api/dma-api-howto.rst for interface definitions. ** ** (c) Copyright 1999,2000 Hewlett-Packard Company ** (c) Copyright 2000 Grant Grundler diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 6b15a24930e0..fed67eafcacc 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -3,8 +3,8 @@ #define _ASM_X86_DMA_MAPPING_H /* - * IOMMU interface. See Documentation/DMA-API-HOWTO.txt and - * Documentation/DMA-API.txt for documentation. + * IOMMU interface. See Documentation/core-api/dma-api-howto.rst and + * Documentation/core-api/dma-api.rst for documentation. */ #include diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 17cb5b933dcf..e89031e9c847 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -6,7 +6,7 @@ * This allows to use PCI devices that only support 32bit addresses on systems * with more than 4GB. * - * See Documentation/DMA-API-HOWTO.txt for the interface specification. + * See Documentation/core-api/dma-api-howto.rst for the interface specification. * * Copyright 2002 Andi Kleen, SuSE Labs. */ diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index 7e112829d250..5368452eb5a6 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -666,7 +666,7 @@ sba_mark_invalid(struct ioc *ioc, dma_addr_t iova, size_t byte_cnt) * @dev: instance of PCI owned by the driver that's asking * @mask: number of address bits this PCI device can handle * - * See Documentation/DMA-API-HOWTO.txt + * See Documentation/core-api/dma-api-howto.rst */ static int sba_dma_supported( struct device *dev, u64 mask) { @@ -698,7 +698,7 @@ static int sba_dma_supported( struct device *dev, u64 mask) * @size: number of bytes to map in driver buffer. * @direction: R/W or both. * - * See Documentation/DMA-API-HOWTO.txt + * See Documentation/core-api/dma-api-howto.rst */ static dma_addr_t sba_map_single(struct device *dev, void *addr, size_t size, @@ -788,7 +788,7 @@ sba_map_page(struct device *dev, struct page *page, unsigned long offset, * @size: number of bytes mapped in driver buffer. * @direction: R/W or both. * - * See Documentation/DMA-API-HOWTO.txt + * See Documentation/core-api/dma-api-howto.rst */ static void sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size, @@ -867,7 +867,7 @@ sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size, * @size: number of bytes mapped in driver buffer. * @dma_handle: IOVA of new buffer. * - * See Documentation/DMA-API-HOWTO.txt + * See Documentation/core-api/dma-api-howto.rst */ static void *sba_alloc(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) @@ -898,7 +898,7 @@ static void *sba_alloc(struct device *hwdev, size_t size, dma_addr_t *dma_handle * @vaddr: virtual address IOVA of "consistent" buffer. * @dma_handler: IO virtual address of "consistent" buffer. * - * See Documentation/DMA-API-HOWTO.txt + * See Documentation/core-api/dma-api-howto.rst */ static void sba_free(struct device *hwdev, size_t size, void *vaddr, @@ -933,7 +933,7 @@ int dump_run_sg = 0; * @nents: number of entries in list * @direction: R/W or both. * - * See Documentation/DMA-API-HOWTO.txt + * See Documentation/core-api/dma-api-howto.rst */ static int sba_map_sg(struct device *dev, struct scatterlist *sglist, int nents, @@ -1017,7 +1017,7 @@ sba_map_sg(struct device *dev, struct scatterlist *sglist, int nents, * @nents: number of entries in list * @direction: R/W or both. * - * See Documentation/DMA-API-HOWTO.txt + * See Documentation/core-api/dma-api-howto.rst */ static void sba_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 78f677cf45ab..ef2b153ddbd9 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -14,7 +14,7 @@ /** * List of possible attributes associated with a DMA mapping. The semantics - * of each attribute should be defined in Documentation/DMA-attributes.txt. + * of each attribute should be defined in Documentation/core-api/dma-attributes.rst. */ /* diff --git a/include/media/videobuf-dma-sg.h b/include/media/videobuf-dma-sg.h index b89d5e31f172..34450f7ad510 100644 --- a/include/media/videobuf-dma-sg.h +++ b/include/media/videobuf-dma-sg.h @@ -31,7 +31,7 @@ * does memory allocation too using vmalloc_32(). * * videobuf_dma_*() - * see Documentation/DMA-API-HOWTO.txt, these functions to + * see Documentation/core-api/dma-api-howto.rst, these functions to * basically the same. The map function does also build a * scatterlist for the buffer (and unmap frees it ...) * diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 36c962a86bf2..f97f088ace7e 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1071,7 +1071,7 @@ static void check_unmap(struct dma_debug_entry *ref) /* * Drivers should use dma_mapping_error() to check the returned * addresses of dma_map_single() and dma_map_page(). - * If not, print this warning message. See Documentation/DMA-API.txt. + * If not, print this warning message. See Documentation/core-api/dma-api.rst. */ if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { err_printk(ref->dev, entry, -- cgit v1.2.3 From c791cc4b1feb881b3644c059dba5464b076bf592 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Jun 2020 14:53:55 -0400 Subject: tracing: Only allow trace_array_printk() to be used by instances To prevent default "trace_printks()" from spamming the top level tracing ring buffer, only allow trace instances to use trace_array_printk() (which can be used without the trace_printk() start up warning). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb62269724d5..8241d1448d70 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3346,12 +3346,16 @@ int trace_array_printk(struct trace_array *tr, int ret; va_list ap; - if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) - return 0; - if (!tr) return -ENOENT; + /* This is only allowed for created instances */ + if (tr == &global_trace) + return 0; + + if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + return 0; + va_start(ap, fmt); ret = trace_array_vprintk(tr, ip, fmt, ap); va_end(ap); -- cgit v1.2.3 From 7582f30cc9fbcdcd630398bea010e5f6a2e2fcab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:56 +0200 Subject: cgroup: unexport cgroup_rstat_updated cgroup_rstat_updated is only used by core block code, no need to export it. Acked-by: Tejun Heo Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/cgroup/rstat.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index b6397a186ce9..d51175cedfca 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -64,7 +64,6 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) raw_spin_unlock_irqrestore(cpu_lock, flags); } -EXPORT_SYMBOL_GPL(cgroup_rstat_updated); /** * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree -- cgit v1.2.3 From 5da7cd11d0811c35a6988d416053b5421bc61521 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 22 Apr 2020 12:25:41 -0400 Subject: x86/ftrace: Only have the builtin ftrace_regs_caller call direct hooks If a direct hook is attached to a function that ftrace also has a function attached to it, then it is required that the ftrace_ops_list_func() is used to iterate over the registered ftrace callbacks. This will also include the direct ftrace_ops helper, that tells ftrace_regs_caller where to return to (the direct callback and not the function that called it). As this direct helper is only to handle the case of ftrace callbacks attached to the same function as the direct callback, the ftrace callback allocated trampolines (used to only call them), should never be used to return back to a direct callback. Only copy the portion of the ftrace_regs_caller that will return back to what called it, and not the portion that returns back to the direct caller. The direct ftrace_ops must then pick the ftrace_regs_caller builtin function as its own trampoline to ensure that it will never have one allocated for it (which would not include the handling of direct callbacks). Link: http://lkml.kernel.org/r/20200422162750.495903799@goodmis.org Cc: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/ftrace.c | 7 ------- arch/x86/kernel/ftrace_64.S | 3 +-- kernel/trace/ftrace.c | 8 ++++++++ 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 51504566b3a6..d1a0190fef5b 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -367,13 +367,6 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) if (WARN_ON(ret < 0)) goto fail; - if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { - ip = trampoline + (ftrace_regs_caller_ret - ftrace_regs_caller); - ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE); - if (WARN_ON(ret < 0)) - goto fail; - } - /* * The address of the ftrace_ops that is used for this trampoline * is stored at the end of the trampoline. This will be used to diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 3ba32cc58f01..43cf9a2b52c7 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -253,7 +253,7 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) * The trampoline will add the code to jump * to the return. */ -SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL) +SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) jmp ftrace_epilogue /* Swap the flags with orig_rax */ @@ -264,7 +264,6 @@ SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL) restore_mcount_regs 8 /* Restore flags */ popfq -SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) UNWIND_HINT_RET_OFFSET jmp ftrace_epilogue diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1903b80db6eb..c141d347f71a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2388,6 +2388,14 @@ struct ftrace_ops direct_ops = { .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_PERMANENT, + /* + * By declaring the main trampoline as this trampoline + * it will never have one allocated for it. Allocated + * trampolines should not call direct functions. + * The direct_ops should only be called by the builtin + * ftrace_regs_caller trampoline. + */ + .trampoline = FTRACE_REGS_ADDR, }; #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ -- cgit v1.2.3 From 43cb5451dffe0bc5d59688d4898c9a1f7c40d3b4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 19:04:06 +0200 Subject: docs: RCU: Convert torture.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to RCU/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- Documentation/RCU/index.rst | 1 + Documentation/RCU/torture.rst | 293 ++++++++++++++++++++++++++++++++++ Documentation/RCU/torture.txt | 282 -------------------------------- Documentation/locking/locktorture.rst | 2 +- MAINTAINERS | 4 +- kernel/rcu/rcutorture.c | 2 +- 6 files changed, 298 insertions(+), 286 deletions(-) create mode 100644 Documentation/RCU/torture.rst delete mode 100644 Documentation/RCU/torture.txt (limited to 'kernel') diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index 577a47e27f5d..5d5f9a1ab8f9 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -18,6 +18,7 @@ RCU concepts whatisRCU rcu rculist_nulls + torture listRCU NMI-RCU UP diff --git a/Documentation/RCU/torture.rst b/Documentation/RCU/torture.rst new file mode 100644 index 000000000000..a90147713062 --- /dev/null +++ b/Documentation/RCU/torture.rst @@ -0,0 +1,293 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +RCU Torture Test Operation +========================== + + +CONFIG_RCU_TORTURE_TEST +======================= + +The CONFIG_RCU_TORTURE_TEST config option is available for all RCU +implementations. It creates an rcutorture kernel module that can +be loaded to run a torture test. The test periodically outputs +status messages via printk(), which can be examined via the dmesg +command (perhaps grepping for "torture"). The test is started +when the module is loaded, and stops when the module is unloaded. + +Module parameters are prefixed by "rcutorture." in +Documentation/admin-guide/kernel-parameters.txt. + +Output +====== + +The statistics output is as follows:: + + rcu-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4 + rcu-torture: rtc: (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767 + rcu-torture: Reader Pipe: 727860534 34213 0 0 0 0 0 0 0 0 0 + rcu-torture: Reader Batch: 727877838 17003 0 0 0 0 0 0 0 0 0 + rcu-torture: Free-Block Circulation: 155440 155440 155440 155440 155440 155440 155440 155440 155440 155440 0 + rcu-torture:--- End of test: SUCCESS: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4 + +The command "dmesg | grep torture:" will extract this information on +most systems. On more esoteric configurations, it may be necessary to +use other commands to access the output of the printk()s used by +the RCU torture test. The printk()s use KERN_ALERT, so they should +be evident. ;-) + +The first and last lines show the rcutorture module parameters, and the +last line shows either "SUCCESS" or "FAILURE", based on rcutorture's +automatic determination as to whether RCU operated correctly. + +The entries are as follows: + +* "rtc": The hexadecimal address of the structure currently visible + to readers. + +* "ver": The number of times since boot that the RCU writer task + has changed the structure visible to readers. + +* "tfle": If non-zero, indicates that the "torture freelist" + containing structures to be placed into the "rtc" area is empty. + This condition is important, since it can fool you into thinking + that RCU is working when it is not. :-/ + +* "rta": Number of structures allocated from the torture freelist. + +* "rtaf": Number of allocations from the torture freelist that have + failed due to the list being empty. It is not unusual for this + to be non-zero, but it is bad for it to be a large fraction of + the value indicated by "rta". + +* "rtf": Number of frees into the torture freelist. + +* "rtmbe": A non-zero value indicates that rcutorture believes that + rcu_assign_pointer() and rcu_dereference() are not working + correctly. This value should be zero. + +* "rtbe": A non-zero value indicates that one of the rcu_barrier() + family of functions is not working correctly. + +* "rtbke": rcutorture was unable to create the real-time kthreads + used to force RCU priority inversion. This value should be zero. + +* "rtbre": Although rcutorture successfully created the kthreads + used to force RCU priority inversion, it was unable to set them + to the real-time priority level of 1. This value should be zero. + +* "rtbf": The number of times that RCU priority boosting failed + to resolve RCU priority inversion. + +* "rtb": The number of times that rcutorture attempted to force + an RCU priority inversion condition. If you are testing RCU + priority boosting via the "test_boost" module parameter, this + value should be non-zero. + +* "nt": The number of times rcutorture ran RCU read-side code from + within a timer handler. This value should be non-zero only + if you specified the "irqreader" module parameter. + +* "Reader Pipe": Histogram of "ages" of structures seen by readers. + If any entries past the first two are non-zero, RCU is broken. + And rcutorture prints the error flag string "!!!" to make sure + you notice. The age of a newly allocated structure is zero, + it becomes one when removed from reader visibility, and is + incremented once per grace period subsequently -- and is freed + after passing through (RCU_TORTURE_PIPE_LEN-2) grace periods. + + The output displayed above was taken from a correctly working + RCU. If you want to see what it looks like when broken, break + it yourself. ;-) + +* "Reader Batch": Another histogram of "ages" of structures seen + by readers, but in terms of counter flips (or batches) rather + than in terms of grace periods. The legal number of non-zero + entries is again two. The reason for this separate view is that + it is sometimes easier to get the third entry to show up in the + "Reader Batch" list than in the "Reader Pipe" list. + +* "Free-Block Circulation": Shows the number of torture structures + that have reached a given point in the pipeline. The first element + should closely correspond to the number of structures allocated, + the second to the number that have been removed from reader view, + and all but the last remaining to the corresponding number of + passes through a grace period. The last entry should be zero, + as it is only incremented if a torture structure's counter + somehow gets incremented farther than it should. + +Different implementations of RCU can provide implementation-specific +additional information. For example, Tree SRCU provides the following +additional line:: + + srcud-torture: Tree SRCU per-CPU(idx=0): 0(35,-21) 1(-4,24) 2(1,1) 3(-26,20) 4(28,-47) 5(-9,4) 6(-10,14) 7(-14,11) T(1,6) + +This line shows the per-CPU counter state, in this case for Tree SRCU +using a dynamically allocated srcu_struct (hence "srcud-" rather than +"srcu-"). The numbers in parentheses are the values of the "old" and +"current" counters for the corresponding CPU. The "idx" value maps the +"old" and "current" values to the underlying array, and is useful for +debugging. The final "T" entry contains the totals of the counters. + +Usage on Specific Kernel Builds +=============================== + +It is sometimes desirable to torture RCU on a specific kernel build, +for example, when preparing to put that kernel build into production. +In that case, the kernel should be built with CONFIG_RCU_TORTURE_TEST=m +so that the test can be started using modprobe and terminated using rmmod. + +For example, the following script may be used to torture RCU:: + + #!/bin/sh + + modprobe rcutorture + sleep 3600 + rmmod rcutorture + dmesg | grep torture: + +The output can be manually inspected for the error flag of "!!!". +One could of course create a more elaborate script that automatically +checked for such errors. The "rmmod" command forces a "SUCCESS", +"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed. The first +two are self-explanatory, while the last indicates that while there +were no RCU failures, CPU-hotplug problems were detected. + + +Usage on Mainline Kernels +========================= + +When using rcutorture to test changes to RCU itself, it is often +necessary to build a number of kernels in order to test that change +across a broad range of combinations of the relevant Kconfig options +and of the relevant kernel boot parameters. In this situation, use +of modprobe and rmmod can be quite time-consuming and error-prone. + +Therefore, the tools/testing/selftests/rcutorture/bin/kvm.sh +script is available for mainline testing for x86, arm64, and +powerpc. By default, it will run the series of tests specified by +tools/testing/selftests/rcutorture/configs/rcu/CFLIST, with each test +running for 30 minutes within a guest OS using a minimal userspace +supplied by an automatically generated initrd. After the tests are +complete, the resulting build products and console output are analyzed +for errors and the results of the runs are summarized. + +On larger systems, rcutorture testing can be accelerated by passing the +--cpus argument to kvm.sh. For example, on a 64-CPU system, "--cpus 43" +would use up to 43 CPUs to run tests concurrently, which as of v5.4 would +complete all the scenarios in two batches, reducing the time to complete +from about eight hours to about one hour (not counting the time to build +the sixteen kernels). The "--dryrun sched" argument will not run tests, +but rather tell you how the tests would be scheduled into batches. This +can be useful when working out how many CPUs to specify in the --cpus +argument. + +Not all changes require that all scenarios be run. For example, a change +to Tree SRCU might run only the SRCU-N and SRCU-P scenarios using the +--configs argument to kvm.sh as follows: "--configs 'SRCU-N SRCU-P'". +Large systems can run multiple copies of of the full set of scenarios, +for example, a system with 448 hardware threads can run five instances +of the full set concurrently. To make this happen:: + + kvm.sh --cpus 448 --configs '5*CFLIST' + +Alternatively, such a system can run 56 concurrent instances of a single +eight-CPU scenario:: + + kvm.sh --cpus 448 --configs '56*TREE04' + +Or 28 concurrent instances of each of two eight-CPU scenarios:: + + kvm.sh --cpus 448 --configs '28*TREE03 28*TREE04' + +Of course, each concurrent instance will use memory, which can be +limited using the --memory argument, which defaults to 512M. Small +values for memory may require disabling the callback-flooding tests +using the --bootargs parameter discussed below. + +Sometimes additional debugging is useful, and in such cases the --kconfig +parameter to kvm.sh may be used, for example, ``--kconfig 'CONFIG_KASAN=y'``. + +Kernel boot arguments can also be supplied, for example, to control +rcutorture's module parameters. For example, to test a change to RCU's +CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'". +This will of course result in the scripting reporting a failure, namely +the resuling RCU CPU stall warning. As noted above, reducing memory may +require disabling rcutorture's callback-flooding tests:: + + kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \ + --bootargs 'rcutorture.fwd_progress=0' + +Sometimes all that is needed is a full set of kernel builds. This is +what the --buildonly argument does. + +Finally, the --trust-make argument allows each kernel build to reuse what +it can from the previous kernel build. + +There are additional more arcane arguments that are documented in the +source code of the kvm.sh script. + +If a run contains failures, the number of buildtime and runtime failures +is listed at the end of the kvm.sh output, which you really should redirect +to a file. The build products and console output of each run is kept in +tools/testing/selftests/rcutorture/res in timestamped directories. A +given directory can be supplied to kvm-find-errors.sh in order to have +it cycle you through summaries of errors and full error logs. For example:: + + tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh \ + tools/testing/selftests/rcutorture/res/2020.01.20-15.54.23 + +However, it is often more convenient to access the files directly. +Files pertaining to all scenarios in a run reside in the top-level +directory (2020.01.20-15.54.23 in the example above), while per-scenario +files reside in a subdirectory named after the scenario (for example, +"TREE04"). If a given scenario ran more than once (as in "--configs +'56*TREE04'" above), the directories corresponding to the second and +subsequent runs of that scenario include a sequence number, for example, +"TREE04.2", "TREE04.3", and so on. + +The most frequently used file in the top-level directory is testid.txt. +If the test ran in a git repository, then this file contains the commit +that was tested and any uncommitted changes in diff format. + +The most frequently used files in each per-scenario-run directory are: + +.config: + This file contains the Kconfig options. + +Make.out: + This contains build output for a specific scenario. + +console.log: + This contains the console output for a specific scenario. + This file may be examined once the kernel has booted, but + it might not exist if the build failed. + +vmlinux: + This contains the kernel, which can be useful with tools like + objdump and gdb. + +A number of additional files are available, but are less frequently used. +Many are intended for debugging of rcutorture itself or of its scripting. + +As of v5.4, a successful run with the default set of scenarios produces +the following summary at the end of the run on a 12-CPU system:: + + SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ] + SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ] + SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ] + SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ] + TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ] + TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ] + TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ] + TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198 + TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631 + TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ] + TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844 + TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497 + CPU count limited from 16 to 12 + TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961 + TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997 + TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732 + CPU count limited from 16 to 12 + TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011 diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt deleted file mode 100644 index af712a3c5b6a..000000000000 --- a/Documentation/RCU/torture.txt +++ /dev/null @@ -1,282 +0,0 @@ -RCU Torture Test Operation - - -CONFIG_RCU_TORTURE_TEST - -The CONFIG_RCU_TORTURE_TEST config option is available for all RCU -implementations. It creates an rcutorture kernel module that can -be loaded to run a torture test. The test periodically outputs -status messages via printk(), which can be examined via the dmesg -command (perhaps grepping for "torture"). The test is started -when the module is loaded, and stops when the module is unloaded. - -Module parameters are prefixed by "rcutorture." in -Documentation/admin-guide/kernel-parameters.txt. - -OUTPUT - -The statistics output is as follows: - - rcu-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4 - rcu-torture: rtc: (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767 - rcu-torture: Reader Pipe: 727860534 34213 0 0 0 0 0 0 0 0 0 - rcu-torture: Reader Batch: 727877838 17003 0 0 0 0 0 0 0 0 0 - rcu-torture: Free-Block Circulation: 155440 155440 155440 155440 155440 155440 155440 155440 155440 155440 0 - rcu-torture:--- End of test: SUCCESS: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4 - -The command "dmesg | grep torture:" will extract this information on -most systems. On more esoteric configurations, it may be necessary to -use other commands to access the output of the printk()s used by -the RCU torture test. The printk()s use KERN_ALERT, so they should -be evident. ;-) - -The first and last lines show the rcutorture module parameters, and the -last line shows either "SUCCESS" or "FAILURE", based on rcutorture's -automatic determination as to whether RCU operated correctly. - -The entries are as follows: - -o "rtc": The hexadecimal address of the structure currently visible - to readers. - -o "ver": The number of times since boot that the RCU writer task - has changed the structure visible to readers. - -o "tfle": If non-zero, indicates that the "torture freelist" - containing structures to be placed into the "rtc" area is empty. - This condition is important, since it can fool you into thinking - that RCU is working when it is not. :-/ - -o "rta": Number of structures allocated from the torture freelist. - -o "rtaf": Number of allocations from the torture freelist that have - failed due to the list being empty. It is not unusual for this - to be non-zero, but it is bad for it to be a large fraction of - the value indicated by "rta". - -o "rtf": Number of frees into the torture freelist. - -o "rtmbe": A non-zero value indicates that rcutorture believes that - rcu_assign_pointer() and rcu_dereference() are not working - correctly. This value should be zero. - -o "rtbe": A non-zero value indicates that one of the rcu_barrier() - family of functions is not working correctly. - -o "rtbke": rcutorture was unable to create the real-time kthreads - used to force RCU priority inversion. This value should be zero. - -o "rtbre": Although rcutorture successfully created the kthreads - used to force RCU priority inversion, it was unable to set them - to the real-time priority level of 1. This value should be zero. - -o "rtbf": The number of times that RCU priority boosting failed - to resolve RCU priority inversion. - -o "rtb": The number of times that rcutorture attempted to force - an RCU priority inversion condition. If you are testing RCU - priority boosting via the "test_boost" module parameter, this - value should be non-zero. - -o "nt": The number of times rcutorture ran RCU read-side code from - within a timer handler. This value should be non-zero only - if you specified the "irqreader" module parameter. - -o "Reader Pipe": Histogram of "ages" of structures seen by readers. - If any entries past the first two are non-zero, RCU is broken. - And rcutorture prints the error flag string "!!!" to make sure - you notice. The age of a newly allocated structure is zero, - it becomes one when removed from reader visibility, and is - incremented once per grace period subsequently -- and is freed - after passing through (RCU_TORTURE_PIPE_LEN-2) grace periods. - - The output displayed above was taken from a correctly working - RCU. If you want to see what it looks like when broken, break - it yourself. ;-) - -o "Reader Batch": Another histogram of "ages" of structures seen - by readers, but in terms of counter flips (or batches) rather - than in terms of grace periods. The legal number of non-zero - entries is again two. The reason for this separate view is that - it is sometimes easier to get the third entry to show up in the - "Reader Batch" list than in the "Reader Pipe" list. - -o "Free-Block Circulation": Shows the number of torture structures - that have reached a given point in the pipeline. The first element - should closely correspond to the number of structures allocated, - the second to the number that have been removed from reader view, - and all but the last remaining to the corresponding number of - passes through a grace period. The last entry should be zero, - as it is only incremented if a torture structure's counter - somehow gets incremented farther than it should. - -Different implementations of RCU can provide implementation-specific -additional information. For example, Tree SRCU provides the following -additional line: - - srcud-torture: Tree SRCU per-CPU(idx=0): 0(35,-21) 1(-4,24) 2(1,1) 3(-26,20) 4(28,-47) 5(-9,4) 6(-10,14) 7(-14,11) T(1,6) - -This line shows the per-CPU counter state, in this case for Tree SRCU -using a dynamically allocated srcu_struct (hence "srcud-" rather than -"srcu-"). The numbers in parentheses are the values of the "old" and -"current" counters for the corresponding CPU. The "idx" value maps the -"old" and "current" values to the underlying array, and is useful for -debugging. The final "T" entry contains the totals of the counters. - - -USAGE ON SPECIFIC KERNEL BUILDS - -It is sometimes desirable to torture RCU on a specific kernel build, -for example, when preparing to put that kernel build into production. -In that case, the kernel should be built with CONFIG_RCU_TORTURE_TEST=m -so that the test can be started using modprobe and terminated using rmmod. - -For example, the following script may be used to torture RCU: - - #!/bin/sh - - modprobe rcutorture - sleep 3600 - rmmod rcutorture - dmesg | grep torture: - -The output can be manually inspected for the error flag of "!!!". -One could of course create a more elaborate script that automatically -checked for such errors. The "rmmod" command forces a "SUCCESS", -"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed. The first -two are self-explanatory, while the last indicates that while there -were no RCU failures, CPU-hotplug problems were detected. - - -USAGE ON MAINLINE KERNELS - -When using rcutorture to test changes to RCU itself, it is often -necessary to build a number of kernels in order to test that change -across a broad range of combinations of the relevant Kconfig options -and of the relevant kernel boot parameters. In this situation, use -of modprobe and rmmod can be quite time-consuming and error-prone. - -Therefore, the tools/testing/selftests/rcutorture/bin/kvm.sh -script is available for mainline testing for x86, arm64, and -powerpc. By default, it will run the series of tests specified by -tools/testing/selftests/rcutorture/configs/rcu/CFLIST, with each test -running for 30 minutes within a guest OS using a minimal userspace -supplied by an automatically generated initrd. After the tests are -complete, the resulting build products and console output are analyzed -for errors and the results of the runs are summarized. - -On larger systems, rcutorture testing can be accelerated by passing the ---cpus argument to kvm.sh. For example, on a 64-CPU system, "--cpus 43" -would use up to 43 CPUs to run tests concurrently, which as of v5.4 would -complete all the scenarios in two batches, reducing the time to complete -from about eight hours to about one hour (not counting the time to build -the sixteen kernels). The "--dryrun sched" argument will not run tests, -but rather tell you how the tests would be scheduled into batches. This -can be useful when working out how many CPUs to specify in the --cpus -argument. - -Not all changes require that all scenarios be run. For example, a change -to Tree SRCU might run only the SRCU-N and SRCU-P scenarios using the ---configs argument to kvm.sh as follows: "--configs 'SRCU-N SRCU-P'". -Large systems can run multiple copies of of the full set of scenarios, -for example, a system with 448 hardware threads can run five instances -of the full set concurrently. To make this happen: - - kvm.sh --cpus 448 --configs '5*CFLIST' - -Alternatively, such a system can run 56 concurrent instances of a single -eight-CPU scenario: - - kvm.sh --cpus 448 --configs '56*TREE04' - -Or 28 concurrent instances of each of two eight-CPU scenarios: - - kvm.sh --cpus 448 --configs '28*TREE03 28*TREE04' - -Of course, each concurrent instance will use memory, which can be -limited using the --memory argument, which defaults to 512M. Small -values for memory may require disabling the callback-flooding tests -using the --bootargs parameter discussed below. - -Sometimes additional debugging is useful, and in such cases the --kconfig -parameter to kvm.sh may be used, for example, "--kconfig 'CONFIG_KASAN=y'". - -Kernel boot arguments can also be supplied, for example, to control -rcutorture's module parameters. For example, to test a change to RCU's -CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'". -This will of course result in the scripting reporting a failure, namely -the resuling RCU CPU stall warning. As noted above, reducing memory may -require disabling rcutorture's callback-flooding tests: - - kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \ - --bootargs 'rcutorture.fwd_progress=0' - -Sometimes all that is needed is a full set of kernel builds. This is -what the --buildonly argument does. - -Finally, the --trust-make argument allows each kernel build to reuse what -it can from the previous kernel build. - -There are additional more arcane arguments that are documented in the -source code of the kvm.sh script. - -If a run contains failures, the number of buildtime and runtime failures -is listed at the end of the kvm.sh output, which you really should redirect -to a file. The build products and console output of each run is kept in -tools/testing/selftests/rcutorture/res in timestamped directories. A -given directory can be supplied to kvm-find-errors.sh in order to have -it cycle you through summaries of errors and full error logs. For example: - - tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh \ - tools/testing/selftests/rcutorture/res/2020.01.20-15.54.23 - -However, it is often more convenient to access the files directly. -Files pertaining to all scenarios in a run reside in the top-level -directory (2020.01.20-15.54.23 in the example above), while per-scenario -files reside in a subdirectory named after the scenario (for example, -"TREE04"). If a given scenario ran more than once (as in "--configs -'56*TREE04'" above), the directories corresponding to the second and -subsequent runs of that scenario include a sequence number, for example, -"TREE04.2", "TREE04.3", and so on. - -The most frequently used file in the top-level directory is testid.txt. -If the test ran in a git repository, then this file contains the commit -that was tested and any uncommitted changes in diff format. - -The most frequently used files in each per-scenario-run directory are: - -.config: This file contains the Kconfig options. - -Make.out: This contains build output for a specific scenario. - -console.log: This contains the console output for a specific scenario. - This file may be examined once the kernel has booted, but - it might not exist if the build failed. - -vmlinux: This contains the kernel, which can be useful with tools like - objdump and gdb. - -A number of additional files are available, but are less frequently used. -Many are intended for debugging of rcutorture itself or of its scripting. - -As of v5.4, a successful run with the default set of scenarios produces -the following summary at the end of the run on a 12-CPU system: - -SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ] -SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ] -SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ] -SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ] -TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ] -TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ] -TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ] -TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198 -TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631 -TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ] -TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844 -TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497 -CPU count limited from 16 to 12 -TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961 -TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997 -TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732 -CPU count limited from 16 to 12 -TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011 diff --git a/Documentation/locking/locktorture.rst b/Documentation/locking/locktorture.rst index 8012a74555e7..dfaf9fc883f4 100644 --- a/Documentation/locking/locktorture.rst +++ b/Documentation/locking/locktorture.rst @@ -166,4 +166,4 @@ checked for such errors. The "rmmod" command forces a "SUCCESS", two are self-explanatory, while the last indicates that while there were no locking failures, CPU-hotplug problems were detected. -Also see: Documentation/RCU/torture.txt +Also see: Documentation/RCU/torture.rst diff --git a/MAINTAINERS b/MAINTAINERS index 496fd4eafb68..4429ce965b3a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14437,7 +14437,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev F: Documentation/RCU/ F: include/linux/rcu* F: kernel/rcu/ -X: Documentation/RCU/torture.txt +X: Documentation/RCU/torture.rst X: include/linux/srcu*.h X: kernel/rcu/srcu*.c @@ -17288,7 +17288,7 @@ M: Josh Triplett L: linux-kernel@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev -F: Documentation/RCU/torture.txt +F: Documentation/RCU/torture.rst F: kernel/locking/locktorture.c F: kernel/rcu/rcuperf.c F: kernel/rcu/rcutorture.c diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efb792e13fca..8205295fc33e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -7,7 +7,7 @@ * Authors: Paul E. McKenney * Josh Triplett * - * See also: Documentation/RCU/torture.txt + * See also: Documentation/RCU/torture.rst */ #define pr_fmt(fmt) fmt -- cgit v1.2.3 From f2286ab99549271f3cec73e305b9ecca95d91394 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 19:04:10 +0200 Subject: docs: RCU: Convert stallwarn.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Fix list markups; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to RCU/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- Documentation/RCU/index.rst | 1 + Documentation/RCU/stallwarn.rst | 329 ++++++++++++++++++++++++++++++++++++++++ Documentation/RCU/stallwarn.txt | 316 -------------------------------------- kernel/rcu/tree_stall.h | 4 +- 4 files changed, 332 insertions(+), 318 deletions(-) create mode 100644 Documentation/RCU/stallwarn.rst delete mode 100644 Documentation/RCU/stallwarn.txt (limited to 'kernel') diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index 9a1d51f394dc..e703d3dbe60c 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -20,6 +20,7 @@ RCU concepts rculist_nulls rcuref torture + stallwarn listRCU NMI-RCU UP diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst new file mode 100644 index 000000000000..08bc9aec4606 --- /dev/null +++ b/Documentation/RCU/stallwarn.rst @@ -0,0 +1,329 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================== +Using RCU's CPU Stall Detector +============================== + +This document first discusses what sorts of issues RCU's CPU stall +detector can locate, and then discusses kernel parameters and Kconfig +options that can be used to fine-tune the detector's operation. Finally, +this document explains the stall detector's "splat" format. + + +What Causes RCU CPU Stall Warnings? +=================================== + +So your kernel printed an RCU CPU stall warning. The next question is +"What caused it?" The following problems can result in RCU CPU stall +warnings: + +- A CPU looping in an RCU read-side critical section. + +- A CPU looping with interrupts disabled. + +- A CPU looping with preemption disabled. + +- A CPU looping with bottom halves disabled. + +- For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel + without invoking schedule(). If the looping in the kernel is + really expected and desirable behavior, you might need to add + some calls to cond_resched(). + +- Booting Linux using a console connection that is too slow to + keep up with the boot-time console-message rate. For example, + a 115Kbaud serial console can be -way- too slow to keep up + with boot-time message rates, and will frequently result in + RCU CPU stall warning messages. Especially if you have added + debug printk()s. + +- Anything that prevents RCU's grace-period kthreads from running. + This can result in the "All QSes seen" console-log message. + This message will include information on when the kthread last + ran and how often it should be expected to run. It can also + result in the ``rcu_.*kthread starved for`` console-log message, + which will include additional debugging information. + +- A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might + happen to preempt a low-priority task in the middle of an RCU + read-side critical section. This is especially damaging if + that low-priority task is not permitted to run on any other CPU, + in which case the next RCU grace period can never complete, which + will eventually cause the system to run out of memory and hang. + While the system is in the process of running itself out of + memory, you might see stall-warning messages. + +- A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that + is running at a higher priority than the RCU softirq threads. + This will prevent RCU callbacks from ever being invoked, + and in a CONFIG_PREEMPT_RCU kernel will further prevent + RCU grace periods from ever completing. Either way, the + system will eventually run out of memory and hang. In the + CONFIG_PREEMPT_RCU case, you might see stall-warning + messages. + + You can use the rcutree.kthread_prio kernel boot parameter to + increase the scheduling priority of RCU's kthreads, which can + help avoid this problem. However, please note that doing this + can increase your system's context-switch rate and thus degrade + performance. + +- A periodic interrupt whose handler takes longer than the time + interval between successive pairs of interrupts. This can + prevent RCU's kthreads and softirq handlers from running. + Note that certain high-overhead debugging options, for example + the function_graph tracer, can result in interrupt handler taking + considerably longer than normal, which can in turn result in + RCU CPU stall warnings. + +- Testing a workload on a fast system, tuning the stall-warning + timeout down to just barely avoid RCU CPU stall warnings, and then + running the same workload with the same stall-warning timeout on a + slow system. Note that thermal throttling and on-demand governors + can cause a single system to be sometimes fast and sometimes slow! + +- A hardware or software issue shuts off the scheduler-clock + interrupt on a CPU that is not in dyntick-idle mode. This + problem really has happened, and seems to be most likely to + result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels. + +- A bug in the RCU implementation. + +- A hardware failure. This is quite unlikely, but has occurred + at least once in real life. A CPU failed in a running system, + becoming unresponsive, but not causing an immediate crash. + This resulted in a series of RCU CPU stall warnings, eventually + leading the realization that the CPU had failed. + +The RCU, RCU-sched, and RCU-tasks implementations have CPU stall warning. +Note that SRCU does -not- have CPU stall warnings. Please note that +RCU only detects CPU stalls when there is a grace period in progress. +No grace period, no CPU stall warnings. + +To diagnose the cause of the stall, inspect the stack traces. +The offending function will usually be near the top of the stack. +If you have a series of stall warnings from a single extended stall, +comparing the stack traces can often help determine where the stall +is occurring, which will usually be in the function nearest the top of +that portion of the stack which remains the same from trace to trace. +If you can reliably trigger the stall, ftrace can be quite helpful. + +RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE +and with RCU's event tracing. For information on RCU's event tracing, +see include/trace/events/rcu.h. + + +Fine-Tuning the RCU CPU Stall Detector +====================================== + +The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's +CPU stall detector, which detects conditions that unduly delay RCU grace +periods. This module parameter enables CPU stall detection by default, +but may be overridden via boot-time parameter or at runtime via sysfs. +The stall detector's idea of what constitutes "unduly delayed" is +controlled by a set of kernel configuration variables and cpp macros: + +CONFIG_RCU_CPU_STALL_TIMEOUT +---------------------------- + + This kernel configuration parameter defines the period of time + that RCU will wait from the beginning of a grace period until it + issues an RCU CPU stall warning. This time period is normally + 21 seconds. + + This configuration parameter may be changed at runtime via the + /sys/module/rcupdate/parameters/rcu_cpu_stall_timeout, however + this parameter is checked only at the beginning of a cycle. + So if you are 10 seconds into a 40-second stall, setting this + sysfs parameter to (say) five will shorten the timeout for the + -next- stall, or the following warning for the current stall + (assuming the stall lasts long enough). It will not affect the + timing of the next warning for the current stall. + + Stall-warning messages may be enabled and disabled completely via + /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress. + +RCU_STALL_DELAY_DELTA +--------------------- + + Although the lockdep facility is extremely useful, it does add + some overhead. Therefore, under CONFIG_PROVE_RCU, the + RCU_STALL_DELAY_DELTA macro allows five extra seconds before + giving an RCU CPU stall warning message. (This is a cpp + macro, not a kernel configuration parameter.) + +RCU_STALL_RAT_DELAY +------------------- + + The CPU stall detector tries to make the offending CPU print its + own warnings, as this often gives better-quality stack traces. + However, if the offending CPU does not detect its own stall in + the number of jiffies specified by RCU_STALL_RAT_DELAY, then + some other CPU will complain. This delay is normally set to + two jiffies. (This is a cpp macro, not a kernel configuration + parameter.) + +rcupdate.rcu_task_stall_timeout +------------------------------- + + This boot/sysfs parameter controls the RCU-tasks stall warning + interval. A value of zero or less suppresses RCU-tasks stall + warnings. A positive value sets the stall-warning interval + in seconds. An RCU-tasks stall warning starts with the line: + + INFO: rcu_tasks detected stalls on tasks: + + And continues with the output of sched_show_task() for each + task stalling the current RCU-tasks grace period. + + +Interpreting RCU's CPU Stall-Detector "Splats" +============================================== + +For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling, +it will print a message similar to the following:: + + INFO: rcu_sched detected stalls on CPUs/tasks: + 2-...: (3 GPs behind) idle=06c/0/0 softirq=1453/1455 fqs=0 + 16-...: (0 ticks this GP) idle=81c/0/0 softirq=764/764 fqs=0 + (detected by 32, t=2603 jiffies, g=7075, q=625) + +This message indicates that CPU 32 detected that CPUs 2 and 16 were both +causing stalls, and that the stall was affecting RCU-sched. This message +will normally be followed by stack dumps for each CPU. Please note that +PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, and that +the tasks will be indicated by PID, for example, "P3421". It is even +possible for an rcu_state stall to be caused by both CPUs -and- tasks, +in which case the offending CPUs and tasks will all be called out in the list. + +CPU 2's "(3 GPs behind)" indicates that this CPU has not interacted with +the RCU core for the past three grace periods. In contrast, CPU 16's "(0 +ticks this GP)" indicates that this CPU has not taken any scheduling-clock +interrupts during the current stalled grace period. + +The "idle=" portion of the message prints the dyntick-idle state. +The hex number before the first "/" is the low-order 12 bits of the +dynticks counter, which will have an even-numbered value if the CPU +is in dyntick-idle mode and an odd-numbered value otherwise. The hex +number between the two "/"s is the value of the nesting, which will be +a small non-negative number if in the idle loop (as shown above) and a +very large positive number otherwise. + +The "softirq=" portion of the message tracks the number of RCU softirq +handlers that the stalled CPU has executed. The number before the "/" +is the number that had executed since boot at the time that this CPU +last noted the beginning of a grace period, which might be the current +(stalled) grace period, or it might be some earlier grace period (for +example, if the CPU might have been in dyntick-idle mode for an extended +time period. The number after the "/" is the number that have executed +since boot until the current time. If this latter number stays constant +across repeated stall-warning messages, it is possible that RCU's softirq +handlers are no longer able to execute on this CPU. This can happen if +the stalled CPU is spinning with interrupts are disabled, or, in -rt +kernels, if a high-priority process is starving RCU's softirq handler. + +The "fqs=" shows the number of force-quiescent-state idle/offline +detection passes that the grace-period kthread has made across this +CPU since the last time that this CPU noted the beginning of a grace +period. + +The "detected by" line indicates which CPU detected the stall (in this +case, CPU 32), how many jiffies have elapsed since the start of the grace +period (in this case 2603), the grace-period sequence number (7075), and +an estimate of the total number of RCU callbacks queued across all CPUs +(625 in this case). + +In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed +for each CPU:: + + 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1 + +The "last_accelerate:" prints the low-order 16 bits (in hex) of the +jiffies counter when this CPU last invoked rcu_try_advance_all_cbs() +from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from +rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle +processing is enabled. + +If the grace period ends just as the stall warning starts printing, +there will be a spurious stall-warning message, which will include +the following:: + + INFO: Stall ended before state dump start + +This is rare, but does happen from time to time in real life. It is also +possible for a zero-jiffy stall to be flagged in this case, depending +on how the stall warning and the grace-period initialization happen to +interact. Please note that it is not possible to entirely eliminate this +sort of false positive without resorting to things like stop_machine(), +which is overkill for this sort of problem. + +If all CPUs and tasks have passed through quiescent states, but the +grace period has nevertheless failed to end, the stall-warning splat +will include something like the following:: + + All QSes seen, last rcu_preempt kthread activity 23807 (4297905177-4297881370), jiffies_till_next_fqs=3, root ->qsmask 0x0 + +The "23807" indicates that it has been more than 23 thousand jiffies +since the grace-period kthread ran. The "jiffies_till_next_fqs" +indicates how frequently that kthread should run, giving the number +of jiffies between force-quiescent-state scans, in this case three, +which is way less than 23807. Finally, the root rcu_node structure's +->qsmask field is printed, which will normally be zero. + +If the relevant grace-period kthread has been unable to run prior to +the stall warning, as was the case in the "All QSes seen" line above, +the following additional line is printed:: + + kthread starved for 23807 jiffies! g7075 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1 ->cpu=5 + +Starving the grace-period kthreads of CPU time can of course result +in RCU CPU stall warnings even when all CPUs and tasks have passed +through the required quiescent states. The "g" number shows the current +grace-period sequence number, the "f" precedes the ->gp_flags command +to the grace-period kthread, the "RCU_GP_WAIT_FQS" indicates that the +kthread is waiting for a short timeout, the "state" precedes value of the +task_struct ->state field, and the "cpu" indicates that the grace-period +kthread last ran on CPU 5. + + +Multiple Warnings From One Stall +================================ + +If a stall lasts long enough, multiple stall-warning messages will be +printed for it. The second and subsequent messages are printed at +longer intervals, so that the time between (say) the first and second +message will be about three times the interval between the beginning +of the stall and the first message. + + +Stall Warnings for Expedited Grace Periods +========================================== + +If an expedited grace period detects a stall, it will place a message +like the following in dmesg:: + + INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 7-... } 21119 jiffies s: 73 root: 0x2/. + +This indicates that CPU 7 has failed to respond to a reschedule IPI. +The three periods (".") following the CPU number indicate that the CPU +is online (otherwise the first period would instead have been "O"), +that the CPU was online at the beginning of the expedited grace period +(otherwise the second period would have instead been "o"), and that +the CPU has been online at least once since boot (otherwise, the third +period would instead have been "N"). The number before the "jiffies" +indicates that the expedited grace period has been going on for 21,119 +jiffies. The number following the "s:" indicates that the expedited +grace-period sequence counter is 73. The fact that this last value is +odd indicates that an expedited grace period is in flight. The number +following "root:" is a bitmask that indicates which children of the root +rcu_node structure correspond to CPUs and/or tasks that are blocking the +current expedited grace period. If the tree had more than one level, +additional hex numbers would be printed for the states of the other +rcu_node structures in the tree. + +As with normal grace periods, PREEMPT_RCU builds can be stalled by +tasks as well as by CPUs, and that the tasks will be indicated by PID, +for example, "P3421". + +It is entirely possible to see stall warnings from normal and from +expedited grace periods at about the same time during the same run. diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt deleted file mode 100644 index a360a8796710..000000000000 --- a/Documentation/RCU/stallwarn.txt +++ /dev/null @@ -1,316 +0,0 @@ -Using RCU's CPU Stall Detector - -This document first discusses what sorts of issues RCU's CPU stall -detector can locate, and then discusses kernel parameters and Kconfig -options that can be used to fine-tune the detector's operation. Finally, -this document explains the stall detector's "splat" format. - - -What Causes RCU CPU Stall Warnings? - -So your kernel printed an RCU CPU stall warning. The next question is -"What caused it?" The following problems can result in RCU CPU stall -warnings: - -o A CPU looping in an RCU read-side critical section. - -o A CPU looping with interrupts disabled. - -o A CPU looping with preemption disabled. - -o A CPU looping with bottom halves disabled. - -o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel - without invoking schedule(). If the looping in the kernel is - really expected and desirable behavior, you might need to add - some calls to cond_resched(). - -o Booting Linux using a console connection that is too slow to - keep up with the boot-time console-message rate. For example, - a 115Kbaud serial console can be -way- too slow to keep up - with boot-time message rates, and will frequently result in - RCU CPU stall warning messages. Especially if you have added - debug printk()s. - -o Anything that prevents RCU's grace-period kthreads from running. - This can result in the "All QSes seen" console-log message. - This message will include information on when the kthread last - ran and how often it should be expected to run. It can also - result in the "rcu_.*kthread starved for" console-log message, - which will include additional debugging information. - -o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might - happen to preempt a low-priority task in the middle of an RCU - read-side critical section. This is especially damaging if - that low-priority task is not permitted to run on any other CPU, - in which case the next RCU grace period can never complete, which - will eventually cause the system to run out of memory and hang. - While the system is in the process of running itself out of - memory, you might see stall-warning messages. - -o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that - is running at a higher priority than the RCU softirq threads. - This will prevent RCU callbacks from ever being invoked, - and in a CONFIG_PREEMPT_RCU kernel will further prevent - RCU grace periods from ever completing. Either way, the - system will eventually run out of memory and hang. In the - CONFIG_PREEMPT_RCU case, you might see stall-warning - messages. - - You can use the rcutree.kthread_prio kernel boot parameter to - increase the scheduling priority of RCU's kthreads, which can - help avoid this problem. However, please note that doing this - can increase your system's context-switch rate and thus degrade - performance. - -o A periodic interrupt whose handler takes longer than the time - interval between successive pairs of interrupts. This can - prevent RCU's kthreads and softirq handlers from running. - Note that certain high-overhead debugging options, for example - the function_graph tracer, can result in interrupt handler taking - considerably longer than normal, which can in turn result in - RCU CPU stall warnings. - -o Testing a workload on a fast system, tuning the stall-warning - timeout down to just barely avoid RCU CPU stall warnings, and then - running the same workload with the same stall-warning timeout on a - slow system. Note that thermal throttling and on-demand governors - can cause a single system to be sometimes fast and sometimes slow! - -o A hardware or software issue shuts off the scheduler-clock - interrupt on a CPU that is not in dyntick-idle mode. This - problem really has happened, and seems to be most likely to - result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels. - -o A bug in the RCU implementation. - -o A hardware failure. This is quite unlikely, but has occurred - at least once in real life. A CPU failed in a running system, - becoming unresponsive, but not causing an immediate crash. - This resulted in a series of RCU CPU stall warnings, eventually - leading the realization that the CPU had failed. - -The RCU, RCU-sched, and RCU-tasks implementations have CPU stall warning. -Note that SRCU does -not- have CPU stall warnings. Please note that -RCU only detects CPU stalls when there is a grace period in progress. -No grace period, no CPU stall warnings. - -To diagnose the cause of the stall, inspect the stack traces. -The offending function will usually be near the top of the stack. -If you have a series of stall warnings from a single extended stall, -comparing the stack traces can often help determine where the stall -is occurring, which will usually be in the function nearest the top of -that portion of the stack which remains the same from trace to trace. -If you can reliably trigger the stall, ftrace can be quite helpful. - -RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE -and with RCU's event tracing. For information on RCU's event tracing, -see include/trace/events/rcu.h. - - -Fine-Tuning the RCU CPU Stall Detector - -The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's -CPU stall detector, which detects conditions that unduly delay RCU grace -periods. This module parameter enables CPU stall detection by default, -but may be overridden via boot-time parameter or at runtime via sysfs. -The stall detector's idea of what constitutes "unduly delayed" is -controlled by a set of kernel configuration variables and cpp macros: - -CONFIG_RCU_CPU_STALL_TIMEOUT - - This kernel configuration parameter defines the period of time - that RCU will wait from the beginning of a grace period until it - issues an RCU CPU stall warning. This time period is normally - 21 seconds. - - This configuration parameter may be changed at runtime via the - /sys/module/rcupdate/parameters/rcu_cpu_stall_timeout, however - this parameter is checked only at the beginning of a cycle. - So if you are 10 seconds into a 40-second stall, setting this - sysfs parameter to (say) five will shorten the timeout for the - -next- stall, or the following warning for the current stall - (assuming the stall lasts long enough). It will not affect the - timing of the next warning for the current stall. - - Stall-warning messages may be enabled and disabled completely via - /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress. - -RCU_STALL_DELAY_DELTA - - Although the lockdep facility is extremely useful, it does add - some overhead. Therefore, under CONFIG_PROVE_RCU, the - RCU_STALL_DELAY_DELTA macro allows five extra seconds before - giving an RCU CPU stall warning message. (This is a cpp - macro, not a kernel configuration parameter.) - -RCU_STALL_RAT_DELAY - - The CPU stall detector tries to make the offending CPU print its - own warnings, as this often gives better-quality stack traces. - However, if the offending CPU does not detect its own stall in - the number of jiffies specified by RCU_STALL_RAT_DELAY, then - some other CPU will complain. This delay is normally set to - two jiffies. (This is a cpp macro, not a kernel configuration - parameter.) - -rcupdate.rcu_task_stall_timeout - - This boot/sysfs parameter controls the RCU-tasks stall warning - interval. A value of zero or less suppresses RCU-tasks stall - warnings. A positive value sets the stall-warning interval - in seconds. An RCU-tasks stall warning starts with the line: - - INFO: rcu_tasks detected stalls on tasks: - - And continues with the output of sched_show_task() for each - task stalling the current RCU-tasks grace period. - - -Interpreting RCU's CPU Stall-Detector "Splats" - -For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling, -it will print a message similar to the following: - - INFO: rcu_sched detected stalls on CPUs/tasks: - 2-...: (3 GPs behind) idle=06c/0/0 softirq=1453/1455 fqs=0 - 16-...: (0 ticks this GP) idle=81c/0/0 softirq=764/764 fqs=0 - (detected by 32, t=2603 jiffies, g=7075, q=625) - -This message indicates that CPU 32 detected that CPUs 2 and 16 were both -causing stalls, and that the stall was affecting RCU-sched. This message -will normally be followed by stack dumps for each CPU. Please note that -PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, and that -the tasks will be indicated by PID, for example, "P3421". It is even -possible for an rcu_state stall to be caused by both CPUs -and- tasks, -in which case the offending CPUs and tasks will all be called out in the list. - -CPU 2's "(3 GPs behind)" indicates that this CPU has not interacted with -the RCU core for the past three grace periods. In contrast, CPU 16's "(0 -ticks this GP)" indicates that this CPU has not taken any scheduling-clock -interrupts during the current stalled grace period. - -The "idle=" portion of the message prints the dyntick-idle state. -The hex number before the first "/" is the low-order 12 bits of the -dynticks counter, which will have an even-numbered value if the CPU -is in dyntick-idle mode and an odd-numbered value otherwise. The hex -number between the two "/"s is the value of the nesting, which will be -a small non-negative number if in the idle loop (as shown above) and a -very large positive number otherwise. - -The "softirq=" portion of the message tracks the number of RCU softirq -handlers that the stalled CPU has executed. The number before the "/" -is the number that had executed since boot at the time that this CPU -last noted the beginning of a grace period, which might be the current -(stalled) grace period, or it might be some earlier grace period (for -example, if the CPU might have been in dyntick-idle mode for an extended -time period. The number after the "/" is the number that have executed -since boot until the current time. If this latter number stays constant -across repeated stall-warning messages, it is possible that RCU's softirq -handlers are no longer able to execute on this CPU. This can happen if -the stalled CPU is spinning with interrupts are disabled, or, in -rt -kernels, if a high-priority process is starving RCU's softirq handler. - -The "fqs=" shows the number of force-quiescent-state idle/offline -detection passes that the grace-period kthread has made across this -CPU since the last time that this CPU noted the beginning of a grace -period. - -The "detected by" line indicates which CPU detected the stall (in this -case, CPU 32), how many jiffies have elapsed since the start of the grace -period (in this case 2603), the grace-period sequence number (7075), and -an estimate of the total number of RCU callbacks queued across all CPUs -(625 in this case). - -In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed -for each CPU: - - 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1 - -The "last_accelerate:" prints the low-order 16 bits (in hex) of the -jiffies counter when this CPU last invoked rcu_try_advance_all_cbs() -from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from -rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle -processing is enabled. - -If the grace period ends just as the stall warning starts printing, -there will be a spurious stall-warning message, which will include -the following: - - INFO: Stall ended before state dump start - -This is rare, but does happen from time to time in real life. It is also -possible for a zero-jiffy stall to be flagged in this case, depending -on how the stall warning and the grace-period initialization happen to -interact. Please note that it is not possible to entirely eliminate this -sort of false positive without resorting to things like stop_machine(), -which is overkill for this sort of problem. - -If all CPUs and tasks have passed through quiescent states, but the -grace period has nevertheless failed to end, the stall-warning splat -will include something like the following: - - All QSes seen, last rcu_preempt kthread activity 23807 (4297905177-4297881370), jiffies_till_next_fqs=3, root ->qsmask 0x0 - -The "23807" indicates that it has been more than 23 thousand jiffies -since the grace-period kthread ran. The "jiffies_till_next_fqs" -indicates how frequently that kthread should run, giving the number -of jiffies between force-quiescent-state scans, in this case three, -which is way less than 23807. Finally, the root rcu_node structure's -->qsmask field is printed, which will normally be zero. - -If the relevant grace-period kthread has been unable to run prior to -the stall warning, as was the case in the "All QSes seen" line above, -the following additional line is printed: - - kthread starved for 23807 jiffies! g7075 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1 ->cpu=5 - -Starving the grace-period kthreads of CPU time can of course result -in RCU CPU stall warnings even when all CPUs and tasks have passed -through the required quiescent states. The "g" number shows the current -grace-period sequence number, the "f" precedes the ->gp_flags command -to the grace-period kthread, the "RCU_GP_WAIT_FQS" indicates that the -kthread is waiting for a short timeout, the "state" precedes value of the -task_struct ->state field, and the "cpu" indicates that the grace-period -kthread last ran on CPU 5. - - -Multiple Warnings From One Stall - -If a stall lasts long enough, multiple stall-warning messages will be -printed for it. The second and subsequent messages are printed at -longer intervals, so that the time between (say) the first and second -message will be about three times the interval between the beginning -of the stall and the first message. - - -Stall Warnings for Expedited Grace Periods - -If an expedited grace period detects a stall, it will place a message -like the following in dmesg: - - INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 7-... } 21119 jiffies s: 73 root: 0x2/. - -This indicates that CPU 7 has failed to respond to a reschedule IPI. -The three periods (".") following the CPU number indicate that the CPU -is online (otherwise the first period would instead have been "O"), -that the CPU was online at the beginning of the expedited grace period -(otherwise the second period would have instead been "o"), and that -the CPU has been online at least once since boot (otherwise, the third -period would instead have been "N"). The number before the "jiffies" -indicates that the expedited grace period has been going on for 21,119 -jiffies. The number following the "s:" indicates that the expedited -grace-period sequence counter is 73. The fact that this last value is -odd indicates that an expedited grace period is in flight. The number -following "root:" is a bitmask that indicates which children of the root -rcu_node structure correspond to CPUs and/or tasks that are blocking the -current expedited grace period. If the tree had more than one level, -additional hex numbers would be printed for the states of the other -rcu_node structures in the tree. - -As with normal grace periods, PREEMPT_RCU builds can be stalled by -tasks as well as by CPUs, and that the tasks will be indicated by PID, -for example, "P3421". - -It is entirely possible to see stall warnings from normal and from -expedited grace periods at about the same time during the same run. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 54a6dba0280d..b04256cd7e12 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -468,7 +468,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) /* * OK, time to rat on our buddy... - * See Documentation/RCU/stallwarn.txt for info on how to debug + * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); @@ -535,7 +535,7 @@ static void print_cpu_stall(unsigned long gps) /* * OK, time to rat on ourselves... - * See Documentation/RCU/stallwarn.txt for info on how to debug + * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); -- cgit v1.2.3 From 7ee880b7bf1dea88d0a472b775aebdb4fb6bf860 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 15 Apr 2020 22:26:55 +0000 Subject: rcu: Initialize and destroy rcu_synchronize only when necessary The __wait_rcu_gp() function unconditionally initializes and cleans up each element of rs_array[], whether used or not. This is slightly wasteful and rather confusing, so this commit skips both initialization and cleanup for duplicate callback functions. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 84843adfd939..f5a82e107bcb 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -390,13 +390,14 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, might_sleep(); continue; } - init_rcu_head_on_stack(&rs_array[i].head); - init_completion(&rs_array[i].completion); for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) + if (j == i) { + init_rcu_head_on_stack(&rs_array[i].head); + init_completion(&rs_array[i].completion); (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + } } /* Wait for all callbacks to be invoked. */ @@ -407,9 +408,10 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) + if (j == i) { wait_for_completion(&rs_array[i].completion); - destroy_rcu_head_on_stack(&rs_array[i].head); + destroy_rcu_head_on_stack(&rs_array[i].head); + } } } EXPORT_SYMBOL_GPL(__wait_rcu_gp); -- cgit v1.2.3 From abfce0414814149f716e1d30da1fb3140d1b3473 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 19 Apr 2020 21:57:15 +0000 Subject: rcu: Simplify the calculation of rcu_state.ncpus There is only 1 bit set in mask, which means that the only difference between oldmask and the new one will be at the position where the bit is set in mask. This commit therefore updates rcu_state.ncpus by checking whether the bit in mask is already set in rnp->expmaskinitnext. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6c6569e0586c..bef1dc91bfbe 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3842,10 +3842,9 @@ void rcu_cpu_starting(unsigned int cpu) { unsigned long flags; unsigned long mask; - int nbits; - unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; + bool newcpu; if (per_cpu(rcu_cpu_started, cpu)) return; @@ -3857,12 +3856,10 @@ void rcu_cpu_starting(unsigned int cpu) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); - oldmask = rnp->expmaskinitnext; + newcpu = !(rnp->expmaskinitnext & mask); rnp->expmaskinitnext |= mask; - oldmask ^= rnp->expmaskinitnext; - nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ - smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */ ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); -- cgit v1.2.3 From e816d56fad57ba9817cef6606b12f5e14647c3bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 May 2020 16:49:48 -0700 Subject: rcu: Add callbacks-invoked counters This commit adds a count of the callbacks invoked to the per-CPU rcu_data structure. This count is printed by the show_rcu_gp_kthreads() that is invoked by rcutorture and the RCU CPU stall-warning code. It is also intended for use by drgn. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree.h | 1 + kernel/rcu/tree_stall.h | 3 +++ 3 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bef1dc91bfbe..874c831bcc45 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2443,6 +2443,7 @@ static void rcu_do_batch(struct rcu_data *rdp) local_irq_save(flags); rcu_nocb_lock(rdp); count = -rcl.len; + rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 43991a40b084..9c6f7343bec0 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -171,6 +171,7 @@ struct rcu_data { /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 54a6dba0280d..2768ce6bf657 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -649,6 +649,7 @@ static void check_cpu_stall(struct rcu_data *rdp) */ void show_rcu_gp_kthreads(void) { + unsigned long cbs = 0; int cpu; unsigned long j; unsigned long ja; @@ -690,9 +691,11 @@ void show_rcu_gp_kthreads(void) } for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); + cbs += data_race(rdp->n_cbs_invoked); if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } + pr_info("RCU callbacks invoked since boot: %lu\n", cbs); show_rcu_tasks_gp_kthreads(); } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); -- cgit v1.2.3 From f8466f94685b5bd931384526cf51e090fd2ac706 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 3 May 2020 19:16:09 -0700 Subject: rcu: Add comment documenting rcu_callback_map's purpose The rcu_callback_map lockdep_map structure was added back in 2013, but its purpose has become obscure. This commit therefore documments that the purpose of rcu_callback map is, in the words of commit 24ef659a857 ("rcu: Provide better diagnostics for blocking in RCU callback functions"), to help lockdep to tie an "inappropriate voluntary context switch back to the fact that the function is being invoked from within a callback." Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f5a82e107bcb..ca17b771ad60 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -279,6 +279,7 @@ struct lockdep_map rcu_sched_lock_map = { }; EXPORT_SYMBOL_GPL(rcu_sched_lock_map); +// Tell lockdep when RCU callbacks are being invoked. static struct lock_class_key rcu_callback_key; struct lockdep_map rcu_callback_map = STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); -- cgit v1.2.3 From 77865dea25c4f45ce0c5bf61a8470af01fccd944 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 15:44:46 -0700 Subject: rcu: Grace-period-kthread related sleeps to idle priority This commit converts the long-standing schedule_timeout_interruptible() and schedule_timeout_uninterruptible() calls used by RCU's grace-period kthread to schedule_timeout_idle(). This conversion avoids polluting the load-average with RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 874c831bcc45..feb31c201dee 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1638,7 +1638,7 @@ static void rcu_gp_slow(int delay) if (delay > 0 && !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) - schedule_timeout_uninterruptible(delay); + schedule_timeout_idle(delay); } static unsigned long sleep_duration; @@ -1661,7 +1661,7 @@ static void rcu_gp_torture_wait(void) duration = xchg(&sleep_duration, 0UL); if (duration > 0) { pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); - schedule_timeout_uninterruptible(duration); + schedule_timeout_idle(duration); pr_alert("%s: Wait complete\n", __func__); } } @@ -2727,7 +2727,7 @@ static void rcu_cpu_kthread(unsigned int cpu) } *statusp = RCU_KTHREAD_YIELDING; trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); *statusp = RCU_KTHREAD_WAITING; } -- cgit v1.2.3 From a9352f72d6a9e8fe4840b9f0d97af8f5a6c52c79 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 16:34:38 -0700 Subject: rcu: Priority-boost-related sleeps to idle priority This commit converts the long-standing schedule_timeout_interruptible() call used by RCU's priority-boosting kthreads to schedule_timeout_idle(). This conversion avoids polluting the load-average with RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 352223664ebd..25296c17a30d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1033,7 +1033,7 @@ static int rcu_boost_kthread(void *arg) if (spincnt > 10) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING); trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); spincnt = 0; } -- cgit v1.2.3 From f5ca34643bbd84f514bdeee194c45dd1fb066ef2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 16:36:10 -0700 Subject: rcu: No-CBs-related sleeps to idle priority This commit converts the schedule_timeout_interruptible() call used by RCU's no-CBs grace-period kthreads to schedule_timeout_idle(). This conversion avoids polluting the load-average with RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 25296c17a30d..982fc5be5269 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2005,7 +2005,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) /* Polling, so trace if first poll in the series. */ if (gotcbs) trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); - schedule_timeout_interruptible(1); + schedule_timeout_idle(1); } else if (!needwait_gp) { /* Wait for callbacks to appear. */ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); -- cgit v1.2.3 From 68c2f27e01f61760e6ae76fff9682e1ffe9bacb6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 16:38:29 -0700 Subject: rcu: Expedited grace-period sleeps to idle priority This commit converts the schedule_timeout_uninterruptible() call used by RCU's expedited grace-period processing to schedule_timeout_idle(). This conversion avoids polluting the load-average with RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 72952edad1e4..1888c0eb1216 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -403,7 +403,7 @@ retry_ipi: /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl")); - schedule_timeout_uninterruptible(1); + schedule_timeout_idle(1); goto retry_ipi; } /* CPU really is offline, so we must report its QS. */ -- cgit v1.2.3 From 360fbbb4897c98971e8955b063c01250817a2191 Mon Sep 17 00:00:00 2001 From: Lihao Liang Date: Thu, 14 May 2020 21:34:34 +0100 Subject: rcu: Update comment from rsp->rcu_gp_seq to rsp->gp_seq Signed-off-by: Lihao Liang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9c6f7343bec0..575745f0a464 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -41,7 +41,7 @@ struct rcu_node { raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ /* some rcu_state fields as well as */ /* following. */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ + unsigned long gp_seq; /* Track rsp->gp_seq. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ @@ -149,7 +149,7 @@ union rcu_noqs { /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ + unsigned long gp_seq; /* Track rsp->gp_seq counter. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ -- cgit v1.2.3 From 3c8920e2dbd1a55f72dc14d656df9d0097cf5c72 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 15 May 2020 02:34:29 +0200 Subject: tick/nohz: Narrow down noise while setting current task's tick dependency Setting a tick dependency on any task, including the case where a task sets that dependency on itself, triggers an IPI to all CPUs. That is of course suboptimal but it had previously not been an issue because it was only used by POSIX CPU timers on nohz_full, which apparently never occurs in latency-sensitive workloads in production. (Or users of such systems are suffering in silence on the one hand or venting their ire on the wrong people on the other.) But RCU now sets a task tick dependency on the current task in order to fix stall issues that can occur during RCU callback processing. Thus, RCU callback processing triggers frequent system-wide IPIs from nohz_full CPUs. This is quite counter-productive, after all, avoiding IPIs is what nohz_full is supposed to be all about. This commit therefore optimizes tasks' self-setting of a task tick dependency by using tick_nohz_full_kick() to avoid the system-wide IPI. Instead, only the execution of the one task is disturbed, which is acceptable given that this disturbance is well down into the noise compared to the degree to which the RCU callback processing itself disturbs execution. Fixes: 6a949b7af82d (rcu: Force on tick when invoking lots of callbacks) Reported-by: Matt Fleming Signed-off-by: Frederic Weisbecker Cc: stable@kernel.org Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/time/tick-sched.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e2dc9b8858c..f0199a4ba1ad 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -351,16 +351,24 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* - * Set a per-task tick dependency. Posix CPU timers need this in order to elapse - * per task timers. + * Set a per-task tick dependency. RCU need this. Also posix CPU timers + * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { - /* - * We could optimize this with just kicking the target running the task - * if that noise matters for nohz full users. - */ - tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); + if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) { + if (tsk == current) { + preempt_disable(); + tick_nohz_full_kick(); + preempt_enable(); + } else { + /* + * Some future tick_nohz_full_kick_task() + * should optimize this. + */ + tick_nohz_full_kick_all(); + } + } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); -- cgit v1.2.3 From 55fbe86ef303bc8ab040e579fba34a750c08200e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 May 2020 15:02:02 -0700 Subject: rcu: Remove initialized but unused rnp from check_slow_task() This commit removes the variable rnp from check_slow_task(), which is defined, assigned to, but not otherwise used. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 2768ce6bf657..d203f82a380a 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -237,14 +237,12 @@ struct rcu_stall_chk_rdr { */ static bool check_slow_task(struct task_struct *t, void *arg) { - struct rcu_node *rnp; struct rcu_stall_chk_rdr *rscrp = arg; if (task_curr(t)) return false; // It is running, so decline to inspect it. rscrp->nesting = t->rcu_read_lock_nesting; rscrp->rs = t->rcu_read_unlock_special; - rnp = t->rcu_blocked_node; rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); return true; } -- cgit v1.2.3 From 04b25a495bd68c1dad07263fb91e8b5a31c00a9e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 May 2020 17:00:54 -0700 Subject: rcu: Mark rcu_nmi_enter() call to rcu_cleanup_after_idle() noinstr The objtool complains about the call to rcu_cleanup_after_idle() from rcu_nmi_enter(), so this commit adds instrumentation_begin() before that call and instrumentation_end() after it. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index feb31c201dee..d17e5a08bf43 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -990,8 +990,11 @@ noinstr void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); // ... but is watching here. - if (!in_nmi()) + if (!in_nmi()) { + instrumentation_begin(); rcu_cleanup_after_idle(); + instrumentation_end(); + } instrumentation_begin(); // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() -- cgit v1.2.3 From d29e0b26b020422cc51b5b51733cc50fcf443965 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 08:49:29 -0700 Subject: lockdep: Complain only once about RCU in extended quiescent state Currently, lockdep_rcu_suspicious() complains twice about RCU read-side critical sections being invoked from within extended quiescent states, for example: RCU used illegally from idle CPU! rcu_scheduler_active = 2, debug_locks = 1 RCU used illegally from extended quiescent state! This commit therefore saves a couple lines of code and one line of console-log output by eliminating the first of these two complaints. Link: https://lore.kernel.org/lkml/87wo4wnpzb.fsf@nanos.tec.linutronix.de Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Signed-off-by: Paul E. McKenney --- kernel/locking/lockdep.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 29a8de4c50b9..0a7549d159ed 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5851,9 +5851,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" - : !rcu_is_watching() - ? "RCU used illegally from idle CPU!\n" - : "", + : "", rcu_scheduler_active, debug_locks); /* -- cgit v1.2.3 From e40bb921119814c6f746891af9cd37eccda616a4 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 1 Jun 2020 19:45:49 +0100 Subject: rcu: Replace 1 with true Coccinelle reports a warning WARNING: Assignment of 0/1 to bool variable The root cause is that the variable lastphase is a bool, but is initialised with integer 1. This commit therefore replaces the 1 with a true. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index ca17b771ad60..a0ba8858dd35 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -207,7 +207,7 @@ void rcu_end_inkernel_boot(void) rcu_unexpedite_gp(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); - rcu_boot_ended = 1; + rcu_boot_ended = true; } /* -- cgit v1.2.3 From c6dfd72b7a3b70a2054db0f73245ea2f762a8452 Mon Sep 17 00:00:00 2001 From: Peter Enderborg Date: Thu, 4 Jun 2020 12:23:20 +0200 Subject: rcu: Stop shrinker loop The count and scan can be separated in time, and there is a fair chance that all work is already done when the scan starts, which might in turn result in a needless retry. This commit therefore avoids this retry by returning SHRINK_STOP. Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Peter Enderborg Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d17e5a08bf43..c8196fab563c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3332,7 +3332,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - return freed; + return freed == 0 ? SHRINK_STOP : freed; } static struct shrinker kfree_rcu_shrinker = { -- cgit v1.2.3 From 00943a609d7ad0f08e58bc9c214f38b0ba163c88 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 12 Jun 2020 10:07:52 +0800 Subject: rcu: gp_max is protected by root rcu_node's lock Because gp_max is protected by root rcu_node's lock, this commit moves the gp_max definition to the region of the rcu_node structure containing fields protected by this lock. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 575745f0a464..09ec93b16f28 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -302,6 +302,8 @@ struct rcu_state { u8 boost ____cacheline_internodealigned_in_smp; /* Subject to priority boost. */ unsigned long gp_seq; /* Grace-period sequence #. */ + unsigned long gp_max; /* Maximum GP duration in */ + /* jiffies. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ @@ -347,8 +349,6 @@ struct rcu_state { /* a reluctant CPU. */ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ /* GP start. */ - unsigned long gp_max; /* Maximum GP duration in */ - /* jiffies. */ const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ -- cgit v1.2.3 From a2dae43088d51c4869e7fa91ca09bcc890e277fc Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 12 Jun 2020 10:07:53 +0800 Subject: rcu: grplo/grphi just records CPU number The ->grplo and ->grphi fields store the lowest and highest CPU number covered by to a rcu_node structure, which is not the group number. This commit therefore adjusts these fields' comments to match reality. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 09ec93b16f28..9f903f5c9fa1 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -73,8 +73,8 @@ struct rcu_node { unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ - int grplo; /* lowest-numbered CPU or group here. */ - int grphi; /* highest-numbered CPU or group here. */ + int grplo; /* lowest-numbered CPU here. */ + int grphi; /* highest-numbered CPU here. */ u8 grpnum; /* CPU/group number for next level up. */ u8 level; /* root is at level 0. */ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ -- cgit v1.2.3 From 7a0c2b0940c13a06573320ab7118375b35feef8b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 12 Jun 2020 10:07:54 +0800 Subject: rcu: grpnum just records group number The ->grpnum field in the rcu_node structure contains the bit position in this structure's parent's bitmasks, which is not the CPU number. This commit therefore adjusts this field's comment accordingly. Signed-off-by: Wei Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9f903f5c9fa1..c96ae351688b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -75,7 +75,7 @@ struct rcu_node { /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU here. */ int grphi; /* highest-numbered CPU here. */ - u8 grpnum; /* CPU/group number for next level up. */ + u8 grpnum; /* group number for next level up. */ u8 level; /* root is at level 0. */ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ /* exit RCU read-side critical sections */ -- cgit v1.2.3 From c3cb47a6cc74af0b79579ba167d7124eb669fbaa Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 15 Jun 2020 12:28:05 -0700 Subject: kernel/rcu/tree.c: Fix kernel-doc warnings Fix kernel-doc warning: ../kernel/rcu/tree.c:959: warning: Excess function parameter 'irq' description in 'rcu_nmi_enter' Fixes: cf7614e13c8f ("rcu: Refactor rcu_{nmi,irq}_{enter,exit}()") Signed-off-by: Randy Dunlap Cc: Byungchul Park Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c8196fab563c..ef05aac7f9d3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -954,7 +954,6 @@ void __rcu_irq_enter_check_tick(void) /** * rcu_nmi_enter - inform RCU of entry to NMI context - * @irq: Is this call from rcu_irq_enter? * * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know -- cgit v1.2.3 From 8e11690d2f5a9823d66f68918c3986b4e9e160ab Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 4 May 2020 14:35:00 +0200 Subject: rcu: Fix a kernel-doc warnings for "count" There are some kernel-doc warnings: ./kernel/rcu/tree.c:2915: warning: Function parameter or member 'count' not described in 'kfree_rcu_cpu' This commit therefore moves the comment for "count" to the kernel-doc markup. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6c6569e0586c..ba4c477495b5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3004,6 +3004,7 @@ struct kfree_rcu_cpu_work { * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending * @initialized: The @lock and @rcu_work fields have been initialized + * @count: Number of objects for which GP not started * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from @@ -3019,7 +3020,6 @@ struct kfree_rcu_cpu { struct delayed_work monitor_work; bool monitor_todo; bool initialized; - // Number of objects for which GP not started int count; }; -- cgit v1.2.3 From 8ac88f7177c75bf9b7b8c29a8054115e1c712baf Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 23:47:45 +0200 Subject: rcu/tree: Keep kfree_rcu() awake during lock contention On PREEMPT_RT kernels, the krcp spinlock gets converted to an rt-mutex and causes kfree_rcu() callers to sleep. This makes it unusable for callers in purely atomic sections such as non-threaded IRQ handlers and raw spinlock sections. Fix it by converting the spinlock to a raw spinlock. Vetting all code paths, there is no reason to believe that the raw spinlock will hurt RT latencies as it is not held for a long time. Cc: bigeasy@linutronix.de Cc: Uladzislau Rezki Reviewed-by: Uladzislau Rezki Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ba4c477495b5..c5de5adca0dd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3016,7 +3016,7 @@ struct kfree_rcu_cpu { struct kfree_rcu_bulk_data *bhead; struct kfree_rcu_bulk_data *bcached; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; - spinlock_t lock; + raw_spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; bool initialized; @@ -3049,12 +3049,12 @@ static void kfree_rcu_work(struct work_struct *work) krwp = container_of(to_rcu_work(work), struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); head = krwp->head_free; krwp->head_free = NULL; bhead = krwp->bhead_free; krwp->bhead_free = NULL; - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); /* "bhead" is now private, so traverse locklessly. */ for (; bhead; bhead = bnext) { @@ -3157,14 +3157,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, krcp->monitor_todo = false; if (queue_kfree_rcu_work(krcp)) { // Success! Our job is done here. - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); return; } // Previous RCU batch still in progress, try again later. krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } /* @@ -3177,11 +3177,11 @@ static void kfree_rcu_monitor(struct work_struct *work) struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, monitor_work.work); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } static inline bool @@ -3252,7 +3252,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) local_irq_save(flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); if (krcp->initialized) - spin_lock(&krcp->lock); + raw_spin_lock(&krcp->lock); // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(head)) { @@ -3283,7 +3283,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unlock_return: if (krcp->initialized) - spin_unlock(&krcp->lock); + raw_spin_unlock(&krcp->lock); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -3315,11 +3315,11 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count = krcp->count; - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); sc->nr_to_scan -= count; freed += count; @@ -3346,15 +3346,15 @@ void __init kfree_rcu_scheduler_running(void) for_each_online_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - spin_lock_irqsave(&krcp->lock, flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (!krcp->head || krcp->monitor_todo) { - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); continue; } krcp->monitor_todo = true; schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES); - spin_unlock_irqrestore(&krcp->lock, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -4250,7 +4250,7 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - spin_lock_init(&krcp->lock); + raw_spin_lock_init(&krcp->lock); for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; -- cgit v1.2.3 From 4d2919411867848fab78c7cb13139e17ad8b85bc Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 23:47:46 +0200 Subject: rcu/tree: Skip entry into the page allocator for PREEMPT_RT To keep the kfree_rcu() code working in purely atomic sections on RT, such as non-threaded IRQ handlers and raw spinlock sections, avoid calling into the page allocator which uses sleeping locks on RT. In fact, even if the caller is preemptible, the kfree_rcu() code is not, as the krcp->lock is a raw spinlock. Calling into the page allocator is optional and avoiding it should be Ok, especially with the page pre-allocation support in future patches. Such pre-allocation would further avoid the a need for a dynamically allocated page in the first place. Cc: Sebastian Andrzej Siewior Reviewed-by: Uladzislau Rezki Co-developed-by: Uladzislau Rezki Signed-off-by: Uladzislau Rezki Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c5de5adca0dd..e0425faf3b3b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3202,6 +3202,18 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, if (!bnode) { WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); + /* + * To keep this path working on raw non-preemptible + * sections, prevent the optional entry into the + * allocator as it uses sleeping locks. In fact, even + * if the caller of kfree_rcu() is preemptible, this + * path still is not, as krcp->lock is a raw spinlock. + * With additional page pre-allocation in the works, + * hitting this return is going to be much less likely. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; + bnode = (struct kfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } -- cgit v1.2.3 From 594aa5975b9b5cfe9edaec06170e43b8c0607377 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:47 +0200 Subject: rcu/tree: Repeat the monitor if any free channel is busy It is possible that one of the channels cannot be detached because its free channel is busy and previously queued data has not been processed yet. On the other hand, another channel can be successfully detached causing the monitor work to stop. Prevent that by rescheduling the monitor work if there are any channels in the pending state after a detach attempt. Fixes: 34c881745549e ("rcu: Support kfree_bulk() interface in kfree_rcu()") Acked-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e0425faf3b3b..5151fe4e1429 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3105,7 +3105,7 @@ static void kfree_rcu_work(struct work_struct *work) static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { struct kfree_rcu_cpu_work *krwp; - bool queued = false; + bool repeat = false; int i; lockdep_assert_held(&krcp->lock); @@ -3143,11 +3143,14 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) * been detached following each other, one by one. */ queue_rcu_work(system_wq, &krwp->rcu_work); - queued = true; } + + /* Repeat if any "free" corresponding channel is still busy. */ + if (krcp->bhead || krcp->head) + repeat = true; } - return queued; + return !repeat; } static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, -- cgit v1.2.3 From 446044eb9c9c335d3ae1be4665193ab43ebb284e Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 23:47:48 +0200 Subject: rcu/tree: Make debug_objects logic independent of rcu_head kfree_rcu()'s debug_objects logic uses the address of the object's embedded rcu_head to queue/unqueue. Instead of this, make use of the object's address itself as preparation for future headless kfree_rcu() support. Reviewed-by: Uladzislau Rezki Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5151fe4e1429..143c1e9265b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2970,13 +2970,11 @@ EXPORT_SYMBOL_GPL(call_rcu); * @nr_records: Number of active pointers in the array * @records: Array of the kfree_rcu() pointers * @next: Next bulk object in the block chain - * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set */ struct kfree_rcu_bulk_data { unsigned long nr_records; void *records[KFREE_BULK_MAX_ENTR]; struct kfree_rcu_bulk_data *next; - struct rcu_head *head_free_debug; }; /** @@ -3026,11 +3024,13 @@ struct kfree_rcu_cpu { static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); static __always_inline void -debug_rcu_head_unqueue_bulk(struct rcu_head *head) +debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) { #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - for (; head; head = head->next) - debug_rcu_head_unqueue(head); + int i; + + for (i = 0; i < bhead->nr_records; i++) + debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); #endif } @@ -3060,7 +3060,7 @@ static void kfree_rcu_work(struct work_struct *work) for (; bhead; bhead = bnext) { bnext = bhead->next; - debug_rcu_head_unqueue_bulk(bhead->head_free_debug); + debug_rcu_bhead_unqueue(bhead); rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, @@ -3082,14 +3082,15 @@ static void kfree_rcu_work(struct work_struct *work) */ for (; head; head = next) { unsigned long offset = (unsigned long)head->func; + void *ptr = (void *)head - offset; next = head->next; - debug_rcu_head_unqueue(head); + debug_rcu_head_unqueue((struct rcu_head *)ptr); rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) - kfree((void *)head - offset); + kfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -3228,18 +3229,11 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Initialize the new block. */ bnode->nr_records = 0; bnode->next = krcp->bhead; - bnode->head_free_debug = NULL; /* Attach it to the head. */ krcp->bhead = bnode; } -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - head->func = func; - head->next = krcp->bhead->head_free_debug; - krcp->bhead->head_free_debug = head; -#endif - /* Finally insert. */ krcp->bhead->records[krcp->bhead->nr_records++] = (void *) head - (unsigned long) func; @@ -3263,14 +3257,17 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; + void *ptr; local_irq_save(flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); if (krcp->initialized) raw_spin_lock(&krcp->lock); + ptr = (void *)head - (unsigned long)func; + // Queue the object but don't yet schedule the batch. - if (debug_rcu_head_queue(head)) { + if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", __func__, head); -- cgit v1.2.3 From 3af84862817403d317dc33312e7a88d76e79401a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:49 +0200 Subject: rcu/tree: Simplify KFREE_BULK_MAX_ENTR macro We can simplify KFREE_BULK_MAX_ENTR macro and get rid of magic numbers which were used to make the structure to be exactly one page. Suggested-by: Boqun Feng Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 143c1e9265b6..bcdc06364426 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2958,13 +2958,6 @@ EXPORT_SYMBOL_GPL(call_rcu); #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 -/* - * This macro defines how many entries the "records" array - * will contain. It is based on the fact that the size of - * kfree_rcu_bulk_data structure becomes exactly one page. - */ -#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3) - /** * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers * @nr_records: Number of active pointers in the array @@ -2973,10 +2966,18 @@ EXPORT_SYMBOL_GPL(call_rcu); */ struct kfree_rcu_bulk_data { unsigned long nr_records; - void *records[KFREE_BULK_MAX_ENTR]; struct kfree_rcu_bulk_data *next; + void *records[]; }; +/* + * This macro defines how many entries the "records" array + * will contain. It is based on the fact that the size of + * kfree_rcu_bulk_data structure becomes exactly one page. + */ +#define KFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *)) + /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period -- cgit v1.2.3 From 952371d6fc0bc360d1d5780f86bb355836117ca2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:50 +0200 Subject: rcu/tree: Move kfree_rcu_cpu locking/unlocking to separate functions Introduce helpers to lock and unlock per-cpu "kfree_rcu_cpu" structures. That will make kfree_call_rcu() more readable and prevent programming errors. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bcdc06364426..368bdc441ffb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3035,6 +3035,27 @@ debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) #endif } +static inline struct kfree_rcu_cpu * +krc_this_cpu_lock(unsigned long *flags) +{ + struct kfree_rcu_cpu *krcp; + + local_irq_save(*flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + if (likely(krcp->initialized)) + raw_spin_lock(&krcp->lock); + + return krcp; +} + +static inline void +krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) +{ + if (likely(krcp->initialized)) + raw_spin_unlock(&krcp->lock); + local_irq_restore(flags); +} + /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bhead_free or ->head_free. @@ -3260,11 +3281,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) struct kfree_rcu_cpu *krcp; void *ptr; - local_irq_save(flags); // For safely calling this_cpu_ptr(). - krcp = this_cpu_ptr(&krc); - if (krcp->initialized) - raw_spin_lock(&krcp->lock); - + krcp = krc_this_cpu_lock(&flags); ptr = (void *)head - (unsigned long)func; // Queue the object but don't yet schedule the batch. @@ -3295,9 +3312,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) } unlock_return: - if (krcp->initialized) - raw_spin_unlock(&krcp->lock); - local_irq_restore(flags); + krc_this_cpu_unlock(krcp, flags); } EXPORT_SYMBOL_GPL(kfree_call_rcu); -- cgit v1.2.3 From 69f08d3999dbef1553a3332b8055282dd3893b6c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 25 May 2020 23:47:51 +0200 Subject: rcu/tree: Use static initializer for krc.lock The per-CPU variable is initialized at runtime in kfree_rcu_batch_init(). This function is invoked before 'rcu_scheduler_active' is set to 'RCU_SCHEDULER_RUNNING'. After the initialisation, '->initialized' is to true. The raw_spin_lock is only acquired if '->initialized' is set to true. The worqueue item is only used if 'rcu_scheduler_active' set to RCU_SCHEDULER_RUNNING which happens after initialisation. Use a static initializer for krc.lock and remove the runtime initialisation of the lock. Since the lock can now be always acquired, remove the '->initialized' check. Cc: Sebastian Andrzej Siewior Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 368bdc441ffb..a42a4693f161 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3002,7 +3002,7 @@ struct kfree_rcu_cpu_work { * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending - * @initialized: The @lock and @rcu_work fields have been initialized + * @initialized: The @rcu_work fields have been initialized * @count: Number of objects for which GP not started * * This is a per-CPU structure. The reason that it is not included in @@ -3022,7 +3022,9 @@ struct kfree_rcu_cpu { int count; }; -static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), +}; static __always_inline void debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) @@ -3042,8 +3044,7 @@ krc_this_cpu_lock(unsigned long *flags) local_irq_save(*flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); - if (likely(krcp->initialized)) - raw_spin_lock(&krcp->lock); + raw_spin_lock(&krcp->lock); return krcp; } @@ -3051,8 +3052,7 @@ krc_this_cpu_lock(unsigned long *flags) static inline void krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) { - if (likely(krcp->initialized)) - raw_spin_unlock(&krcp->lock); + raw_spin_unlock(&krcp->lock); local_irq_restore(flags); } @@ -4278,7 +4278,6 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - raw_spin_lock_init(&krcp->lock); for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; -- cgit v1.2.3 From 53c72b590b3a0afd6747d6f7957e6838003e90a4 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:52 +0200 Subject: rcu/tree: cache specified number of objects In order to reduce the dynamic need for pages in kfree_rcu(), pre-allocate a configurable number of pages per CPU and link them in a list. When kfree_rcu() reclaims objects, the object's container page is cached into a list instead of being released to the low-level page allocator. Such an approach provides O(1) access to free pages while also reducing the number of requests to the page allocator. It also makes the kfree_rcu() code to have free pages available during a low memory condition. A read-only sysfs parameter (rcu_min_cached_objs) reflects the minimum number of allowed cached pages per CPU. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 8 +++ kernel/rcu/tree.c | 66 +++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb95fad81c79..befaa63652ff 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4038,6 +4038,14 @@ latencies, which will choose a value aligned with the appropriate hardware boundaries. + rcutree.rcu_min_cached_objs= [KNL] + Minimum number of objects which are cached and + maintained per one CPU. Object size is equal + to PAGE_SIZE. The cache allows to reduce the + pressure to page allocator, also it makes the + whole algorithm to behave better in low memory + condition. + rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to first attempt to force quiescent states. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a42a4693f161..37c0cd0332f8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -175,6 +175,15 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +/* + * This rcu parameter is runtime-read-only. It reflects + * a minimum allowed number of objects which can be cached + * per-CPU. Object size is equal to one page. This value + * can be changed at boot time. + */ +static int rcu_min_cached_objs = 2; +module_param(rcu_min_cached_objs, int, 0444); + /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -2997,7 +3006,6 @@ struct kfree_rcu_cpu_work { * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period - * @bcached: Keeps at most one object for later reuse when build chain blocks * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES @@ -3013,13 +3021,22 @@ struct kfree_rcu_cpu_work { struct kfree_rcu_cpu { struct rcu_head *head; struct kfree_rcu_bulk_data *bhead; - struct kfree_rcu_bulk_data *bcached; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; bool initialized; int count; + + /* + * A simple cache list that contains objects for + * reuse purpose. In order to save some per-cpu + * space the list is singular. Even though it is + * lockless an access has to be protected by the + * per-cpu lock. + */ + struct llist_head bkvcache; + int nr_bkv_objs; }; static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { @@ -3056,6 +3073,31 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) local_irq_restore(flags); } +static inline struct kfree_rcu_bulk_data * +get_cached_bnode(struct kfree_rcu_cpu *krcp) +{ + if (!krcp->nr_bkv_objs) + return NULL; + + krcp->nr_bkv_objs--; + return (struct kfree_rcu_bulk_data *) + llist_del_first(&krcp->bkvcache); +} + +static inline bool +put_cached_bnode(struct kfree_rcu_cpu *krcp, + struct kfree_rcu_bulk_data *bnode) +{ + // Check the limit. + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) + return false; + + llist_add((struct llist_node *) bnode, &krcp->bkvcache); + krcp->nr_bkv_objs++; + return true; + +} + /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bhead_free or ->head_free. @@ -3091,7 +3133,12 @@ static void kfree_rcu_work(struct work_struct *work) kfree_bulk(bhead->nr_records, bhead->records); rcu_lock_release(&rcu_callback_map); - if (cmpxchg(&krcp->bcached, NULL, bhead)) + krcp = krc_this_cpu_lock(&flags); + if (put_cached_bnode(krcp, bhead)) + bhead = NULL; + krc_this_cpu_unlock(krcp, flags); + + if (bhead) free_page((unsigned long) bhead); cond_resched_tasks_rcu_qs(); @@ -3224,7 +3271,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Check if a new block is required. */ if (!krcp->bhead || krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { - bnode = xchg(&krcp->bcached, NULL); + bnode = get_cached_bnode(krcp); if (!bnode) { WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); @@ -4277,12 +4324,23 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + struct kfree_rcu_bulk_data *bnode; for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; } + for (i = 0; i < rcu_min_cached_objs; i++) { + bnode = (struct kfree_rcu_bulk_data *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + + if (bnode) + put_cached_bnode(krcp, bnode); + else + pr_err("Failed to preallocate for %d CPU!\n", cpu); + } + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } -- cgit v1.2.3 From 5f3c8d620447d509e534962e23f7edfb85f4e533 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:53 +0200 Subject: rcu/tree: Maintain separate array for vmalloc ptrs To do so, we use an array of kvfree_rcu_bulk_data structures. It consists of two elements: - index number 0 corresponds to slab pointers. - index number 1 corresponds to vmalloc pointers. Keeping vmalloc pointers separated from slab pointers makes it possible to invoke the right freeing API for the right kind of pointer. It also prepares us for future headless support for vmalloc and SLAB objects. Such objects cannot be queued on a linked list and are instead directly into an array. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Reviewed-by: Joel Fernandes (Google) Co-developed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 173 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 100 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 37c0cd0332f8..67c4b984c499 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -57,6 +57,8 @@ #include #include #include +#include +#include #include "../time/tick-internal.h" #include "tree.h" @@ -2966,46 +2968,47 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_N_BATCHES 2 +#define FREE_N_CHANNELS 2 /** - * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers + * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers * @nr_records: Number of active pointers in the array - * @records: Array of the kfree_rcu() pointers * @next: Next bulk object in the block chain + * @records: Array of the kvfree_rcu() pointers */ -struct kfree_rcu_bulk_data { +struct kvfree_rcu_bulk_data { unsigned long nr_records; - struct kfree_rcu_bulk_data *next; + struct kvfree_rcu_bulk_data *next; void *records[]; }; /* * This macro defines how many entries the "records" array * will contain. It is based on the fact that the size of - * kfree_rcu_bulk_data structure becomes exactly one page. + * kvfree_rcu_bulk_data structure becomes exactly one page. */ -#define KFREE_BULK_MAX_ENTR \ - ((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *)) +#define KVFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period - * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period + * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; - struct kfree_rcu_bulk_data *bhead_free; + struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period + * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES @@ -3020,7 +3023,7 @@ struct kfree_rcu_cpu_work { */ struct kfree_rcu_cpu { struct rcu_head *head; - struct kfree_rcu_bulk_data *bhead; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; @@ -3044,7 +3047,7 @@ static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { }; static __always_inline void -debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) +debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) { #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD int i; @@ -3073,20 +3076,20 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) local_irq_restore(flags); } -static inline struct kfree_rcu_bulk_data * +static inline struct kvfree_rcu_bulk_data * get_cached_bnode(struct kfree_rcu_cpu *krcp) { if (!krcp->nr_bkv_objs) return NULL; krcp->nr_bkv_objs--; - return (struct kfree_rcu_bulk_data *) + return (struct kvfree_rcu_bulk_data *) llist_del_first(&krcp->bkvcache); } static inline bool put_cached_bnode(struct kfree_rcu_cpu *krcp, - struct kfree_rcu_bulk_data *bnode) + struct kvfree_rcu_bulk_data *bnode) { // Check the limit. if (krcp->nr_bkv_objs >= rcu_min_cached_objs) @@ -3105,43 +3108,63 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp, static void kfree_rcu_work(struct work_struct *work) { unsigned long flags; + struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; struct rcu_head *head, *next; - struct kfree_rcu_bulk_data *bhead, *bnext; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; + int i, j; krwp = container_of(to_rcu_work(work), struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; + raw_spin_lock_irqsave(&krcp->lock, flags); + // Channels 1 and 2. + for (i = 0; i < FREE_N_CHANNELS; i++) { + bkvhead[i] = krwp->bkvhead_free[i]; + krwp->bkvhead_free[i] = NULL; + } + + // Channel 3. head = krwp->head_free; krwp->head_free = NULL; - bhead = krwp->bhead_free; - krwp->bhead_free = NULL; raw_spin_unlock_irqrestore(&krcp->lock, flags); - /* "bhead" is now private, so traverse locklessly. */ - for (; bhead; bhead = bnext) { - bnext = bhead->next; - - debug_rcu_bhead_unqueue(bhead); - - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, - bhead->nr_records, bhead->records); - - kfree_bulk(bhead->nr_records, bhead->records); - rcu_lock_release(&rcu_callback_map); + // Handle two first channels. + for (i = 0; i < FREE_N_CHANNELS; i++) { + for (; bkvhead[i]; bkvhead[i] = bnext) { + bnext = bkvhead[i]->next; + debug_rcu_bhead_unqueue(bkvhead[i]); + + rcu_lock_acquire(&rcu_callback_map); + if (i == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bkvhead[i]->nr_records, + bkvhead[i]->records); + + kfree_bulk(bkvhead[i]->nr_records, + bkvhead[i]->records); + } else { // vmalloc() / vfree(). + for (j = 0; j < bkvhead[i]->nr_records; j++) { + trace_rcu_invoke_kfree_callback( + rcu_state.name, + bkvhead[i]->records[j], 0); + + vfree(bkvhead[i]->records[j]); + } + } + rcu_lock_release(&rcu_callback_map); - krcp = krc_this_cpu_lock(&flags); - if (put_cached_bnode(krcp, bhead)) - bhead = NULL; - krc_this_cpu_unlock(krcp, flags); + krcp = krc_this_cpu_lock(&flags); + if (put_cached_bnode(krcp, bkvhead[i])) + bkvhead[i] = NULL; + krc_this_cpu_unlock(krcp, flags); - if (bhead) - free_page((unsigned long) bhead); + if (bkvhead[i]) + free_page((unsigned long) bkvhead[i]); - cond_resched_tasks_rcu_qs(); + cond_resched_tasks_rcu_qs(); + } } /* @@ -3159,7 +3182,7 @@ static void kfree_rcu_work(struct work_struct *work) trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) - kfree(ptr); + kvfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -3176,7 +3199,7 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { struct kfree_rcu_cpu_work *krwp; bool repeat = false; - int i; + int i, j; lockdep_assert_held(&krcp->lock); @@ -3184,21 +3207,25 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krwp = &(krcp->krw_arr[i]); /* - * Try to detach bhead or head and attach it over any + * Try to detach bkvhead or head and attach it over any * available corresponding free channel. It can be that * a previous RCU batch is in progress, it means that * immediately to queue another one is not possible so * return false to tell caller to retry. */ - if ((krcp->bhead && !krwp->bhead_free) || + if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || + (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || (krcp->head && !krwp->head_free)) { - /* Channel 1. */ - if (!krwp->bhead_free) { - krwp->bhead_free = krcp->bhead; - krcp->bhead = NULL; + // Channel 1 corresponds to SLAB ptrs. + // Channel 2 corresponds to vmalloc ptrs. + for (j = 0; j < FREE_N_CHANNELS; j++) { + if (!krwp->bkvhead_free[j]) { + krwp->bkvhead_free[j] = krcp->bkvhead[j]; + krcp->bkvhead[j] = NULL; + } } - /* Channel 2. */ + // Channel 3 corresponds to emergency path. if (!krwp->head_free) { krwp->head_free = krcp->head; krcp->head = NULL; @@ -3207,16 +3234,17 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) WRITE_ONCE(krcp->count, 0); /* - * One work is per one batch, so there are two "free channels", - * "bhead_free" and "head_free" the batch can handle. It can be - * that the work is in the pending state when two channels have - * been detached following each other, one by one. + * One work is per one batch, so there are three + * "free channels", the batch can handle. It can + * be that the work is in the pending state when + * channels have been detached following by each + * other. */ queue_rcu_work(system_wq, &krwp->rcu_work); } - /* Repeat if any "free" corresponding channel is still busy. */ - if (krcp->bhead || krcp->head) + // Repeat if any "free" corresponding channel is still busy. + if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head) repeat = true; } @@ -3258,23 +3286,22 @@ static void kfree_rcu_monitor(struct work_struct *work) } static inline bool -kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, - struct rcu_head *head, rcu_callback_t func) +kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) { - struct kfree_rcu_bulk_data *bnode; + struct kvfree_rcu_bulk_data *bnode; + int idx; if (unlikely(!krcp->initialized)) return false; lockdep_assert_held(&krcp->lock); + idx = !!is_vmalloc_addr(ptr); /* Check if a new block is required. */ - if (!krcp->bhead || - krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { + if (!krcp->bkvhead[idx] || + krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { bnode = get_cached_bnode(krcp); if (!bnode) { - WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE); - /* * To keep this path working on raw non-preemptible * sections, prevent the optional entry into the @@ -3287,7 +3314,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, if (IS_ENABLED(CONFIG_PREEMPT_RT)) return false; - bnode = (struct kfree_rcu_bulk_data *) + bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } @@ -3297,30 +3324,30 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, /* Initialize the new block. */ bnode->nr_records = 0; - bnode->next = krcp->bhead; + bnode->next = krcp->bkvhead[idx]; /* Attach it to the head. */ - krcp->bhead = bnode; + krcp->bkvhead[idx] = bnode; } /* Finally insert. */ - krcp->bhead->records[krcp->bhead->nr_records++] = - (void *) head - (unsigned long) func; + krcp->bkvhead[idx]->records + [krcp->bkvhead[idx]->nr_records++] = ptr; return true; } /* - * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace - * period. Please note there are two paths are maintained, one is the main one - * that uses kfree_bulk() interface and second one is emergency one, that is - * used only when the main path can not be maintained temporary, due to memory - * pressure. + * Queue a request for lazy invocation of appropriate free routine after a + * grace period. Please note there are three paths are maintained, two are the + * main ones that use array of pointers interface and third one is emergency + * one, that is used only when the main path can not be maintained temporary, + * due to memory pressure. * * Each kfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * be free'd in workqueue context. This allows us to: batch requests together to - * reduce the number of grace periods during heavy kfree_rcu() load. + * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -3343,7 +3370,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) * Under high memory pressure GFP_NOWAIT can fail, * in that case the emergency path is maintained. */ - if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) { + if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) { head->func = func; head->next = krcp->head; krcp->head = head; @@ -4324,7 +4351,7 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - struct kfree_rcu_bulk_data *bnode; + struct kvfree_rcu_bulk_data *bnode; for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); @@ -4332,7 +4359,7 @@ static void __init kfree_rcu_batch_init(void) } for (i = 0; i < rcu_min_cached_objs; i++) { - bnode = (struct kfree_rcu_bulk_data *) + bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (bnode) -- cgit v1.2.3 From 64d1d06ccb1b7de245ccf781b91517f328bebd9f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:54 +0200 Subject: rcu/tiny: support vmalloc in tiny-RCU Replace kfree() with kvfree() in rcu_reclaim_tiny(). This makes it possible to release either SLAB or vmalloc objects after a GP. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index dd572ce7c747..4b99f7b88bee 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "rcu.h" @@ -86,7 +87,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { trace_rcu_invoke_kfree_callback("", head, offset); - kfree((void *)head - offset); + kvfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } -- cgit v1.2.3 From c408b215f58f7156bb6bafb64c0263ee907033df Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:55 +0200 Subject: rcu: Rename *_kfree_callback/*_kfree_rcu_offset/kfree_call_* The following changes are introduced: 1. Rename rcu_invoke_kfree_callback() to rcu_invoke_kvfree_callback(), as well as the associated trace events, so the rcu_kfree_callback(), becomes rcu_kvfree_callback(). The reason is to be aligned with kvfree() notation. 2. Rename __is_kfree_rcu_offset to __is_kvfree_rcu_offset. All RCU paths use kvfree() now instead of kfree(), thus rename it. 3. Rename kfree_call_rcu() to the kvfree_call_rcu(). The reason is, it is capable of freeing vmalloc() memory now. Do the same with __kfree_rcu() macro, it becomes __kvfree_rcu(), the goal is the same. Reviewed-by: Joel Fernandes (Google) Co-developed-by: Joel Fernandes (Google) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 14 +++++++------- include/linux/rcutiny.h | 2 +- include/linux/rcutree.h | 2 +- include/trace/events/rcu.h | 8 ++++---- kernel/rcu/tiny.c | 4 ++-- kernel/rcu/tree.c | 16 ++++++++-------- 6 files changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 659cbfa7581a..b344fc800a9b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -828,17 +828,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) /* * Does the specified offset indicate that the corresponding rcu_head - * structure can be handled by kfree_rcu()? + * structure can be handled by kvfree_rcu()? */ -#define __is_kfree_rcu_offset(offset) ((offset) < 4096) +#define __is_kvfree_rcu_offset(offset) ((offset) < 4096) /* * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain. */ -#define __kfree_rcu(head, offset) \ +#define __kvfree_rcu(head, offset) \ do { \ - BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \ - kfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \ + BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \ + kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \ } while (0) /** @@ -857,7 +857,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * Because the functions are not allowed in the low-order 4096 bytes of * kernel virtual memory, offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will - * be generated in __kfree_rcu(). If this error is triggered, you can + * be generated in __kvfree_rcu(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to * position the rcu_head structure into the first 4096 bytes. * @@ -872,7 +872,7 @@ do { \ typeof (ptr) ___p = (ptr); \ \ if (___p) \ - __kfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ + __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ } while (0) /* diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 8512caeb7682..fb2eb39c484f 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -34,7 +34,7 @@ static inline void synchronize_rcu_expedited(void) synchronize_rcu(); } -static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { call_rcu(head, func); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d5cc9d675987..d2f4064ebd1d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -33,7 +33,7 @@ static inline void rcu_virt_note_context_switch(int cpu) } void synchronize_rcu_expedited(void); -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func); +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index f9a7811148e2..0ee93d0b1daa 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -506,13 +506,13 @@ TRACE_EVENT_RCU(rcu_callback, /* * Tracepoint for the registration of a single RCU callback of the special - * kfree() form. The first argument is the RCU type, the second argument + * kvfree() form. The first argument is the RCU type, the second argument * is a pointer to the RCU callback, the third argument is the offset * of the callback within the enclosing RCU-protected data structure, * the fourth argument is the number of lazy callbacks queued, and the * fifth argument is the total number of callbacks queued. */ -TRACE_EVENT_RCU(rcu_kfree_callback, +TRACE_EVENT_RCU(rcu_kvfree_callback, TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset, long qlen), @@ -596,12 +596,12 @@ TRACE_EVENT_RCU(rcu_invoke_callback, /* * Tracepoint for the invocation of a single RCU callback of the special - * kfree() form. The first argument is the RCU flavor, the second + * kvfree() form. The first argument is the RCU flavor, the second * argument is a pointer to the RCU callback, and the third argument * is the offset of the callback within the enclosing RCU-protected * data structure. */ -TRACE_EVENT_RCU(rcu_invoke_kfree_callback, +TRACE_EVENT_RCU(rcu_invoke_kvfree_callback, TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset), diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4b99f7b88bee..aa897c3f2e92 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -85,8 +85,8 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kfree_rcu_offset(offset)) { - trace_rcu_invoke_kfree_callback("", head, offset); + if (__is_kvfree_rcu_offset(offset)) { + trace_rcu_invoke_kvfree_callback("", head, offset); kvfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 67c4b984c499..f22c47e72287 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2905,8 +2905,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) return; // Enqueued onto ->nocb_bypass, so just leave. // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kfree_rcu_offset((unsigned long)func)) - trace_rcu_kfree_callback(rcu_state.name, head, + if (__is_kvfree_rcu_offset((unsigned long)func)) + trace_rcu_kvfree_callback(rcu_state.name, head, (unsigned long)func, rcu_segcblist_n_cbs(&rdp->cblist)); else @@ -3146,7 +3146,7 @@ static void kfree_rcu_work(struct work_struct *work) bkvhead[i]->records); } else { // vmalloc() / vfree(). for (j = 0; j < bkvhead[i]->nr_records; j++) { - trace_rcu_invoke_kfree_callback( + trace_rcu_invoke_kvfree_callback( rcu_state.name, bkvhead[i]->records[j], 0); @@ -3179,9 +3179,9 @@ static void kfree_rcu_work(struct work_struct *work) next = head->next; debug_rcu_head_unqueue((struct rcu_head *)ptr); rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) + if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) kvfree(ptr); rcu_lock_release(&rcu_callback_map); @@ -3344,12 +3344,12 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) * one, that is used only when the main path can not be maintained temporary, * due to memory pressure. * - * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * Each kvfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * be free'd in workqueue context. This allows us to: batch requests together to * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; @@ -3388,7 +3388,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unlock_return: krc_this_cpu_unlock(krcp, flags); } -EXPORT_SYMBOL_GPL(kfree_call_rcu); +EXPORT_SYMBOL_GPL(kvfree_call_rcu); static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) -- cgit v1.2.3 From 3042f83f19bec2e0cd356f72b39e4d816e8cd5ff Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 25 May 2020 23:47:58 +0200 Subject: rcu: Support reclaim for head-less object Update the kvfree_call_rcu() function with head-less support. This allows RCU to reclaim objects without an embedded rcu_head. tree-RCU: We introduce two chains of arrays to store SLAB-backed and vmalloc pointers, each. Storage in either of these arrays does not require embedding an rcu_head within the object. Maintaining the arrays may become impossible due to high memory pressure. For such cases there is an emergency path. Objects with rcu_head inside are just queued on a backup rcu_head list. Later on that list is drained. As for the head-less variant, as the current context can sleep, the following emergency measures are applied: a) Synchronously wait until a grace period has elapsed. b) Call kvfree(). tiny-RCU: For double argument calls, there are no new changes in behavior. For single argument call, kvfree() is directly inlined on the current stack after a synchronize_rcu() call. Note that for tiny-RCU, any call to synchronize_rcu() is actually a quiescent state, therefore it does nothing. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Co-developed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 18 +++++++++++++++++- kernel/rcu/tree.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index fb2eb39c484f..5cc9637cac16 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -34,9 +34,25 @@ static inline void synchronize_rcu_expedited(void) synchronize_rcu(); } +/* + * Add one more declaration of kvfree() here. It is + * not so straight forward to just include + * where it is defined due to getting many compile + * errors caused by that include. + */ +extern void kvfree(const void *addr); + static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { - call_rcu(head, func); + if (head) { + call_rcu(head, func); + return; + } + + // kvfree_rcu(one_arg) call. + might_sleep(); + synchronize_rcu(); + kvfree((void *) func); } void rcu_qs(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f22c47e72287..01f29e4500ba 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3314,6 +3314,13 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) if (IS_ENABLED(CONFIG_PREEMPT_RT)) return false; + /* + * NOTE: For one argument of kvfree_rcu() we can + * drop the lock and get the page in sleepable + * context. That would allow to maintain an array + * for the CONFIG_PREEMPT_RT as well if no cached + * pages are available. + */ bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_NOWAIT | __GFP_NOWARN); } @@ -3353,16 +3360,33 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct kfree_rcu_cpu *krcp; + bool success; void *ptr; + if (head) { + ptr = (void *) head - (unsigned long) func; + } else { + /* + * Please note there is a limitation for the head-less + * variant, that is why there is a clear rule for such + * objects: it can be used from might_sleep() context + * only. For other places please embed an rcu_head to + * your data. + */ + might_sleep(); + ptr = (unsigned long *) func; + } + krcp = krc_this_cpu_lock(&flags); - ptr = (void *)head - (unsigned long)func; // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", __func__, head); + + // Mark as success and leave. + success = true; goto unlock_return; } @@ -3370,10 +3394,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) * Under high memory pressure GFP_NOWAIT can fail, * in that case the emergency path is maintained. */ - if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) { + success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); + if (!success) { + if (head == NULL) + // Inline if kvfree_rcu(one_arg) call. + goto unlock_return; + head->func = func; head->next = krcp->head; krcp->head = head; + success = true; } WRITE_ONCE(krcp->count, krcp->count + 1); @@ -3387,6 +3417,17 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unlock_return: krc_this_cpu_unlock(krcp, flags); + + /* + * Inline kvfree() after synchronize_rcu(). We can do + * it from might_sleep() context only, so the current + * CPU can pass the QS state. + */ + if (!success) { + debug_rcu_head_unqueue((struct rcu_head *) ptr); + synchronize_rcu(); + kvfree(ptr); + } } EXPORT_SYMBOL_GPL(kvfree_call_rcu); -- cgit v1.2.3 From ea6eed9f7d7382c7230202d4c3bf74185f193394 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 May 2020 16:47:13 -0700 Subject: rcu-tasks: Convert sleeps to idle priority This commit converts the long-standing schedule_timeout_interruptible() and schedule_timeout_uninterruptible() calls used by the various Tasks RCU's grace-period kthreads to schedule_timeout_idle(). This conversion avoids polluting the load-average with Tasks-RCU-related sleeping. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ce23f6cc5043..91fee8122acd 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -205,7 +205,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) if (!rtp->cbs_head) { WARN_ON(signal_pending(current)); set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); - schedule_timeout_interruptible(HZ/10); + schedule_timeout_idle(HZ/10); } continue; } @@ -227,7 +227,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) cond_resched(); } /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_uninterruptible(HZ/10); + schedule_timeout_idle(HZ/10); set_tasks_gp_state(rtp, RTGS_WAIT_CBS); } @@ -336,7 +336,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) /* Slowly back off waiting for holdouts */ set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - schedule_timeout_interruptible(HZ/fract); + schedule_timeout_idle(HZ/fract); if (fract > 1) fract--; -- cgit v1.2.3 From 04a3c5aa7a8cb2ce97f9beb627ba742bc8b0fe03 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 19:27:06 -0700 Subject: rcu-tasks: Make rcu_tasks_postscan() be static The rcu_tasks_postscan() function is not used outside of RCU's tasks.h file, so this commit makes it be static. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 91fee8122acd..da200e53d60d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -402,7 +402,7 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) } /* Processing between scanning taskslist and draining the holdout list. */ -void rcu_tasks_postscan(struct list_head *hop) +static void rcu_tasks_postscan(struct list_head *hop) { /* * Wait for tasks that are in the process of exiting. This -- cgit v1.2.3 From 5b3cc99bedf5885055fbaf35fe63d205f06b5be5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 19:33:47 -0700 Subject: rcu-tasks: Add #include of rcupdate_trace.h to update.c Although this is in some strict sense unnecessary, it is good to allow the compiler to compare the function declaration with its definition. This commit therefore adds a #include of linux/rcupdate_trace.h to kernel/rcu/update.c. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 84843adfd939..c0fea809d738 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -42,6 +42,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS -- cgit v1.2.3 From 8344496e8b49c4122c1808d6cd3f8dc71bccb595 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 20:03:48 -0700 Subject: rcu-tasks: Conditionally compile show_rcu_tasks_gp_kthreads() The show_rcu_tasks_gp_kthreads() function is not invoked by Tiny RCU, but is nevertheless defined in Tiny RCU builds that enable Tasks Trace RCU. This commit therefore conditionally compiles this function so that it is defined only in builds that actually use it. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index da200e53d60d..d5c003c1972c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -103,6 +103,7 @@ module_param(rcu_task_stall_timeout, int, 0644); #define RTGS_WAIT_READERS 9 #define RTGS_INVOKE_CBS 10 #define RTGS_WAIT_CBS 11 +#ifndef CONFIG_TINY_RCU static const char * const rcu_tasks_gp_state_names[] = { "RTGS_INIT", "RTGS_WAIT_WAIT_CBS", @@ -117,6 +118,7 @@ static const char * const rcu_tasks_gp_state_names[] = { "RTGS_INVOKE_CBS", "RTGS_WAIT_CBS", }; +#endif /* #ifndef CONFIG_TINY_RCU */ //////////////////////////////////////////////////////////////////////// // @@ -129,6 +131,7 @@ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) rtp->gp_jiffies = jiffies; } +#ifndef CONFIG_TINY_RCU /* Return state name. */ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) { @@ -139,6 +142,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) return "???"; return rcu_tasks_gp_state_names[j]; } +#endif /* #ifndef CONFIG_TINY_RCU */ // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, @@ -268,6 +272,7 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifndef CONFIG_TINY_RCU */ +#ifndef CONFIG_TINY_RCU /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { @@ -281,6 +286,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) ".C"[!!data_race(rtp->cbs_head)], s); } +#endif /* #ifndef CONFIG_TINY_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t); @@ -557,10 +563,12 @@ static int __init rcu_spawn_tasks_kthread(void) } core_initcall(rcu_spawn_tasks_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_classic_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } +#endif /* #ifndef CONFIG_TINY_RCU */ /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) @@ -682,10 +690,12 @@ static int __init rcu_spawn_tasks_rude_kthread(void) } core_initcall(rcu_spawn_tasks_rude_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_rude_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_RUDE_RCU */ static void show_rcu_tasks_rude_gp_kthread(void) {} @@ -1164,6 +1174,7 @@ static int __init rcu_spawn_tasks_trace_kthread(void) } core_initcall(rcu_spawn_tasks_trace_kthread); +#ifndef CONFIG_TINY_RCU static void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; @@ -1174,18 +1185,21 @@ static void show_rcu_tasks_trace_gp_kthread(void) data_race(n_heavy_reader_attempts)); show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ +#ifndef CONFIG_TINY_RCU void show_rcu_tasks_gp_kthreads(void) { show_rcu_tasks_classic_gp_kthread(); show_rcu_tasks_rude_gp_kthread(); show_rcu_tasks_trace_gp_kthread(); } +#endif /* #ifndef CONFIG_TINY_RCU */ #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} -- cgit v1.2.3 From 30d8aa5128f12c9d781b67c9694c1abfa4f6ce6a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Jun 2020 09:24:51 -0700 Subject: rcu-tasks: Fix code-style issues This commit declares trc_n_readers_need_end and trc_wait static and replaced a "&" with "&&". The "&" happened to work because the values are bool, but accidents waiting to happen and all that... Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d5c003c1972c..828f222895f1 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -737,8 +737,8 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map); #ifdef CONFIG_TASKS_TRACE_RCU -atomic_t trc_n_readers_need_end; // Number of waited-for readers. -DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. +static atomic_t trc_n_readers_need_end; // Number of waited-for readers. +static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. // Record outstanding IPIs to each CPU. No point in sending two... static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); @@ -845,7 +845,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) bool ofl = cpu_is_offline(cpu); if (task_curr(t)) { - WARN_ON_ONCE(ofl & !is_idle_task(t)); + WARN_ON_ONCE(ofl && !is_idle_task(t)); // If no chance of heavyweight readers, do it the hard way. if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) -- cgit v1.2.3 From 7e866460cc18797b3a59360f5f8c444598a21729 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 00:36:47 -0400 Subject: rcuperf: Remove useless while loops around wait_event wait_event() already retries if the condition for the wake up is not satisifed after wake up. Remove them from the rcuperf test. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 16dd1e6b7c09..246da8fe199e 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -576,11 +576,8 @@ static int compute_real(int n) static int rcu_perf_shutdown(void *arg) { - do { - wait_event(shutdown_wq, - atomic_read(&n_rcu_perf_writer_finished) >= - nrealwriters); - } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); + wait_event(shutdown_wq, + atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters); smp_mb(); /* Wake before output. */ rcu_perf_cleanup(); kernel_power_off(); @@ -693,11 +690,8 @@ kfree_perf_cleanup(void) static int kfree_perf_shutdown(void *arg) { - do { - wait_event(shutdown_wq, - atomic_read(&n_kfree_perf_thread_ended) >= - kfree_nrealthreads); - } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads); + wait_event(shutdown_wq, + atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ -- cgit v1.2.3 From 653ed64b01dc5989f8f579d0038e987476c2c023 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 May 2020 00:36:48 -0400 Subject: refperf: Add a test to measure performance of read-side synchronization Add a test for comparing the performance of RCU with various read-side synchronization mechanisms. The test has proved useful for collecting data and performing these comparisons. Currently RCU, SRCU, reader-writer lock, reader-writer semaphore and reference counting can be measured using refperf.perf_type parameter. Each invocation of the test runs measures performance of a specific mechanism. The maximum number of CPUs to concurrently run readers on is chosen by the test itself and is 75% of the total number of CPUs. So if you had 24 CPUs, the test runs with a maximum of 18 parallel readers. A number of experiments are conducted, and in each experiment, the number of readers is increased by 1, upto the 75% of CPUs mark. During each experiment, all readers execute an empty loop with refperf.loops iterations and time the total loop duration. This is then averaged. Example output: Parameters "refperf.perf_type=srcu refperf.loops=2000000" looks like: [ 3.347133] srcu-ref-perf: [ 3.347133] Threads Time(ns) [ 3.347133] 1 36 [ 3.347133] 2 34 [ 3.347133] 3 34 [ 3.347133] 4 34 [ 3.347133] 5 33 [ 3.347133] 6 33 [ 3.347133] 7 33 [ 3.347133] 8 33 [ 3.347133] 9 33 [ 3.347133] 10 33 [ 3.347133] 11 33 [ 3.347133] 12 33 [ 3.347133] 13 33 [ 3.347133] 14 33 [ 3.347133] 15 32 [ 3.347133] 16 33 [ 3.347133] 17 33 [ 3.347133] 18 34 Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 19 ++ kernel/rcu/Makefile | 1 + kernel/rcu/refperf.c | 558 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 578 insertions(+) create mode 100644 kernel/rcu/refperf.c (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 452feae8de20..858765b7f644 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -61,6 +61,25 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. +config RCU_REF_PERF_TEST + tristate "Performance tests for read-side synchronization (RCU and others)" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU + default n + help + This option provides a kernel module that runs performance tests + useful comparing RCU with various read-side synchronization mechanisms. + The kernel module may be built after the fact on the running kernel to be + tested, if desired. + + Say Y here if you want these performance tests built into the kernel. + Say M if you want to build it as a module instead. + Say N if you are unsure. + config RCU_CPU_STALL_TIMEOUT int "RCU CPU stall timeout in seconds" depends on RCU_STALL_COMMON diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index f91f2c2cf138..ba7d82609cbe 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o +obj-$(CONFIG_RCU_REF_PERF_TEST) += refperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c new file mode 100644 index 000000000000..61161530acc8 --- /dev/null +++ b/kernel/rcu/refperf.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Performance test comparing RCU vs other mechanisms +// for acquiring references on objects. +// +// Copyright (C) Google, 2020. +// +// Author: Joel Fernandes + +#define pr_fmt(fmt) fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rcu.h" + +#define PERF_FLAG "-ref-perf: " + +#define PERFOUT(s, x...) \ + pr_alert("%s" PERF_FLAG s, perf_type, ## x) + +#define VERBOSE_PERFOUT(s, x...) \ + do { if (verbose) pr_alert("%s" PERF_FLAG s, perf_type, ## x); } while (0) + +#define VERBOSE_PERFOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert("%s" PERF_FLAG "!!! " s, perf_type, ## x); } while (0) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joel Fernandes (Google) "); + +static char *perf_type = "rcu"; +module_param(perf_type, charp, 0444); +MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); + +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); + +// Number of loops per experiment, all readers execute an operation concurrently +torture_param(long, loops, 10000000, "Number of loops per experiment."); + +#ifdef MODULE +# define REFPERF_SHUTDOWN 0 +#else +# define REFPERF_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, REFPERF_SHUTDOWN, + "Shutdown at end of performance tests."); + +struct reader_task { + struct task_struct *task; + atomic_t start; + wait_queue_head_t wq; + u64 last_duration_ns; + + // The average latency When 1.. are concurrently + // running an experiment. For example, if this reader_task is + // of index 5 in the reader_tasks array, then result is for + // 6 cores. + u64 result_avg; +}; + +static struct task_struct *shutdown_task; +static wait_queue_head_t shutdown_wq; + +static struct task_struct *main_task; +static wait_queue_head_t main_wq; +static int shutdown_start; + +static struct reader_task *reader_tasks; +static int nreaders; + +// Number of readers that are part of the current experiment. +static atomic_t nreaders_exp; + +// Use to wait for all threads to start. +static atomic_t n_init; + +// Track which experiment is currently running. +static int exp_idx; + +// Operations vector for selecting different types of tests. +struct ref_perf_ops { + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + const char *name; +}; + +static struct ref_perf_ops *cur_ops; + +// Definitions for RCU ref perf testing. +static int ref_rcu_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void ref_rcu_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static void rcu_sync_perf_init(void) +{ +} + +static struct ref_perf_ops rcu_ops = { + .init = rcu_sync_perf_init, + .readlock = ref_rcu_read_lock, + .readunlock = ref_rcu_read_unlock, + .name = "rcu" +}; + + +// Definitions for SRCU ref perf testing. +DEFINE_STATIC_SRCU(srcu_refctl_perf); +static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; + +static int srcu_ref_perf_read_lock(void) __acquires(srcu_ctlp) +{ + return srcu_read_lock(srcu_ctlp); +} + +static void srcu_ref_perf_read_unlock(int idx) __releases(srcu_ctlp) +{ + srcu_read_unlock(srcu_ctlp, idx); +} + +static struct ref_perf_ops srcu_ops = { + .init = rcu_sync_perf_init, + .readlock = srcu_ref_perf_read_lock, + .readunlock = srcu_ref_perf_read_unlock, + .name = "srcu" +}; + +// Definitions for reference count +static atomic_t refcnt; + +static int srcu_ref_perf_refcnt_lock(void) +{ + atomic_inc(&refcnt); + return 0; +} + +static void srcu_ref_perf_refcnt_unlock(int idx) __releases(srcu_ctlp) +{ + atomic_dec(&refcnt); + srcu_read_unlock(srcu_ctlp, idx); +} + +static struct ref_perf_ops refcnt_ops = { + .init = rcu_sync_perf_init, + .readlock = srcu_ref_perf_refcnt_lock, + .readunlock = srcu_ref_perf_refcnt_unlock, + .name = "refcnt" +}; + +// Definitions for rwlock +static rwlock_t test_rwlock; + +static void ref_perf_rwlock_init(void) +{ + rwlock_init(&test_rwlock); +} + +static int ref_perf_rwlock_lock(void) +{ + read_lock(&test_rwlock); + return 0; +} + +static void ref_perf_rwlock_unlock(int idx) +{ + read_unlock(&test_rwlock); +} + +static struct ref_perf_ops rwlock_ops = { + .init = ref_perf_rwlock_init, + .readlock = ref_perf_rwlock_lock, + .readunlock = ref_perf_rwlock_unlock, + .name = "rwlock" +}; + +// Definitions for rwsem +static struct rw_semaphore test_rwsem; + +static void ref_perf_rwsem_init(void) +{ + init_rwsem(&test_rwsem); +} + +static int ref_perf_rwsem_lock(void) +{ + down_read(&test_rwsem); + return 0; +} + +static void ref_perf_rwsem_unlock(int idx) +{ + up_read(&test_rwsem); +} + +static struct ref_perf_ops rwsem_ops = { + .init = ref_perf_rwsem_init, + .readlock = ref_perf_rwsem_lock, + .readunlock = ref_perf_rwsem_unlock, + .name = "rwsem" +}; + +// Reader kthread. Repeatedly does empty RCU read-side +// critical section, minimizing update-side interference. +static int +ref_perf_reader(void *arg) +{ + unsigned long flags; + long me = (long)arg; + struct reader_task *rt = &(reader_tasks[me]); + unsigned long spincnt; + int idx; + u64 start; + s64 duration; + + VERBOSE_PERFOUT("ref_perf_reader %ld: task started", me); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_init); +repeat: + VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + + // Wait for signal that this reader can start. + wait_event(rt->wq, (atomic_read(&nreaders_exp) && atomic_read(&rt->start)) || + torture_must_stop()); + + if (torture_must_stop()) + goto end; + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(smp_processor_id() != me); + + atomic_dec(&rt->start); + + // To prevent noise, keep interrupts disabled. This also has the + // effect of preventing entries into slow path for rcu_read_unlock(). + local_irq_save(flags); + start = ktime_get_mono_fast_ns(); + + VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); + + for (spincnt = 0; spincnt < loops; spincnt++) { + idx = cur_ops->readlock(); + cur_ops->readunlock(idx); + } + + duration = ktime_get_mono_fast_ns() - start; + local_irq_restore(flags); + + rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; + + atomic_dec(&nreaders_exp); + + VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)", + me, exp_idx, atomic_read(&nreaders_exp)); + + if (!atomic_read(&nreaders_exp)) + wake_up(&main_wq); + + if (!torture_must_stop()) + goto repeat; +end: + torture_kthread_stopping("ref_perf_reader"); + return 0; +} + +void reset_readers(int n) +{ + int i; + struct reader_task *rt; + + for (i = 0; i < n; i++) { + rt = &(reader_tasks[i]); + + rt->last_duration_ns = 0; + } +} + +// Print the results of each reader and return the sum of all their durations. +u64 process_durations(int n) +{ + int i; + struct reader_task *rt; + char buf1[64]; + char buf[512]; + u64 sum = 0; + + buf[0] = 0; + sprintf(buf, "Experiment #%d (Format: :)", + exp_idx); + + for (i = 0; i <= n && !torture_must_stop(); i++) { + rt = &(reader_tasks[i]); + sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); + + if (i % 5 == 0) + strcat(buf, "\n"); + strcat(buf, buf1); + + sum += rt->last_duration_ns; + } + strcat(buf, "\n"); + + PERFOUT("%s\n", buf); + + return sum; +} + +// The main_func is the main orchestrator, it performs a bunch of +// experiments. For every experiment, it orders all the readers +// involved to start and waits for them to finish the experiment. It +// then reads their timestamps and starts the next experiment. Each +// experiment progresses from 1 concurrent reader to N of them at which +// point all the timestamps are printed. +static int main_func(void *arg) +{ + int exp, r; + char buf1[64]; + char buf[512]; + + set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + VERBOSE_PERFOUT("main_func task started"); + atomic_inc(&n_init); + + // Wait for all threads to start. + wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1)); + + // Start exp readers up per experiment + for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) { + if (torture_must_stop()) + goto end; + + reset_readers(exp); + atomic_set(&nreaders_exp, exp + 1); + + exp_idx = exp; + + for (r = 0; r <= exp; r++) { + atomic_set(&reader_tasks[r].start, 1); + wake_up(&reader_tasks[r].wq); + } + + VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers", + exp); + + wait_event(main_wq, + !atomic_read(&nreaders_exp) || torture_must_stop()); + + VERBOSE_PERFOUT("main_func: experiment ended"); + + if (torture_must_stop()) + goto end; + + reader_tasks[exp].result_avg = process_durations(exp) / ((exp + 1) * loops); + } + + // Print the average of all experiments + PERFOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); + + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Threads\tTime(ns)\n"); + + for (exp = 0; exp < nreaders; exp++) { + sprintf(buf1, "%d\t%llu\n", exp + 1, reader_tasks[exp].result_avg); + strcat(buf, buf1); + } + + PERFOUT("%s", buf); + + // This will shutdown everything including us. + if (shutdown) { + shutdown_start = 1; + wake_up(&shutdown_wq); + } + + // Wait for torture to stop us + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + +end: + torture_kthread_stopping("main_func"); + return 0; +} + +static void +ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) +{ + pr_alert("%s" PERF_FLAG + "--- %s: verbose=%d shutdown=%d loops=%ld\n", perf_type, tag, + verbose, shutdown, loops); +} + +static void +ref_perf_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nreaders; i++) + torture_stop_kthread("ref_perf_reader", + reader_tasks[i].task); + } + kfree(reader_tasks); + + torture_stop_kthread("main_task", main_task); + kfree(main_task); + + // Do perf-type-specific cleanup operations. + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +// Shutdown kthread. Just waits to be awakened, then shuts down system. +static int +ref_perf_shutdown(void *arg) +{ + wait_event(shutdown_wq, shutdown_start); + + smp_mb(); // Wake before output. + ref_perf_cleanup(); + kernel_power_off(); + + return -EINVAL; +} + +static int __init +ref_perf_init(void) +{ + long i; + int firsterr = 0; + static struct ref_perf_ops *perf_ops[] = { + &rcu_ops, &srcu_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops, + }; + + if (!torture_init_begin(perf_type, verbose)) + return -EBUSY; + + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { + cur_ops = perf_ops[i]; + if (strcmp(perf_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(perf_ops)) { + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); + pr_alert("rcu-perf types:"); + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) + pr_cont(" %s", perf_ops[i]->name); + pr_cont("\n"); + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_PERF_TEST)); + firsterr = -EINVAL; + cur_ops = NULL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + ref_perf_print_module_parms(cur_ops, "Start of test"); + + // Shutdown task + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(ref_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + // Reader tasks (~75% of online CPUs). + nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (!reader_tasks) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_PERFOUT("Starting %d reader threads\n", nreaders); + + for (i = 0; i < nreaders; i++) { + firsterr = torture_create_kthread(ref_perf_reader, (void *)i, + reader_tasks[i].task); + if (firsterr) + goto unwind; + + init_waitqueue_head(&(reader_tasks[i].wq)); + } + + // Main Task + init_waitqueue_head(&main_wq); + firsterr = torture_create_kthread(main_func, NULL, main_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + + + // Wait until all threads start + while (atomic_read(&n_init) < nreaders + 1) + schedule_timeout_uninterruptible(1); + + wake_up(&main_wq); + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + ref_perf_cleanup(); + return firsterr; +} + +module_init(ref_perf_init); +module_exit(ref_perf_cleanup); -- cgit v1.2.3 From 708cda31652c02e64adaeafafe7b996e4e14c3eb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 09:22:24 -0700 Subject: rcuperf: Add comments explaining the high reader overhead This commit adds comments explaining why the readers have otherwise insane levels of measurement overhead, namely that they are intended as a test load for update-side performance measurements, not as a straight-up read-side performance test. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 246da8fe199e..d906ca987936 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -69,6 +69,11 @@ MODULE_AUTHOR("Paul E. McKenney "); * value specified by nr_cpus for a read-only test. * * Various other use cases may of course be specified. + * + * Note that this test's readers are intended only as a test load for + * the writers. The reader performance statistics will be overly + * pessimistic due to the per-critical-section interrupt disabling, + * test-end checks, and the pair of calls through pointers. */ #ifdef MODULE @@ -309,8 +314,10 @@ static void rcu_perf_wait_shutdown(void) } /* - * RCU perf reader kthread. Repeatedly does empty RCU read-side - * critical section, minimizing update-side interference. + * RCU perf reader kthread. Repeatedly does empty RCU read-side critical + * section, minimizing update-side interference. However, the point of + * this test is not to evaluate reader performance, but instead to serve + * as a test load for update-side performance testing. */ static int rcu_perf_reader(void *arg) -- cgit v1.2.3 From 777a54c908ec69fa0eccab54068a49ecda38ffde Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 14:16:44 -0700 Subject: refperf: Add holdoff parameter to allow CPUs to come online This commit adds an rcuperf module parameter named "holdoff" that defaults to 10 seconds if refperf is built in and to zero otherwise. The assumption is that all the CPUs are online by the time that the modprobe and insmod commands are going to do anything, and that normal systems will have all the CPUs online within ten seconds. Larger systems may take many tens of seconds or even minutes to get to this point, hence this being a module parameter instead of being a hard-coded constant. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 61161530acc8..4d686fdc3105 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -57,7 +57,10 @@ MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); -// Number of loops per experiment, all readers execute an operation concurrently +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +// Number of loops per experiment, all readers execute operations concurrently. torture_param(long, loops, 10000000, "Number of loops per experiment."); #ifdef MODULE @@ -248,6 +251,8 @@ ref_perf_reader(void *arg) set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); atomic_inc(&n_init); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); repeat: VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); @@ -357,6 +362,8 @@ static int main_func(void *arg) // Wait for all threads to start. wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1)); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); // Start exp readers up per experiment for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) { @@ -420,8 +427,8 @@ static void ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d loops=%ld\n", perf_type, tag, - verbose, shutdown, loops); + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld\n", perf_type, tag, + verbose, shutdown, holdoff, loops); } static void -- cgit v1.2.3 From 75dd8efef56ed5959c398974c785026f84aa0d1a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 14:59:06 -0700 Subject: refperf: Hoist function-pointer calls out of the loop Current runs show PREEMPT=n rcu_read_lock()/rcu_read_unlock() pairs consuming between 20 and 30 nanoseconds, when in fact the actual value is zero, give or take the barrier() asm's effect on compiler optimizations. The additional overhead is caused by function calls through pointers (especially in these days of Spectre mitigations) and perhaps also needless argument passing, a non-const loop limit, and an upcounting loop. This commit therefore combines the ->readlock() and ->readunlock() function pointers into a single ->readsection() function pointer that takes the loop count as a const parameter and keeps any data passed from the read-lock to the read-unlock internal to this new function. These changes reduce the measured overhead of the aforementioned PREEMPT=n rcu_read_lock()/rcu_read_unlock() pairs from between 20 and 30 nanoseconds to somewhere south of 500 picoseconds. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 92 ++++++++++++++++++++++------------------------------ 1 file changed, 38 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 4d686fdc3105..57c7b7a40bd2 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -108,23 +108,20 @@ static int exp_idx; struct ref_perf_ops { void (*init)(void); void (*cleanup)(void); - int (*readlock)(void); - void (*readunlock)(int idx); + void (*readsection)(const int nloops); const char *name; }; static struct ref_perf_ops *cur_ops; -// Definitions for RCU ref perf testing. -static int ref_rcu_read_lock(void) __acquires(RCU) +static void ref_rcu_read_section(const int nloops) { - rcu_read_lock(); - return 0; -} + int i; -static void ref_rcu_read_unlock(int idx) __releases(RCU) -{ - rcu_read_unlock(); + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + rcu_read_unlock(); + } } static void rcu_sync_perf_init(void) @@ -133,8 +130,7 @@ static void rcu_sync_perf_init(void) static struct ref_perf_ops rcu_ops = { .init = rcu_sync_perf_init, - .readlock = ref_rcu_read_lock, - .readunlock = ref_rcu_read_unlock, + .readsection = ref_rcu_read_section, .name = "rcu" }; @@ -143,42 +139,39 @@ static struct ref_perf_ops rcu_ops = { DEFINE_STATIC_SRCU(srcu_refctl_perf); static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; -static int srcu_ref_perf_read_lock(void) __acquires(srcu_ctlp) +static void srcu_ref_perf_read_section(int nloops) { - return srcu_read_lock(srcu_ctlp); -} + int i; + int idx; -static void srcu_ref_perf_read_unlock(int idx) __releases(srcu_ctlp) -{ - srcu_read_unlock(srcu_ctlp, idx); + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + srcu_read_unlock(srcu_ctlp, idx); + } } static struct ref_perf_ops srcu_ops = { .init = rcu_sync_perf_init, - .readlock = srcu_ref_perf_read_lock, - .readunlock = srcu_ref_perf_read_unlock, + .readsection = srcu_ref_perf_read_section, .name = "srcu" }; // Definitions for reference count static atomic_t refcnt; -static int srcu_ref_perf_refcnt_lock(void) +static void ref_perf_refcnt_section(const int nloops) { - atomic_inc(&refcnt); - return 0; -} + int i; -static void srcu_ref_perf_refcnt_unlock(int idx) __releases(srcu_ctlp) -{ - atomic_dec(&refcnt); - srcu_read_unlock(srcu_ctlp, idx); + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + atomic_dec(&refcnt); + } } static struct ref_perf_ops refcnt_ops = { .init = rcu_sync_perf_init, - .readlock = srcu_ref_perf_refcnt_lock, - .readunlock = srcu_ref_perf_refcnt_unlock, + .readsection = ref_perf_refcnt_section, .name = "refcnt" }; @@ -190,21 +183,19 @@ static void ref_perf_rwlock_init(void) rwlock_init(&test_rwlock); } -static int ref_perf_rwlock_lock(void) +static void ref_perf_rwlock_section(const int nloops) { - read_lock(&test_rwlock); - return 0; -} + int i; -static void ref_perf_rwlock_unlock(int idx) -{ - read_unlock(&test_rwlock); + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + read_unlock(&test_rwlock); + } } static struct ref_perf_ops rwlock_ops = { .init = ref_perf_rwlock_init, - .readlock = ref_perf_rwlock_lock, - .readunlock = ref_perf_rwlock_unlock, + .readsection = ref_perf_rwlock_section, .name = "rwlock" }; @@ -216,21 +207,19 @@ static void ref_perf_rwsem_init(void) init_rwsem(&test_rwsem); } -static int ref_perf_rwsem_lock(void) +static void ref_perf_rwsem_section(const int nloops) { - down_read(&test_rwsem); - return 0; -} + int i; -static void ref_perf_rwsem_unlock(int idx) -{ - up_read(&test_rwsem); + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + up_read(&test_rwsem); + } } static struct ref_perf_ops rwsem_ops = { .init = ref_perf_rwsem_init, - .readlock = ref_perf_rwsem_lock, - .readunlock = ref_perf_rwsem_unlock, + .readsection = ref_perf_rwsem_section, .name = "rwsem" }; @@ -242,8 +231,6 @@ ref_perf_reader(void *arg) unsigned long flags; long me = (long)arg; struct reader_task *rt = &(reader_tasks[me]); - unsigned long spincnt; - int idx; u64 start; s64 duration; @@ -275,10 +262,7 @@ repeat: VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); - for (spincnt = 0; spincnt < loops; spincnt++) { - idx = cur_ops->readlock(); - cur_ops->readunlock(idx); - } + cur_ops->readsection(loops); duration = ktime_get_mono_fast_ns() - start; local_irq_restore(flags); -- cgit v1.2.3 From 83b88c86da0e5f97faeac5a9bb19fe32f8c0394b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 15:31:07 -0700 Subject: refperf: Allow decimal nanoseconds The CONFIG_PREEMPT=n rcu_read_lock()/rcu_read_unlock() pair's overhead, even including loop overhead, is far less than one nanosecond. Since logscale plots are not all that happy with zero values, provide picoseconds as decimals. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 57c7b7a40bd2..e991d4820f51 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -375,7 +375,7 @@ static int main_func(void *arg) if (torture_must_stop()) goto end; - reader_tasks[exp].result_avg = process_durations(exp) / ((exp + 1) * loops); + reader_tasks[exp].result_avg = 1000 * process_durations(exp) / ((exp + 1) * loops); } // Print the average of all experiments @@ -386,7 +386,7 @@ static int main_func(void *arg) strcat(buf, "Threads\tTime(ns)\n"); for (exp = 0; exp < nreaders; exp++) { - sprintf(buf1, "%d\t%llu\n", exp + 1, reader_tasks[exp].result_avg); + sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, reader_tasks[exp].result_avg / 1000, (int)(reader_tasks[exp].result_avg % 1000)); strcat(buf, buf1); } -- cgit v1.2.3 From 8fc28783a0c3704ea27505a25dbde8333d75380c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 15:48:38 -0700 Subject: refperf: Convert nreaders to a module parameter This commit converts nreaders to a module parameter, with the default of -1 specifying the old behavior of using 75% of the readers. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index e991d4820f51..020e55a9a64b 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -62,6 +62,12 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0, "Holdoff time before test start (s)"); // Number of loops per experiment, all readers execute operations concurrently. torture_param(long, loops, 10000000, "Number of loops per experiment."); +// Number of readers, with -1 defaulting to about 75% of the CPUs. +torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); +// Number of runs. +torture_param(int, nruns, 30, "Number of experiments to run."); +// Reader delay in nanoseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); #ifdef MODULE # define REFPERF_SHUTDOWN 0 @@ -93,7 +99,6 @@ static wait_queue_head_t main_wq; static int shutdown_start; static struct reader_task *reader_tasks; -static int nreaders; // Number of readers that are part of the current experiment. static atomic_t nreaders_exp; @@ -411,8 +416,8 @@ static void ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld\n", perf_type, tag, - verbose, shutdown, holdoff, loops); + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d\n", perf_type, tag, + verbose, shutdown, holdoff, loops, nreaders); } static void @@ -501,8 +506,9 @@ ref_perf_init(void) schedule_timeout_uninterruptible(1); } - // Reader tasks (~75% of online CPUs). - nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + // Reader tasks (default to ~75% of online CPUs). + if (nreaders < 0) + nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { -- cgit v1.2.3 From dbf28efdae7bb51032eeb0fe1b6bd07d6f0f9b6c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 17:22:24 -0700 Subject: refperf: Provide module parameter to specify number of experiments The current code uses the number of threads both to limit the number of threads and to specify the number of experiments, but also varies the number of threads as the experiments progress. This commit takes a different approach by adding an refperf.nruns module parameter that specifies the number of experiments, and furthermore uses the same number of threads for each experiment. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 020e55a9a64b..6324449db404 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -83,12 +83,6 @@ struct reader_task { atomic_t start; wait_queue_head_t wq; u64 last_duration_ns; - - // The average latency When 1.. are concurrently - // running an experiment. For example, if this reader_task is - // of index 5 in the reader_tasks array, then result is for - // 6 cores. - u64 result_avg; }; static struct task_struct *shutdown_task; @@ -289,12 +283,12 @@ end: return 0; } -void reset_readers(int n) +void reset_readers(void) { int i; struct reader_task *rt; - for (i = 0; i < n; i++) { + for (i = 0; i < nreaders; i++) { rt = &(reader_tasks[i]); rt->last_duration_ns = 0; @@ -314,7 +308,7 @@ u64 process_durations(int n) sprintf(buf, "Experiment #%d (Format: :)", exp_idx); - for (i = 0; i <= n && !torture_must_stop(); i++) { + for (i = 0; i < n && !torture_must_stop(); i++) { rt = &(reader_tasks[i]); sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); @@ -342,11 +336,15 @@ static int main_func(void *arg) int exp, r; char buf1[64]; char buf[512]; + u64 *result_avg; set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); set_user_nice(current, MAX_NICE); VERBOSE_PERFOUT("main_func task started"); + result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); + if (!result_avg) + VERBOSE_PERFOUT_ERRSTRING("out of memory"); atomic_inc(&n_init); // Wait for all threads to start. @@ -355,22 +353,24 @@ static int main_func(void *arg) schedule_timeout_interruptible(holdoff * HZ); // Start exp readers up per experiment - for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) { + for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { + if (!result_avg) + break; if (torture_must_stop()) goto end; - reset_readers(exp); - atomic_set(&nreaders_exp, exp + 1); + reset_readers(); + atomic_set(&nreaders_exp, nreaders); exp_idx = exp; - for (r = 0; r <= exp; r++) { + for (r = 0; r < nreaders; r++) { atomic_set(&reader_tasks[r].start, 1); wake_up(&reader_tasks[r].wq); } VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers", - exp); + nreaders); wait_event(main_wq, !atomic_read(&nreaders_exp) || torture_must_stop()); @@ -380,7 +380,7 @@ static int main_func(void *arg) if (torture_must_stop()) goto end; - reader_tasks[exp].result_avg = 1000 * process_durations(exp) / ((exp + 1) * loops); + result_avg[exp] = 1000 * process_durations(nreaders) / (nreaders * loops); } // Print the average of all experiments @@ -390,12 +390,15 @@ static int main_func(void *arg) strcat(buf, "\n"); strcat(buf, "Threads\tTime(ns)\n"); - for (exp = 0; exp < nreaders; exp++) { - sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, reader_tasks[exp].result_avg / 1000, (int)(reader_tasks[exp].result_avg % 1000)); + for (exp = 0; exp < nruns; exp++) { + if (!result_avg) + break; + sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000)); strcat(buf, buf1); } - PERFOUT("%s", buf); + if (result_avg) + PERFOUT("%s", buf); // This will shutdown everything including us. if (shutdown) { @@ -416,8 +419,8 @@ static void ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d\n", perf_type, tag, - verbose, shutdown, holdoff, loops, nreaders); + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d\n", perf_type, tag, + verbose, shutdown, holdoff, loops, nreaders, nruns); } static void -- cgit v1.2.3 From f518f154ecef347777db33b7c9b0581f245159f0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 17:32:56 -0700 Subject: refperf: Dynamically allocate experiment-summary output buffer Currently, the buffer used to accumulate the experiment-summary output is fixed size, which will cause problems if someone decides to run one hundred experiments. This commit therefore dynamically allocates this buffer. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 6324449db404..75b9cceaece1 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -333,9 +333,10 @@ u64 process_durations(int n) // point all the timestamps are printed. static int main_func(void *arg) { + bool errexit = false; int exp, r; char buf1[64]; - char buf[512]; + char *buf; u64 *result_avg; set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); @@ -343,8 +344,11 @@ static int main_func(void *arg) VERBOSE_PERFOUT("main_func task started"); result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); - if (!result_avg) + buf = kzalloc(64 + nruns * 32, GFP_KERNEL); + if (!result_avg || !buf) { VERBOSE_PERFOUT_ERRSTRING("out of memory"); + errexit = true; + } atomic_inc(&n_init); // Wait for all threads to start. @@ -354,7 +358,7 @@ static int main_func(void *arg) // Start exp readers up per experiment for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { - if (!result_avg) + if (errexit) break; if (torture_must_stop()) goto end; @@ -391,13 +395,13 @@ static int main_func(void *arg) strcat(buf, "Threads\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { - if (!result_avg) + if (errexit) break; sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000)); strcat(buf, buf1); } - if (result_avg) + if (!errexit) PERFOUT("%s", buf); // This will shutdown everything including us. @@ -412,6 +416,8 @@ static int main_func(void *arg) end: torture_kthread_stopping("main_func"); + kfree(result_avg); + kfree(buf); return 0; } -- cgit v1.2.3 From 2e90de76f226f11fe26c871aa321be28152f565a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 May 2020 17:45:03 -0700 Subject: refperf: Dynamically allocate thread-summary output buffer Currently, the buffer used to accumulate the thread-summary output is fixed size, which will cause problems if someone decides to run on a large number of PCUs. This commit therefore dynamically allocates this buffer. [ paulmck: Fix memory allocation as suggested by KASAN. ] Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 75b9cceaece1..fc940e3dba1f 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -301,9 +301,12 @@ u64 process_durations(int n) int i; struct reader_task *rt; char buf1[64]; - char buf[512]; + char *buf; u64 sum = 0; + buf = kmalloc(128 + nreaders * 32, GFP_KERNEL); + if (!buf) + return 0; buf[0] = 0; sprintf(buf, "Experiment #%d (Format: :)", exp_idx); @@ -322,6 +325,7 @@ u64 process_durations(int n) PERFOUT("%s\n", buf); + kfree(buf); return sum; } -- cgit v1.2.3 From 2990750bceb05c3cdeae3a6d2683cbc4ae4de15e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 09:32:57 -0700 Subject: refperf: Make functions static Because the reset_readers() and process_durations() functions are used only within kernel/rcu/refperf.c, this commit makes them static. Reported-by: kbuild test robot Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index fc940e3dba1f..0a900f3ae151 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -283,7 +283,7 @@ end: return 0; } -void reset_readers(void) +static void reset_readers(void) { int i; struct reader_task *rt; @@ -296,7 +296,7 @@ void reset_readers(void) } // Print the results of each reader and return the sum of all their durations. -u64 process_durations(int n) +static u64 process_durations(int n) { int i; struct reader_task *rt; -- cgit v1.2.3 From b864f89ff61492f56b4e8c6713a5efec6540a0e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 10:57:34 -0700 Subject: refperf: Tune reader measurement interval This commit moves a printk() out of the measurement interval, converts a atomic_dec()/atomic_read() pair to atomic_dec_and_test(), and adds a smp_mb__before_atomic() to avoid potential wake/wait hangs. These changes have the added benefit of reducing the number of loops required for amortizing loop overhead for CONFIG_PREEMPT=n RCU measurements from 1,000,000 to 10,000. This reduction in turn shortens the test, reducing the probability of interference. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 0a900f3ae151..8815ccfb6f98 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -252,15 +252,16 @@ repeat: // Make sure that the CPU is affinitized appropriately during testing. WARN_ON_ONCE(smp_processor_id() != me); + smp_mb__before_atomic(); atomic_dec(&rt->start); + VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); + // To prevent noise, keep interrupts disabled. This also has the // effect of preventing entries into slow path for rcu_read_unlock(). local_irq_save(flags); start = ktime_get_mono_fast_ns(); - VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); - cur_ops->readsection(loops); duration = ktime_get_mono_fast_ns() - start; @@ -268,14 +269,12 @@ repeat: rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; - atomic_dec(&nreaders_exp); + if (atomic_dec_and_test(&nreaders_exp)) + wake_up(&main_wq); VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)", me, exp_idx, atomic_read(&nreaders_exp)); - if (!atomic_read(&nreaders_exp)) - wake_up(&main_wq); - if (!torture_must_stop()) goto repeat; end: -- cgit v1.2.3 From af2789db13b8dc38d16e969f8c11b9468be42d46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 11:22:03 -0700 Subject: refperf: Convert reader_task structure's "start" field to int This commit converts the reader_task structure's "start" field to int in order to demote a full barrier to an smp_load_acquire() and also to simplify the code a bit. While in the area, and to enlist the compiler's help in ensuring that nothing was missed, the field's name was changed to start_reader. Also while in the area, change the main_func() store to use smp_store_release() to further fortify against wait/wake races. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 8815ccfb6f98..2fd3ed1a0d0d 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -80,7 +80,7 @@ torture_param(bool, shutdown, REFPERF_SHUTDOWN, struct reader_task { struct task_struct *task; - atomic_t start; + int start_reader; wait_queue_head_t wq; u64 last_duration_ns; }; @@ -243,7 +243,7 @@ repeat: VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); // Wait for signal that this reader can start. - wait_event(rt->wq, (atomic_read(&nreaders_exp) && atomic_read(&rt->start)) || + wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || torture_must_stop()); if (torture_must_stop()) @@ -252,8 +252,7 @@ repeat: // Make sure that the CPU is affinitized appropriately during testing. WARN_ON_ONCE(smp_processor_id() != me); - smp_mb__before_atomic(); - atomic_dec(&rt->start); + WRITE_ONCE(rt->start_reader, 0); VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); @@ -372,7 +371,7 @@ static int main_func(void *arg) exp_idx = exp; for (r = 0; r < nreaders; r++) { - atomic_set(&reader_tasks[r].start, 1); + smp_store_release(&reader_tasks[r].start_reader, 1); wake_up(&reader_tasks[r].wq); } -- cgit v1.2.3 From 86e0da2bb8ed934d3dce5a337895f1118f59c087 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 11:40:52 -0700 Subject: refperf: More closely synchronize reader start times Currently, readers are awakened individually. On most systems, this results in significant wakeup delay from one reader to the next, which can result in the first and last reader having sole access to the synchronization primitive in question. If that synchronization primitive involves shared memory, those readers will rack up a huge number of operations in a very short time, causing large perturbations in the results. This commit therefore has the readers busy-wait after being awakened, and uses a new n_started variable to synchronize their start times. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 2fd3ed1a0d0d..234bb0e84a8b 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -99,6 +99,7 @@ static atomic_t nreaders_exp; // Use to wait for all threads to start. static atomic_t n_init; +static atomic_t n_started; // Track which experiment is currently running. static int exp_idx; @@ -253,6 +254,9 @@ repeat: WARN_ON_ONCE(smp_processor_id() != me); WRITE_ONCE(rt->start_reader, 0); + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) + cpu_relax(); VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); @@ -367,6 +371,7 @@ static int main_func(void *arg) reset_readers(); atomic_set(&nreaders_exp, nreaders); + atomic_set(&n_started, nreaders); exp_idx = exp; -- cgit v1.2.3 From 2db0bda38453f472640f4ece1e2a495cbd44f892 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 12:34:57 -0700 Subject: refperf: Add warmup and cooldown processing phases This commit causes all the readers to start running unmeasured load until all readers have done at least one such run (thus having warmed up), then run the measured load, and then run unmeasured load until all readers have completed their measured load. This approach avoids any thread running measured load while other readers are idle. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 234bb0e84a8b..445190b97b05 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -100,6 +100,8 @@ static atomic_t nreaders_exp; // Use to wait for all threads to start. static atomic_t n_init; static atomic_t n_started; +static atomic_t n_warmedup; +static atomic_t n_cooleddown; // Track which experiment is currently running. static int exp_idx; @@ -260,8 +262,15 @@ repeat: VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); - // To prevent noise, keep interrupts disabled. This also has the - // effect of preventing entries into slow path for rcu_read_unlock(). + + // To reduce noise, do an initial cache-warming invocation, check + // in, and then keep warming until everyone has checked in. + cur_ops->readsection(loops); + if (!atomic_dec_return(&n_warmedup)) + while (atomic_read_acquire(&n_warmedup)) + cur_ops->readsection(loops); + // Also keep interrupts disabled. This also has the effect + // of preventing entries into slow path for rcu_read_unlock(). local_irq_save(flags); start = ktime_get_mono_fast_ns(); @@ -271,6 +280,11 @@ repeat: local_irq_restore(flags); rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; + // To reduce runtime-skew noise, do maintain-load invocations until + // everyone is done. + if (!atomic_dec_return(&n_cooleddown)) + while (atomic_read_acquire(&n_cooleddown)) + cur_ops->readsection(loops); if (atomic_dec_and_test(&nreaders_exp)) wake_up(&main_wq); @@ -372,6 +386,8 @@ static int main_func(void *arg) reset_readers(); atomic_set(&nreaders_exp, nreaders); atomic_set(&n_started, nreaders); + atomic_set(&n_warmedup, nreaders); + atomic_set(&n_cooleddown, nreaders); exp_idx = exp; -- cgit v1.2.3 From 6efb06340846c788336f402e3a472a24fabb431e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 May 2020 14:26:25 -0700 Subject: refperf: Label experiment-number column "Runs" The experiment-number column is currently labeled "Threads", which is misleading at best. This commit therefore relabels it as "Runs", and adjusts the scripts accordingly. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 2 +- tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 445190b97b05..2d2d227d761a 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -415,7 +415,7 @@ static int main_func(void *arg) buf[0] = 0; strcat(buf, "\n"); - strcat(buf, "Threads\tTime(ns)\n"); + strcat(buf, "Runs\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { if (errexit) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh index 6fc06cd3538e..0660f3fab215 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh @@ -24,7 +24,7 @@ configfile=`echo $i | sed -e 's/^.*\///'` sed -e 's/^\[[^]]*]//' < $i/console.log | tr -d '\015' | awk -v configfile="$configfile" ' -/^[ ]*Threads Time\(ns\) *$/ { +/^[ ]*Runs Time\(ns\) *$/ { if (dataphase + 0 == 0) { dataphase = 1; # print configfile, $0; -- cgit v1.2.3 From 96af8669591d740a1e2695c4d96e544409dbf896 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 May 2020 16:46:56 -0700 Subject: refperf: Simplify initialization-time wakeup protocol This commit moves the reader-launch wait loop from ref_perf_init() to main_func(), removing one layer of wakeup and allowing slightly faster system boot. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 2d2d227d761a..7839237ffc17 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -369,13 +369,14 @@ static int main_func(void *arg) VERBOSE_PERFOUT_ERRSTRING("out of memory"); errexit = true; } - atomic_inc(&n_init); - - // Wait for all threads to start. - wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1)); if (holdoff) schedule_timeout_interruptible(holdoff * HZ); + // Wait for all threads to start. + atomic_inc(&n_init); + while (atomic_read(&n_init) < nreaders + 1) + schedule_timeout_uninterruptible(1); + // Start exp readers up per experiment for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { if (errexit) @@ -565,14 +566,6 @@ ref_perf_init(void) firsterr = torture_create_kthread(main_func, NULL, main_task); if (firsterr) goto unwind; - schedule_timeout_uninterruptible(1); - - - // Wait until all threads start - while (atomic_read(&n_init) < nreaders + 1) - schedule_timeout_uninterruptible(1); - - wake_up(&main_wq); torture_init_end(); return 0; -- cgit v1.2.3 From b4d1e34f6502a138e32275baabdb6d593d7ea432 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 May 2020 16:37:35 -0700 Subject: refperf: Add read-side delay module parameter This commit adds a refperf.readdelay module parameter that controls the duration of each critical section. This parameter allows gathering data showing how the performance differences between the various primitives vary with critical-section length. Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 108 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 89 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 7839237ffc17..57a750bbcaca 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -66,8 +66,8 @@ torture_param(long, loops, 10000000, "Number of loops per experiment."); torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. torture_param(int, nruns, 30, "Number of experiments to run."); -// Reader delay in nanoseconds, 0 for no delay. -torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); +// Reader delay in microseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in microseconds."); #ifdef MODULE # define REFPERF_SHUTDOWN 0 @@ -111,6 +111,7 @@ struct ref_perf_ops { void (*init)(void); void (*cleanup)(void); void (*readsection)(const int nloops); + void (*delaysection)(const int nloops, const int ndelay); const char *name; }; @@ -126,6 +127,17 @@ static void ref_rcu_read_section(const int nloops) } } +static void ref_rcu_delay_section(const int nloops, const int ndelay) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + udelay(ndelay); + rcu_read_unlock(); + } +} + static void rcu_sync_perf_init(void) { } @@ -133,6 +145,7 @@ static void rcu_sync_perf_init(void) static struct ref_perf_ops rcu_ops = { .init = rcu_sync_perf_init, .readsection = ref_rcu_read_section, + .delaysection = ref_rcu_delay_section, .name = "rcu" }; @@ -141,7 +154,7 @@ static struct ref_perf_ops rcu_ops = { DEFINE_STATIC_SRCU(srcu_refctl_perf); static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; -static void srcu_ref_perf_read_section(int nloops) +static void srcu_ref_perf_read_section(const int nloops) { int i; int idx; @@ -152,16 +165,29 @@ static void srcu_ref_perf_read_section(int nloops) } } +static void srcu_ref_perf_delay_section(const int nloops, const int ndelay) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + udelay(ndelay); + srcu_read_unlock(srcu_ctlp, idx); + } +} + static struct ref_perf_ops srcu_ops = { .init = rcu_sync_perf_init, .readsection = srcu_ref_perf_read_section, + .delaysection = srcu_ref_perf_delay_section, .name = "srcu" }; // Definitions for reference count static atomic_t refcnt; -static void ref_perf_refcnt_section(const int nloops) +static void ref_refcnt_section(const int nloops) { int i; @@ -171,45 +197,69 @@ static void ref_perf_refcnt_section(const int nloops) } } +static void ref_refcnt_delay_section(const int nloops, const int ndelay) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + udelay(ndelay); + atomic_dec(&refcnt); + } +} + static struct ref_perf_ops refcnt_ops = { .init = rcu_sync_perf_init, - .readsection = ref_perf_refcnt_section, + .readsection = ref_refcnt_section, + .delaysection = ref_refcnt_delay_section, .name = "refcnt" }; // Definitions for rwlock static rwlock_t test_rwlock; -static void ref_perf_rwlock_init(void) +static void ref_rwlock_init(void) { rwlock_init(&test_rwlock); } -static void ref_perf_rwlock_section(const int nloops) +static void ref_rwlock_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + read_unlock(&test_rwlock); + } +} + +static void ref_rwlock_delay_section(const int nloops, const int ndelay) { int i; for (i = nloops; i >= 0; i--) { read_lock(&test_rwlock); + udelay(ndelay); read_unlock(&test_rwlock); } } static struct ref_perf_ops rwlock_ops = { - .init = ref_perf_rwlock_init, - .readsection = ref_perf_rwlock_section, + .init = ref_rwlock_init, + .readsection = ref_rwlock_section, + .delaysection = ref_rwlock_delay_section, .name = "rwlock" }; // Definitions for rwsem static struct rw_semaphore test_rwsem; -static void ref_perf_rwsem_init(void) +static void ref_rwsem_init(void) { init_rwsem(&test_rwsem); } -static void ref_perf_rwsem_section(const int nloops) +static void ref_rwsem_section(const int nloops) { int i; @@ -219,12 +269,32 @@ static void ref_perf_rwsem_section(const int nloops) } } +static void ref_rwsem_delay_section(const int nloops, const int ndelay) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + udelay(ndelay); + up_read(&test_rwsem); + } +} + static struct ref_perf_ops rwsem_ops = { - .init = ref_perf_rwsem_init, - .readsection = ref_perf_rwsem_section, + .init = ref_rwsem_init, + .readsection = ref_rwsem_section, + .delaysection = ref_rwsem_delay_section, .name = "rwsem" }; +static void rcu_perf_one_reader(void) +{ + if (readdelay <= 0) + cur_ops->readsection(loops); + else + cur_ops->delaysection(loops, readdelay); +} + // Reader kthread. Repeatedly does empty RCU read-side // critical section, minimizing update-side interference. static int @@ -265,16 +335,16 @@ repeat: // To reduce noise, do an initial cache-warming invocation, check // in, and then keep warming until everyone has checked in. - cur_ops->readsection(loops); + rcu_perf_one_reader(); if (!atomic_dec_return(&n_warmedup)) while (atomic_read_acquire(&n_warmedup)) - cur_ops->readsection(loops); + rcu_perf_one_reader(); // Also keep interrupts disabled. This also has the effect // of preventing entries into slow path for rcu_read_unlock(). local_irq_save(flags); start = ktime_get_mono_fast_ns(); - cur_ops->readsection(loops); + rcu_perf_one_reader(); duration = ktime_get_mono_fast_ns() - start; local_irq_restore(flags); @@ -284,7 +354,7 @@ repeat: // everyone is done. if (!atomic_dec_return(&n_cooleddown)) while (atomic_read_acquire(&n_cooleddown)) - cur_ops->readsection(loops); + rcu_perf_one_reader(); if (atomic_dec_and_test(&nreaders_exp)) wake_up(&main_wq); @@ -449,8 +519,8 @@ static void ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) { pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d\n", perf_type, tag, - verbose, shutdown, holdoff, loops, nreaders, nruns); + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", perf_type, tag, + verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); } static void -- cgit v1.2.3 From 4dd72a338a07486823037a6b45334d05192c913a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 29 May 2020 13:11:26 -0700 Subject: refperf: Adjust refperf.loop default value With the various measurement optimizations, 10,000 loops normally suffices. This commit therefore reduces the refperf.loops default value from 10,000,000 to 10,000. Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 57a750bbcaca..063eeb0473a1 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -61,7 +61,7 @@ torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0, "Holdoff time before test start (s)"); // Number of loops per experiment, all readers execute operations concurrently. -torture_param(long, loops, 10000000, "Number of loops per experiment."); +torture_param(long, loops, 10000, "Number of loops per experiment."); // Number of readers, with -1 defaulting to about 75% of the CPUs. torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. -- cgit v1.2.3 From 7c944d7c67daee84e3c756bb74ad2f32b28c41cf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 29 May 2020 14:36:26 -0700 Subject: refperf: Work around 64-bit division MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A 64-bit division was introduced in refperf, breaking compilation on all 32-bit architectures: kernel/rcu/refperf.o: in function `main_func': refperf.c:(.text+0x57c): undefined reference to `__aeabi_uldivmod' Fix this by using div_u64 to mark the expensive operation. [ paulmck: Update primitive and format per Nathan Chancellor. ] Fixes: bd5b16d6c88d ("refperf: Allow decimal nanoseconds") Reported-by: kbuild test robot Reported-by: Valdis Klētnieks Acked-by: Randy Dunlap # build-tested Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 063eeb0473a1..80d449060bdf 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -478,7 +478,7 @@ static int main_func(void *arg) if (torture_must_stop()) goto end; - result_avg[exp] = 1000 * process_durations(nreaders) / (nreaders * loops); + result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); } // Print the average of all experiments @@ -489,9 +489,13 @@ static int main_func(void *arg) strcat(buf, "Runs\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { + u64 avg; + u32 rem; + if (errexit) break; - sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000)); + avg = div_u64_rem(result_avg[exp], 1000, &rem); + sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); strcat(buf, buf1); } -- cgit v1.2.3 From 918b351d965560c7902ad482cf87049517843ff2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 31 May 2020 18:14:57 -0700 Subject: refperf: Change readdelay module parameter to nanoseconds The current units of microseconds are too coarse, so this commit changes the units to nanoseconds. However, ndelay is used only for the nanoseconds with udelay being used for whole microseconds. For example, setting refperf.readdelay=1500 results in a udelay(1) followed by an ndelay(500). Suggested-by: Akira Yokosawa [ paulmck: Abstracted delay per Akira feedback and move from 80 to 100 lines. ] [ paulmck: Fix names as suggested by kbuild test robot. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 80d449060bdf..49fffb9bce77 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -66,8 +66,8 @@ torture_param(long, loops, 10000, "Number of loops per experiment."); torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. torture_param(int, nruns, 30, "Number of experiments to run."); -// Reader delay in microseconds, 0 for no delay. -torture_param(int, readdelay, 0, "Read-side delay in microseconds."); +// Reader delay in nanoseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); #ifdef MODULE # define REFPERF_SHUTDOWN 0 @@ -111,12 +111,20 @@ struct ref_perf_ops { void (*init)(void); void (*cleanup)(void); void (*readsection)(const int nloops); - void (*delaysection)(const int nloops, const int ndelay); + void (*delaysection)(const int nloops, const int udl, const int ndl); const char *name; }; static struct ref_perf_ops *cur_ops; +static void un_delay(const int udl, const int ndl) +{ + if (udl) + udelay(udl); + if (ndl) + ndelay(ndl); +} + static void ref_rcu_read_section(const int nloops) { int i; @@ -127,13 +135,13 @@ static void ref_rcu_read_section(const int nloops) } } -static void ref_rcu_delay_section(const int nloops, const int ndelay) +static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl) { int i; for (i = nloops; i >= 0; i--) { rcu_read_lock(); - udelay(ndelay); + un_delay(udl, ndl); rcu_read_unlock(); } } @@ -165,14 +173,14 @@ static void srcu_ref_perf_read_section(const int nloops) } } -static void srcu_ref_perf_delay_section(const int nloops, const int ndelay) +static void srcu_ref_perf_delay_section(const int nloops, const int udl, const int ndl) { int i; int idx; for (i = nloops; i >= 0; i--) { idx = srcu_read_lock(srcu_ctlp); - udelay(ndelay); + un_delay(udl, ndl); srcu_read_unlock(srcu_ctlp, idx); } } @@ -197,13 +205,13 @@ static void ref_refcnt_section(const int nloops) } } -static void ref_refcnt_delay_section(const int nloops, const int ndelay) +static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl) { int i; for (i = nloops; i >= 0; i--) { atomic_inc(&refcnt); - udelay(ndelay); + un_delay(udl, ndl); atomic_dec(&refcnt); } } @@ -233,13 +241,13 @@ static void ref_rwlock_section(const int nloops) } } -static void ref_rwlock_delay_section(const int nloops, const int ndelay) +static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl) { int i; for (i = nloops; i >= 0; i--) { read_lock(&test_rwlock); - udelay(ndelay); + un_delay(udl, ndl); read_unlock(&test_rwlock); } } @@ -269,13 +277,13 @@ static void ref_rwsem_section(const int nloops) } } -static void ref_rwsem_delay_section(const int nloops, const int ndelay) +static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl) { int i; for (i = nloops; i >= 0; i--) { down_read(&test_rwsem); - udelay(ndelay); + un_delay(udl, ndl); up_read(&test_rwsem); } } @@ -292,7 +300,7 @@ static void rcu_perf_one_reader(void) if (readdelay <= 0) cur_ops->readsection(loops); else - cur_ops->delaysection(loops, readdelay); + cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); } // Reader kthread. Repeatedly does empty RCU read-side -- cgit v1.2.3 From 72bb749e7048d0a8d7663b59ec1a33bd56c51083 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Jun 2020 08:34:41 -0700 Subject: refperf: Add test for RCU Tasks Trace readers. This commit adds testing for RCU Tasks Trace readers to the refperf module. Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 49fffb9bce77..da7de9ac548d 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -157,7 +158,6 @@ static struct ref_perf_ops rcu_ops = { .name = "rcu" }; - // Definitions for SRCU ref perf testing. DEFINE_STATIC_SRCU(srcu_refctl_perf); static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; @@ -192,6 +192,35 @@ static struct ref_perf_ops srcu_ops = { .name = "srcu" }; +// Definitions for RCU Tasks Trace ref perf testing. +static void rcu_trace_ref_perf_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + rcu_read_unlock_trace(); + } +} + +static void rcu_trace_ref_perf_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + un_delay(udl, ndl); + rcu_read_unlock_trace(); + } +} + +static struct ref_perf_ops rcu_trace_ops = { + .init = rcu_sync_perf_init, + .readsection = rcu_trace_ref_perf_read_section, + .delaysection = rcu_trace_ref_perf_delay_section, + .name = "rcu-trace" +}; + // Definitions for reference count static atomic_t refcnt; @@ -584,7 +613,7 @@ ref_perf_init(void) long i; int firsterr = 0; static struct ref_perf_ops *perf_ops[] = { - &rcu_ops, &srcu_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops, + &rcu_ops, &srcu_ops, &rcu_trace_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops, }; if (!torture_init_begin(perf_type, verbose)) -- cgit v1.2.3 From e13ef442fe522fa1f604efec8c899a0e1fc3d426 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jun 2020 11:56:34 -0700 Subject: refperf: Add test for RCU Tasks readers This commit adds testing for RCU Tasks readers to the refperf module. This also applies to RCU Rude readers, as both flavors have empty (as in non-existent) read-side markers. Signed-off-by: Paul E. McKenney --- kernel/rcu/refperf.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index da7de9ac548d..2bfdcdcb6bd1 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -192,6 +192,31 @@ static struct ref_perf_ops srcu_ops = { .name = "srcu" }; +// Definitions for RCU Tasks ref perf testing: Empty read markers. +// These definitions also work for RCU Rude readers. +static void rcu_tasks_ref_perf_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) + continue; +} + +static void rcu_tasks_ref_perf_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) + un_delay(udl, ndl); +} + +static struct ref_perf_ops rcu_tasks_ops = { + .init = rcu_sync_perf_init, + .readsection = rcu_tasks_ref_perf_read_section, + .delaysection = rcu_tasks_ref_perf_delay_section, + .name = "rcu-tasks" +}; + // Definitions for RCU Tasks Trace ref perf testing. static void rcu_trace_ref_perf_read_section(const int nloops) { @@ -613,7 +638,8 @@ ref_perf_init(void) long i; int firsterr = 0; static struct ref_perf_ops *perf_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops, + &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, + &refcnt_ops, &rwlock_ops, &rwsem_ops, }; if (!torture_init_begin(perf_type, verbose)) -- cgit v1.2.3 From c7dcf8106f7570b133b05ff68fd4100064965d9d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Jun 2020 13:11:29 -0700 Subject: rcu-tasks: Fix synchronize_rcu_tasks_trace() header comment The synchronize_rcu_tasks_trace() header comment incorrectly claims that any number of things delimit RCU Tasks Trace read-side critical sections, when in fact only rcu_read_lock_trace() and rcu_read_unlock_trace() do so. This commit therefore fixes this comment, and, while in the area, fixes a typo in the rcu_read_lock_trace() header comment. Reported-by: Alexei Starovoitov Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 4 ++-- kernel/rcu/tasks.h | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index 4c25a41f8b27..d9015aac78c6 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -36,8 +36,8 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting); /** * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section * - * When synchronize_rcu_trace() is invoked by one task, then that task - * is guaranteed to block until all other tasks exit their read-side + * When synchronize_rcu_tasks_trace() is invoked by one task, then that + * task is guaranteed to block until all other tasks exit their read-side * critical sections. Similarly, if call_rcu_trace() is invoked on one * task while other tasks are within RCU read-side critical sections, * invocation of the corresponding RCU callback is deferred until after diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ce23f6cc5043..a77298c1d126 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1118,11 +1118,10 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period * * Control will return to the caller some time after a trace rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, - * anyway) cond_resched(). + * grace period has elapsed, in other words after all currently executing + * rcu-tasks read-side critical sections have elapsed. These read-side + * critical sections are delimited by calls to rcu_read_lock_trace() + * and rcu_read_unlock_trace(). * * This is a very specialized primitive, intended only for a few uses in * tracing and other situations requiring manipulation of function preambles -- cgit v1.2.3 From 8e4ec3d02b549a731c94b4bcddff212bb92cdbaf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 11:33:54 -0700 Subject: refperf: Rename RCU_REF_PERF_TEST to RCU_REF_SCALE_TEST The old Kconfig option name is all too easy to conflate with the unrelated "perf" feature, so this commit renames RCU_REF_PERF_TEST to RCU_REF_SCALE_TEST. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 4 ++-- kernel/rcu/Makefile | 2 +- kernel/rcu/refperf.c | 6 +++--- tools/testing/selftests/rcutorture/configs/refperf/CFcommon | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 858765b7f644..3cf6132a4bb9 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -61,8 +61,8 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. -config RCU_REF_PERF_TEST - tristate "Performance tests for read-side synchronization (RCU and others)" +config RCU_REF_SCALE_TEST + tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL select TORTURE_TEST select SRCU diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index ba7d82609cbe..45d562de279a 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o -obj-$(CONFIG_RCU_REF_PERF_TEST) += refperf.o +obj-$(CONFIG_RCU_REF_SCALE_TEST) += refperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c index 2bfdcdcb6bd1..7c980573acbe 100644 --- a/kernel/rcu/refperf.c +++ b/kernel/rcu/refperf.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ // -// Performance test comparing RCU vs other mechanisms +// Scalability test comparing RCU vs other mechanisms // for acquiring references on objects. // // Copyright (C) Google, 2020. @@ -59,7 +59,7 @@ MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); // Wait until there are multiple CPUs before starting test. -torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0, +torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, "Holdoff time before test start (s)"); // Number of loops per experiment, all readers execute operations concurrently. torture_param(long, loops, 10000, "Number of loops per experiment."); @@ -656,7 +656,7 @@ ref_perf_init(void) for (i = 0; i < ARRAY_SIZE(perf_ops); i++) pr_cont(" %s", perf_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_REF_PERF_TEST)); + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); firsterr = -EINVAL; cur_ops = NULL; goto unwind; diff --git a/tools/testing/selftests/rcutorture/configs/refperf/CFcommon b/tools/testing/selftests/rcutorture/configs/refperf/CFcommon index 8ba5ba207503..a98b58b54bb1 100644 --- a/tools/testing/selftests/rcutorture/configs/refperf/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/refperf/CFcommon @@ -1,2 +1,2 @@ -CONFIG_RCU_REF_PERF_TEST=y +CONFIG_RCU_REF_SCALE_TEST=y CONFIG_PRINTK_TIME=y -- cgit v1.2.3 From 1fbeb3a8c4de29433a8d230ee600b13d369b6c0f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 11:53:53 -0700 Subject: refperf: Rename refperf.c to refscale.c and change internal names This commit further avoids conflation of refperf with the kernel's perf feature by renaming kernel/rcu/refperf.c to kernel/rcu/refscale.c, and also by similarly renaming the functions and variables inside this file. This has the side effect of changing the names of the kernel boot parameters, so kernel-parameters.txt and ver_functions.sh are also updated. The rcutorture --torture type remains refperf, and this will be addressed in a separate commit. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 17 +- kernel/rcu/Makefile | 2 +- kernel/rcu/refperf.c | 717 --------------------- kernel/rcu/refscale.c | 717 +++++++++++++++++++++ .../rcutorture/configs/refperf/ver_functions.sh | 4 +- 5 files changed, 730 insertions(+), 727 deletions(-) delete mode 100644 kernel/rcu/refperf.c create mode 100644 kernel/rcu/refscale.c (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 20cd00b78fc4..a4e4e0f6a550 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4407,13 +4407,13 @@ reboot_cpu is s[mp]#### with #### being the processor to be used for rebooting. - refperf.holdoff= [KNL] + refscale.holdoff= [KNL] Set test-start holdoff period. The purpose of this parameter is to delay the start of the test until boot completes in order to avoid interference. - refperf.loops= [KNL] + refscale.loops= [KNL] Set the number of loops over the synchronization primitive under test. Increasing this number reduces noise due to loop start/end overhead, @@ -4421,26 +4421,29 @@ noise to a handful of picoseconds on ca. 2020 x86 laptops. - refperf.nreaders= [KNL] + refscale.nreaders= [KNL] Set number of readers. The default value of -1 selects N, where N is roughly 75% of the number of CPUs. A value of zero is an interesting choice. - refperf.nruns= [KNL] + refscale.nruns= [KNL] Set number of runs, each of which is dumped onto the console log. - refperf.readdelay= [KNL] + refscale.readdelay= [KNL] Set the read-side critical-section duration, measured in microseconds. - refperf.shutdown= [KNL] + refscale.scale_type= [KNL] + Specify the read-protection implementation to test. + + refscale.shutdown= [KNL] Shut down the system at the end of the performance test. This defaults to 1 (shut it down) when rcuperf is built into the kernel and to 0 (leave it running) when rcuperf is built as a module. - refperf.verbose= [KNL] + refscale.verbose= [KNL] Enable additional printk() statements. relax_domain_level= diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 45d562de279a..95f5117ef8da 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o -obj-$(CONFIG_RCU_REF_SCALE_TEST) += refperf.o +obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c deleted file mode 100644 index 7c980573acbe..000000000000 --- a/kernel/rcu/refperf.c +++ /dev/null @@ -1,717 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -// -// Scalability test comparing RCU vs other mechanisms -// for acquiring references on objects. -// -// Copyright (C) Google, 2020. -// -// Author: Joel Fernandes - -#define pr_fmt(fmt) fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rcu.h" - -#define PERF_FLAG "-ref-perf: " - -#define PERFOUT(s, x...) \ - pr_alert("%s" PERF_FLAG s, perf_type, ## x) - -#define VERBOSE_PERFOUT(s, x...) \ - do { if (verbose) pr_alert("%s" PERF_FLAG s, perf_type, ## x); } while (0) - -#define VERBOSE_PERFOUT_ERRSTRING(s, x...) \ - do { if (verbose) pr_alert("%s" PERF_FLAG "!!! " s, perf_type, ## x); } while (0) - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Joel Fernandes (Google) "); - -static char *perf_type = "rcu"; -module_param(perf_type, charp, 0444); -MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); - -torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); - -// Wait until there are multiple CPUs before starting test. -torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, - "Holdoff time before test start (s)"); -// Number of loops per experiment, all readers execute operations concurrently. -torture_param(long, loops, 10000, "Number of loops per experiment."); -// Number of readers, with -1 defaulting to about 75% of the CPUs. -torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); -// Number of runs. -torture_param(int, nruns, 30, "Number of experiments to run."); -// Reader delay in nanoseconds, 0 for no delay. -torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); - -#ifdef MODULE -# define REFPERF_SHUTDOWN 0 -#else -# define REFPERF_SHUTDOWN 1 -#endif - -torture_param(bool, shutdown, REFPERF_SHUTDOWN, - "Shutdown at end of performance tests."); - -struct reader_task { - struct task_struct *task; - int start_reader; - wait_queue_head_t wq; - u64 last_duration_ns; -}; - -static struct task_struct *shutdown_task; -static wait_queue_head_t shutdown_wq; - -static struct task_struct *main_task; -static wait_queue_head_t main_wq; -static int shutdown_start; - -static struct reader_task *reader_tasks; - -// Number of readers that are part of the current experiment. -static atomic_t nreaders_exp; - -// Use to wait for all threads to start. -static atomic_t n_init; -static atomic_t n_started; -static atomic_t n_warmedup; -static atomic_t n_cooleddown; - -// Track which experiment is currently running. -static int exp_idx; - -// Operations vector for selecting different types of tests. -struct ref_perf_ops { - void (*init)(void); - void (*cleanup)(void); - void (*readsection)(const int nloops); - void (*delaysection)(const int nloops, const int udl, const int ndl); - const char *name; -}; - -static struct ref_perf_ops *cur_ops; - -static void un_delay(const int udl, const int ndl) -{ - if (udl) - udelay(udl); - if (ndl) - ndelay(ndl); -} - -static void ref_rcu_read_section(const int nloops) -{ - int i; - - for (i = nloops; i >= 0; i--) { - rcu_read_lock(); - rcu_read_unlock(); - } -} - -static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - - for (i = nloops; i >= 0; i--) { - rcu_read_lock(); - un_delay(udl, ndl); - rcu_read_unlock(); - } -} - -static void rcu_sync_perf_init(void) -{ -} - -static struct ref_perf_ops rcu_ops = { - .init = rcu_sync_perf_init, - .readsection = ref_rcu_read_section, - .delaysection = ref_rcu_delay_section, - .name = "rcu" -}; - -// Definitions for SRCU ref perf testing. -DEFINE_STATIC_SRCU(srcu_refctl_perf); -static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf; - -static void srcu_ref_perf_read_section(const int nloops) -{ - int i; - int idx; - - for (i = nloops; i >= 0; i--) { - idx = srcu_read_lock(srcu_ctlp); - srcu_read_unlock(srcu_ctlp, idx); - } -} - -static void srcu_ref_perf_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - int idx; - - for (i = nloops; i >= 0; i--) { - idx = srcu_read_lock(srcu_ctlp); - un_delay(udl, ndl); - srcu_read_unlock(srcu_ctlp, idx); - } -} - -static struct ref_perf_ops srcu_ops = { - .init = rcu_sync_perf_init, - .readsection = srcu_ref_perf_read_section, - .delaysection = srcu_ref_perf_delay_section, - .name = "srcu" -}; - -// Definitions for RCU Tasks ref perf testing: Empty read markers. -// These definitions also work for RCU Rude readers. -static void rcu_tasks_ref_perf_read_section(const int nloops) -{ - int i; - - for (i = nloops; i >= 0; i--) - continue; -} - -static void rcu_tasks_ref_perf_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - - for (i = nloops; i >= 0; i--) - un_delay(udl, ndl); -} - -static struct ref_perf_ops rcu_tasks_ops = { - .init = rcu_sync_perf_init, - .readsection = rcu_tasks_ref_perf_read_section, - .delaysection = rcu_tasks_ref_perf_delay_section, - .name = "rcu-tasks" -}; - -// Definitions for RCU Tasks Trace ref perf testing. -static void rcu_trace_ref_perf_read_section(const int nloops) -{ - int i; - - for (i = nloops; i >= 0; i--) { - rcu_read_lock_trace(); - rcu_read_unlock_trace(); - } -} - -static void rcu_trace_ref_perf_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - - for (i = nloops; i >= 0; i--) { - rcu_read_lock_trace(); - un_delay(udl, ndl); - rcu_read_unlock_trace(); - } -} - -static struct ref_perf_ops rcu_trace_ops = { - .init = rcu_sync_perf_init, - .readsection = rcu_trace_ref_perf_read_section, - .delaysection = rcu_trace_ref_perf_delay_section, - .name = "rcu-trace" -}; - -// Definitions for reference count -static atomic_t refcnt; - -static void ref_refcnt_section(const int nloops) -{ - int i; - - for (i = nloops; i >= 0; i--) { - atomic_inc(&refcnt); - atomic_dec(&refcnt); - } -} - -static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - - for (i = nloops; i >= 0; i--) { - atomic_inc(&refcnt); - un_delay(udl, ndl); - atomic_dec(&refcnt); - } -} - -static struct ref_perf_ops refcnt_ops = { - .init = rcu_sync_perf_init, - .readsection = ref_refcnt_section, - .delaysection = ref_refcnt_delay_section, - .name = "refcnt" -}; - -// Definitions for rwlock -static rwlock_t test_rwlock; - -static void ref_rwlock_init(void) -{ - rwlock_init(&test_rwlock); -} - -static void ref_rwlock_section(const int nloops) -{ - int i; - - for (i = nloops; i >= 0; i--) { - read_lock(&test_rwlock); - read_unlock(&test_rwlock); - } -} - -static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - - for (i = nloops; i >= 0; i--) { - read_lock(&test_rwlock); - un_delay(udl, ndl); - read_unlock(&test_rwlock); - } -} - -static struct ref_perf_ops rwlock_ops = { - .init = ref_rwlock_init, - .readsection = ref_rwlock_section, - .delaysection = ref_rwlock_delay_section, - .name = "rwlock" -}; - -// Definitions for rwsem -static struct rw_semaphore test_rwsem; - -static void ref_rwsem_init(void) -{ - init_rwsem(&test_rwsem); -} - -static void ref_rwsem_section(const int nloops) -{ - int i; - - for (i = nloops; i >= 0; i--) { - down_read(&test_rwsem); - up_read(&test_rwsem); - } -} - -static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - - for (i = nloops; i >= 0; i--) { - down_read(&test_rwsem); - un_delay(udl, ndl); - up_read(&test_rwsem); - } -} - -static struct ref_perf_ops rwsem_ops = { - .init = ref_rwsem_init, - .readsection = ref_rwsem_section, - .delaysection = ref_rwsem_delay_section, - .name = "rwsem" -}; - -static void rcu_perf_one_reader(void) -{ - if (readdelay <= 0) - cur_ops->readsection(loops); - else - cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); -} - -// Reader kthread. Repeatedly does empty RCU read-side -// critical section, minimizing update-side interference. -static int -ref_perf_reader(void *arg) -{ - unsigned long flags; - long me = (long)arg; - struct reader_task *rt = &(reader_tasks[me]); - u64 start; - s64 duration; - - VERBOSE_PERFOUT("ref_perf_reader %ld: task started", me); - set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); - set_user_nice(current, MAX_NICE); - atomic_inc(&n_init); - if (holdoff) - schedule_timeout_interruptible(holdoff * HZ); -repeat: - VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); - - // Wait for signal that this reader can start. - wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || - torture_must_stop()); - - if (torture_must_stop()) - goto end; - - // Make sure that the CPU is affinitized appropriately during testing. - WARN_ON_ONCE(smp_processor_id() != me); - - WRITE_ONCE(rt->start_reader, 0); - if (!atomic_dec_return(&n_started)) - while (atomic_read_acquire(&n_started)) - cpu_relax(); - - VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx); - - - // To reduce noise, do an initial cache-warming invocation, check - // in, and then keep warming until everyone has checked in. - rcu_perf_one_reader(); - if (!atomic_dec_return(&n_warmedup)) - while (atomic_read_acquire(&n_warmedup)) - rcu_perf_one_reader(); - // Also keep interrupts disabled. This also has the effect - // of preventing entries into slow path for rcu_read_unlock(). - local_irq_save(flags); - start = ktime_get_mono_fast_ns(); - - rcu_perf_one_reader(); - - duration = ktime_get_mono_fast_ns() - start; - local_irq_restore(flags); - - rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; - // To reduce runtime-skew noise, do maintain-load invocations until - // everyone is done. - if (!atomic_dec_return(&n_cooleddown)) - while (atomic_read_acquire(&n_cooleddown)) - rcu_perf_one_reader(); - - if (atomic_dec_and_test(&nreaders_exp)) - wake_up(&main_wq); - - VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)", - me, exp_idx, atomic_read(&nreaders_exp)); - - if (!torture_must_stop()) - goto repeat; -end: - torture_kthread_stopping("ref_perf_reader"); - return 0; -} - -static void reset_readers(void) -{ - int i; - struct reader_task *rt; - - for (i = 0; i < nreaders; i++) { - rt = &(reader_tasks[i]); - - rt->last_duration_ns = 0; - } -} - -// Print the results of each reader and return the sum of all their durations. -static u64 process_durations(int n) -{ - int i; - struct reader_task *rt; - char buf1[64]; - char *buf; - u64 sum = 0; - - buf = kmalloc(128 + nreaders * 32, GFP_KERNEL); - if (!buf) - return 0; - buf[0] = 0; - sprintf(buf, "Experiment #%d (Format: :)", - exp_idx); - - for (i = 0; i < n && !torture_must_stop(); i++) { - rt = &(reader_tasks[i]); - sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); - - if (i % 5 == 0) - strcat(buf, "\n"); - strcat(buf, buf1); - - sum += rt->last_duration_ns; - } - strcat(buf, "\n"); - - PERFOUT("%s\n", buf); - - kfree(buf); - return sum; -} - -// The main_func is the main orchestrator, it performs a bunch of -// experiments. For every experiment, it orders all the readers -// involved to start and waits for them to finish the experiment. It -// then reads their timestamps and starts the next experiment. Each -// experiment progresses from 1 concurrent reader to N of them at which -// point all the timestamps are printed. -static int main_func(void *arg) -{ - bool errexit = false; - int exp, r; - char buf1[64]; - char *buf; - u64 *result_avg; - - set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); - set_user_nice(current, MAX_NICE); - - VERBOSE_PERFOUT("main_func task started"); - result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); - buf = kzalloc(64 + nruns * 32, GFP_KERNEL); - if (!result_avg || !buf) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); - errexit = true; - } - if (holdoff) - schedule_timeout_interruptible(holdoff * HZ); - - // Wait for all threads to start. - atomic_inc(&n_init); - while (atomic_read(&n_init) < nreaders + 1) - schedule_timeout_uninterruptible(1); - - // Start exp readers up per experiment - for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { - if (errexit) - break; - if (torture_must_stop()) - goto end; - - reset_readers(); - atomic_set(&nreaders_exp, nreaders); - atomic_set(&n_started, nreaders); - atomic_set(&n_warmedup, nreaders); - atomic_set(&n_cooleddown, nreaders); - - exp_idx = exp; - - for (r = 0; r < nreaders; r++) { - smp_store_release(&reader_tasks[r].start_reader, 1); - wake_up(&reader_tasks[r].wq); - } - - VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers", - nreaders); - - wait_event(main_wq, - !atomic_read(&nreaders_exp) || torture_must_stop()); - - VERBOSE_PERFOUT("main_func: experiment ended"); - - if (torture_must_stop()) - goto end; - - result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); - } - - // Print the average of all experiments - PERFOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); - - buf[0] = 0; - strcat(buf, "\n"); - strcat(buf, "Runs\tTime(ns)\n"); - - for (exp = 0; exp < nruns; exp++) { - u64 avg; - u32 rem; - - if (errexit) - break; - avg = div_u64_rem(result_avg[exp], 1000, &rem); - sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); - strcat(buf, buf1); - } - - if (!errexit) - PERFOUT("%s", buf); - - // This will shutdown everything including us. - if (shutdown) { - shutdown_start = 1; - wake_up(&shutdown_wq); - } - - // Wait for torture to stop us - while (!torture_must_stop()) - schedule_timeout_uninterruptible(1); - -end: - torture_kthread_stopping("main_func"); - kfree(result_avg); - kfree(buf); - return 0; -} - -static void -ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag) -{ - pr_alert("%s" PERF_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", perf_type, tag, - verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); -} - -static void -ref_perf_cleanup(void) -{ - int i; - - if (torture_cleanup_begin()) - return; - - if (!cur_ops) { - torture_cleanup_end(); - return; - } - - if (reader_tasks) { - for (i = 0; i < nreaders; i++) - torture_stop_kthread("ref_perf_reader", - reader_tasks[i].task); - } - kfree(reader_tasks); - - torture_stop_kthread("main_task", main_task); - kfree(main_task); - - // Do perf-type-specific cleanup operations. - if (cur_ops->cleanup != NULL) - cur_ops->cleanup(); - - torture_cleanup_end(); -} - -// Shutdown kthread. Just waits to be awakened, then shuts down system. -static int -ref_perf_shutdown(void *arg) -{ - wait_event(shutdown_wq, shutdown_start); - - smp_mb(); // Wake before output. - ref_perf_cleanup(); - kernel_power_off(); - - return -EINVAL; -} - -static int __init -ref_perf_init(void) -{ - long i; - int firsterr = 0; - static struct ref_perf_ops *perf_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, - &refcnt_ops, &rwlock_ops, &rwsem_ops, - }; - - if (!torture_init_begin(perf_type, verbose)) - return -EBUSY; - - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { - cur_ops = perf_ops[i]; - if (strcmp(perf_type, cur_ops->name) == 0) - break; - } - if (i == ARRAY_SIZE(perf_ops)) { - pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); - pr_alert("rcu-perf types:"); - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) - pr_cont(" %s", perf_ops[i]->name); - pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); - firsterr = -EINVAL; - cur_ops = NULL; - goto unwind; - } - if (cur_ops->init) - cur_ops->init(); - - ref_perf_print_module_parms(cur_ops, "Start of test"); - - // Shutdown task - if (shutdown) { - init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(ref_perf_shutdown, NULL, - shutdown_task); - if (firsterr) - goto unwind; - schedule_timeout_uninterruptible(1); - } - - // Reader tasks (default to ~75% of online CPUs). - if (nreaders < 0) - nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); - reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), - GFP_KERNEL); - if (!reader_tasks) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - - VERBOSE_PERFOUT("Starting %d reader threads\n", nreaders); - - for (i = 0; i < nreaders; i++) { - firsterr = torture_create_kthread(ref_perf_reader, (void *)i, - reader_tasks[i].task); - if (firsterr) - goto unwind; - - init_waitqueue_head(&(reader_tasks[i].wq)); - } - - // Main Task - init_waitqueue_head(&main_wq); - firsterr = torture_create_kthread(main_func, NULL, main_task); - if (firsterr) - goto unwind; - - torture_init_end(); - return 0; - -unwind: - torture_init_end(); - ref_perf_cleanup(); - return firsterr; -} - -module_init(ref_perf_init); -module_exit(ref_perf_cleanup); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c new file mode 100644 index 000000000000..d9291f883b54 --- /dev/null +++ b/kernel/rcu/refscale.c @@ -0,0 +1,717 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Scalability test comparing RCU vs other mechanisms +// for acquiring references on objects. +// +// Copyright (C) Google, 2020. +// +// Author: Joel Fernandes + +#define pr_fmt(fmt) fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rcu.h" + +#define SCALE_FLAG "-ref-scale: " + +#define SCALEOUT(s, x...) \ + pr_alert("%s" SCALE_FLAG s, scale_type, ## x) + +#define VERBOSE_SCALEOUT(s, x...) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0) + +#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joel Fernandes (Google) "); + +static char *scale_type = "rcu"; +module_param(scale_type, charp, 0444); +MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); + +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); + +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +// Number of loops per experiment, all readers execute operations concurrently. +torture_param(long, loops, 10000, "Number of loops per experiment."); +// Number of readers, with -1 defaulting to about 75% of the CPUs. +torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); +// Number of runs. +torture_param(int, nruns, 30, "Number of experiments to run."); +// Reader delay in nanoseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); + +#ifdef MODULE +# define REFSCALE_SHUTDOWN 0 +#else +# define REFSCALE_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, REFSCALE_SHUTDOWN, + "Shutdown at end of scalability tests."); + +struct reader_task { + struct task_struct *task; + int start_reader; + wait_queue_head_t wq; + u64 last_duration_ns; +}; + +static struct task_struct *shutdown_task; +static wait_queue_head_t shutdown_wq; + +static struct task_struct *main_task; +static wait_queue_head_t main_wq; +static int shutdown_start; + +static struct reader_task *reader_tasks; + +// Number of readers that are part of the current experiment. +static atomic_t nreaders_exp; + +// Use to wait for all threads to start. +static atomic_t n_init; +static atomic_t n_started; +static atomic_t n_warmedup; +static atomic_t n_cooleddown; + +// Track which experiment is currently running. +static int exp_idx; + +// Operations vector for selecting different types of tests. +struct ref_scale_ops { + void (*init)(void); + void (*cleanup)(void); + void (*readsection)(const int nloops); + void (*delaysection)(const int nloops, const int udl, const int ndl); + const char *name; +}; + +static struct ref_scale_ops *cur_ops; + +static void un_delay(const int udl, const int ndl) +{ + if (udl) + udelay(udl); + if (ndl) + ndelay(ndl); +} + +static void ref_rcu_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + rcu_read_unlock(); + } +} + +static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + un_delay(udl, ndl); + rcu_read_unlock(); + } +} + +static void rcu_sync_scale_init(void) +{ +} + +static struct ref_scale_ops rcu_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_rcu_read_section, + .delaysection = ref_rcu_delay_section, + .name = "rcu" +}; + +// Definitions for SRCU ref scale testing. +DEFINE_STATIC_SRCU(srcu_refctl_scale); +static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale; + +static void srcu_ref_scale_read_section(const int nloops) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + srcu_read_unlock(srcu_ctlp, idx); + } +} + +static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock(srcu_ctlp, idx); + } +} + +static struct ref_scale_ops srcu_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_ref_scale_read_section, + .delaysection = srcu_ref_scale_delay_section, + .name = "srcu" +}; + +// Definitions for RCU Tasks ref scale testing: Empty read markers. +// These definitions also work for RCU Rude readers. +static void rcu_tasks_ref_scale_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) + continue; +} + +static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) + un_delay(udl, ndl); +} + +static struct ref_scale_ops rcu_tasks_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_tasks_ref_scale_read_section, + .delaysection = rcu_tasks_ref_scale_delay_section, + .name = "rcu-tasks" +}; + +// Definitions for RCU Tasks Trace ref scale testing. +static void rcu_trace_ref_scale_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + rcu_read_unlock_trace(); + } +} + +static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + un_delay(udl, ndl); + rcu_read_unlock_trace(); + } +} + +static struct ref_scale_ops rcu_trace_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_trace_ref_scale_read_section, + .delaysection = rcu_trace_ref_scale_delay_section, + .name = "rcu-trace" +}; + +// Definitions for reference count +static atomic_t refcnt; + +static void ref_refcnt_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + atomic_dec(&refcnt); + } +} + +static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + un_delay(udl, ndl); + atomic_dec(&refcnt); + } +} + +static struct ref_scale_ops refcnt_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_refcnt_section, + .delaysection = ref_refcnt_delay_section, + .name = "refcnt" +}; + +// Definitions for rwlock +static rwlock_t test_rwlock; + +static void ref_rwlock_init(void) +{ + rwlock_init(&test_rwlock); +} + +static void ref_rwlock_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + read_unlock(&test_rwlock); + } +} + +static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + un_delay(udl, ndl); + read_unlock(&test_rwlock); + } +} + +static struct ref_scale_ops rwlock_ops = { + .init = ref_rwlock_init, + .readsection = ref_rwlock_section, + .delaysection = ref_rwlock_delay_section, + .name = "rwlock" +}; + +// Definitions for rwsem +static struct rw_semaphore test_rwsem; + +static void ref_rwsem_init(void) +{ + init_rwsem(&test_rwsem); +} + +static void ref_rwsem_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + up_read(&test_rwsem); + } +} + +static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + un_delay(udl, ndl); + up_read(&test_rwsem); + } +} + +static struct ref_scale_ops rwsem_ops = { + .init = ref_rwsem_init, + .readsection = ref_rwsem_section, + .delaysection = ref_rwsem_delay_section, + .name = "rwsem" +}; + +static void rcu_scale_one_reader(void) +{ + if (readdelay <= 0) + cur_ops->readsection(loops); + else + cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); +} + +// Reader kthread. Repeatedly does empty RCU read-side +// critical section, minimizing update-side interference. +static int +ref_scale_reader(void *arg) +{ + unsigned long flags; + long me = (long)arg; + struct reader_task *rt = &(reader_tasks[me]); + u64 start; + s64 duration; + + VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_init); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); +repeat: + VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + + // Wait for signal that this reader can start. + wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || + torture_must_stop()); + + if (torture_must_stop()) + goto end; + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(smp_processor_id() != me); + + WRITE_ONCE(rt->start_reader, 0); + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) + cpu_relax(); + + VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx); + + + // To reduce noise, do an initial cache-warming invocation, check + // in, and then keep warming until everyone has checked in. + rcu_scale_one_reader(); + if (!atomic_dec_return(&n_warmedup)) + while (atomic_read_acquire(&n_warmedup)) + rcu_scale_one_reader(); + // Also keep interrupts disabled. This also has the effect + // of preventing entries into slow path for rcu_read_unlock(). + local_irq_save(flags); + start = ktime_get_mono_fast_ns(); + + rcu_scale_one_reader(); + + duration = ktime_get_mono_fast_ns() - start; + local_irq_restore(flags); + + rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; + // To reduce runtime-skew noise, do maintain-load invocations until + // everyone is done. + if (!atomic_dec_return(&n_cooleddown)) + while (atomic_read_acquire(&n_cooleddown)) + rcu_scale_one_reader(); + + if (atomic_dec_and_test(&nreaders_exp)) + wake_up(&main_wq); + + VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)", + me, exp_idx, atomic_read(&nreaders_exp)); + + if (!torture_must_stop()) + goto repeat; +end: + torture_kthread_stopping("ref_scale_reader"); + return 0; +} + +static void reset_readers(void) +{ + int i; + struct reader_task *rt; + + for (i = 0; i < nreaders; i++) { + rt = &(reader_tasks[i]); + + rt->last_duration_ns = 0; + } +} + +// Print the results of each reader and return the sum of all their durations. +static u64 process_durations(int n) +{ + int i; + struct reader_task *rt; + char buf1[64]; + char *buf; + u64 sum = 0; + + buf = kmalloc(128 + nreaders * 32, GFP_KERNEL); + if (!buf) + return 0; + buf[0] = 0; + sprintf(buf, "Experiment #%d (Format: :)", + exp_idx); + + for (i = 0; i < n && !torture_must_stop(); i++) { + rt = &(reader_tasks[i]); + sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); + + if (i % 5 == 0) + strcat(buf, "\n"); + strcat(buf, buf1); + + sum += rt->last_duration_ns; + } + strcat(buf, "\n"); + + SCALEOUT("%s\n", buf); + + kfree(buf); + return sum; +} + +// The main_func is the main orchestrator, it performs a bunch of +// experiments. For every experiment, it orders all the readers +// involved to start and waits for them to finish the experiment. It +// then reads their timestamps and starts the next experiment. Each +// experiment progresses from 1 concurrent reader to N of them at which +// point all the timestamps are printed. +static int main_func(void *arg) +{ + bool errexit = false; + int exp, r; + char buf1[64]; + char *buf; + u64 *result_avg; + + set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + VERBOSE_SCALEOUT("main_func task started"); + result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); + buf = kzalloc(64 + nruns * 32, GFP_KERNEL); + if (!result_avg || !buf) { + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + errexit = true; + } + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); + + // Wait for all threads to start. + atomic_inc(&n_init); + while (atomic_read(&n_init) < nreaders + 1) + schedule_timeout_uninterruptible(1); + + // Start exp readers up per experiment + for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { + if (errexit) + break; + if (torture_must_stop()) + goto end; + + reset_readers(); + atomic_set(&nreaders_exp, nreaders); + atomic_set(&n_started, nreaders); + atomic_set(&n_warmedup, nreaders); + atomic_set(&n_cooleddown, nreaders); + + exp_idx = exp; + + for (r = 0; r < nreaders; r++) { + smp_store_release(&reader_tasks[r].start_reader, 1); + wake_up(&reader_tasks[r].wq); + } + + VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers", + nreaders); + + wait_event(main_wq, + !atomic_read(&nreaders_exp) || torture_must_stop()); + + VERBOSE_SCALEOUT("main_func: experiment ended"); + + if (torture_must_stop()) + goto end; + + result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); + } + + // Print the average of all experiments + SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); + + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Runs\tTime(ns)\n"); + + for (exp = 0; exp < nruns; exp++) { + u64 avg; + u32 rem; + + if (errexit) + break; + avg = div_u64_rem(result_avg[exp], 1000, &rem); + sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); + strcat(buf, buf1); + } + + if (!errexit) + SCALEOUT("%s", buf); + + // This will shutdown everything including us. + if (shutdown) { + shutdown_start = 1; + wake_up(&shutdown_wq); + } + + // Wait for torture to stop us + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + +end: + torture_kthread_stopping("main_func"); + kfree(result_avg); + kfree(buf); + return 0; +} + +static void +ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) +{ + pr_alert("%s" SCALE_FLAG + "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); +} + +static void +ref_scale_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nreaders; i++) + torture_stop_kthread("ref_scale_reader", + reader_tasks[i].task); + } + kfree(reader_tasks); + + torture_stop_kthread("main_task", main_task); + kfree(main_task); + + // Do scale-type-specific cleanup operations. + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +// Shutdown kthread. Just waits to be awakened, then shuts down system. +static int +ref_scale_shutdown(void *arg) +{ + wait_event(shutdown_wq, shutdown_start); + + smp_mb(); // Wake before output. + ref_scale_cleanup(); + kernel_power_off(); + + return -EINVAL; +} + +static int __init +ref_scale_init(void) +{ + long i; + int firsterr = 0; + static struct ref_scale_ops *scale_ops[] = { + &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, + &refcnt_ops, &rwlock_ops, &rwsem_ops, + }; + + if (!torture_init_begin(scale_type, verbose)) + return -EBUSY; + + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) { + cur_ops = scale_ops[i]; + if (strcmp(scale_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(scale_ops)) { + pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type); + pr_alert("rcu-scale types:"); + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) + pr_cont(" %s", scale_ops[i]->name); + pr_cont("\n"); + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); + firsterr = -EINVAL; + cur_ops = NULL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + ref_scale_print_module_parms(cur_ops, "Start of test"); + + // Shutdown task + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(ref_scale_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + // Reader tasks (default to ~75% of online CPUs). + if (nreaders < 0) + nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (!reader_tasks) { + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders); + + for (i = 0; i < nreaders; i++) { + firsterr = torture_create_kthread(ref_scale_reader, (void *)i, + reader_tasks[i].task); + if (firsterr) + goto unwind; + + init_waitqueue_head(&(reader_tasks[i].wq)); + } + + // Main Task + init_waitqueue_head(&main_wq); + firsterr = torture_create_kthread(main_func, NULL, main_task); + if (firsterr) + goto unwind; + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + ref_scale_cleanup(); + return firsterr; +} + +module_init(ref_scale_init); +module_exit(ref_scale_cleanup); diff --git a/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh index 489f05dd929a..321e82641287 100644 --- a/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh @@ -11,6 +11,6 @@ # # Adds per-version torture-module parameters to kernels supporting them. per_version_boot_params () { - echo $1 refperf.shutdown=1 \ - refperf.verbose=1 + echo $1 refscale.shutdown=1 \ + refscale.verbose=1 } -- cgit v1.2.3 From 7fef6cff8f2814bf8eb632e2bb8f0a987ffd9ece Mon Sep 17 00:00:00 2001 From: Ethon Paul Date: Sat, 18 Apr 2020 19:46:47 +0800 Subject: srcu: Fix a typo in comment "amoritized"->"amortized" This commit fixes a typo in a comment. Signed-off-by: Ethon Paul Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6d3ef700fb0e..8ff71e5d0fe8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -766,7 +766,7 @@ static void srcu_flip(struct srcu_struct *ssp) * it, if this function was preempted for enough time for the counters * to wrap, it really doesn't matter whether or not we expedite the grace * period. The extra overhead of a needlessly expedited grace period is - * negligible when amoritized over that time period, and the extra latency + * negligible when amortized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ static bool srcu_might_be_idle(struct srcu_struct *ssp) -- cgit v1.2.3 From bde50d8ff83e4ce9e576f7c5ba1edb48a3610a5b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 26 May 2020 15:41:34 +0200 Subject: srcu: Avoid local_irq_save() before acquiring spinlock_t SRCU disables interrupts to get a stable per-CPU pointer and then acquires the spinlock which is in the per-CPU data structure. The release uses spin_unlock_irqrestore(). While this is correct on a non-RT kernel, this conflicts with the RT semantics because the spinlock is converted to a 'sleeping' spinlock. Sleeping locks can obviously not be acquired with interrupts disabled. Acquire the per-CPU pointer `ssp->sda' without disabling preemption and then acquire the spinlock_t of the per-CPU data structure. The lock will ensure that the data is consistent. The added call to check_init_srcu_struct() is now needed because a statically defined srcu_struct may remain uninitialized until this point and the newly introduced locking operation requires an initialized spinlock_t. This change was tested for four hours with 8*SRCU-N and 8*SRCU-P without causing any warnings. Cc: Lai Jiangshan Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: rcu@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 8ff71e5d0fe8..c100acf332ed 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -777,14 +777,15 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long t; unsigned long tlast; + check_init_srcu_struct(ssp); /* If the local srcu_data structure has callbacks, not idle. */ - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); return false; /* Callbacks already present, so not idle. */ } - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); /* * No local callbacks, so probabalistically probe global state. @@ -864,9 +865,8 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, } rhp->func = func; idx = srcu_read_lock(ssp); - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); - spin_lock_rcu_node(sdp); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); -- cgit v1.2.3 From d02c6b52d12fa30eeabfaf5aefe12078eacb94b2 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Mon, 13 Apr 2020 20:02:59 +0800 Subject: locktorture: Use true and false to assign to bool variables This commit fixes the following coccicheck warnings: kernel/locking/locktorture.c:689:6-10: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:907:2-20: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:938:3-20: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:668:2-19: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:674:2-19: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:634:2-20: WARNING: Assignment of 0/1 to bool variable kernel/locking/locktorture.c:640:2-20: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: Zou Wei Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 5efbfc68ce99..8ff6f50e06a0 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -631,13 +631,13 @@ static int lock_torture_writer(void *arg) cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; - lock_is_write_held = 1; + lock_is_write_held = true; if (WARN_ON_ONCE(lock_is_read_held)) lwsp->n_lock_fail++; /* rare, but... */ lwsp->n_lock_acquired++; cxt.cur_ops->write_delay(&rand); - lock_is_write_held = 0; + lock_is_write_held = false; cxt.cur_ops->writeunlock(); stutter_wait("lock_torture_writer"); @@ -665,13 +665,13 @@ static int lock_torture_reader(void *arg) schedule_timeout_uninterruptible(1); cxt.cur_ops->readlock(); - lock_is_read_held = 1; + lock_is_read_held = true; if (WARN_ON_ONCE(lock_is_write_held)) lrsp->n_lock_fail++; /* rare, but... */ lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); - lock_is_read_held = 0; + lock_is_read_held = false; cxt.cur_ops->readunlock(); stutter_wait("lock_torture_reader"); @@ -686,7 +686,7 @@ static int lock_torture_reader(void *arg) static void __torture_print_stats(char *page, struct lock_stress_stats *statp, bool write) { - bool fail = 0; + bool fail = false; int i, n_stress; long max = 0, min = statp ? statp[0].n_lock_acquired : 0; long long sum = 0; @@ -904,7 +904,7 @@ static int __init lock_torture_init(void) /* Initialize the statistics so that each run gets its own numbers. */ if (nwriters_stress) { - lock_is_write_held = 0; + lock_is_write_held = false; cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress, sizeof(*cxt.lwsa), GFP_KERNEL); @@ -935,7 +935,7 @@ static int __init lock_torture_init(void) } if (nreaders_stress) { - lock_is_read_held = 0; + lock_is_read_held = false; cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, sizeof(*cxt.lrsa), GFP_KERNEL); -- cgit v1.2.3 From 4a5f133c15b77c4018e8d7996541868ac94afb4f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Apr 2020 11:21:40 -0700 Subject: rcutorture: Add races with task-exit processing Several variants of Linux-kernel RCU interact with task-exit processing, including preemptible RCU, Tasks RCU, and Tasks Trace RCU. This commit therefore adds testing of this interaction to rcutorture by adding rcutorture.read_exit_burst and rcutorture.read_exit_delay kernel-boot parameters. These kernel parameters control the frequency and spacing of special read-then-exit kthreads that are spawned. [ paulmck: Apply feedback from Dan Carpenter's static checker. ] [ paulmck: Reduce latency to avoid false-positive shutdown hangs. ] Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 14 +++ include/linux/torture.h | 5 ++ kernel/rcu/rcutorture.c | 112 +++++++++++++++++++++++- 3 files changed, 128 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb95fad81c79..a0dcc925c8a2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4258,6 +4258,20 @@ Set time (jiffies) between CPU-hotplug operations, or zero to disable CPU-hotplug testing. + rcutorture.read_exit= [KNL] + Set the number of read-then-exit kthreads used + to test the interaction of RCU updaters and + task-exit processing. + + rcutorture.read_exit_burst= [KNL] + The number of times in a given read-then-exit + episode that a set of read-then-exit kthreads + is spawned. + + rcutorture.read_exit_delay= [KNL] + The delay, in seconds, between successive + read-then-exit testing episodes. + rcutorture.shuffle_interval= [KNL] Set task-shuffle interval (s). Shuffling tasks allows some CPUs to go into dyntick-idle mode diff --git a/include/linux/torture.h b/include/linux/torture.h index 629b66e6c161..7f65bd1dd307 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -55,6 +55,11 @@ struct torture_random_state { #define DEFINE_TORTURE_RANDOM_PERCPU(name) \ DEFINE_PER_CPU(struct torture_random_state, name) unsigned long torture_random(struct torture_random_state *trsp); +static inline void torture_random_init(struct torture_random_state *trsp) +{ + trsp->trs_state = 0; + trsp->trs_count = 0; +} /* Task shuffler, which causes CPUs to occasionally go idle. */ void torture_shuffle_task_register(struct task_struct *tp); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efb792e13fca..2621a339c8a4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -109,6 +109,10 @@ torture_param(int, object_debug, 0, torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(int, read_exit_delay, 13, + "Delay between read-then-exit episodes (s)"); +torture_param(int, read_exit_burst, 16, + "# of read-then-exit bursts per episode, zero to disable"); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -146,6 +150,7 @@ static struct task_struct *stall_task; static struct task_struct *fwd_prog_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; +static struct task_struct *read_exit_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -177,6 +182,7 @@ static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; /* did rcu_barrier test succeed? */ +static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; @@ -1539,10 +1545,11 @@ rcu_torture_stats_print(void) n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); - pr_cont("barrier: %ld/%ld:%ld\n", + pr_cont("barrier: %ld/%ld:%ld ", data_race(n_barrier_successes), data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); + pr_cont("read-exits: %ld\n", data_race(n_read_exits)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -1634,7 +1641,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "stall_cpu_block=%d " "n_barrier_cbs=%d " - "onoff_interval=%d onoff_holdoff=%d\n", + "onoff_interval=%d onoff_holdoff=%d " + "read_exit_delay=%d read_exit_burst=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -1643,7 +1651,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, stall_cpu_block, n_barrier_cbs, - onoff_interval, onoff_holdoff); + onoff_interval, onoff_holdoff, + read_exit_delay, read_exit_burst); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -2338,6 +2347,99 @@ static bool rcu_torture_can_boost(void) return true; } +static bool read_exit_child_stop; +static bool read_exit_child_stopped; +static wait_queue_head_t read_exit_wq; + +// Child kthread which just does an rcutorture reader and exits. +static int rcu_torture_read_exit_child(void *trsp_in) +{ + struct torture_random_state *trsp = trsp_in; + + set_user_nice(current, MAX_NICE); + // Minimize time between reading and exiting. + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + (void)rcu_torture_one_read(trsp); + return 0; +} + +// Parent kthread which creates and destroys read-exit child kthreads. +static int rcu_torture_read_exit(void *unused) +{ + int count = 0; + bool errexit = false; + int i; + struct task_struct *tsp; + DEFINE_TORTURE_RANDOM(trs); + + // Allocate and initialize. + set_user_nice(current, MAX_NICE); + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test"); + + // Each pass through this loop does one read-exit episode. + do { + if (++count > read_exit_burst) { + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); + rcu_barrier(); // Wait for task_struct free, avoid OOM. + for (i = 0; i < read_exit_delay; i++) { + schedule_timeout_uninterruptible(HZ); + if (READ_ONCE(read_exit_child_stop)) + break; + } + if (!READ_ONCE(read_exit_child_stop)) + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); + count = 0; + } + if (READ_ONCE(read_exit_child_stop)) + break; + // Spawn child. + tsp = kthread_run(rcu_torture_read_exit_child, + &trs, "%s", + "rcu_torture_read_exit_child"); + if (IS_ERR(tsp)) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + errexit = true; + tsp = NULL; + break; + } + cond_resched(); + kthread_stop(tsp); + n_read_exits ++; + stutter_wait("rcu_torture_read_exit"); + } while (!errexit && !READ_ONCE(read_exit_child_stop)); + + // Clean up and exit. + smp_store_release(&read_exit_child_stopped, true); // After reaping. + smp_mb(); // Store before wakeup. + wake_up(&read_exit_wq); + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_read_exit"); + return 0; +} + +static int rcu_torture_read_exit_init(void) +{ + if (read_exit_burst <= 0) + return -EINVAL; + init_waitqueue_head(&read_exit_wq); + read_exit_child_stop = false; + read_exit_child_stopped = false; + return torture_create_kthread(rcu_torture_read_exit, NULL, + read_exit_task); +} + +static void rcu_torture_read_exit_cleanup(void) +{ + if (!read_exit_task) + return; + WRITE_ONCE(read_exit_child_stop, true); + smp_mb(); // Above write before wait. + wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped)); + torture_stop_kthread(rcutorture_read_exit, read_exit_task); +} + static enum cpuhp_state rcutor_hp; static void @@ -2359,6 +2461,7 @@ rcu_torture_cleanup(void) } show_rcu_gp_kthreads(); + rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); torture_stop_kthread(rcu_torture_stall, stall_task); @@ -2680,6 +2783,9 @@ rcu_torture_init(void) if (firsterr) goto unwind; firsterr = rcu_torture_barrier_init(); + if (firsterr) + goto unwind; + firsterr = rcu_torture_read_exit_init(); if (firsterr) goto unwind; if (object_debug) -- cgit v1.2.3 From cae7cc6ba5bad320c2055ac54f73affd051e76ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 26 Apr 2020 19:20:37 -0700 Subject: rcutorture: NULL rcu_torture_current earlier in cleanup code Currently, the rcu_torture_current variable remains non-NULL until after all readers have stopped. During this time, rcu_torture_stats_print() will think that the test is still ongoing, which can result in confusing dmesg output. This commit therefore NULLs rcu_torture_current immediately after the rcu_torture_writer() kthread has decided to stop, thus informing rcu_torture_stats_print() much sooner. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2621a339c8a4..59112077a6da 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1172,6 +1172,7 @@ rcu_torture_writer(void *arg) WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); } } while (!torture_must_stop()); + rcu_torture_current = NULL; // Let stats task know that we are done. /* Reset expediting back to unexpedited. */ if (expediting > 0) expediting = -expediting; @@ -2473,7 +2474,6 @@ rcu_torture_cleanup(void) reader_tasks[i]); kfree(reader_tasks); } - rcu_torture_current = NULL; if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { -- cgit v1.2.3 From 8f43d5911b38f00dfa46169dcb1feb1e101dd906 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 1 Jun 2020 19:45:48 +0100 Subject: rcu/rcutorture: Replace 0 with false Coccinelle reports a warning WARNING: Assignment of 0/1 to bool variable The root cause is that the variable lastphase is a bool, but is initialised with integer 0. This commit therefore replaces the 0 with a false. Signed-off-by: Jules Irenge Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 59112077a6da..37455a12898e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2185,7 +2185,7 @@ static void rcu_torture_barrier1cb(void *rcu_void) static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; - bool lastphase = 0; + bool lastphase = false; bool newphase; struct rcu_head rcu; -- cgit v1.2.3 From 775227511843202e65a7f194cbf64f38de01f004 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Jun 2020 16:43:14 -0700 Subject: rcutorture: Check for unwatched readers RCU is supposed to be watching all non-idle kernel code and also all softirq handlers. This commit adds some teeth to this statement by adding a WARN_ON_ONCE(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 37455a12898e..9c310016585b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1377,6 +1377,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) struct rt_read_seg *rtrsp1; unsigned long long ts; + WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); started = cur_ops->get_gp_seq(); -- cgit v1.2.3 From 2102ad290af06119ccfb56ddc3a0e5011a91537e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Jun 2020 15:38:24 -0700 Subject: torture: Dump ftrace at shutdown only if requested If there is a large number of torture tests running concurrently, all of which are dumping large ftrace buffers at shutdown time, the resulting dumping can take a very long time, particularly on systems with rotating-rust storage. This commit therefore adds a default-off torture.ftrace_dump_at_shutdown module parameter that enables shutdown-time ftrace-buffer dumping. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 7 +++++++ kernel/torture.c | 6 +++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a0dcc925c8a2..9f11ff80d4ad 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5096,6 +5096,13 @@ Prevent the CPU-hotplug component of torturing until after init has spawned. + torture.ftrace_dump_at_shutdown= [KNL] + Dump the ftrace buffer at torture-test shutdown, + even if there were no errors. This can be a + very costly operation when many torture tests + are running concurrently, especially on systems + with rotating-rust storage. + tp720= [HW,PS2] tpm_suspend_pcr=[HW,TPM] diff --git a/kernel/torture.c b/kernel/torture.c index a1a41484ff6d..1061492f14bd 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -45,6 +45,9 @@ MODULE_AUTHOR("Paul E. McKenney "); static bool disable_onoff_at_boot; module_param(disable_onoff_at_boot, bool, 0444); +static bool ftrace_dump_at_shutdown; +module_param(ftrace_dump_at_shutdown, bool, 0444); + static char *torture_type; static int verbose; @@ -527,7 +530,8 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); - rcu_ftrace_dump(DUMP_ALL); + if (ftrace_dump_at_shutdown) + rcu_ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } -- cgit v1.2.3 From cda099b37d7165fc73a63961739acf026444cde2 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 19 Feb 2020 11:00:54 -0800 Subject: fork: Annotate a data race in vm_area_dup() struct vm_area_struct could be accessed concurrently as noticed by KCSAN, write to 0xffff9cf8bba08ad8 of 8 bytes by task 14263 on cpu 35: vma_interval_tree_insert+0x101/0x150: rb_insert_augmented_cached at include/linux/rbtree_augmented.h:58 (inlined by) vma_interval_tree_insert at mm/interval_tree.c:23 __vma_link_file+0x6e/0xe0 __vma_link_file at mm/mmap.c:629 vma_link+0xa2/0x120 mmap_region+0x753/0xb90 do_mmap+0x45c/0x710 vm_mmap_pgoff+0xc0/0x130 ksys_mmap_pgoff+0x1d1/0x300 __x64_sys_mmap+0x33/0x40 do_syscall_64+0x91/0xc44 entry_SYSCALL_64_after_hwframe+0x49/0xbe read to 0xffff9cf8bba08a80 of 200 bytes by task 14262 on cpu 122: vm_area_dup+0x6a/0xe0 vm_area_dup at kernel/fork.c:362 __split_vma+0x72/0x2a0 __split_vma at mm/mmap.c:2661 split_vma+0x5a/0x80 mprotect_fixup+0x368/0x3f0 do_mprotect_pkey+0x263/0x420 __x64_sys_mprotect+0x51/0x70 do_syscall_64+0x91/0xc44 entry_SYSCALL_64_after_hwframe+0x49/0xbe vm_area_dup() blindly copies all fields of original VMA to the new one. This includes coping vm_area_struct::shared.rb which is normally protected by i_mmap_lock. But this is fine because the read value will be overwritten on the following __vma_link_file() under proper protection. Thus, mark it as an intentional data race and insert a few assertions for the fields that should not be modified concurrently. Signed-off-by: Qian Cai Signed-off-by: Paul E. McKenney --- kernel/fork.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..bba10fbcdce7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -359,7 +359,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new) { - *new = *orig; + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); + /* + * orig->shared.rb may be modified concurrently, but the clone + * will be reinitialized. + */ + *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; } -- cgit v1.2.3 From 1fe84fd4a4027a17d511a832f89ab14107650ba4 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 5 May 2020 20:28:21 +0200 Subject: kcsan: Add test suite This adds KCSAN test focusing on behaviour of the integrated runtime. Tests various race scenarios, and verifies the reports generated to console. Makes use of KUnit for test organization, and the Torture framework for test thread control. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 3 + kernel/kcsan/kcsan-test.c | 1084 +++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.kcsan | 23 +- 3 files changed, 1109 insertions(+), 1 deletion(-) create mode 100644 kernel/kcsan/kcsan-test.c (limited to 'kernel') diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index d4999b38d1be..14533cf24bc3 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -12,3 +12,6 @@ CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ obj-y := core.o debugfs.o report.o obj-$(CONFIG_KCSAN_SELFTEST) += test.o + +CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer +obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c new file mode 100644 index 000000000000..a8c11506dd2a --- /dev/null +++ b/kernel/kcsan/kcsan-test.c @@ -0,0 +1,1084 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KCSAN test with various race scenarious to test runtime behaviour. Since the + * interface with which KCSAN's reports are obtained is via the console, this is + * the output we should verify. For each test case checks the presence (or + * absence) of generated reports. Relies on 'console' tracepoint to capture + * reports as they appear in the kernel log. + * + * Makes use of KUnit for test organization, and the Torture framework for test + * thread control. + * + * Copyright (C) 2020, Google LLC. + * Author: Marco Elver + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Points to current test-case memory access "kernels". */ +static void (*access_kernels[2])(void); + +static struct task_struct **threads; /* Lists of threads. */ +static unsigned long end_time; /* End time of test. */ + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + int nlines; + char lines[3][512]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Setup test checking loop. */ +static __no_kcsan_or_inline void +begin_test_checks(void (*func1)(void), void (*func2)(void)) +{ + kcsan_disable_current(); + + /* + * Require at least as long as KCSAN_REPORT_ONCE_IN_MS, to ensure at + * least one race is reported. + */ + end_time = jiffies + msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS + 500); + + /* Signal start; release potential initialization of shared data. */ + smp_store_release(&access_kernels[0], func1); + smp_store_release(&access_kernels[1], func2); +} + +/* End test checking loop. */ +static __no_kcsan_or_inline bool +end_test_checks(bool stop) +{ + if (!stop && time_before(jiffies, end_time)) { + /* Continue checking */ + might_sleep(); + return false; + } + + kcsan_enable_current(); + return true; +} + +/* + * Probe for console output: checks if a race was reported, and obtains observed + * lines of interest. + */ +__no_kcsan +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + int nlines; + + /* + * Note that KCSAN reports under a global lock, so we do not risk the + * possibility of having multiple reports interleaved. If that were the + * case, we'd expect tests to fail. + */ + + spin_lock_irqsave(&observed.lock, flags); + nlines = observed.nlines; + + if (strnstr(buf, "BUG: KCSAN: ", len) && strnstr(buf, "test_", len)) { + /* + * KCSAN report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0]))); + nlines = 1; + } else if ((nlines == 1 || nlines == 2) && strnstr(buf, "bytes by", len)) { + strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0]))); + + if (strnstr(buf, "race at unknown origin", len)) { + if (WARN_ON(nlines != 2)) + goto out; + + /* No second line of interest. */ + strcpy(observed.lines[nlines++], ""); + } + } + +out: + WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */ + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +__no_kcsan +static bool report_available(void) +{ + return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines); +} + +/* Report information we expect in a report. */ +struct expect_report { + /* Access information of both accesses. */ + struct { + void *fn; /* Function pointer to expected function of top frame. */ + void *addr; /* Address of access; unchecked if NULL. */ + size_t size; /* Size of access; unchecked if @addr is NULL. */ + int type; /* Access type, see KCSAN_ACCESS definitions. */ + } access[2]; +}; + +/* Check observed report matches information in @r. */ +__no_kcsan +static bool report_matches(const struct expect_report *r) +{ + const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT; + bool ret = false; + unsigned long flags; + typeof(observed.lines) expect; + const char *end; + char *cur; + int i; + + /* Doubled-checked locking. */ + if (!report_available()) + return false; + + /* Generate expected report contents. */ + + /* Title */ + cur = expect[0]; + end = &expect[0][sizeof(expect[0]) - 1]; + cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ", + is_assert ? "assert: race" : "data-race"); + if (r->access[1].fn) { + char tmp[2][64]; + int cmp; + + /* Expect lexographically sorted function names in title. */ + scnprintf(tmp[0], sizeof(tmp[0]), "%pS", r->access[0].fn); + scnprintf(tmp[1], sizeof(tmp[1]), "%pS", r->access[1].fn); + cmp = strcmp(tmp[0], tmp[1]); + cur += scnprintf(cur, end - cur, "%ps / %ps", + cmp < 0 ? r->access[0].fn : r->access[1].fn, + cmp < 0 ? r->access[1].fn : r->access[0].fn); + } else { + scnprintf(cur, end - cur, "%pS", r->access[0].fn); + /* The exact offset won't match, remove it. */ + cur = strchr(expect[0], '+'); + if (cur) + *cur = '\0'; + } + + /* Access 1 */ + cur = expect[1]; + end = &expect[1][sizeof(expect[1]) - 1]; + if (!r->access[1].fn) + cur += scnprintf(cur, end - cur, "race at unknown origin, with "); + + /* Access 1 & 2 */ + for (i = 0; i < 2; ++i) { + const char *const access_type = + (r->access[i].type & KCSAN_ACCESS_ASSERT) ? + ((r->access[i].type & KCSAN_ACCESS_WRITE) ? + "assert no accesses" : + "assert no writes") : + ((r->access[i].type & KCSAN_ACCESS_WRITE) ? + "write" : + "read"); + const char *const access_type_aux = + (r->access[i].type & KCSAN_ACCESS_ATOMIC) ? + " (marked)" : + ((r->access[i].type & KCSAN_ACCESS_SCOPED) ? + " (scoped)" : + ""); + + if (i == 1) { + /* Access 2 */ + cur = expect[2]; + end = &expect[2][sizeof(expect[2]) - 1]; + + if (!r->access[1].fn) { + /* Dummy string if no second access is available. */ + strcpy(cur, ""); + break; + } + } + + cur += scnprintf(cur, end - cur, "%s%s to ", access_type, + access_type_aux); + + if (r->access[i].addr) /* Address is optional. */ + cur += scnprintf(cur, end - cur, "0x%px of %zu bytes", + r->access[i].addr, r->access[i].size); + } + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.lines[0], expect[0]) && + /* Access info may appear in any order. */ + ((strstr(observed.lines[1], expect[1]) && + strstr(observed.lines[2], expect[2])) || + (strstr(observed.lines[1], expect[2]) && + strstr(observed.lines[2], expect[1]))); +out: + spin_unlock_irqrestore(&observed.lock, flags); + return ret; +} + +/* ===== Test kernels ===== */ + +static long test_sink; +static long test_var; +/* @test_array should be large enough to fall into multiple watchpoint slots. */ +static long test_array[3 * PAGE_SIZE / sizeof(long)]; +static struct { + long val[8]; +} test_struct; +static DEFINE_SEQLOCK(test_seqlock); + +/* + * Helper to avoid compiler optimizing out reads, and to generate source values + * for writes. + */ +__no_kcsan +static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); } + +static noinline void test_kernel_read(void) { sink_value(test_var); } + +static noinline void test_kernel_write(void) +{ + test_var = READ_ONCE_NOCHECK(test_sink) + 1; +} + +static noinline void test_kernel_write_nochange(void) { test_var = 42; } + +/* Suffixed by value-change exception filter. */ +static noinline void test_kernel_write_nochange_rcu(void) { test_var = 42; } + +static noinline void test_kernel_read_atomic(void) +{ + sink_value(READ_ONCE(test_var)); +} + +static noinline void test_kernel_write_atomic(void) +{ + WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1); +} + +__no_kcsan +static noinline void test_kernel_write_uninstrumented(void) { test_var++; } + +static noinline void test_kernel_data_race(void) { data_race(test_var++); } + +static noinline void test_kernel_assert_writer(void) +{ + ASSERT_EXCLUSIVE_WRITER(test_var); +} + +static noinline void test_kernel_assert_access(void) +{ + ASSERT_EXCLUSIVE_ACCESS(test_var); +} + +#define TEST_CHANGE_BITS 0xff00ff00 + +static noinline void test_kernel_change_bits(void) +{ + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { + /* + * Avoid race of unknown origin for this test, just pretend they + * are atomic. + */ + kcsan_nestable_atomic_begin(); + test_var ^= TEST_CHANGE_BITS; + kcsan_nestable_atomic_end(); + } else + WRITE_ONCE(test_var, READ_ONCE(test_var) ^ TEST_CHANGE_BITS); +} + +static noinline void test_kernel_assert_bits_change(void) +{ + ASSERT_EXCLUSIVE_BITS(test_var, TEST_CHANGE_BITS); +} + +static noinline void test_kernel_assert_bits_nochange(void) +{ + ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS); +} + +/* To check that scoped assertions do trigger anywhere in scope. */ +static noinline void test_enter_scope(void) +{ + int x = 0; + + /* Unrelated accesses to scoped assert. */ + READ_ONCE(test_sink); + kcsan_check_read(&x, sizeof(x)); +} + +static noinline void test_kernel_assert_writer_scoped(void) +{ + ASSERT_EXCLUSIVE_WRITER_SCOPED(test_var); + test_enter_scope(); +} + +static noinline void test_kernel_assert_access_scoped(void) +{ + ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_var); + test_enter_scope(); +} + +static noinline void test_kernel_rmw_array(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(test_array); ++i) + test_array[i]++; +} + +static noinline void test_kernel_write_struct(void) +{ + kcsan_check_write(&test_struct, sizeof(test_struct)); + kcsan_disable_current(); + test_struct.val[3]++; /* induce value change */ + kcsan_enable_current(); +} + +static noinline void test_kernel_write_struct_part(void) +{ + test_struct.val[3] = 42; +} + +static noinline void test_kernel_read_struct_zero_size(void) +{ + kcsan_check_read(&test_struct.val[3], 0); +} + +static noinline void test_kernel_seqlock_reader(void) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&test_seqlock); + sink_value(test_var); + } while (read_seqretry(&test_seqlock, seq)); +} + +static noinline void test_kernel_seqlock_writer(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&test_seqlock, flags); + test_var++; + write_sequnlock_irqrestore(&test_seqlock, flags); +} + +/* ===== Test cases ===== */ + +/* Simple test with normal data race. */ +__no_kcsan +static void test_basic(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + static const struct expect_report never = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_write, test_kernel_read); + do { + match_expect |= report_matches(&expect); + match_never = report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* + * Stress KCSAN with lots of concurrent races on different addresses until + * timeout. + */ +__no_kcsan +static void test_concurrent_races(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + /* NULL will match any address. */ + { test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE }, + { test_kernel_rmw_array, NULL, 0, 0 }, + }, + }; + static const struct expect_report never = { + .access = { + { test_kernel_rmw_array, NULL, 0, 0 }, + { test_kernel_rmw_array, NULL, 0, 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_rmw_array, test_kernel_rmw_array); + do { + match_expect |= report_matches(&expect); + match_never |= report_matches(&never); + } while (!end_test_checks(false)); + KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check matches exist. */ + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test the KCSAN_REPORT_VALUE_CHANGE_ONLY option. */ +__no_kcsan +static void test_novalue_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_nochange, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY)) + KUNIT_EXPECT_FALSE(test, match_expect); + else + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that the rules where the KCSAN_REPORT_VALUE_CHANGE_ONLY option should + * never apply work. + */ +__no_kcsan +static void test_novalue_change_exception(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Test that data races of unknown origin are reported. */ +__no_kcsan +static void test_unknown_origin(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { NULL }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_uninstrumented, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN)) + KUNIT_EXPECT_TRUE(test, match_expect); + else + KUNIT_EXPECT_FALSE(test, match_expect); +} + +/* Test KCSAN_ASSUME_PLAIN_WRITES_ATOMIC if it is selected. */ +__no_kcsan +static void test_write_write_assume_atomic(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write, test_kernel_write); + do { + sink_value(READ_ONCE(test_var)); /* induce value-change */ + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) + KUNIT_EXPECT_FALSE(test, match_expect); + else + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that data races with writes larger than word-size are always reported, + * even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected. + */ +__no_kcsan +static void test_write_write_struct(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_write_struct); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* + * Test that data races where only one write is larger than word-size are always + * reported, even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected. + */ +__no_kcsan +static void test_write_write_struct_part(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_write_struct_part); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Test that races with atomic accesses never result in reports. */ +__no_kcsan +static void test_read_atomic_write_atomic(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_read_atomic, test_kernel_write_atomic); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test that a race with an atomic and plain access result in reports. */ +__no_kcsan +static void test_read_plain_atomic_write(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC }, + }, + }; + bool match_expect = false; + + if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) + return; + + begin_test_checks(test_kernel_read, test_kernel_write_atomic); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +/* Zero-sized accesses should never cause data race reports. */ +__no_kcsan +static void test_zero_size_access(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report never = { + .access = { + { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, + { test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 }, + }, + }; + bool match_expect = false; + bool match_never = false; + + begin_test_checks(test_kernel_write_struct, test_kernel_read_struct_zero_size); + do { + match_expect |= report_matches(&expect); + match_never = report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check. */ + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* Test the data_race() macro. */ +__no_kcsan +static void test_data_race(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_data_race, test_kernel_data_race); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_writer(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_writer, test_kernel_write_nochange); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_access(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_access, test_kernel_read); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_access_writer(struct kunit *test) +{ + const struct expect_report expect_access_writer = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + }, + }; + const struct expect_report expect_access_access = { + .access = { + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report never = { + .access = { + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + }, + }; + bool match_expect_access_writer = false; + bool match_expect_access_access = false; + bool match_never = false; + + begin_test_checks(test_kernel_assert_access, test_kernel_assert_writer); + do { + match_expect_access_writer |= report_matches(&expect_access_writer); + match_expect_access_access |= report_matches(&expect_access_access); + match_never |= report_matches(&never); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_TRUE(test, match_expect_access_writer); + KUNIT_EXPECT_TRUE(test, match_expect_access_access); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_bits_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, + { test_kernel_change_bits, &test_var, sizeof(test_var), + KCSAN_ACCESS_WRITE | (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) ? 0 : KCSAN_ACCESS_ATOMIC) }, + }, + }; + bool match_expect = false; + + begin_test_checks(test_kernel_assert_bits_change, test_kernel_change_bits); + do { + match_expect = report_matches(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_TRUE(test, match_expect); +} + +__no_kcsan +static void test_assert_exclusive_bits_nochange(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_assert_bits_nochange, test_kernel_change_bits); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +__no_kcsan +static void test_assert_exclusive_writer_scoped(struct kunit *test) +{ + const struct expect_report expect_start = { + .access = { + { test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + const struct expect_report expect_anywhere = { + .access = { + { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, + { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, + }, + }; + bool match_expect_start = false; + bool match_expect_anywhere = false; + + begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange); + do { + match_expect_start |= report_matches(&expect_start); + match_expect_anywhere |= report_matches(&expect_anywhere); + } while (!end_test_checks(match_expect_start && match_expect_anywhere)); + KUNIT_EXPECT_TRUE(test, match_expect_start); + KUNIT_EXPECT_TRUE(test, match_expect_anywhere); +} + +__no_kcsan +static void test_assert_exclusive_access_scoped(struct kunit *test) +{ + const struct expect_report expect_start1 = { + .access = { + { test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + const struct expect_report expect_start2 = { + .access = { expect_start1.access[0], expect_start1.access[0] }, + }; + const struct expect_report expect_inscope = { + .access = { + { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + }, + }; + bool match_expect_start = false; + bool match_expect_inscope = false; + + begin_test_checks(test_kernel_assert_access_scoped, test_kernel_read); + end_time += msecs_to_jiffies(1000); /* This test requires a bit more time. */ + do { + match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2); + match_expect_inscope |= report_matches(&expect_inscope); + } while (!end_test_checks(match_expect_start && match_expect_inscope)); + KUNIT_EXPECT_TRUE(test, match_expect_start); + KUNIT_EXPECT_TRUE(test, match_expect_inscope); +} + +/* Test that racing accesses in seqlock critical sections are not reported. */ +__no_kcsan +static void test_seqlock_noreport(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_seqlock_reader, test_kernel_seqlock_writer); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + +/* + * Each test case is run with different numbers of threads. Until KUnit supports + * passing arguments for each test case, we encode #threads in the test case + * name (read by get_num_threads()). [The '-' was chosen as a stylistic + * preference to separate test name and #threads.] + * + * The thread counts are chosen to cover potentially interesting boundaries and + * corner cases (range 2-5), and then stress the system with larger counts. + */ +#define KCSAN_KUNIT_CASE(test_name) \ + { .run_case = test_name, .name = #test_name "-02" }, \ + { .run_case = test_name, .name = #test_name "-03" }, \ + { .run_case = test_name, .name = #test_name "-04" }, \ + { .run_case = test_name, .name = #test_name "-05" }, \ + { .run_case = test_name, .name = #test_name "-08" }, \ + { .run_case = test_name, .name = #test_name "-16" } + +static struct kunit_case kcsan_test_cases[] = { + KCSAN_KUNIT_CASE(test_basic), + KCSAN_KUNIT_CASE(test_concurrent_races), + KCSAN_KUNIT_CASE(test_novalue_change), + KCSAN_KUNIT_CASE(test_novalue_change_exception), + KCSAN_KUNIT_CASE(test_unknown_origin), + KCSAN_KUNIT_CASE(test_write_write_assume_atomic), + KCSAN_KUNIT_CASE(test_write_write_struct), + KCSAN_KUNIT_CASE(test_write_write_struct_part), + KCSAN_KUNIT_CASE(test_read_atomic_write_atomic), + KCSAN_KUNIT_CASE(test_read_plain_atomic_write), + KCSAN_KUNIT_CASE(test_zero_size_access), + KCSAN_KUNIT_CASE(test_data_race), + KCSAN_KUNIT_CASE(test_assert_exclusive_writer), + KCSAN_KUNIT_CASE(test_assert_exclusive_access), + KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer), + KCSAN_KUNIT_CASE(test_assert_exclusive_bits_change), + KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange), + KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped), + KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped), + KCSAN_KUNIT_CASE(test_seqlock_noreport), + {}, +}; + +/* ===== End test cases ===== */ + +/* Get number of threads encoded in test name. */ +static bool __no_kcsan +get_num_threads(const char *test, int *nthreads) +{ + int len = strlen(test); + + if (WARN_ON(len < 3)) + return false; + + *nthreads = test[len - 1] - '0'; + *nthreads += (test[len - 2] - '0') * 10; + + if (WARN_ON(*nthreads < 0)) + return false; + + return true; +} + +/* Concurrent accesses from interrupts. */ +__no_kcsan +static void access_thread_timer(struct timer_list *timer) +{ + static atomic_t cnt = ATOMIC_INIT(0); + unsigned int idx; + void (*func)(void); + + idx = (unsigned int)atomic_inc_return(&cnt) % ARRAY_SIZE(access_kernels); + /* Acquire potential initialization. */ + func = smp_load_acquire(&access_kernels[idx]); + if (func) + func(); +} + +/* The main loop for each thread. */ +__no_kcsan +static int access_thread(void *arg) +{ + struct timer_list timer; + unsigned int cnt = 0; + unsigned int idx; + void (*func)(void); + + timer_setup_on_stack(&timer, access_thread_timer, 0); + do { + might_sleep(); + + if (!timer_pending(&timer)) + mod_timer(&timer, jiffies + 1); + else { + /* Iterate through all kernels. */ + idx = cnt++ % ARRAY_SIZE(access_kernels); + /* Acquire potential initialization. */ + func = smp_load_acquire(&access_kernels[idx]); + if (func) + func(); + } + } while (!torture_must_stop()); + del_timer_sync(&timer); + destroy_timer_on_stack(&timer); + + torture_kthread_stopping("access_thread"); + return 0; +} + +__no_kcsan +static int test_init(struct kunit *test) +{ + unsigned long flags; + int nthreads; + int i; + + spin_lock_irqsave(&observed.lock, flags); + for (i = 0; i < ARRAY_SIZE(observed.lines); ++i) + observed.lines[i][0] = '\0'; + observed.nlines = 0; + spin_unlock_irqrestore(&observed.lock, flags); + + if (!torture_init_begin((char *)test->name, 1)) + return -EBUSY; + + if (!get_num_threads(test->name, &nthreads)) + goto err; + + if (WARN_ON(threads)) + goto err; + + for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) { + if (WARN_ON(access_kernels[i])) + goto err; + } + + if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { + /* + * Without any preemption, keep 2 CPUs free for other tasks, one + * of which is the main test case function checking for + * completion or failure. + */ + const int min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; + const int min_required_cpus = 2 + min_unused_cpus; + + if (num_online_cpus() < min_required_cpus) { + pr_err("%s: too few online CPUs (%u < %d) for test", + test->name, num_online_cpus(), min_required_cpus); + goto err; + } else if (nthreads > num_online_cpus() - min_unused_cpus) { + nthreads = num_online_cpus() - min_unused_cpus; + pr_warn("%s: limiting number of threads to %d\n", + test->name, nthreads); + } + } + + if (nthreads) { + threads = kcalloc(nthreads + 1, sizeof(struct task_struct *), + GFP_KERNEL); + if (WARN_ON(!threads)) + goto err; + + threads[nthreads] = NULL; + for (i = 0; i < nthreads; ++i) { + if (torture_create_kthread(access_thread, NULL, + threads[i])) + goto err; + } + } + + torture_init_end(); + + return 0; + +err: + kfree(threads); + threads = NULL; + torture_init_end(); + return -EINVAL; +} + +__no_kcsan +static void test_exit(struct kunit *test) +{ + struct task_struct **stop_thread; + int i; + + if (torture_cleanup_begin()) + return; + + for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) + WRITE_ONCE(access_kernels[i], NULL); + + if (threads) { + for (stop_thread = threads; *stop_thread; stop_thread++) + torture_stop_kthread(reader_thread, *stop_thread); + + kfree(threads); + threads = NULL; + } + + torture_cleanup_end(); +} + +static struct kunit_suite kcsan_test_suite = { + .name = "kcsan-test", + .test_cases = kcsan_test_cases, + .init = test_init, + .exit = test_exit, +}; +static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL }; + +__no_kcsan +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +__no_kcsan +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +/* + * We only want to do tracepoints setup and teardown once, therefore we have to + * customize the init and exit functions and cannot rely on kunit_test_suite(). + */ +static int __init kcsan_test_init(void) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return __kunit_test_suites_init(kcsan_test_suites); +} + +static void kcsan_test_exit(void) +{ + __kunit_test_suites_exit(kcsan_test_suites); + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +late_initcall(kcsan_test_init); +module_exit(kcsan_test_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Marco Elver "); diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index 5ee88e5119c2..3f3b5bca7a8f 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -59,7 +59,28 @@ config KCSAN_SELFTEST bool "Perform short selftests on boot" default y help - Run KCSAN selftests on boot. On test failure, causes the kernel to panic. + Run KCSAN selftests on boot. On test failure, causes the kernel to + panic. Recommended to be enabled, ensuring critical functionality + works as intended. + +config KCSAN_TEST + tristate "KCSAN test for integrated runtime behaviour" + depends on TRACEPOINTS && KUNIT + select TORTURE_TEST + help + KCSAN test focusing on behaviour of the integrated runtime. Tests + various race scenarios, and verifies the reports generated to + console. Makes use of KUnit for test organization, and the Torture + framework for test thread control. + + Each test case may run at least up to KCSAN_REPORT_ONCE_IN_MS + milliseconds. Test run duration may be optimized by building the + kernel and KCSAN test with KCSAN_REPORT_ONCE_IN_MS set to a lower + than default value. + + Say Y here if you want the test to be built into the kernel and run + during boot; say M if you want the test to build as a module; say N + if you are unsure. config KCSAN_EARLY_ENABLE bool "Early enable during boot" -- cgit v1.2.3 From 33190b675ce2eacbeb4e75168c05b41110b506ec Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 11 Feb 2020 08:54:15 -0500 Subject: locking/osq_lock: Annotate a data race in osq_lock The prev->next pointer can be accessed concurrently as noticed by KCSAN: write (marked) to 0xffff9d3370dbbe40 of 8 bytes by task 3294 on cpu 107: osq_lock+0x25f/0x350 osq_wait_next at kernel/locking/osq_lock.c:79 (inlined by) osq_lock at kernel/locking/osq_lock.c:185 rwsem_optimistic_spin read to 0xffff9d3370dbbe40 of 8 bytes by task 3398 on cpu 100: osq_lock+0x196/0x350 osq_lock at kernel/locking/osq_lock.c:157 rwsem_optimistic_spin Since the write only stores NULL to prev->next and the read tests if prev->next equals to this_cpu_ptr(&osq_node). Even if the value is shattered, the code is still working correctly. Thus, mark it as an intentional data race using the data_race() macro. Signed-off-by: Qian Cai Signed-off-by: Paul E. McKenney --- kernel/locking/osq_lock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 1f7734949ac8..1de006ed3aa8 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -154,7 +154,11 @@ bool osq_lock(struct optimistic_spin_queue *lock) */ for (;;) { - if (prev->next == node && + /* + * cpu_relax() below implies a compiler barrier which would + * prevent this comparison being optimized away. + */ + if (data_race(prev->next) == node && cmpxchg(&prev->next, node, NULL) == node) break; -- cgit v1.2.3 From 2888557f68db334a3839dcc262264a4c436f576b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 2 Jun 2020 16:36:33 +0200 Subject: kcsan: Prefer '__no_kcsan inline' in test Instead of __no_kcsan_or_inline, prefer '__no_kcsan inline' in test -- this is in case we decide to remove __no_kcsan_or_inline. Suggested-by: Peter Zijlstra Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan-test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c index a8c11506dd2a..3af420ad6ee7 100644 --- a/kernel/kcsan/kcsan-test.c +++ b/kernel/kcsan/kcsan-test.c @@ -43,7 +43,7 @@ static struct { }; /* Setup test checking loop. */ -static __no_kcsan_or_inline void +static __no_kcsan inline void begin_test_checks(void (*func1)(void), void (*func2)(void)) { kcsan_disable_current(); @@ -60,7 +60,7 @@ begin_test_checks(void (*func1)(void), void (*func2)(void)) } /* End test checking loop. */ -static __no_kcsan_or_inline bool +static __no_kcsan inline bool end_test_checks(bool stop) { if (!stop && time_before(jiffies, end_time)) { -- cgit v1.2.3 From 9dd979bae4cf76558ff816abe83283308fb1ae8c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 14:36:22 +0200 Subject: kcsan: Silence -Wmissing-prototypes warning with W=1 The functions here should not be forward declared for explicit use elsewhere in the kernel, as they should only be emitted by the compiler due to sanitizer instrumentation. Add forward declarations a line above their definition to shut up warnings in W=1 builds. Link: https://lkml.kernel.org/r/202006060103.jSCpnV1g%lkp@intel.com Reported-by: kernel test robot Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 15f67949d11e..1866bafda4fd 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -754,6 +754,7 @@ EXPORT_SYMBOL(__kcsan_check_access); */ #define DEFINE_TSAN_READ_WRITE(size) \ + void __tsan_read##size(void *ptr); \ void __tsan_read##size(void *ptr) \ { \ check_access(ptr, size, 0); \ @@ -762,6 +763,7 @@ EXPORT_SYMBOL(__kcsan_check_access); void __tsan_unaligned_read##size(void *ptr) \ __alias(__tsan_read##size); \ EXPORT_SYMBOL(__tsan_unaligned_read##size); \ + void __tsan_write##size(void *ptr); \ void __tsan_write##size(void *ptr) \ { \ check_access(ptr, size, KCSAN_ACCESS_WRITE); \ @@ -777,12 +779,14 @@ DEFINE_TSAN_READ_WRITE(4); DEFINE_TSAN_READ_WRITE(8); DEFINE_TSAN_READ_WRITE(16); +void __tsan_read_range(void *ptr, size_t size); void __tsan_read_range(void *ptr, size_t size) { check_access(ptr, size, 0); } EXPORT_SYMBOL(__tsan_read_range); +void __tsan_write_range(void *ptr, size_t size); void __tsan_write_range(void *ptr, size_t size) { check_access(ptr, size, KCSAN_ACCESS_WRITE); @@ -799,6 +803,7 @@ EXPORT_SYMBOL(__tsan_write_range); * the size-check of compiletime_assert_rwonce_type(). */ #define DEFINE_TSAN_VOLATILE_READ_WRITE(size) \ + void __tsan_volatile_read##size(void *ptr); \ void __tsan_volatile_read##size(void *ptr) \ { \ const bool is_atomic = size <= sizeof(long long) && \ @@ -811,6 +816,7 @@ EXPORT_SYMBOL(__tsan_write_range); void __tsan_unaligned_volatile_read##size(void *ptr) \ __alias(__tsan_volatile_read##size); \ EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \ + void __tsan_volatile_write##size(void *ptr); \ void __tsan_volatile_write##size(void *ptr) \ { \ const bool is_atomic = size <= sizeof(long long) && \ @@ -836,14 +842,17 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(16); * The below are not required by KCSAN, but can still be emitted by the * compiler. */ +void __tsan_func_entry(void *call_pc); void __tsan_func_entry(void *call_pc) { } EXPORT_SYMBOL(__tsan_func_entry); +void __tsan_func_exit(void); void __tsan_func_exit(void) { } EXPORT_SYMBOL(__tsan_func_exit); +void __tsan_init(void); void __tsan_init(void) { } -- cgit v1.2.3 From acfa087ccf2d2eff46186477f53e4c3ffbdb033d Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 14:36:23 +0200 Subject: kcsan: Rename test.c to selftest.c Rename 'test.c' to 'selftest.c' to better reflect its purpose (Kconfig variable and code inside already match this). This is to avoid confusion with the test suite module in 'kcsan-test.c'. No functional change. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 2 +- kernel/kcsan/selftest.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/kcsan/test.c | 131 ------------------------------------------------ 3 files changed, 132 insertions(+), 132 deletions(-) create mode 100644 kernel/kcsan/selftest.c delete mode 100644 kernel/kcsan/test.c (limited to 'kernel') diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index 14533cf24bc3..092ce58d2e56 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -11,7 +11,7 @@ CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ $(call cc-option,-fno-stack-protector,) obj-y := core.o debugfs.o report.o -obj-$(CONFIG_KCSAN_SELFTEST) += test.o +obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c new file mode 100644 index 000000000000..d26a052d3383 --- /dev/null +++ b/kernel/kcsan/selftest.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +#include "encoding.h" + +#define ITERS_PER_TEST 2000 + +/* Test requirements. */ +static bool test_requires(void) +{ + /* random should be initialized for the below tests */ + return prandom_u32() + prandom_u32() != 0; +} + +/* + * Test watchpoint encode and decode: check that encoding some access's info, + * and then subsequent decode preserves the access's info. + */ +static bool test_encode_decode(void) +{ + int i; + + for (i = 0; i < ITERS_PER_TEST; ++i) { + size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1; + bool is_write = !!prandom_u32_max(2); + unsigned long addr; + + prandom_bytes(&addr, sizeof(addr)); + if (WARN_ON(!check_encodable(addr, size))) + return false; + + /* Encode and decode */ + { + const long encoded_watchpoint = + encode_watchpoint(addr, size, is_write); + unsigned long verif_masked_addr; + size_t verif_size; + bool verif_is_write; + + /* Check special watchpoints */ + if (WARN_ON(decode_watchpoint( + INVALID_WATCHPOINT, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + if (WARN_ON(decode_watchpoint( + CONSUMED_WATCHPOINT, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + + /* Check decoding watchpoint returns same data */ + if (WARN_ON(!decode_watchpoint( + encoded_watchpoint, &verif_masked_addr, + &verif_size, &verif_is_write))) + return false; + if (WARN_ON(verif_masked_addr != + (addr & WATCHPOINT_ADDR_MASK))) + goto fail; + if (WARN_ON(verif_size != size)) + goto fail; + if (WARN_ON(is_write != verif_is_write)) + goto fail; + + continue; +fail: + pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n", + __func__, is_write ? "write" : "read", size, + addr, encoded_watchpoint, + verif_is_write ? "write" : "read", verif_size, + verif_masked_addr); + return false; + } + } + + return true; +} + +/* Test access matching function. */ +static bool test_matching_access(void) +{ + if (WARN_ON(!matching_access(10, 1, 10, 1))) + return false; + if (WARN_ON(!matching_access(10, 2, 11, 1))) + return false; + if (WARN_ON(!matching_access(10, 1, 9, 2))) + return false; + if (WARN_ON(matching_access(10, 1, 11, 1))) + return false; + if (WARN_ON(matching_access(9, 1, 10, 1))) + return false; + + /* + * An access of size 0 could match another access, as demonstrated here. + * Rather than add more comparisons to 'matching_access()', which would + * end up in the fast-path for *all* checks, check_access() simply + * returns for all accesses of size 0. + */ + if (WARN_ON(!matching_access(8, 8, 12, 0))) + return false; + + return true; +} + +static int __init kcsan_selftest(void) +{ + int passed = 0; + int total = 0; + +#define RUN_TEST(do_test) \ + do { \ + ++total; \ + if (do_test()) \ + ++passed; \ + else \ + pr_err("KCSAN selftest: " #do_test " failed"); \ + } while (0) + + RUN_TEST(test_requires); + RUN_TEST(test_encode_decode); + RUN_TEST(test_matching_access); + + pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total); + if (passed != total) + panic("KCSAN selftests failed"); + return 0; +} +postcore_initcall(kcsan_selftest); diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c deleted file mode 100644 index d26a052d3383..000000000000 --- a/kernel/kcsan/test.c +++ /dev/null @@ -1,131 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include - -#include "encoding.h" - -#define ITERS_PER_TEST 2000 - -/* Test requirements. */ -static bool test_requires(void) -{ - /* random should be initialized for the below tests */ - return prandom_u32() + prandom_u32() != 0; -} - -/* - * Test watchpoint encode and decode: check that encoding some access's info, - * and then subsequent decode preserves the access's info. - */ -static bool test_encode_decode(void) -{ - int i; - - for (i = 0; i < ITERS_PER_TEST; ++i) { - size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1; - bool is_write = !!prandom_u32_max(2); - unsigned long addr; - - prandom_bytes(&addr, sizeof(addr)); - if (WARN_ON(!check_encodable(addr, size))) - return false; - - /* Encode and decode */ - { - const long encoded_watchpoint = - encode_watchpoint(addr, size, is_write); - unsigned long verif_masked_addr; - size_t verif_size; - bool verif_is_write; - - /* Check special watchpoints */ - if (WARN_ON(decode_watchpoint( - INVALID_WATCHPOINT, &verif_masked_addr, - &verif_size, &verif_is_write))) - return false; - if (WARN_ON(decode_watchpoint( - CONSUMED_WATCHPOINT, &verif_masked_addr, - &verif_size, &verif_is_write))) - return false; - - /* Check decoding watchpoint returns same data */ - if (WARN_ON(!decode_watchpoint( - encoded_watchpoint, &verif_masked_addr, - &verif_size, &verif_is_write))) - return false; - if (WARN_ON(verif_masked_addr != - (addr & WATCHPOINT_ADDR_MASK))) - goto fail; - if (WARN_ON(verif_size != size)) - goto fail; - if (WARN_ON(is_write != verif_is_write)) - goto fail; - - continue; -fail: - pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n", - __func__, is_write ? "write" : "read", size, - addr, encoded_watchpoint, - verif_is_write ? "write" : "read", verif_size, - verif_masked_addr); - return false; - } - } - - return true; -} - -/* Test access matching function. */ -static bool test_matching_access(void) -{ - if (WARN_ON(!matching_access(10, 1, 10, 1))) - return false; - if (WARN_ON(!matching_access(10, 2, 11, 1))) - return false; - if (WARN_ON(!matching_access(10, 1, 9, 2))) - return false; - if (WARN_ON(matching_access(10, 1, 11, 1))) - return false; - if (WARN_ON(matching_access(9, 1, 10, 1))) - return false; - - /* - * An access of size 0 could match another access, as demonstrated here. - * Rather than add more comparisons to 'matching_access()', which would - * end up in the fast-path for *all* checks, check_access() simply - * returns for all accesses of size 0. - */ - if (WARN_ON(!matching_access(8, 8, 12, 0))) - return false; - - return true; -} - -static int __init kcsan_selftest(void) -{ - int passed = 0; - int total = 0; - -#define RUN_TEST(do_test) \ - do { \ - ++total; \ - if (do_test()) \ - ++passed; \ - else \ - pr_err("KCSAN selftest: " #do_test " failed"); \ - } while (0) - - RUN_TEST(test_requires); - RUN_TEST(test_encode_decode); - RUN_TEST(test_matching_access); - - pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total); - if (passed != total) - panic("KCSAN selftests failed"); - return 0; -} -postcore_initcall(kcsan_selftest); -- cgit v1.2.3 From 7e766560e6e2c1cf2782f00e63c31564e4c9f0fe Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 14:36:24 +0200 Subject: kcsan: Remove existing special atomic rules Remove existing special atomic rules from kcsan_is_atomic_special() because they are no longer needed. Since we rely on the compiler emitting instrumentation distinguishing volatile accesses, the rules have become redundant. Let's keep kcsan_is_atomic_special() around, so that we have an obvious place to add special rules should the need arise in future. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/atomic.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index be9e625227f3..75fe701f4127 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -3,8 +3,7 @@ #ifndef _KERNEL_KCSAN_ATOMIC_H #define _KERNEL_KCSAN_ATOMIC_H -#include -#include +#include /* * Special rules for certain memory where concurrent conflicting accesses are @@ -13,8 +12,7 @@ */ static bool kcsan_is_atomic_special(const volatile void *ptr) { - /* volatile globals that have been observed in data races. */ - return ptr == &jiffies || ptr == ¤t->state; + return false; } #endif /* _KERNEL_KCSAN_ATOMIC_H */ -- cgit v1.2.3 From 56b031f0abf55254d47a329010574733fa9a27b8 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 14:36:25 +0200 Subject: kcsan: Add jiffies test to test suite Add a test that KCSAN nor the compiler gets confused about accesses to jiffies on different architectures. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan-test.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c index 3af420ad6ee7..fed6fcb5768c 100644 --- a/kernel/kcsan/kcsan-test.c +++ b/kernel/kcsan/kcsan-test.c @@ -366,6 +366,11 @@ static noinline void test_kernel_read_struct_zero_size(void) kcsan_check_read(&test_struct.val[3], 0); } +static noinline void test_kernel_jiffies_reader(void) +{ + sink_value((long)jiffies); +} + static noinline void test_kernel_seqlock_reader(void) { unsigned int seq; @@ -817,6 +822,23 @@ static void test_assert_exclusive_access_scoped(struct kunit *test) KUNIT_EXPECT_TRUE(test, match_expect_inscope); } +/* + * jiffies is special (declared to be volatile) and its accesses are typically + * not marked; this test ensures that the compiler nor KCSAN gets confused about + * jiffies's declaration on different architectures. + */ +__no_kcsan +static void test_jiffies_noreport(struct kunit *test) +{ + bool match_never = false; + + begin_test_checks(test_kernel_jiffies_reader, test_kernel_jiffies_reader); + do { + match_never = report_available(); + } while (!end_test_checks(match_never)); + KUNIT_EXPECT_FALSE(test, match_never); +} + /* Test that racing accesses in seqlock critical sections are not reported. */ __no_kcsan static void test_seqlock_noreport(struct kunit *test) @@ -867,6 +889,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange), KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped), KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped), + KCSAN_KUNIT_CASE(test_jiffies_noreport), KCSAN_KUNIT_CASE(test_seqlock_noreport), {}, }; -- cgit v1.2.3 From 2839a232071f588d334543fb86f5689b43353842 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 18 Jun 2020 11:31:17 +0200 Subject: kcsan: Simplify compiler flags Simplify the set of compiler flags for the runtime by removing cc-option from -fno-stack-protector, because all supported compilers support it. This saves us one compiler invocation during build. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index 092ce58d2e56..fea064afc4f7 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -7,8 +7,8 @@ CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) -CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \ - $(call cc-option,-fno-stack-protector,) +CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ + -fno-stack-protector obj-y := core.o debugfs.o report.o obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o -- cgit v1.2.3 From 61d56d7aa5eca3b909bce51ba8125b0fa44d7e17 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 18 Jun 2020 11:31:18 +0200 Subject: kcsan: Disable branch tracing in core runtime Disable branch tracing in core KCSAN runtime if branches are being traced (TRACE_BRANCH_PROFILING). This it to avoid its performance impact, but also avoid recursion in case KCSAN is enabled for the branch tracing runtime. The latter had already been a problem for KASAN: https://lore.kernel.org/lkml/CANpmjNOeXmD5E3O50Z3MjkiuCYaYOPyi+1rq=GZvEKwBvLR0Ug@mail.gmail.com/ Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index fea064afc4f7..65ca5539c470 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -8,7 +8,7 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ - -fno-stack-protector + -fno-stack-protector -DDISABLE_BRANCH_PROFILING obj-y := core.o debugfs.o report.o obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o -- cgit v1.2.3 From 142240398e50e5fe3171bcf2459856603be13a39 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Sat, 27 Jun 2020 23:24:19 -0400 Subject: audit: add gfp parameter to audit_log_nfcfg Fixed an inconsistent use of GFP flags in nft_obj_notify() that used GFP_KERNEL when a GFP flag was passed in to that function. Given this allocated memory was then used in audit_log_nfcfg() it led to an audit of all other GFP allocations in net/netfilter/nf_tables_api.c and a modification of audit_log_nfcfg() to accept a GFP parameter. Reported-by: Dan Carptenter Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 8 ++++---- kernel/auditsc.c | 4 ++-- net/bridge/netfilter/ebtables.c | 6 +++--- net/netfilter/nf_tables_api.c | 33 +++++++++++++++++++++------------ net/netfilter/x_tables.c | 5 +++-- 5 files changed, 33 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 604ede630580..d93739f7a35a 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -404,7 +404,7 @@ extern void __audit_fanotify(unsigned int response); extern void __audit_tk_injoffset(struct timespec64 offset); extern void __audit_ntp_log(const struct audit_ntp_data *ad); extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, - enum audit_nfcfgop op); + enum audit_nfcfgop op, gfp_t gfp); static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { @@ -542,10 +542,10 @@ static inline void audit_ntp_log(const struct audit_ntp_data *ad) static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, - enum audit_nfcfgop op) + enum audit_nfcfgop op, gfp_t gfp) { if (audit_enabled) - __audit_log_nfcfg(name, af, nentries, op); + __audit_log_nfcfg(name, af, nentries, op, gfp); } extern int audit_n_rules; @@ -683,7 +683,7 @@ static inline void audit_ptrace(struct task_struct *t) static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, - enum audit_nfcfgop op) + enum audit_nfcfgop op, gfp_t gfp) { } #define audit_n_rules 0 diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3a9100e95fda..eae1a599ffe3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2572,12 +2572,12 @@ void __audit_ntp_log(const struct audit_ntp_data *ad) } void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, - enum audit_nfcfgop op) + enum audit_nfcfgop op, gfp_t gfp) { struct audit_buffer *ab; char comm[sizeof(current->comm)]; - ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG); + ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG); if (!ab) return; audit_log_format(ab, "table=%s family=%u entries=%u op=%s", diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index c83ffe912163..b13b49b9f75c 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1047,7 +1047,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, vfree(counterstmp); audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, - AUDIT_XT_OP_REPLACE); + AUDIT_XT_OP_REPLACE, GFP_KERNEL); return ret; free_unlock: @@ -1123,7 +1123,7 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table) list_del(&table->list); mutex_unlock(&ebt_mutex); audit_log_nfcfg(table->name, AF_BRIDGE, table->private->nentries, - AUDIT_XT_OP_UNREGISTER); + AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size, ebt_cleanup_entry, net, NULL); if (table->private->nentries) @@ -1218,7 +1218,7 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, } audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, - AUDIT_XT_OP_REGISTER); + AUDIT_XT_OP_REGISTER, GFP_KERNEL); return ret; free_unlock: mutex_unlock(&ebt_mutex); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 164700273947..f7ff91479647 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -702,7 +702,8 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) ctx->table->use, event == NFT_MSG_NEWTABLE ? AUDIT_NFT_OP_TABLE_REGISTER : - AUDIT_NFT_OP_TABLE_UNREGISTER); + AUDIT_NFT_OP_TABLE_UNREGISTER, + GFP_KERNEL); kfree(buf); if (!ctx->report && @@ -1448,7 +1449,8 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) ctx->chain->use, event == NFT_MSG_NEWCHAIN ? AUDIT_NFT_OP_CHAIN_REGISTER : - AUDIT_NFT_OP_CHAIN_UNREGISTER); + AUDIT_NFT_OP_CHAIN_UNREGISTER, + GFP_KERNEL); kfree(buf); if (!ctx->report && @@ -2724,7 +2726,8 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, rule->handle, event == NFT_MSG_NEWRULE ? AUDIT_NFT_OP_RULE_REGISTER : - AUDIT_NFT_OP_RULE_UNREGISTER); + AUDIT_NFT_OP_RULE_UNREGISTER, + GFP_KERNEL); kfree(buf); if (!ctx->report && @@ -3737,7 +3740,8 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, set->field_count, event == NFT_MSG_NEWSET ? AUDIT_NFT_OP_SET_REGISTER : - AUDIT_NFT_OP_SET_UNREGISTER); + AUDIT_NFT_OP_SET_UNREGISTER, + gfp_flags); kfree(buf); if (!ctx->report && @@ -4864,7 +4868,8 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, set->handle, event == NFT_MSG_NEWSETELEM ? AUDIT_NFT_OP_SETELEM_REGISTER : - AUDIT_NFT_OP_SETELEM_UNREGISTER); + AUDIT_NFT_OP_SETELEM_UNREGISTER, + GFP_KERNEL); kfree(buf); if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) @@ -5956,7 +5961,8 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) audit_log_nfcfg(buf, family, obj->handle, - AUDIT_NFT_OP_OBJ_RESET); + AUDIT_NFT_OP_OBJ_RESET, + GFP_KERNEL); kfree(buf); } @@ -6071,13 +6077,14 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, reset = true; if (reset) { - char *buf = kasprintf(GFP_KERNEL, "%s:%llu;?:0", + char *buf = kasprintf(GFP_ATOMIC, "%s:%llu;?:0", table->name, table->handle); audit_log_nfcfg(buf, family, obj->handle, - AUDIT_NFT_OP_OBJ_RESET); + AUDIT_NFT_OP_OBJ_RESET, + GFP_KERNEL); kfree(buf); } @@ -6156,7 +6163,7 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, { struct sk_buff *skb; int err; - char *buf = kasprintf(GFP_KERNEL, "%s:%llu;?:0", + char *buf = kasprintf(gfp, "%s:%llu;?:0", table->name, table->handle); audit_log_nfcfg(buf, @@ -6164,7 +6171,8 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, obj->handle, event == NFT_MSG_NEWOBJ ? AUDIT_NFT_OP_OBJ_REGISTER : - AUDIT_NFT_OP_OBJ_UNREGISTER); + AUDIT_NFT_OP_OBJ_UNREGISTER, + GFP_KERNEL); kfree(buf); if (!report && @@ -6954,7 +6962,8 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, flowtable->hooknum, event == NFT_MSG_NEWFLOWTABLE ? AUDIT_NFT_OP_FLOWTABLE_REGISTER : - AUDIT_NFT_OP_FLOWTABLE_UNREGISTER); + AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, + GFP_KERNEL); kfree(buf); if (ctx->report && @@ -7078,7 +7087,7 @@ static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int err; audit_log_nfcfg("?:0;?:0", 0, net->nft.base_seq, - AUDIT_NFT_OP_GEN_REGISTER); + AUDIT_NFT_OP_GEN_REGISTER, GFP_KERNEL); if (nlmsg_report(nlh) && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 99a468be4a59..9ad8f3ff66f5 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1410,7 +1410,8 @@ xt_replace_table(struct xt_table *table, audit_log_nfcfg(table->name, table->af, private->number, !private->number ? AUDIT_XT_OP_REGISTER : - AUDIT_XT_OP_REPLACE); + AUDIT_XT_OP_REPLACE, + GFP_KERNEL); return private; } EXPORT_SYMBOL_GPL(xt_replace_table); @@ -1473,7 +1474,7 @@ void *xt_unregister_table(struct xt_table *table) list_del(&table->list); mutex_unlock(&xt[table->af].mutex); audit_log_nfcfg(table->name, table->af, private->number, - AUDIT_XT_OP_UNREGISTER); + AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); kfree(table); return private; -- cgit v1.2.3 From 7ef282e05132d56b6f6b71e3873f317664bea78b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 29 Jun 2020 23:45:56 -0400 Subject: tracing: Move pipe reference to trace array instead of current_tracer If a process has the trace_pipe open on a trace_array, the current tracer for that trace array should not be changed. This was original enforced by a global lock, but when instances were introduced, it was moved to the current_trace. But this structure is shared by all instances, and a trace_pipe is for a single instance. There's no reason that a process that has trace_pipe open on one instance should prevent another instance from changing its current tracer. Move the reference counter to the trace_array instead. This is marked as "Fixes" but is more of a clean up than a true fix. Backport if you want, but its not critical. Fixes: cf6ab6d9143b1 ("tracing: Add ref count to tracer for when they are being read by pipe") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 12 ++++++------ kernel/trace/trace.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8241d1448d70..64c5b8146cca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5891,7 +5891,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) } /* If trace pipe files are being read, we can't change the tracer */ - if (tr->current_trace->ref) { + if (tr->trace_ref) { ret = -EBUSY; goto out; } @@ -6107,7 +6107,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) nonseekable_open(inode, filp); - tr->current_trace->ref++; + tr->trace_ref++; out: mutex_unlock(&trace_types_lock); return ret; @@ -6126,7 +6126,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - tr->current_trace->ref--; + tr->trace_ref--; if (iter->trace->pipe_close) iter->trace->pipe_close(iter); @@ -7428,7 +7428,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) filp->private_data = info; - tr->current_trace->ref++; + tr->trace_ref++; mutex_unlock(&trace_types_lock); @@ -7529,7 +7529,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - iter->tr->current_trace->ref--; + iter->tr->trace_ref--; __trace_array_put(iter->tr); @@ -8737,7 +8737,7 @@ static int __remove_instance(struct trace_array *tr) int i; /* Reference counter for a newly created trace array = 1. */ - if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref)) + if (tr->ref > 1 || (tr->current_trace && tr->trace_ref)) return -EBUSY; list_del(&tr->list); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 13db4000af3f..f21607f87189 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -356,6 +356,7 @@ struct trace_array { struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ int ref; + int trace_ref; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; @@ -547,7 +548,6 @@ struct tracer { struct tracer *next; struct tracer_flags *flags; int enabled; - int ref; bool print_max; bool allow_instances; #ifdef CONFIG_TRACER_MAX_TRACE -- cgit v1.2.3 From a389d86f7fd0902e4ce4136a5601988dbd371eb1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 28 Jun 2020 22:52:25 -0400 Subject: ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 276 +++++++++++++++++++++++++++++---------------- 1 file changed, 181 insertions(+), 95 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 00867ff82412..026238c55b0c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -27,6 +27,7 @@ #include #include +#include static void update_pages_handler(struct work_struct *work); @@ -418,6 +419,17 @@ struct rb_event_info { int add_timestamp; }; +/* + * Used for the add_timestamp + * NONE + * NORMAL - may be for either time extend or absolute + * FORCE - force a full time stamp. + */ +enum { + RB_ADD_STAMP_NONE, + RB_ADD_STAMP_NORMAL, + RB_ADD_STAMP_FORCE +}; /* * Used for which event context the event is in. * NMI = 0 @@ -470,7 +482,8 @@ struct ring_buffer_per_cpu { size_t shortest_full; unsigned long read; unsigned long read_bytes; - u64 write_stamp; + local64_t write_stamp; + local64_t before_stamp; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -2416,16 +2429,13 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned length = info->length; u64 delta = info->delta; - /* Only a commit updates the timestamp */ - if (unlikely(!rb_event_is_commit(cpu_buffer, event))) - delta = 0; - /* * If we need to add a timestamp, then we * add it to the start of the reserved space. */ if (unlikely(info->add_timestamp)) { - bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer); + bool abs = info->add_timestamp == RB_ADD_STAMP_FORCE || + ring_buffer_time_stamp_abs(cpu_buffer->buffer); event = rb_add_time_stamp(event, abs ? info->delta : delta, abs); length -= RB_LEN_TIME_EXTEND; @@ -2480,6 +2490,39 @@ static inline bool sched_clock_stable(void) } #endif +static __always_inline bool +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static u64 rb_time_delta(struct ring_buffer_event *event) +{ + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + return 0; + + case RINGBUF_TYPE_TIME_EXTEND: + return ring_buffer_event_time_stamp(event); + + case RINGBUF_TYPE_TIME_STAMP: + return 0; + + case RINGBUF_TYPE_DATA: + return event->time_delta; + default: + return 0; + } +} + static inline int rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) @@ -2488,6 +2531,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage; unsigned long index; unsigned long addr; + u64 write_stamp; + u64 delta; new_index = rb_event_index(event); old_index = new_index + rb_event_ts_length(event); @@ -2496,10 +2541,32 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, bpage = READ_ONCE(cpu_buffer->tail_page); + delta = rb_time_delta(event); + + write_stamp = local64_read(&cpu_buffer->write_stamp); + + /* Make sure the write stamp is read before testing the location */ + barrier(); + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = rb_event_length(event); + u64 ret; + + ret = local64_cmpxchg(&cpu_buffer->write_stamp, write_stamp, write_stamp - delta); + /* Something came in, can't discard */ + if (ret != write_stamp) + return 0; + + /* + * If an event were to come in now, it would see that the + * write_stamp and the before_stamp are different, and assume + * that this event just added itself before updating + * the write stamp. The interrupting event will fix the + * write stamp for us, and use the before stamp as its delta. + */ + /* * This is on the tail page. It is possible that * a write could come in and move the tail page @@ -2551,10 +2618,6 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - /* Only update the write stamp if the page has an event */ - if (rb_page_write(cpu_buffer->commit_page)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; /* add barrier to keep gcc from optimizing too much */ barrier(); } @@ -2626,54 +2689,10 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -static __always_inline bool -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - -static __always_inline void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - u64 delta; - - /* - * The event first in the commit queue updates the - * time stamp. - */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * A commit event that is first on a page - * updates the write timestamp with the page stamp - */ - if (!rb_event_index(event)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = ring_buffer_event_time_stamp(event); - cpu_buffer->write_stamp += delta; - } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { - delta = ring_buffer_event_time_stamp(event); - cpu_buffer->write_stamp = delta; - } else - cpu_buffer->write_stamp += event->time_delta; - } -} - static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { local_inc(&cpu_buffer->entries); - rb_update_write_stamp(cpu_buffer, event); rb_end_commit(cpu_buffer); } @@ -2872,13 +2891,13 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)info->delta, (unsigned long long)info->ts, - (unsigned long long)cpu_buffer->write_stamp, + (unsigned long long)local64_read(&cpu_buffer->write_stamp), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" " echo global > /sys/kernel/debug/tracing/trace_clock\n" "or add trace_clock=global to the kernel command line\n"); - info->add_timestamp = 1; + info->add_timestamp = RB_ADD_STAMP_NORMAL; } static struct ring_buffer_event * @@ -2887,8 +2906,36 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, { struct ring_buffer_event *event; struct buffer_page *tail_page; - unsigned long tail, write; + unsigned long tail, write, w; + u64 before, after; + bool abs = false; + + /* Don't let the compiler play games with cpu_buffer->tail_page */ + tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); + + /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; + barrier(); + before = local64_read(&cpu_buffer->before_stamp); + after = local64_read(&cpu_buffer->write_stamp); + barrier(); + info->ts = rb_time_stamp(cpu_buffer->buffer); + + if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { + info->delta = info->ts; + abs = true; + } else { + info->delta = info->ts - after; + } + + if (unlikely(test_time_stamp(info->delta))) + rb_handle_timestamp(cpu_buffer, info); + /* + * If interrupting an event time update, we may need an absolute timestamp. + * Don't bother if this is the start of a new page (w == 0). + */ + if (unlikely(before != after && w)) + info->add_timestamp = RB_ADD_STAMP_FORCE; /* * If the time delta since the last event is too big to * hold in the time field of the event, then we append a @@ -2897,25 +2944,88 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->add_timestamp)) info->length += RB_LEN_TIME_EXTEND; - /* Don't let the compiler play games with cpu_buffer->tail_page */ - tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); - write = local_add_return(info->length, &tail_page->write); + /*B*/ local64_set(&cpu_buffer->before_stamp, info->ts); + + /*C*/ write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ write &= RB_WRITE_MASK; + tail = write - info->length; + /* See if we shot pass the end of this buffer page */ + if (unlikely(write > BUF_PAGE_SIZE)) { + if (tail != w) { + /* before and after may now different, fix it up*/ + before = local64_read(&cpu_buffer->before_stamp); + after = local64_read(&cpu_buffer->write_stamp); + if (before != after) + (void)local64_cmpxchg(&cpu_buffer->before_stamp, before, after); + } + return rb_move_tail(cpu_buffer, tail, info); + } + + if (likely(tail == w)) { + u64 save_before; + + /* Nothing interrupted us between A and C */ + /*D*/ local64_set(&cpu_buffer->write_stamp, info->ts); + barrier(); + /*E*/ save_before = local64_read(&cpu_buffer->before_stamp); + if (likely(info->add_timestamp != RB_ADD_STAMP_FORCE)) + /* This did not interrupt any time update */ + info->delta = info->ts - after; + else + /* Just use full timestamp for inerrupting event */ + info->delta = info->ts; + barrier(); + if (unlikely(info->ts != save_before)) { + /* SLOW PATH - Interrupted between C and E */ + + after = local64_read(&cpu_buffer->write_stamp); + /* Write stamp must only go forward */ + if (save_before > after) { + /* + * We do not care about the result, only that + * it gets updated atomically. + */ + (void)local64_cmpxchg(&cpu_buffer->write_stamp, after, save_before); + } + } + } else { + u64 ts; + /* SLOW PATH - Interrupted between A and C */ + after = local64_read(&cpu_buffer->write_stamp); + ts = rb_time_stamp(cpu_buffer->buffer); + barrier(); + /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && + after < ts) { + /* Nothing came after this event between C and E */ + info->delta = ts - after; + (void)local64_cmpxchg(&cpu_buffer->write_stamp, after, info->ts); + info->ts = ts; + } else { + /* + * Interrupted beween C and E: + * Lost the previous events time stamp. Just set the + * delta to zero, and this will be the same time as + * the event this event interrupted. And the events that + * came after this will still be correct (as they would + * have built their delta on the previous event. + */ + info->delta = 0; + } + if (info->add_timestamp == RB_ADD_STAMP_FORCE) + info->add_timestamp = RB_ADD_STAMP_NORMAL; + } + /* * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ - if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer)) + if (unlikely(!tail && info->add_timestamp != RB_ADD_STAMP_FORCE && !abs)) info->delta = 0; - /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) - return rb_move_tail(cpu_buffer, tail, info); - /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); @@ -2944,9 +3054,9 @@ rb_reserve_next_event(struct trace_buffer *buffer, struct ring_buffer_event *event; struct rb_event_info info; int nr_loops = 0; - u64 diff; rb_start_commit(cpu_buffer); + /* The commit page can not change after this */ #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /* @@ -2965,7 +3075,7 @@ rb_reserve_next_event(struct trace_buffer *buffer, info.length = rb_calculate_event_length(length); again: - info.add_timestamp = 0; + info.add_timestamp = RB_ADD_STAMP_NONE; info.delta = 0; /* @@ -2980,22 +3090,6 @@ rb_reserve_next_event(struct trace_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - info.ts = rb_time_stamp(cpu_buffer->buffer); - diff = info.ts - cpu_buffer->write_stamp; - - /* make sure this diff is calculated here */ - barrier(); - - if (ring_buffer_time_stamp_abs(buffer)) { - info.delta = info.ts; - rb_handle_timestamp(cpu_buffer, &info); - } else /* Did the write stamp get updated already? */ - if (likely(info.ts >= cpu_buffer->write_stamp)) { - info.delta = diff; - if (unlikely(test_time_stamp(info.delta))) - rb_handle_timestamp(cpu_buffer, &info); - } - event = __rb_reserve_next(cpu_buffer, &info); if (unlikely(PTR_ERR(event) == -EAGAIN)) { @@ -3004,11 +3098,8 @@ rb_reserve_next_event(struct trace_buffer *buffer, goto again; } - if (!event) - goto out_fail; - - return event; - + if (likely(event)) + return event; out_fail: rb_end_commit(cpu_buffer); return NULL; @@ -3154,11 +3245,6 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, if (rb_try_to_discard(cpu_buffer, event)) goto out; - /* - * The commit is still visible by the reader, so we - * must still update the timestamp. - */ - rb_update_write_stamp(cpu_buffer, event); out: rb_end_commit(cpu_buffer); @@ -4475,8 +4561,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; - cpu_buffer->write_stamp = 0; - cpu_buffer->read_stamp = 0; + local64_set(&cpu_buffer->write_stamp, 0); + local64_set(&cpu_buffer->before_stamp, 0); cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; -- cgit v1.2.3 From 7c4b4a5164fbedc11c23e3671bd90ba0d23a5efd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 28 Jun 2020 22:52:26 -0400 Subject: ring-buffer: Incorporate absolute timestamp into add_timestamp logic Instead of calling out the absolute test for each time to check if the ring buffer wants absolute time stamps for all its recording, incorporate it with the add_timestamp field and turn it into flags for faster processing between wanting a absolute tag and needing to force one. Link: http://lkml.kernel.org/r/20200629025259.154892368@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 026238c55b0c..7ee6619951ea 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -422,13 +422,15 @@ struct rb_event_info { /* * Used for the add_timestamp * NONE - * NORMAL - may be for either time extend or absolute + * EXTEND - wants a time extend + * ABSOLUTE - the buffer requests all events to have absolute time stamps * FORCE - force a full time stamp. */ enum { - RB_ADD_STAMP_NONE, - RB_ADD_STAMP_NORMAL, - RB_ADD_STAMP_FORCE + RB_ADD_STAMP_NONE = 0, + RB_ADD_STAMP_EXTEND = BIT(1), + RB_ADD_STAMP_ABSOLUTE = BIT(2), + RB_ADD_STAMP_FORCE = BIT(3) }; /* * Used for which event context the event is in. @@ -2434,8 +2436,8 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, * add it to the start of the reserved space. */ if (unlikely(info->add_timestamp)) { - bool abs = info->add_timestamp == RB_ADD_STAMP_FORCE || - ring_buffer_time_stamp_abs(cpu_buffer->buffer); + bool abs = info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); event = rb_add_time_stamp(event, abs ? info->delta : delta, abs); length -= RB_LEN_TIME_EXTEND; @@ -2884,8 +2886,8 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer, EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static noinline void -rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, - struct rb_event_info *info) +rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) { WARN_ONCE(info->delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", @@ -2897,7 +2899,6 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, "please switch to the trace global clock:\n" " echo global > /sys/kernel/debug/tracing/trace_clock\n" "or add trace_clock=global to the kernel command line\n"); - info->add_timestamp = RB_ADD_STAMP_NORMAL; } static struct ring_buffer_event * @@ -2908,7 +2909,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page; unsigned long tail, write, w; u64 before, after; - bool abs = false; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); @@ -2922,20 +2922,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { info->delta = info->ts; - abs = true; + info->add_timestamp = RB_ADD_STAMP_ABSOLUTE; } else { info->delta = info->ts - after; } - if (unlikely(test_time_stamp(info->delta))) - rb_handle_timestamp(cpu_buffer, info); + if (unlikely(test_time_stamp(info->delta))) { + rb_check_timestamp(cpu_buffer, info); + info->add_timestamp |= RB_ADD_STAMP_EXTEND; + } /* * If interrupting an event time update, we may need an absolute timestamp. * Don't bother if this is the start of a new page (w == 0). */ if (unlikely(before != after && w)) - info->add_timestamp = RB_ADD_STAMP_FORCE; + info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; + /* * If the time delta since the last event is too big to * hold in the time field of the event, then we append a @@ -2972,7 +2975,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /*D*/ local64_set(&cpu_buffer->write_stamp, info->ts); barrier(); /*E*/ save_before = local64_read(&cpu_buffer->before_stamp); - if (likely(info->add_timestamp != RB_ADD_STAMP_FORCE)) + if (likely(!(info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ info->delta = info->ts - after; else @@ -3015,15 +3019,15 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ info->delta = 0; } - if (info->add_timestamp == RB_ADD_STAMP_FORCE) - info->add_timestamp = RB_ADD_STAMP_NORMAL; + info->add_timestamp &= ~RB_ADD_STAMP_FORCE; } /* * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ - if (unlikely(!tail && info->add_timestamp != RB_ADD_STAMP_FORCE && !abs)) + if (unlikely(!tail && !(info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) info->delta = 0; /* We reserved something on the buffer */ -- cgit v1.2.3 From 10464b4aa605ef93c937452f442e74cc0a4a6ceb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 28 Jun 2020 22:52:27 -0400 Subject: ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 270 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 243 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7ee6619951ea..802bb38d9c81 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -27,7 +27,6 @@ #include #include -#include static void update_pages_handler(struct work_struct *work); @@ -449,6 +448,28 @@ enum { RB_CTX_MAX }; +#if BITS_PER_LONG == 32 +#define RB_TIME_32 +#endif + +/* To test on 64 bit machines */ +//#define RB_TIME_32 + +#ifdef RB_TIME_32 + +struct rb_time_struct { + local_t cnt; + local_t top; + local_t bottom; +}; +#else +#include +struct rb_time_struct { + local64_t time; +}; +#endif +typedef struct rb_time_struct rb_time_t; + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -484,8 +505,8 @@ struct ring_buffer_per_cpu { size_t shortest_full; unsigned long read; unsigned long read_bytes; - local64_t write_stamp; - local64_t before_stamp; + rb_time_t write_stamp; + rb_time_t before_stamp; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -528,6 +549,189 @@ struct ring_buffer_iter { int missed_events; }; +#ifdef RB_TIME_32 + +/* + * On 32 bit machines, local64_t is very expensive. As the ring + * buffer doesn't need all the features of a true 64 bit atomic, + * on 32 bit, it uses these functions (64 still uses local64_t). + * + * For the ring buffer, 64 bit required operations for the time is + * the following: + * + * - Only need 59 bits (uses 60 to make it even). + * - Reads may fail if it interrupted a modification of the time stamp. + * It will succeed if it did not interrupt another write even if + * the read itself is interrupted by a write. + * It returns whether it was successful or not. + * + * - Writes always succeed and will overwrite other writes and writes + * that were done by events interrupting the current write. + * + * - A write followed by a read of the same time stamp will always succeed, + * but may not contain the same value. + * + * - A cmpxchg will fail if it interrupted another write or cmpxchg. + * Other than that, it acts like a normal cmpxchg. + * + * The 60 bit time stamp is broken up by 30 bits in a top and bottom half + * (bottom being the least significant 30 bits of the 60 bit time stamp). + * + * The two most significant bits of each half holds a 2 bit counter (0-3). + * Each update will increment this counter by one. + * When reading the top and bottom, if the two counter bits match then the + * top and bottom together make a valid 60 bit number. + */ +#define RB_TIME_SHIFT 30 +#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) + +static inline int rb_time_cnt(unsigned long val) +{ + return (val >> RB_TIME_SHIFT) & 3; +} + +static inline u64 rb_time_val(unsigned long top, unsigned long bottom) +{ + u64 val; + + val = top & RB_TIME_VAL_MASK; + val <<= RB_TIME_SHIFT; + val |= bottom & RB_TIME_VAL_MASK; + + return val; +} + +static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) +{ + unsigned long top, bottom; + unsigned long c; + + /* + * If the read is interrupted by a write, then the cnt will + * be different. Loop until both top and bottom have been read + * without interruption. + */ + do { + c = local_read(&t->cnt); + top = local_read(&t->top); + bottom = local_read(&t->bottom); + } while (c != local_read(&t->cnt)); + + *cnt = rb_time_cnt(top); + + /* If top and bottom counts don't match, this interrupted a write */ + if (*cnt != rb_time_cnt(bottom)) + return false; + + *ret = rb_time_val(top, bottom); + return true; +} + +static bool rb_time_read(rb_time_t *t, u64 *ret) +{ + unsigned long cnt; + + return __rb_time_read(t, ret, &cnt); +} + +static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt) +{ + return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); +} + +static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom) +{ + *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); + *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); +} + +static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) +{ + val = rb_time_val_cnt(val, cnt); + local_set(t, val); +} + +static void rb_time_set(rb_time_t *t, u64 val) +{ + unsigned long cnt, top, bottom; + + rb_time_split(val, &top, &bottom); + + /* Writes always succeed with a valid number even if it gets interrupted. */ + do { + cnt = local_inc_return(&t->cnt); + rb_time_val_set(&t->top, top, cnt); + rb_time_val_set(&t->bottom, bottom, cnt); + } while (cnt != local_read(&t->cnt)); +} + +static inline bool +rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) +{ + unsigned long ret; + + ret = local_cmpxchg(l, expect, set); + return ret == expect; +} + +static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +{ + unsigned long cnt, top, bottom; + unsigned long cnt2, top2, bottom2; + u64 val; + + /* The cmpxchg always fails if it interrupted an update */ + if (!__rb_time_read(t, &val, &cnt2)) + return false; + + if (val != expect) + return false; + + cnt = local_read(&t->cnt); + if ((cnt & 3) != cnt2) + return false; + + cnt2 = cnt + 1; + + rb_time_split(val, &top, &bottom); + top = rb_time_val_cnt(top, cnt); + bottom = rb_time_val_cnt(bottom, cnt); + + rb_time_split(set, &top2, &bottom2); + top2 = rb_time_val_cnt(top2, cnt2); + bottom2 = rb_time_val_cnt(bottom2, cnt2); + + if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) + return false; + if (!rb_time_read_cmpxchg(&t->top, top, top2)) + return false; + if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) + return false; + return true; +} + +#else /* 64 bits */ + +/* local64_t always succeeds */ + +static inline bool rb_time_read(rb_time_t *t, u64 *ret) +{ + *ret = local64_read(&t->time); + return true; +} +static void rb_time_set(rb_time_t *t, u64 val) +{ + local64_set(&t->time, val); +} + +static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +{ + u64 val; + val = local64_cmpxchg(&t->time, expect, set); + return val == expect; +} +#endif + /** * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from @@ -2545,7 +2749,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, delta = rb_time_delta(event); - write_stamp = local64_read(&cpu_buffer->write_stamp); + if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp)) + return 0; /* Make sure the write stamp is read before testing the location */ barrier(); @@ -2554,11 +2759,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = rb_event_length(event); - u64 ret; - ret = local64_cmpxchg(&cpu_buffer->write_stamp, write_stamp, write_stamp - delta); /* Something came in, can't discard */ - if (ret != write_stamp) + if (!rb_time_cmpxchg(&cpu_buffer->write_stamp, + write_stamp, write_stamp - delta)) return 0; /* @@ -2889,11 +3093,13 @@ static noinline void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { + u64 write_stamp; + WARN_ONCE(info->delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)info->delta, (unsigned long long)info->ts, - (unsigned long long)local64_read(&cpu_buffer->write_stamp), + (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" @@ -2909,14 +3115,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page; unsigned long tail, write, w; u64 before, after; + bool a_ok; + bool b_ok; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); - before = local64_read(&cpu_buffer->before_stamp); - after = local64_read(&cpu_buffer->write_stamp); + b_ok = rb_time_read(&cpu_buffer->before_stamp, &before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); @@ -2927,16 +3135,18 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, info->delta = info->ts - after; } - if (unlikely(test_time_stamp(info->delta))) { - rb_check_timestamp(cpu_buffer, info); - info->add_timestamp |= RB_ADD_STAMP_EXTEND; + if (likely(a_ok && b_ok)) { + if (unlikely(test_time_stamp(info->delta))) { + rb_check_timestamp(cpu_buffer, info); + info->add_timestamp |= RB_ADD_STAMP_EXTEND; + } } /* * If interrupting an event time update, we may need an absolute timestamp. * Don't bother if this is the start of a new page (w == 0). */ - if (unlikely(before != after && w)) + if (unlikely(!a_ok || !b_ok || (before != after && w))) info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; /* @@ -2947,7 +3157,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->add_timestamp)) info->length += RB_LEN_TIME_EXTEND; - /*B*/ local64_set(&cpu_buffer->before_stamp, info->ts); + /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts); /*C*/ write = local_add_return(info->length, &tail_page->write); @@ -2960,21 +3170,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(write > BUF_PAGE_SIZE)) { if (tail != w) { /* before and after may now different, fix it up*/ - before = local64_read(&cpu_buffer->before_stamp); - after = local64_read(&cpu_buffer->write_stamp); - if (before != after) - (void)local64_cmpxchg(&cpu_buffer->before_stamp, before, after); + b_ok = rb_time_read(&cpu_buffer->before_stamp, &before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + if (a_ok && b_ok && before != after) + (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, before, after); } return rb_move_tail(cpu_buffer, tail, info); } if (likely(tail == w)) { u64 save_before; + bool s_ok; /* Nothing interrupted us between A and C */ - /*D*/ local64_set(&cpu_buffer->write_stamp, info->ts); + /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts); barrier(); - /*E*/ save_before = local64_read(&cpu_buffer->before_stamp); + /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before); + RB_WARN_ON(cpu_buffer, !s_ok); if (likely(!(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ @@ -2986,27 +3198,31 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->ts != save_before)) { /* SLOW PATH - Interrupted between C and E */ - after = local64_read(&cpu_buffer->write_stamp); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + RB_WARN_ON(cpu_buffer, !a_ok); + /* Write stamp must only go forward */ if (save_before > after) { /* * We do not care about the result, only that * it gets updated atomically. */ - (void)local64_cmpxchg(&cpu_buffer->write_stamp, after, save_before); + (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, save_before); } } } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ - after = local64_read(&cpu_buffer->write_stamp); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + /* Was interrupted before here, write_stamp must be valid */ + RB_WARN_ON(cpu_buffer, !a_ok); ts = rb_time_stamp(cpu_buffer->buffer); barrier(); /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && after < ts) { /* Nothing came after this event between C and E */ info->delta = ts - after; - (void)local64_cmpxchg(&cpu_buffer->write_stamp, after, info->ts); + (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, info->ts); info->ts = ts; } else { /* @@ -4565,8 +4781,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; - local64_set(&cpu_buffer->write_stamp, 0); - local64_set(&cpu_buffer->before_stamp, 0); + rb_time_set(&cpu_buffer->write_stamp, 0); + rb_time_set(&cpu_buffer->before_stamp, 0); cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; -- cgit v1.2.3 From b23d7a5f4a07af02343cdd28fe1f7488bac3afda Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 25 Jun 2020 15:34:03 +1000 Subject: ring-buffer: speed up buffer resets by avoiding synchronize_rcu for each CPU On a 144 thread system, `perf ftrace` takes about 20 seconds to start up, due to calling synchronize_rcu() for each CPU. cat /proc/108560/stack 0xc0003e7eb336f470 __switch_to+0x2e0/0x480 __wait_rcu_gp+0x20c/0x220 synchronize_rcu+0x9c/0xc0 ring_buffer_reset_cpu+0x88/0x2e0 tracing_reset_online_cpus+0x84/0xe0 tracing_open+0x1d4/0x1f0 On a system with 10x more threads, it starts to become an annoyance. Batch these up so we disable all the per-cpu buffers first, then synchronize_rcu() once, then reset each of the buffers. This brings the time down to about 0.5s. Link: https://lkml.kernel.org/r/20200625053403.2386972-1-npiggin@gmail.com Tested-by: Anton Blanchard Acked-by: Paul E. McKenney Signed-off-by: Nicholas Piggin Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 1 + kernel/trace/ring_buffer.c | 85 +++++++++++++++++++++++++++++++++++++-------- kernel/trace/trace.c | 4 +-- 3 files changed, 73 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index c76b2f3b3ac4..136ea0997e6d 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -143,6 +143,7 @@ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter); unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu); void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu); +void ring_buffer_reset_online_cpus(struct trace_buffer *buffer); void ring_buffer_reset(struct trace_buffer *buffer); #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 802bb38d9c81..ed1941304f69 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -270,6 +270,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ for_each_cpu(cpu, buffer->cpumask) +#define for_each_online_buffer_cpu(buffer, cpu) \ + for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) + #define TS_SHIFT 27 #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) @@ -4790,6 +4793,26 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) rb_head_page_activate(cpu_buffer); } +/* Must have disabled the cpu buffer then done a synchronize_rcu */ +static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) + goto out; + + arch_spin_lock(&cpu_buffer->lock); + + rb_reset_cpu(cpu_buffer); + + arch_spin_unlock(&cpu_buffer->lock); + + out: + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +} + /** * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of @@ -4798,7 +4821,6 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; @@ -4809,24 +4831,42 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) /* Make sure all commits have finished */ synchronize_rcu(); - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + reset_disabled_cpu_buffer(cpu_buffer); - if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) - goto out; + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->resize_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); - arch_spin_lock(&cpu_buffer->lock); +/** + * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * @buffer: The ring buffer to reset a per cpu buffer of + * @cpu: The CPU buffer to be reset + */ +void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; - rb_reset_cpu(cpu_buffer); + for_each_online_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; - arch_spin_unlock(&cpu_buffer->lock); + atomic_inc(&cpu_buffer->resize_disabled); + atomic_inc(&cpu_buffer->record_disabled); + } - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + /* Make sure all commits have finished */ + synchronize_rcu(); - atomic_dec(&cpu_buffer->record_disabled); - atomic_dec(&cpu_buffer->resize_disabled); + for_each_online_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + reset_disabled_cpu_buffer(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->resize_disabled); + } } -EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /** * ring_buffer_reset - reset a ring buffer @@ -4834,10 +4874,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); */ void ring_buffer_reset(struct trace_buffer *buffer) { + struct ring_buffer_per_cpu *cpu_buffer; int cpu; - for_each_buffer_cpu(buffer, cpu) - ring_buffer_reset_cpu(buffer, cpu); + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + atomic_inc(&cpu_buffer->resize_disabled); + atomic_inc(&cpu_buffer->record_disabled); + } + + /* Make sure all commits have finished */ + synchronize_rcu(); + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + reset_disabled_cpu_buffer(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->resize_disabled); + } } EXPORT_SYMBOL_GPL(ring_buffer_reset); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 64c5b8146cca..4aab712f9567 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2003,7 +2003,6 @@ static void tracing_reset_cpu(struct array_buffer *buf, int cpu) void tracing_reset_online_cpus(struct array_buffer *buf) { struct trace_buffer *buffer = buf->buffer; - int cpu; if (!buffer) return; @@ -2015,8 +2014,7 @@ void tracing_reset_online_cpus(struct array_buffer *buf) buf->time_start = buffer_ftrace_now(buf, buf->cpu); - for_each_online_cpu(cpu) - ring_buffer_reset_cpu(buffer, cpu); + ring_buffer_reset_online_cpus(buffer); ring_buffer_record_enable(buffer); } -- cgit v1.2.3 From 75b21c6dfa2d816bcabac24a530d9cffc12092b8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 30 Jun 2020 09:04:35 -0400 Subject: ring-buffer: Mark the !tail (crossing a page) as unlikely It is the uncommon case where an event crosses a sub buffer boundary (page) mark that check at the end of reserving an event as unlikely. Suggested-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ed1941304f69..888bc9177937 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3260,7 +3260,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * If this is the first commit on the page, then update * its timestamp. */ - if (!tail) + if (unlikely(!tail)) tail_page->page->time_stamp = info->ts; /* account for these added bytes */ -- cgit v1.2.3 From fb37409a01b011a664347702f44dbf13fa7c7486 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 10 Jun 2020 09:45:20 +0300 Subject: arch: remove unicore32 port The unicore32 port do not seem maintained for a long time now, there is no upstream toolchain that can create unicore32 binaries and all the links to prebuilt toolchains for unicore32 are dead. Even compilers that were available are not supported by the kernel anymore. Guenter Roeck says: I have stopped building unicore32 images since v4.19 since there is no available compiler that is still supported by the kernel. I am surprised that support for it has not been removed from the kernel. Remove unicore32 port. Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: Guenter Roeck --- .../features/core/cBPF-JIT/arch-support.txt | 1 - .../features/core/eBPF-JIT/arch-support.txt | 1 - .../core/generic-idle-thread/arch-support.txt | 1 - .../features/core/jump-labels/arch-support.txt | 1 - .../features/core/tracehook/arch-support.txt | 1 - .../features/debug/KASAN/arch-support.txt | 1 - .../debug/debug-vm-pgtable/arch-support.txt | 1 - .../debug/gcov-profile-all/arch-support.txt | 1 - Documentation/features/debug/kgdb/arch-support.txt | 1 - .../debug/kprobes-on-ftrace/arch-support.txt | 1 - .../features/debug/kprobes/arch-support.txt | 1 - .../features/debug/kretprobes/arch-support.txt | 1 - .../features/debug/optprobes/arch-support.txt | 1 - .../features/debug/stackprotector/arch-support.txt | 1 - .../features/debug/uprobes/arch-support.txt | 1 - .../debug/user-ret-profiler/arch-support.txt | 1 - .../features/io/dma-contiguous/arch-support.txt | 1 - .../locking/cmpxchg-local/arch-support.txt | 1 - .../features/locking/lockdep/arch-support.txt | 1 - .../locking/queued-rwlocks/arch-support.txt | 1 - .../locking/queued-spinlocks/arch-support.txt | 1 - .../features/perf/kprobes-event/arch-support.txt | 1 - .../features/perf/perf-regs/arch-support.txt | 1 - .../features/perf/perf-stackdump/arch-support.txt | 1 - .../sched/membarrier-sync-core/arch-support.txt | 1 - .../features/sched/numa-balancing/arch-support.txt | 1 - .../seccomp/seccomp-filter/arch-support.txt | 1 - .../time/arch-tick-broadcast/arch-support.txt | 1 - .../features/time/clockevents/arch-support.txt | 1 - .../time/context-tracking/arch-support.txt | 1 - .../features/time/irq-time-acct/arch-support.txt | 1 - .../time/modern-timekeeping/arch-support.txt | 1 - .../features/time/virt-cpuacct/arch-support.txt | 1 - .../features/vm/ELF-ASLR/arch-support.txt | 1 - .../features/vm/PG_uncached/arch-support.txt | 1 - Documentation/features/vm/THP/arch-support.txt | 1 - Documentation/features/vm/TLB/arch-support.txt | 1 - .../features/vm/huge-vmap/arch-support.txt | 1 - .../features/vm/ioremap_prot/arch-support.txt | 1 - .../features/vm/pte_special/arch-support.txt | 1 - MAINTAINERS | 7 - arch/unicore32/.gitignore | 22 - arch/unicore32/Kconfig | 200 ----- arch/unicore32/Kconfig.debug | 29 - arch/unicore32/Makefile | 59 -- arch/unicore32/boot/Makefile | 39 - arch/unicore32/boot/compressed/Makefile | 64 -- arch/unicore32/boot/compressed/head.S | 201 ------ arch/unicore32/boot/compressed/misc.c | 123 ---- arch/unicore32/boot/compressed/piggy.S.in | 6 - arch/unicore32/boot/compressed/vmlinux.lds.S | 58 -- arch/unicore32/configs/defconfig | 214 ------ arch/unicore32/include/asm/Kbuild | 7 - arch/unicore32/include/asm/assembler.h | 128 ---- arch/unicore32/include/asm/barrier.h | 16 - arch/unicore32/include/asm/bitops.h | 46 -- arch/unicore32/include/asm/bug.h | 20 - arch/unicore32/include/asm/cache.h | 24 - arch/unicore32/include/asm/cacheflush.h | 186 ----- arch/unicore32/include/asm/checksum.h | 38 - arch/unicore32/include/asm/cmpxchg.h | 58 -- arch/unicore32/include/asm/cpu-single.h | 42 -- arch/unicore32/include/asm/cputype.h | 30 - arch/unicore32/include/asm/delay.h | 49 -- arch/unicore32/include/asm/dma.h | 20 - arch/unicore32/include/asm/elf.h | 90 --- arch/unicore32/include/asm/fpstate.h | 23 - arch/unicore32/include/asm/fpu-ucf64.h | 50 -- arch/unicore32/include/asm/gpio.h | 101 --- arch/unicore32/include/asm/hwcap.h | 29 - arch/unicore32/include/asm/hwdef-copro.h | 45 -- arch/unicore32/include/asm/io.h | 69 -- arch/unicore32/include/asm/irq.h | 102 --- arch/unicore32/include/asm/irqflags.h | 50 -- arch/unicore32/include/asm/linkage.h | 19 - arch/unicore32/include/asm/memblock.h | 43 -- arch/unicore32/include/asm/memory.h | 102 --- arch/unicore32/include/asm/mmu.h | 14 - arch/unicore32/include/asm/mmu_context.h | 98 --- arch/unicore32/include/asm/page.h | 74 -- arch/unicore32/include/asm/pci.h | 20 - arch/unicore32/include/asm/pgalloc.h | 87 --- arch/unicore32/include/asm/pgtable-hwdef.h | 51 -- arch/unicore32/include/asm/pgtable.h | 267 ------- arch/unicore32/include/asm/processor.h | 74 -- arch/unicore32/include/asm/ptrace.h | 58 -- arch/unicore32/include/asm/stacktrace.h | 28 - arch/unicore32/include/asm/string.h | 35 - arch/unicore32/include/asm/suspend.h | 26 - arch/unicore32/include/asm/switch_to.h | 27 - arch/unicore32/include/asm/syscall.h | 12 - arch/unicore32/include/asm/thread_info.h | 133 ---- arch/unicore32/include/asm/timex.h | 31 - arch/unicore32/include/asm/tlb.h | 24 - arch/unicore32/include/asm/tlbflush.h | 192 ----- arch/unicore32/include/asm/traps.h | 18 - arch/unicore32/include/asm/uaccess.h | 38 - arch/unicore32/include/asm/vmalloc.h | 4 - arch/unicore32/include/mach/PKUnity.h | 95 --- arch/unicore32/include/mach/bitfield.h | 21 - arch/unicore32/include/mach/dma.h | 45 -- arch/unicore32/include/mach/hardware.h | 30 - arch/unicore32/include/mach/map.h | 17 - arch/unicore32/include/mach/memory.h | 54 -- arch/unicore32/include/mach/ocd.h | 33 - arch/unicore32/include/mach/pm.h | 37 - arch/unicore32/include/mach/regs-ac97.h | 33 - arch/unicore32/include/mach/regs-dmac.h | 82 --- arch/unicore32/include/mach/regs-gpio.h | 71 -- arch/unicore32/include/mach/regs-i2c.h | 64 -- arch/unicore32/include/mach/regs-intc.h | 29 - arch/unicore32/include/mach/regs-nand.h | 80 -- arch/unicore32/include/mach/regs-ost.h | 91 --- arch/unicore32/include/mach/regs-pci.h | 95 --- arch/unicore32/include/mach/regs-pm.h | 127 ---- arch/unicore32/include/mach/regs-ps2.h | 21 - arch/unicore32/include/mach/regs-resetc.h | 35 - arch/unicore32/include/mach/regs-rtc.h | 38 - arch/unicore32/include/mach/regs-sdc.h | 157 ---- arch/unicore32/include/mach/regs-spi.h | 99 --- arch/unicore32/include/mach/regs-uart.h | 3 - arch/unicore32/include/mach/regs-umal.h | 230 ------ arch/unicore32/include/mach/regs-unigfx.h | 201 ------ arch/unicore32/include/mach/uncompress.h | 31 - arch/unicore32/include/uapi/asm/Kbuild | 2 - arch/unicore32/include/uapi/asm/byteorder.h | 25 - arch/unicore32/include/uapi/asm/ptrace.h | 91 --- arch/unicore32/include/uapi/asm/sigcontext.h | 30 - arch/unicore32/include/uapi/asm/unistd.h | 21 - arch/unicore32/kernel/Makefile | 31 - arch/unicore32/kernel/asm-offsets.c | 108 --- arch/unicore32/kernel/clock.c | 387 ---------- arch/unicore32/kernel/debug-macro.S | 86 --- arch/unicore32/kernel/debug.S | 82 --- arch/unicore32/kernel/dma.c | 179 ----- arch/unicore32/kernel/early_printk.c | 46 -- arch/unicore32/kernel/elf.c | 35 - arch/unicore32/kernel/entry.S | 802 --------------------- arch/unicore32/kernel/fpu-ucf64.c | 117 --- arch/unicore32/kernel/gpio.c | 121 ---- arch/unicore32/kernel/head.S | 249 ------- arch/unicore32/kernel/hibernate.c | 159 ---- arch/unicore32/kernel/hibernate_asm.S | 114 --- arch/unicore32/kernel/irq.c | 371 ---------- arch/unicore32/kernel/ksyms.c | 57 -- arch/unicore32/kernel/ksyms.h | 14 - arch/unicore32/kernel/module.c | 105 --- arch/unicore32/kernel/pci.c | 371 ---------- arch/unicore32/kernel/pm.c | 121 ---- arch/unicore32/kernel/process.c | 319 -------- arch/unicore32/kernel/ptrace.c | 147 ---- arch/unicore32/kernel/puv3-core.c | 276 ------- arch/unicore32/kernel/puv3-nb0916.c | 147 ---- arch/unicore32/kernel/setup.c | 352 --------- arch/unicore32/kernel/setup.h | 36 - arch/unicore32/kernel/signal.c | 424 ----------- arch/unicore32/kernel/sleep.S | 199 ----- arch/unicore32/kernel/stacktrace.c | 127 ---- arch/unicore32/kernel/sys.c | 37 - arch/unicore32/kernel/time.c | 128 ---- arch/unicore32/kernel/traps.c | 322 --------- arch/unicore32/kernel/vmlinux.lds.S | 59 -- arch/unicore32/lib/Makefile | 28 - arch/unicore32/lib/backtrace.S | 168 ----- arch/unicore32/lib/clear_user.S | 54 -- arch/unicore32/lib/copy_from_user.S | 101 --- arch/unicore32/lib/copy_page.S | 36 - arch/unicore32/lib/copy_template.S | 211 ------ arch/unicore32/lib/copy_to_user.S | 93 --- arch/unicore32/lib/delay.S | 48 -- arch/unicore32/lib/findbit.S | 97 --- arch/unicore32/lib/strncpy_from_user.S | 42 -- arch/unicore32/lib/strnlen_user.S | 39 - arch/unicore32/mm/Kconfig | 41 -- arch/unicore32/mm/Makefile | 14 - arch/unicore32/mm/alignment.c | 524 -------------- arch/unicore32/mm/cache-ucv2.S | 209 ------ arch/unicore32/mm/extable.c | 21 - arch/unicore32/mm/fault.c | 481 ------------ arch/unicore32/mm/flush.c | 94 --- arch/unicore32/mm/init.c | 261 ------- arch/unicore32/mm/ioremap.c | 242 ------- arch/unicore32/mm/mm.h | 31 - arch/unicore32/mm/mmu.c | 513 ------------- arch/unicore32/mm/pgd.c | 102 --- arch/unicore32/mm/proc-macros.S | 142 ---- arch/unicore32/mm/proc-syms.c | 19 - arch/unicore32/mm/proc-ucv2.S | 131 ---- arch/unicore32/mm/tlb-ucv2.S | 86 --- kernel/reboot.c | 2 +- 190 files changed, 1 insertion(+), 15705 deletions(-) delete mode 100644 arch/unicore32/.gitignore delete mode 100644 arch/unicore32/Kconfig delete mode 100644 arch/unicore32/Kconfig.debug delete mode 100644 arch/unicore32/Makefile delete mode 100644 arch/unicore32/boot/Makefile delete mode 100644 arch/unicore32/boot/compressed/Makefile delete mode 100644 arch/unicore32/boot/compressed/head.S delete mode 100644 arch/unicore32/boot/compressed/misc.c delete mode 100644 arch/unicore32/boot/compressed/piggy.S.in delete mode 100644 arch/unicore32/boot/compressed/vmlinux.lds.S delete mode 100644 arch/unicore32/configs/defconfig delete mode 100644 arch/unicore32/include/asm/Kbuild delete mode 100644 arch/unicore32/include/asm/assembler.h delete mode 100644 arch/unicore32/include/asm/barrier.h delete mode 100644 arch/unicore32/include/asm/bitops.h delete mode 100644 arch/unicore32/include/asm/bug.h delete mode 100644 arch/unicore32/include/asm/cache.h delete mode 100644 arch/unicore32/include/asm/cacheflush.h delete mode 100644 arch/unicore32/include/asm/checksum.h delete mode 100644 arch/unicore32/include/asm/cmpxchg.h delete mode 100644 arch/unicore32/include/asm/cpu-single.h delete mode 100644 arch/unicore32/include/asm/cputype.h delete mode 100644 arch/unicore32/include/asm/delay.h delete mode 100644 arch/unicore32/include/asm/dma.h delete mode 100644 arch/unicore32/include/asm/elf.h delete mode 100644 arch/unicore32/include/asm/fpstate.h delete mode 100644 arch/unicore32/include/asm/fpu-ucf64.h delete mode 100644 arch/unicore32/include/asm/gpio.h delete mode 100644 arch/unicore32/include/asm/hwcap.h delete mode 100644 arch/unicore32/include/asm/hwdef-copro.h delete mode 100644 arch/unicore32/include/asm/io.h delete mode 100644 arch/unicore32/include/asm/irq.h delete mode 100644 arch/unicore32/include/asm/irqflags.h delete mode 100644 arch/unicore32/include/asm/linkage.h delete mode 100644 arch/unicore32/include/asm/memblock.h delete mode 100644 arch/unicore32/include/asm/memory.h delete mode 100644 arch/unicore32/include/asm/mmu.h delete mode 100644 arch/unicore32/include/asm/mmu_context.h delete mode 100644 arch/unicore32/include/asm/page.h delete mode 100644 arch/unicore32/include/asm/pci.h delete mode 100644 arch/unicore32/include/asm/pgalloc.h delete mode 100644 arch/unicore32/include/asm/pgtable-hwdef.h delete mode 100644 arch/unicore32/include/asm/pgtable.h delete mode 100644 arch/unicore32/include/asm/processor.h delete mode 100644 arch/unicore32/include/asm/ptrace.h delete mode 100644 arch/unicore32/include/asm/stacktrace.h delete mode 100644 arch/unicore32/include/asm/string.h delete mode 100644 arch/unicore32/include/asm/suspend.h delete mode 100644 arch/unicore32/include/asm/switch_to.h delete mode 100644 arch/unicore32/include/asm/syscall.h delete mode 100644 arch/unicore32/include/asm/thread_info.h delete mode 100644 arch/unicore32/include/asm/timex.h delete mode 100644 arch/unicore32/include/asm/tlb.h delete mode 100644 arch/unicore32/include/asm/tlbflush.h delete mode 100644 arch/unicore32/include/asm/traps.h delete mode 100644 arch/unicore32/include/asm/uaccess.h delete mode 100644 arch/unicore32/include/asm/vmalloc.h delete mode 100644 arch/unicore32/include/mach/PKUnity.h delete mode 100644 arch/unicore32/include/mach/bitfield.h delete mode 100644 arch/unicore32/include/mach/dma.h delete mode 100644 arch/unicore32/include/mach/hardware.h delete mode 100644 arch/unicore32/include/mach/map.h delete mode 100644 arch/unicore32/include/mach/memory.h delete mode 100644 arch/unicore32/include/mach/ocd.h delete mode 100644 arch/unicore32/include/mach/pm.h delete mode 100644 arch/unicore32/include/mach/regs-ac97.h delete mode 100644 arch/unicore32/include/mach/regs-dmac.h delete mode 100644 arch/unicore32/include/mach/regs-gpio.h delete mode 100644 arch/unicore32/include/mach/regs-i2c.h delete mode 100644 arch/unicore32/include/mach/regs-intc.h delete mode 100644 arch/unicore32/include/mach/regs-nand.h delete mode 100644 arch/unicore32/include/mach/regs-ost.h delete mode 100644 arch/unicore32/include/mach/regs-pci.h delete mode 100644 arch/unicore32/include/mach/regs-pm.h delete mode 100644 arch/unicore32/include/mach/regs-ps2.h delete mode 100644 arch/unicore32/include/mach/regs-resetc.h delete mode 100644 arch/unicore32/include/mach/regs-rtc.h delete mode 100644 arch/unicore32/include/mach/regs-sdc.h delete mode 100644 arch/unicore32/include/mach/regs-spi.h delete mode 100644 arch/unicore32/include/mach/regs-uart.h delete mode 100644 arch/unicore32/include/mach/regs-umal.h delete mode 100644 arch/unicore32/include/mach/regs-unigfx.h delete mode 100644 arch/unicore32/include/mach/uncompress.h delete mode 100644 arch/unicore32/include/uapi/asm/Kbuild delete mode 100644 arch/unicore32/include/uapi/asm/byteorder.h delete mode 100644 arch/unicore32/include/uapi/asm/ptrace.h delete mode 100644 arch/unicore32/include/uapi/asm/sigcontext.h delete mode 100644 arch/unicore32/include/uapi/asm/unistd.h delete mode 100644 arch/unicore32/kernel/Makefile delete mode 100644 arch/unicore32/kernel/asm-offsets.c delete mode 100644 arch/unicore32/kernel/clock.c delete mode 100644 arch/unicore32/kernel/debug-macro.S delete mode 100644 arch/unicore32/kernel/debug.S delete mode 100644 arch/unicore32/kernel/dma.c delete mode 100644 arch/unicore32/kernel/early_printk.c delete mode 100644 arch/unicore32/kernel/elf.c delete mode 100644 arch/unicore32/kernel/entry.S delete mode 100644 arch/unicore32/kernel/fpu-ucf64.c delete mode 100644 arch/unicore32/kernel/gpio.c delete mode 100644 arch/unicore32/kernel/head.S delete mode 100644 arch/unicore32/kernel/hibernate.c delete mode 100644 arch/unicore32/kernel/hibernate_asm.S delete mode 100644 arch/unicore32/kernel/irq.c delete mode 100644 arch/unicore32/kernel/ksyms.c delete mode 100644 arch/unicore32/kernel/ksyms.h delete mode 100644 arch/unicore32/kernel/module.c delete mode 100644 arch/unicore32/kernel/pci.c delete mode 100644 arch/unicore32/kernel/pm.c delete mode 100644 arch/unicore32/kernel/process.c delete mode 100644 arch/unicore32/kernel/ptrace.c delete mode 100644 arch/unicore32/kernel/puv3-core.c delete mode 100644 arch/unicore32/kernel/puv3-nb0916.c delete mode 100644 arch/unicore32/kernel/setup.c delete mode 100644 arch/unicore32/kernel/setup.h delete mode 100644 arch/unicore32/kernel/signal.c delete mode 100644 arch/unicore32/kernel/sleep.S delete mode 100644 arch/unicore32/kernel/stacktrace.c delete mode 100644 arch/unicore32/kernel/sys.c delete mode 100644 arch/unicore32/kernel/time.c delete mode 100644 arch/unicore32/kernel/traps.c delete mode 100644 arch/unicore32/kernel/vmlinux.lds.S delete mode 100644 arch/unicore32/lib/Makefile delete mode 100644 arch/unicore32/lib/backtrace.S delete mode 100644 arch/unicore32/lib/clear_user.S delete mode 100644 arch/unicore32/lib/copy_from_user.S delete mode 100644 arch/unicore32/lib/copy_page.S delete mode 100644 arch/unicore32/lib/copy_template.S delete mode 100644 arch/unicore32/lib/copy_to_user.S delete mode 100644 arch/unicore32/lib/delay.S delete mode 100644 arch/unicore32/lib/findbit.S delete mode 100644 arch/unicore32/lib/strncpy_from_user.S delete mode 100644 arch/unicore32/lib/strnlen_user.S delete mode 100644 arch/unicore32/mm/Kconfig delete mode 100644 arch/unicore32/mm/Makefile delete mode 100644 arch/unicore32/mm/alignment.c delete mode 100644 arch/unicore32/mm/cache-ucv2.S delete mode 100644 arch/unicore32/mm/extable.c delete mode 100644 arch/unicore32/mm/fault.c delete mode 100644 arch/unicore32/mm/flush.c delete mode 100644 arch/unicore32/mm/init.c delete mode 100644 arch/unicore32/mm/ioremap.c delete mode 100644 arch/unicore32/mm/mm.h delete mode 100644 arch/unicore32/mm/mmu.c delete mode 100644 arch/unicore32/mm/pgd.c delete mode 100644 arch/unicore32/mm/proc-macros.S delete mode 100644 arch/unicore32/mm/proc-syms.c delete mode 100644 arch/unicore32/mm/proc-ucv2.S delete mode 100644 arch/unicore32/mm/tlb-ucv2.S (limited to 'kernel') diff --git a/Documentation/features/core/cBPF-JIT/arch-support.txt b/Documentation/features/core/cBPF-JIT/arch-support.txt index 8620c38d4db0..399935616813 100644 --- a/Documentation/features/core/cBPF-JIT/arch-support.txt +++ b/Documentation/features/core/cBPF-JIT/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | TODO | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/core/eBPF-JIT/arch-support.txt b/Documentation/features/core/eBPF-JIT/arch-support.txt index 9ed964f65224..79409bfe0263 100644 --- a/Documentation/features/core/eBPF-JIT/arch-support.txt +++ b/Documentation/features/core/eBPF-JIT/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/core/generic-idle-thread/arch-support.txt b/Documentation/features/core/generic-idle-thread/arch-support.txt index 365df2c2ff0b..9ea60e416efd 100644 --- a/Documentation/features/core/generic-idle-thread/arch-support.txt +++ b/Documentation/features/core/generic-idle-thread/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/core/jump-labels/arch-support.txt b/Documentation/features/core/jump-labels/arch-support.txt index 632a1c7aefa2..f8ec5c13cde4 100644 --- a/Documentation/features/core/jump-labels/arch-support.txt +++ b/Documentation/features/core/jump-labels/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/core/tracehook/arch-support.txt b/Documentation/features/core/tracehook/arch-support.txt index 964667052eda..cd3510e2eedb 100644 --- a/Documentation/features/core/tracehook/arch-support.txt +++ b/Documentation/features/core/tracehook/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/debug/KASAN/arch-support.txt b/Documentation/features/debug/KASAN/arch-support.txt index 6ff38548923e..c3fe9b266e7b 100644 --- a/Documentation/features/debug/KASAN/arch-support.txt +++ b/Documentation/features/debug/KASAN/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt index c527d05c0459..ca6bacb1e99e 100644 --- a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt +++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/gcov-profile-all/arch-support.txt b/Documentation/features/debug/gcov-profile-all/arch-support.txt index 210256f6a4cf..7563a494ddb8 100644 --- a/Documentation/features/debug/gcov-profile-all/arch-support.txt +++ b/Documentation/features/debug/gcov-profile-all/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/kgdb/arch-support.txt b/Documentation/features/debug/kgdb/arch-support.txt index 38c40cfa0578..4b0a1d0d6ba4 100644 --- a/Documentation/features/debug/kgdb/arch-support.txt +++ b/Documentation/features/debug/kgdb/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt index 97cd7aa74905..6225cfe0c5bf 100644 --- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt +++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/kprobes/arch-support.txt b/Documentation/features/debug/kprobes/arch-support.txt index 8b316c6e03d4..371f0ac488f5 100644 --- a/Documentation/features/debug/kprobes/arch-support.txt +++ b/Documentation/features/debug/kprobes/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/kretprobes/arch-support.txt b/Documentation/features/debug/kretprobes/arch-support.txt index b805aada395e..38e95251deed 100644 --- a/Documentation/features/debug/kretprobes/arch-support.txt +++ b/Documentation/features/debug/kretprobes/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/optprobes/arch-support.txt b/Documentation/features/debug/optprobes/arch-support.txt index fb297a88f62c..7f4a20e6a12b 100644 --- a/Documentation/features/debug/optprobes/arch-support.txt +++ b/Documentation/features/debug/optprobes/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/stackprotector/arch-support.txt b/Documentation/features/debug/stackprotector/arch-support.txt index 12410f606edc..3db4763aa3f5 100644 --- a/Documentation/features/debug/stackprotector/arch-support.txt +++ b/Documentation/features/debug/stackprotector/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/debug/uprobes/arch-support.txt b/Documentation/features/debug/uprobes/arch-support.txt index be8acbb95b54..43cac6ee0c68 100644 --- a/Documentation/features/debug/uprobes/arch-support.txt +++ b/Documentation/features/debug/uprobes/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/debug/user-ret-profiler/arch-support.txt b/Documentation/features/debug/user-ret-profiler/arch-support.txt index 6bfa36b0e017..d636ed0e679f 100644 --- a/Documentation/features/debug/user-ret-profiler/arch-support.txt +++ b/Documentation/features/debug/user-ret-profiler/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/io/dma-contiguous/arch-support.txt b/Documentation/features/io/dma-contiguous/arch-support.txt index 895c3b0f6492..dfc93d074e3d 100644 --- a/Documentation/features/io/dma-contiguous/arch-support.txt +++ b/Documentation/features/io/dma-contiguous/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/locking/cmpxchg-local/arch-support.txt b/Documentation/features/locking/cmpxchg-local/arch-support.txt index 242ff5a6586e..1815c7fed06d 100644 --- a/Documentation/features/locking/cmpxchg-local/arch-support.txt +++ b/Documentation/features/locking/cmpxchg-local/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/locking/lockdep/arch-support.txt b/Documentation/features/locking/lockdep/arch-support.txt index 98cb9d85c55d..4f844ecd0680 100644 --- a/Documentation/features/locking/lockdep/arch-support.txt +++ b/Documentation/features/locking/lockdep/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | ok | - | unicore32: | ok | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/locking/queued-rwlocks/arch-support.txt b/Documentation/features/locking/queued-rwlocks/arch-support.txt index ee922746a64c..5c6bcfcf8e1f 100644 --- a/Documentation/features/locking/queued-rwlocks/arch-support.txt +++ b/Documentation/features/locking/queued-rwlocks/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/locking/queued-spinlocks/arch-support.txt b/Documentation/features/locking/queued-spinlocks/arch-support.txt index c52116c1a049..b55e420a34ea 100644 --- a/Documentation/features/locking/queued-spinlocks/arch-support.txt +++ b/Documentation/features/locking/queued-spinlocks/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/perf/kprobes-event/arch-support.txt b/Documentation/features/perf/kprobes-event/arch-support.txt index 518f352fc727..04c17c2106a4 100644 --- a/Documentation/features/perf/kprobes-event/arch-support.txt +++ b/Documentation/features/perf/kprobes-event/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/perf/perf-regs/arch-support.txt b/Documentation/features/perf/perf-regs/arch-support.txt index c22cd6f8aa5e..e7450fbb8253 100644 --- a/Documentation/features/perf/perf-regs/arch-support.txt +++ b/Documentation/features/perf/perf-regs/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/perf/perf-stackdump/arch-support.txt b/Documentation/features/perf/perf-stackdump/arch-support.txt index 527fe4d0b074..98e79d128d9b 100644 --- a/Documentation/features/perf/perf-stackdump/arch-support.txt +++ b/Documentation/features/perf/perf-stackdump/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt index 8a521a622966..68658a6f8c5b 100644 --- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt +++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt @@ -51,7 +51,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/sched/numa-balancing/arch-support.txt b/Documentation/features/sched/numa-balancing/arch-support.txt index 350823692f28..964457ad26c1 100644 --- a/Documentation/features/sched/numa-balancing/arch-support.txt +++ b/Documentation/features/sched/numa-balancing/arch-support.txt @@ -28,7 +28,6 @@ | sh: | .. | | sparc: | TODO | | um: | .. | - | unicore32: | .. | | x86: | ok | | xtensa: | .. | ----------------------- diff --git a/Documentation/features/seccomp/seccomp-filter/arch-support.txt b/Documentation/features/seccomp/seccomp-filter/arch-support.txt index c7b837f735b1..f54ddfc06a12 100644 --- a/Documentation/features/seccomp/seccomp-filter/arch-support.txt +++ b/Documentation/features/seccomp/seccomp-filter/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | ok | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/time/arch-tick-broadcast/arch-support.txt b/Documentation/features/time/arch-tick-broadcast/arch-support.txt index 593536f7925b..4d11cbb3c09b 100644 --- a/Documentation/features/time/arch-tick-broadcast/arch-support.txt +++ b/Documentation/features/time/arch-tick-broadcast/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | TODO | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/time/clockevents/arch-support.txt b/Documentation/features/time/clockevents/arch-support.txt index 7a27157da408..8287b6aa522e 100644 --- a/Documentation/features/time/clockevents/arch-support.txt +++ b/Documentation/features/time/clockevents/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | ok | - | unicore32: | ok | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/time/context-tracking/arch-support.txt b/Documentation/features/time/context-tracking/arch-support.txt index 048bfb6d3872..a71f3a945285 100644 --- a/Documentation/features/time/context-tracking/arch-support.txt +++ b/Documentation/features/time/context-tracking/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt index a14bbad8e948..d9082b91f10e 100644 --- a/Documentation/features/time/irq-time-acct/arch-support.txt +++ b/Documentation/features/time/irq-time-acct/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | .. | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/time/modern-timekeeping/arch-support.txt b/Documentation/features/time/modern-timekeeping/arch-support.txt index 1d46da165b75..a84c3b9d9a94 100644 --- a/Documentation/features/time/modern-timekeeping/arch-support.txt +++ b/Documentation/features/time/modern-timekeeping/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | ok | - | unicore32: | ok | | x86: | ok | | xtensa: | ok | ----------------------- diff --git a/Documentation/features/time/virt-cpuacct/arch-support.txt b/Documentation/features/time/virt-cpuacct/arch-support.txt index fb0d0cab9cab..56b372da6b01 100644 --- a/Documentation/features/time/virt-cpuacct/arch-support.txt +++ b/Documentation/features/time/virt-cpuacct/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/vm/ELF-ASLR/arch-support.txt b/Documentation/features/vm/ELF-ASLR/arch-support.txt index adc25878d217..eccda0732474 100644 --- a/Documentation/features/vm/ELF-ASLR/arch-support.txt +++ b/Documentation/features/vm/ELF-ASLR/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/vm/PG_uncached/arch-support.txt b/Documentation/features/vm/PG_uncached/arch-support.txt index f05588f9e4b4..c74e3f8040e1 100644 --- a/Documentation/features/vm/PG_uncached/arch-support.txt +++ b/Documentation/features/vm/PG_uncached/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/vm/THP/arch-support.txt b/Documentation/features/vm/THP/arch-support.txt index cdfe8925f881..1c0b95f2b40d 100644 --- a/Documentation/features/vm/THP/arch-support.txt +++ b/Documentation/features/vm/THP/arch-support.txt @@ -28,7 +28,6 @@ | sh: | .. | | sparc: | ok | | um: | .. | - | unicore32: | .. | | x86: | ok | | xtensa: | .. | ----------------------- diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt index 2bdd3b6cee3c..30f75a79ce01 100644 --- a/Documentation/features/vm/TLB/arch-support.txt +++ b/Documentation/features/vm/TLB/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | .. | - | unicore32: | .. | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt index 8525f1981f19..c5ff3a427722 100644 --- a/Documentation/features/vm/huge-vmap/arch-support.txt +++ b/Documentation/features/vm/huge-vmap/arch-support.txt @@ -28,7 +28,6 @@ | sh: | TODO | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/vm/ioremap_prot/arch-support.txt b/Documentation/features/vm/ioremap_prot/arch-support.txt index 3a6b87de6a19..1cb7406cd858 100644 --- a/Documentation/features/vm/ioremap_prot/arch-support.txt +++ b/Documentation/features/vm/ioremap_prot/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | TODO | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt index 2e017387e228..13d0e1e17001 100644 --- a/Documentation/features/vm/pte_special/arch-support.txt +++ b/Documentation/features/vm/pte_special/arch-support.txt @@ -28,7 +28,6 @@ | sh: | ok | | sparc: | ok | | um: | TODO | - | unicore32: | TODO | | x86: | ok | | xtensa: | TODO | ----------------------- diff --git a/MAINTAINERS b/MAINTAINERS index 496fd4eafb68..1de95aa44bbb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17532,13 +17532,6 @@ L: linux-fsdevel@vger.kernel.org S: Supported F: fs/unicode/ -UNICORE32 ARCHITECTURE -M: Guan Xuetao -S: Maintained -W: http://mprc.pku.edu.cn/~guanxuetao/linux -T: git git://github.com/gxt/linux.git -F: arch/unicore32/ - UNIFDEF M: Tony Finch S: Maintained diff --git a/arch/unicore32/.gitignore b/arch/unicore32/.gitignore deleted file mode 100644 index e82f3fb57ba0..000000000000 --- a/arch/unicore32/.gitignore +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Generated include files -# -include/generated -# -# Generated ld script file -# -kernel/vmlinux.lds -# -# Generated images in boot -# -boot/Image -boot/zImage -boot/uImage -# -# Generated files in boot/compressed -# -boot/compressed/piggy.S -boot/compressed/piggy.gzip -boot/compressed/vmlinux -boot/compressed/vmlinux.lds diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig deleted file mode 100644 index 11ba1839d198..000000000000 --- a/arch/unicore32/Kconfig +++ /dev/null @@ -1,200 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -config UNICORE32 - def_bool y - select ARCH_32BIT_OFF_T - select ARCH_HAS_DEVMEM_IS_ALLOWED - select ARCH_HAS_KEEPINITRD - select ARCH_MIGHT_HAVE_PC_PARPORT - select ARCH_MIGHT_HAVE_PC_SERIO - select HAVE_KERNEL_GZIP - select HAVE_KERNEL_BZIP2 - select GENERIC_ATOMIC64 - select HAVE_KERNEL_LZO - select HAVE_KERNEL_LZMA - select HAVE_PCI - select VIRT_TO_BUS - select ARCH_HAVE_CUSTOM_GPIO_H - select GENERIC_FIND_FIRST_BIT - select GENERIC_IRQ_PROBE - select GENERIC_IRQ_SHOW - select ARCH_WANT_FRAME_POINTERS - select GENERIC_IOMAP - select MODULES_USE_ELF_REL - select NEED_DMA_MAP_STATE - select MMU_GATHER_NO_RANGE if MMU - help - UniCore-32 is 32-bit Instruction Set Architecture, - including a series of low-power-consumption RISC chip - designs licensed by PKUnity Ltd. - Please see web page at . - -config GENERIC_CSUM - def_bool y - -config NO_IOPORT_MAP - bool - -config STACKTRACE_SUPPORT - def_bool y - -config LOCKDEP_SUPPORT - def_bool y - -config ARCH_HAS_ILOG2_U32 - bool - -config ARCH_HAS_ILOG2_U64 - bool - -config GENERIC_HWEIGHT - def_bool y - -config GENERIC_CALIBRATE_DELAY - def_bool y - -config ARCH_MAY_HAVE_PC_FDC - bool - -config ZONE_DMA - def_bool y - -menu "System Type" - -config MMU - def_bool y - -config ARCH_FPGA - bool - -config ARCH_PUV3 - def_bool y - select CPU_UCV2 - select GENERIC_CLOCKEVENTS - select HAVE_LEGACY_CLK - select GPIOLIB - -# CONFIGs for ARCH_PUV3 - -if ARCH_PUV3 - -choice - prompt "Board Selection" - default PUV3_DB0913 - -config PUV3_FPGA_DLX200 - select ARCH_FPGA - bool "FPGA board" - -config PUV3_DB0913 - bool "DEBUG board (0913)" - -config PUV3_NB0916 - bool "NetBook board (0916)" - select PWM - select PWM_PUV3 - -config PUV3_SMW0919 - bool "Security Mini-Workstation board (0919)" - -endchoice - -config PUV3_PM - def_bool y if !ARCH_FPGA - -endif - -source "arch/unicore32/mm/Kconfig" - -comment "Floating point support" - -config UNICORE_FPU_F64 - def_bool y if !ARCH_FPGA - -endmenu - -menu "Kernel Features" - -source "kernel/Kconfig.hz" - -config LEDS - def_bool y - depends on GPIOLIB - -config ALIGNMENT_TRAP - def_bool y - help - Unicore processors can not fetch/store information which is not - naturally aligned on the bus, i.e., a 4 byte fetch must start at an - address divisible by 4. On 32-bit Unicore processors, these non-aligned - fetch/store instructions will be emulated in software if you say - here, which has a severe performance impact. This is necessary for - correct operation of some network protocols. With an IP-only - configuration it is safe to say N, otherwise say Y. - -endmenu - -menu "Boot options" - -config CMDLINE - string "Default kernel command string" - default "" - -config CMDLINE_FORCE - bool "Always use the default kernel command string" - depends on CMDLINE != "" - help - Always use the default kernel command string, even if the boot - loader passes other arguments to the kernel. - This is useful if you cannot or don't want to change the - command-line options your boot loader passes to the kernel. - - If unsure, say N. - -endmenu - -menu "Power management options" - -source "kernel/power/Kconfig" - -source "drivers/cpufreq/Kconfig" - -config ARCH_SUSPEND_POSSIBLE - def_bool y if !ARCH_FPGA - -config ARCH_HIBERNATION_POSSIBLE - def_bool y if !ARCH_FPGA - -endmenu - -if ARCH_PUV3 - -config PUV3_GPIO - bool - depends on !ARCH_FPGA - select GPIO_SYSFS - default y - -if PUV3_NB0916 - -menu "PKUnity NetBook-0916 Features" - -config I2C_BATTERY_BQ27200 - tristate "I2C Battery BQ27200 Support" - select I2C_PUV3 - select POWER_SUPPLY - select BATTERY_BQ27XXX - -config I2C_EEPROM_AT24 - tristate "I2C EEPROMs AT24 support" - select I2C_PUV3 - select EEPROM_AT24 - -config LCD_BACKLIGHT - tristate "LCD Backlight support" - select BACKLIGHT_PWM - -endmenu - -endif - -endif diff --git a/arch/unicore32/Kconfig.debug b/arch/unicore32/Kconfig.debug deleted file mode 100644 index ca0ff97657ef..000000000000 --- a/arch/unicore32/Kconfig.debug +++ /dev/null @@ -1,29 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -config EARLY_PRINTK - def_bool DEBUG_OCD - help - Write kernel log output directly into the ocd or to a serial port. - - This is useful for kernel debugging when your machine crashes very - early before the console code is initialized. For normal operation - it is not recommended because it looks ugly and doesn't cooperate - with klogd/syslogd or the X server. You should normally N here, - unless you want to debug such a crash. - -# These options are only for real kernel hackers who want to get their hands dirty. -config DEBUG_LL - bool "Kernel low-level debugging functions" - depends on DEBUG_KERNEL - help - Say Y here to include definitions of printascii, printch, printhex - in the kernel. This is helpful if you are debugging code that - executes before the console is initialized. - -config DEBUG_OCD - bool "Kernel low-level debugging via On-Chip-Debugger" - depends on DEBUG_LL - default y - help - Say Y here if you want the debug print routines to direct their - output to the UniCore On-Chip-Debugger channel using CP #1. diff --git a/arch/unicore32/Makefile b/arch/unicore32/Makefile deleted file mode 100644 index 390819947c37..000000000000 --- a/arch/unicore32/Makefile +++ /dev/null @@ -1,59 +0,0 @@ -# -# arch/unicore32/Makefile -# -# This file is included by the global makefile so that you can add your own -# architecture-specific flags and dependencies. -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 2002~2010 by Guan Xue-tao -# -ifneq ($(SUBARCH),$(ARCH)) - ifeq ($(CROSS_COMPILE),) - CROSS_COMPILE := $(call cc-cross-prefix, unicore32-linux-) - endif -endif - -LDFLAGS_vmlinux := -p --no-undefined -X - -OBJCOPYFLAGS := -O binary -R .note -R .note.gnu.build-id -R .comment -S - -# Never generate .eh_frame -KBUILD_CFLAGS += $(call cc-option,-fno-dwarf2-cfi-asm) - -# Never use hard float in kernel -KBUILD_CFLAGS += -msoft-float - -ifeq ($(CONFIG_FRAME_POINTER),y) -KBUILD_CFLAGS += -mno-sched-prolog -endif - -CHECKFLAGS += -D__unicore32__ - -head-y := arch/unicore32/kernel/head.o - -core-y += arch/unicore32/kernel/ -core-y += arch/unicore32/mm/ - -libs-y += arch/unicore32/lib/ - -boot := arch/unicore32/boot - -# Default target when executing plain make -KBUILD_IMAGE := $(boot)/zImage - -all: zImage - -zImage Image uImage: vmlinux - $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ - -archclean: - $(Q)$(MAKE) $(clean)=$(boot) - -define archhelp - echo '* zImage - Compressed kernel image (arch/$(ARCH)/boot/zImage)' - echo ' Image - Uncompressed kernel image (arch/$(ARCH)/boot/Image)' - echo ' uImage - U-Boot wrapped zImage' -endef diff --git a/arch/unicore32/boot/Makefile b/arch/unicore32/boot/Makefile deleted file mode 100644 index 828855007b29..000000000000 --- a/arch/unicore32/boot/Makefile +++ /dev/null @@ -1,39 +0,0 @@ -# -# arch/unicore32/boot/Makefile -# -# This file is included by the global makefile so that you can add your own -# architecture-specific flags and dependencies. -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 2001~2010 GUAN Xue-tao -# - -targets := Image zImage uImage - -$(obj)/Image: vmlinux FORCE - $(call if_changed,objcopy) - @echo ' Kernel: $@ is ready' - -$(obj)/compressed/vmlinux: $(obj)/Image FORCE - $(Q)$(MAKE) $(build)=$(obj)/compressed $@ - -$(obj)/zImage: $(obj)/compressed/vmlinux FORCE - $(call if_changed,objcopy) - @echo ' Kernel: $@ is ready' - -UIMAGE_ARCH = unicore -UIMAGE_LOADADDR = 0x0 - -$(obj)/uImage: $(obj)/zImage FORCE - $(call if_changed,uimage) - @echo ' Image $@ is ready' - -PHONY += initrd -initrd: - @test "$(INITRD)" != "" || \ - (echo You must specify INITRD; exit -1) - -subdir- := compressed diff --git a/arch/unicore32/boot/compressed/Makefile b/arch/unicore32/boot/compressed/Makefile deleted file mode 100644 index 150fafc32fb0..000000000000 --- a/arch/unicore32/boot/compressed/Makefile +++ /dev/null @@ -1,64 +0,0 @@ -# -# linux/arch/unicore32/boot/compressed/Makefile -# -# create a compressed vmlinuz image from the original vmlinux -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 2001~2010 GUAN Xue-tao -# - -ccflags-y := -fpic -fno-builtin -asflags-y := -Wa,-march=all - -OBJS := misc.o - -# font.c and font.o -CFLAGS_font.o := -Dstatic= -$(obj)/font.c: $(srctree)/lib/fonts/font_8x8.c - $(call cmd,shipped) - -# piggy.S and piggy.o -suffix_$(CONFIG_KERNEL_GZIP) := gzip -suffix_$(CONFIG_KERNEL_BZIP2) := bz2 -suffix_$(CONFIG_KERNEL_LZO) := lzo -suffix_$(CONFIG_KERNEL_LZMA) := lzma - -$(obj)/piggy.$(suffix_y): $(obj)/../Image FORCE - $(call if_changed,$(suffix_y)) - -SEDFLAGS_piggy = s/DECOMP_SUFFIX/$(suffix_y)/ -$(obj)/piggy.S: $(obj)/piggy.S.in - @sed "$(SEDFLAGS_piggy)" < $< > $@ - -$(obj)/piggy.o: $(obj)/piggy.$(suffix_y) $(obj)/piggy.S FORCE - -targets := vmlinux vmlinux.lds font.o font.c head.o misc.o \ - piggy.$(suffix_y) piggy.o piggy.S \ - -# Make sure files are removed during clean -extra-y += piggy.gzip piggy.bz2 piggy.lzo piggy.lzma - -# ? -LDFLAGS_vmlinux += -p -# Report unresolved symbol references -LDFLAGS_vmlinux += --no-undefined -# Delete all temporary local symbols -LDFLAGS_vmlinux += -X -# Next argument is a linker script -LDFLAGS_vmlinux += -T - -# For uidivmod -$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head.o $(obj)/piggy.o \ - $(obj)/misc.o FORCE - $(call if_changed,ld) - -# We now have a PIC decompressor implementation. Decompressors running -# from RAM should not define ZTEXTADDR. Decompressors running directly -# from ROM or Flash must define ZTEXTADDR (preferably via the config) -ZTEXTADDR := 0x03000000 -ZBSSADDR := ALIGN(4) - -CPPFLAGS_vmlinux.lds = -DTEXT_START="$(ZTEXTADDR)" -DBSS_START="$(ZBSSADDR)" diff --git a/arch/unicore32/boot/compressed/head.S b/arch/unicore32/boot/compressed/head.S deleted file mode 100644 index 5f72662cd294..000000000000 --- a/arch/unicore32/boot/compressed/head.S +++ /dev/null @@ -1,201 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/boot/compressed/head.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - -#define csub cmpsub -#define cand cmpand -#define nop8 nop; nop; nop; nop; nop; nop; nop; nop - - .section ".start", #alloc, #execinstr - .text -start: - .type start,#function - - /* Initialize ASR, PRIV mode and INTR off */ - mov r0, #0xD3 - mov.a asr, r0 - - adr r0, LC0 - ldm (r1, r2, r3, r5, r6, r7, r8), [r0]+ - ldw sp, [r0+], #28 - sub.a r0, r0, r1 @ calculate the delta offset - - /* - * if delta is zero, we are running at the address - * we were linked at. - */ - beq not_relocated - - /* - * We're running at a different address. We need to fix - * up various pointers: - * r5 - zImage base address (_start) - * r7 - GOT start - * r8 - GOT end - */ - add r5, r5, r0 - add r7, r7, r0 - add r8, r8, r0 - - /* - * we need to fix up pointers into the BSS region. - * r2 - BSS start - * r3 - BSS end - * sp - stack pointer - */ - add r2, r2, r0 - add r3, r3, r0 - add sp, sp, r0 - - /* - * Relocate all entries in the GOT table. - * This fixes up the C references. - * r7 - GOT start - * r8 - GOT end - */ -1001: ldw r1, [r7+], #0 - add r1, r1, r0 - stw.w r1, [r7]+, #4 - csub.a r7, r8 - bub 1001b - -not_relocated: - /* - * Clear BSS region. - * r2 - BSS start - * r3 - BSS end - */ - mov r0, #0 -1002: stw.w r0, [r2]+, #4 - csub.a r2, r3 - bub 1002b - - /* - * Turn on the cache. - */ - mov r0, #0 - movc p0.c5, r0, #28 @ cache invalidate all - nop8 - movc p0.c6, r0, #6 @ tlb invalidate all - nop8 - - mov r0, #0x1c @ en icache and wb dcache - movc p0.c1, r0, #0 - nop8 - - /* - * Set up some pointers, for starting decompressing. - */ - - mov r1, sp @ malloc space above stack - add r2, sp, #0x10000 @ 64k max - - /* - * Check to see if we will overwrite ourselves. - * r4 = final kernel address - * r5 = start of this image - * r6 = size of decompressed image - * r2 = end of malloc space (and therefore this image) - * We basically want: - * r4 >= r2 -> OK - * r4 + image length <= r5 -> OK - */ - ldw r4, =KERNEL_IMAGE_START - csub.a r4, r2 - bea wont_overwrite - add r0, r4, r6 - csub.a r0, r5 - beb wont_overwrite - - /* - * If overwrite, just print error message - */ - b __error_overwrite - - /* - * We're not in danger of overwriting ourselves. - * Do this the simple way. - */ -wont_overwrite: - /* - * decompress_kernel: - * r0: output_start - * r1: free_mem_ptr_p - * r2: free_mem_ptr_end_p - */ - mov r0, r4 - b.l decompress_kernel @ C functions - - /* - * Clean and flush the cache to maintain consistency. - */ - mov r0, #0 - movc p0.c5, r0, #14 @ flush dcache - nop8 - movc p0.c5, r0, #20 @ icache invalidate all - nop8 - - /* - * Turn off the Cache and MMU. - */ - mov r0, #0 @ disable i/d cache and MMU - movc p0.c1, r0, #0 - nop8 - - mov r0, #0 @ must be zero - ldw r4, =KERNEL_IMAGE_START - mov pc, r4 @ call kernel - - - .align 2 - .type LC0, #object -LC0: .word LC0 @ r1 - .word __bss_start @ r2 - .word _end @ r3 - .word _start @ r5 - .word _image_size @ r6 - .word _got_start @ r7 - .word _got_end @ r8 - .word decompress_stack_end @ sp - .size LC0, . - LC0 - -print_string: -#ifdef CONFIG_DEBUG_OCD -2001: ldb.w r1, [r0]+, #1 - csub.a r1, #0 - bne 2002f - mov pc, lr -2002: - movc r2, p1.c0, #0 - cand.a r2, #2 - bne 2002b - movc p1.c1, r1, #1 - csub.a r1, #'\n' - cmoveq r1, #'\r' - beq 2002b - b 2001b -#else - mov pc, lr -#endif - -__error_overwrite: - adr r0, str_error - b.l print_string -2001: nop8 - b 2001b -str_error: .asciz "\nError: Kernel address OVERWRITE\n" - .align - - .ltorg - - .align 4 - .section ".stack", "aw", %nobits -decompress_stack: .space 4096 -decompress_stack_end: diff --git a/arch/unicore32/boot/compressed/misc.c b/arch/unicore32/boot/compressed/misc.c deleted file mode 100644 index 450d3355de20..000000000000 --- a/arch/unicore32/boot/compressed/misc.c +++ /dev/null @@ -1,123 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/boot/compressed/misc.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#include -#include - -/* - * gzip delarations - */ -unsigned char *output_data; -unsigned long output_ptr; - -unsigned int free_mem_ptr; -unsigned int free_mem_end_ptr; - -#define STATIC static -#define STATIC_RW_DATA /* non-static please */ - -/* - * arch-dependent implementations - */ -#ifndef ARCH_HAVE_DECOMP_ERROR -#define arch_decomp_error(x) -#endif - -#ifndef ARCH_HAVE_DECOMP_SETUP -#define arch_decomp_setup() -#endif - -#ifndef ARCH_HAVE_DECOMP_PUTS -#define arch_decomp_puts(p) -#endif - -void *memcpy(void *dest, const void *src, size_t n) -{ - int i = 0; - unsigned char *d = (unsigned char *)dest, *s = (unsigned char *)src; - - for (i = n >> 3; i > 0; i--) { - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - } - - if (n & 1 << 2) { - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - *d++ = *s++; - } - - if (n & 1 << 1) { - *d++ = *s++; - *d++ = *s++; - } - - if (n & 1) - *d++ = *s++; - - return dest; -} - -void error(char *x) -{ - arch_decomp_puts("\n\n"); - arch_decomp_puts(x); - arch_decomp_puts("\n\n -- System halted"); - - arch_decomp_error(x); - - for (;;) - ; /* Halt */ -} - -/* Heap size should be adjusted for different decompress method */ -#ifdef CONFIG_KERNEL_GZIP -#include "../../../../lib/decompress_inflate.c" -#endif - -#ifdef CONFIG_KERNEL_BZIP2 -#include "../../../../lib/decompress_bunzip2.c" -#endif - -#ifdef CONFIG_KERNEL_LZO -#include "../../../../lib/decompress_unlzo.c" -#endif - -#ifdef CONFIG_KERNEL_LZMA -#include "../../../../lib/decompress_unlzma.c" -#endif - -unsigned long decompress_kernel(unsigned long output_start, - unsigned long free_mem_ptr_p, - unsigned long free_mem_ptr_end_p) -{ - unsigned char *tmp; - - output_data = (unsigned char *)output_start; - free_mem_ptr = free_mem_ptr_p; - free_mem_end_ptr = free_mem_ptr_end_p; - - arch_decomp_setup(); - - tmp = (unsigned char *) (((unsigned long)input_data_end) - 4); - output_ptr = get_unaligned_le32(tmp); - - arch_decomp_puts("Uncompressing Linux..."); - __decompress(input_data, input_data_end - input_data, NULL, NULL, - output_data, 0, NULL, error); - arch_decomp_puts(" done, booting the kernel.\n"); - return output_ptr; -} diff --git a/arch/unicore32/boot/compressed/piggy.S.in b/arch/unicore32/boot/compressed/piggy.S.in deleted file mode 100644 index b79704d58026..000000000000 --- a/arch/unicore32/boot/compressed/piggy.S.in +++ /dev/null @@ -1,6 +0,0 @@ - .section .piggydata,#alloc - .globl input_data -input_data: - .incbin "arch/unicore32/boot/compressed/piggy.DECOMP_SUFFIX" - .globl input_data_end -input_data_end: diff --git a/arch/unicore32/boot/compressed/vmlinux.lds.S b/arch/unicore32/boot/compressed/vmlinux.lds.S deleted file mode 100644 index edda4ddfa357..000000000000 --- a/arch/unicore32/boot/compressed/vmlinux.lds.S +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore/boot/compressed/vmlinux.lds.in - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -OUTPUT_ARCH(unicore32) -ENTRY(_start) -SECTIONS -{ - /DISCARD/ : { - /* - * Discard any r/w data - this produces a link error if we have any, - * which is required for PIC decompression. Local data generates - * GOTOFF relocations, which prevents it being relocated independently - * of the text/got segments. - */ - *(.data) - } - - . = TEXT_START; - _text = .; - - .text : { - _start = .; - *(.start) - *(.text) - *(.text.*) - *(.fixup) - *(.gnu.warning) - *(.rodata) - *(.rodata.*) - *(.piggydata) - . = ALIGN(4); - } - - _etext = .; - - /* Assume size of decompressed image is 4x the compressed image */ - _image_size = (_etext - _text) * 4; - - _got_start = .; - .got : { *(.got) } - _got_end = .; - .got.plt : { *(.got.plt) } - _edata = .; - - . = BSS_START; - __bss_start = .; - .bss : { *(.bss) } - _end = .; - - .stack : { *(.stack) } - .comment 0 : { *(.comment) } -} - diff --git a/arch/unicore32/configs/defconfig b/arch/unicore32/configs/defconfig deleted file mode 100644 index 360cc9abcdb0..000000000000 --- a/arch/unicore32/configs/defconfig +++ /dev/null @@ -1,214 +0,0 @@ -### General setup -CONFIG_EXPERIMENTAL=y -CONFIG_LOCALVERSION="-unicore32" -CONFIG_SWAP=y -CONFIG_SYSVIPC=y -CONFIG_POSIX_MQUEUE=y -CONFIG_HOTPLUG=y -# Initial RAM filesystem and RAM disk (initramfs/initrd) support -#CONFIG_BLK_DEV_INITRD=y -#CONFIG_INITRAMFS_SOURCE="arch/unicore/ramfs/ramfs_config" - -### Enable loadable module support -CONFIG_MODULES=n -CONFIG_MODULE_UNLOAD=y - -### System Type -CONFIG_ARCH_PUV3=y -# Board Selection -CONFIG_PUV3_NB0916=y -# Processor Features -CONFIG_CPU_DCACHE_LINE_DISABLE=y -CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE=n - -### Bus support -CONFIG_PCI=y -CONFIG_PCI_LEGACY=n - -### Boot options -# for debug, adding: earlyprintk=ocd,keep initcall_debug -# others support: test_suspend=mem root=/dev/sda -# hibernate support: resume=/dev/sda3 -CONFIG_CMDLINE="earlyprintk=ocd,keep ignore_loglevel" -# TODO: mem=512M video=unifb:1024x600-16@75 -# for nfs: root=/dev/nfs rw nfsroot=192.168.10.88:/home/udb/nfs/,rsize=1024,wsize=1024 -# ip=192.168.10.83:192.168.10.88:192.168.10.1:255.255.255.0::eth0:off -CONFIG_CMDLINE_FORCE=y - -### Power management options -CONFIG_PM=y -CONFIG_HIBERNATION=y -CONFIG_PM_STD_PARTITION="/dev/sda3" -CONFIG_CPU_FREQ=n -CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y - -### Networking support -CONFIG_NET=y -# Networking options -CONFIG_PACKET=m -CONFIG_UNIX=m -# TCP/IP networking -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_PNP=y -CONFIG_IPV6=n -# Wireless -CONFIG_WIRELESS=y -CONFIG_WIRELESS_EXT=y -CONFIG_MAC80211=m - -### PKUnity SoC Features -CONFIG_USB_WLAN_HED_AQ3=n -CONFIG_USB_CMMB_INNOFIDEI=n -CONFIG_I2C_BATTERY_BQ27200=n -CONFIG_I2C_EEPROM_AT24=n -CONFIG_LCD_BACKLIGHT=n - -CONFIG_PUV3_UMAL=y -CONFIG_PUV3_MUSB=n -CONFIG_PUV3_AC97=n -CONFIG_PUV3_NAND=n -CONFIG_PUV3_MMC=n -CONFIG_PUV3_UART=n - -### Device Drivers -# Memory Technology Device (MTD) support -CONFIG_MTD=m -CONFIG_MTD_UBI=m -CONFIG_MTD_PARTITIONS=y -CONFIG_MTD_CHAR=m -CONFIG_MTD_BLKDEVS=m -# RAM/ROM/Flash chip drivers -CONFIG_MTD_CFI=m -CONFIG_MTD_JEDECPROBE=m -CONFIG_MTD_CFI_AMDSTD=m -# Mapping drivers for chip access -CONFIG_MTD_PHYSMAP=m - -# Block devices -CONFIG_BLK_DEV_LOOP=m - -# SCSI device support -CONFIG_SCSI=y -CONFIG_BLK_DEV_SD=y -CONFIG_BLK_DEV_SR=m -CONFIG_CHR_DEV_SG=m - -# Serial ATA (prod) and Parallel ATA (experimental) drivers -CONFIG_ATA=y -CONFIG_SATA_VIA=y - -# Network device support -CONFIG_NETDEVICES=y -CONFIG_NET_ETHERNET=y -CONFIG_NETDEV_1000=y -# Wireless LAN -CONFIG_WLAN_80211=n -CONFIG_RT2X00=n -CONFIG_RT73USB=n - -# Input device support -CONFIG_INPUT_EVDEV=m -# Keyboards -CONFIG_KEYBOARD_GPIO=m - -# I2C support -CONFIG_I2C=y -CONFIG_I2C_PUV3=y - -# Hardware Monitoring support -#CONFIG_SENSORS_LM75=m -# Generic Thermal sysfs driver -#CONFIG_THERMAL=y -#CONFIG_THERMAL_HWMON=y - -# Multimedia support -CONFIG_MEDIA_SUPPORT=n -CONFIG_VIDEO_DEV=n -CONFIG_USB_VIDEO_CLASS=n - -# Graphics support -CONFIG_FB=y -CONFIG_FB_PUV3_UNIGFX=y -# Console display driver support -CONFIG_VGA_CONSOLE=n -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_FONTS=y -CONFIG_FONT_8x8=y -CONFIG_FONT_8x16=y -# Bootup logo -CONFIG_LOGO=n - -# Sound card support -CONFIG_SOUND=m -# Advanced Linux Sound Architecture -CONFIG_SND=m -CONFIG_SND_MIXER_OSS=m -CONFIG_SND_PCM_OSS=m - -# USB support -CONFIG_USB_ARCH_HAS_HCD=n -CONFIG_USB=n -CONFIG_USB_PRINTER=n -CONFIG_USB_STORAGE=n -# Inventra Highspeed Dual Role Controller -CONFIG_USB_MUSB_HDRC=n - -# LED Support -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=y -CONFIG_LEDS_GPIO=y -# LED Triggers -CONFIG_LEDS_TRIGGERS=y -CONFIG_LEDS_TRIGGER_TIMER=y -CONFIG_LEDS_TRIGGER_DISK=y -CONFIG_LEDS_TRIGGER_HEARTBEAT=y - -# Real Time Clock -CONFIG_RTC_LIB=y -CONFIG_RTC_CLASS=y -CONFIG_RTC_DRV_PUV3=y - -### File systems -CONFIG_EXT2_FS=m -CONFIG_EXT3_FS=y -CONFIG_EXT4_FS=y -CONFIG_FUSE_FS=m -# CD-ROM/DVD Filesystems -CONFIG_ISO9660_FS=m -CONFIG_JOLIET=y -CONFIG_UDF_FS=m -# DOS/FAT/NT Filesystems -CONFIG_VFAT_FS=m -# Pseudo filesystems -CONFIG_PROC_FS=y -CONFIG_SYSFS=y -CONFIG_TMPFS=y -# Miscellaneous filesystems -CONFIG_MISC_FILESYSTEMS=y -CONFIG_JFFS2_FS=m -CONFIG_UBIFS_FS=m -# Network File Systems -CONFIG_NETWORK_FILESYSTEMS=y -CONFIG_NFS_FS=y -CONFIG_NFS_V3=y -CONFIG_ROOT_NFS=y -# Partition Types -CONFIG_PARTITION_ADVANCED=y -CONFIG_MSDOS_PARTITION=y -# Native language support -CONFIG_NLS=y -CONFIG_NLS_CODEPAGE_437=m -CONFIG_NLS_CODEPAGE_936=m -CONFIG_NLS_ISO8859_1=m -CONFIG_NLS_UTF8=m - -### Kernel hacking -CONFIG_FRAME_WARN=8096 -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_KERNEL=y -CONFIG_PROVE_LOCKING=n -CONFIG_DEBUG_BUGVERBOSE=y -CONFIG_FRAME_POINTER=y -CONFIG_DEBUG_LL=y - diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild deleted file mode 100644 index 55026e8240d8..000000000000 --- a/arch/unicore32/include/asm/Kbuild +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -generic-y += extable.h -generic-y += kvm_para.h -generic-y += mcs_spinlock.h -generic-y += parport.h -generic-y += syscalls.h -generic-y += user.h diff --git a/arch/unicore32/include/asm/assembler.h b/arch/unicore32/include/asm/assembler.h deleted file mode 100644 index 3de843d92850..000000000000 --- a/arch/unicore32/include/asm/assembler.h +++ /dev/null @@ -1,128 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/assembler.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Do not include any C declarations in this file - it is included by - * assembler source. - */ -#ifndef __ASSEMBLY__ -#error "Only include this from assembly code" -#endif - -#include - -/* - * Little Endian independent macros for shifting bytes within registers. - */ -#define pull >> -#define push << -#define get_byte_0 << #0 -#define get_byte_1 >> #8 -#define get_byte_2 >> #16 -#define get_byte_3 >> #24 -#define put_byte_0 << #0 -#define put_byte_1 << #8 -#define put_byte_2 << #16 -#define put_byte_3 << #24 - -#define cadd cmpadd -#define cand cmpand -#define csub cmpsub -#define cxor cmpxor - -/* - * Enable and disable interrupts - */ - .macro disable_irq, temp - mov \temp, asr - andn \temp, \temp, #0xFF - or \temp, \temp, #PSR_I_BIT | PRIV_MODE - mov.a asr, \temp - .endm - - .macro enable_irq, temp - mov \temp, asr - andn \temp, \temp, #0xFF - or \temp, \temp, #PRIV_MODE - mov.a asr, \temp - .endm - -#define USER(x...) \ -9999: x; \ - .pushsection __ex_table, "a"; \ - .align 3; \ - .long 9999b, 9001f; \ - .popsection - - .macro notcond, cond, nexti = .+8 - .ifc \cond, eq - bne \nexti - .else; .ifc \cond, ne - beq \nexti - .else; .ifc \cond, ea - bub \nexti - .else; .ifc \cond, ub - bea \nexti - .else; .ifc \cond, fs - bns \nexti - .else; .ifc \cond, ns - bfs \nexti - .else; .ifc \cond, fv - bnv \nexti - .else; .ifc \cond, nv - bfv \nexti - .else; .ifc \cond, ua - beb \nexti - .else; .ifc \cond, eb - bua \nexti - .else; .ifc \cond, eg - bsl \nexti - .else; .ifc \cond, sl - beg \nexti - .else; .ifc \cond, sg - bel \nexti - .else; .ifc \cond, el - bsg \nexti - .else; .ifnc \cond, al - .error "Unknown cond in notcond macro argument" - .endif; .endif; .endif; .endif; .endif; .endif; .endif - .endif; .endif; .endif; .endif; .endif; .endif; .endif - .endif - .endm - - .macro usracc, instr, reg, ptr, inc, cond, rept, abort - .rept \rept - notcond \cond, .+8 -9999 : - .if \inc == 1 - \instr\()b.u \reg, [\ptr], #\inc - .elseif \inc == 4 - \instr\()w.u \reg, [\ptr], #\inc - .else - .error "Unsupported inc macro argument" - .endif - - .pushsection __ex_table, "a" - .align 3 - .long 9999b, \abort - .popsection - .endr - .endm - - .macro strusr, reg, ptr, inc, cond = al, rept = 1, abort = 9001f - usracc st, \reg, \ptr, \inc, \cond, \rept, \abort - .endm - - .macro ldrusr, reg, ptr, inc, cond = al, rept = 1, abort = 9001f - usracc ld, \reg, \ptr, \inc, \cond, \rept, \abort - .endm - - .macro nop8 - .rept 8 - nop - .endr - .endm diff --git a/arch/unicore32/include/asm/barrier.h b/arch/unicore32/include/asm/barrier.h deleted file mode 100644 index efb81de87507..000000000000 --- a/arch/unicore32/include/asm/barrier.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Memory barrier implementations for PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2012 GUAN Xue-tao - */ -#ifndef __UNICORE_BARRIER_H__ -#define __UNICORE_BARRIER_H__ - -#define isb() __asm__ __volatile__ ("" : : : "memory") -#define dsb() __asm__ __volatile__ ("" : : : "memory") -#define dmb() __asm__ __volatile__ ("" : : : "memory") - -#include - -#endif /* __UNICORE_BARRIER_H__ */ diff --git a/arch/unicore32/include/asm/bitops.h b/arch/unicore32/include/asm/bitops.h deleted file mode 100644 index deeb2163f35e..000000000000 --- a/arch/unicore32/include/asm/bitops.h +++ /dev/null @@ -1,46 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/bitops.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_BITOPS_H__ -#define __UNICORE_BITOPS_H__ - -#define _ASM_GENERIC_BITOPS_FLS_H_ -#define _ASM_GENERIC_BITOPS___FLS_H_ -#define _ASM_GENERIC_BITOPS_FFS_H_ -#define _ASM_GENERIC_BITOPS___FFS_H_ -/* - * On UNICORE, those functions can be implemented around - * the cntlz instruction for much better code efficiency. - */ - -static inline int fls(unsigned int x) -{ - int ret; - - asm("cntlz\t%0, %1" : "=r" (ret) : "r" (x) : "cc"); - ret = 32 - ret; - - return ret; -} - -#define __fls(x) (fls(x) - 1) -#define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); }) -#define __ffs(x) (ffs(x) - 1) - -#include - -/* following definitions: to avoid using codes in lib/find_*.c */ -#define find_next_bit find_next_bit -#define find_next_zero_bit find_next_zero_bit -#define find_first_bit find_first_bit -#define find_first_zero_bit find_first_zero_bit - -#include - -#endif /* __UNICORE_BITOPS_H__ */ diff --git a/arch/unicore32/include/asm/bug.h b/arch/unicore32/include/asm/bug.h deleted file mode 100644 index 99acea84a865..000000000000 --- a/arch/unicore32/include/asm/bug.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Bug handling for PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2012 GUAN Xue-tao - */ -#ifndef __UNICORE_BUG_H__ -#define __UNICORE_BUG_H__ - -#include - -struct pt_regs; -struct siginfo; - -extern void die(const char *msg, struct pt_regs *regs, int err); -extern void uc32_notify_die(const char *str, struct pt_regs *regs, - int sig, int code, void __user *addr, - unsigned long err, unsigned long trap); - -#endif /* __UNICORE_BUG_H__ */ diff --git a/arch/unicore32/include/asm/cache.h b/arch/unicore32/include/asm/cache.h deleted file mode 100644 index 44ecd1f300fe..000000000000 --- a/arch/unicore32/include/asm/cache.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/cache.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_CACHE_H__ -#define __UNICORE_CACHE_H__ - -#define L1_CACHE_SHIFT (5) -#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) - -/* - * Memory returned by kmalloc() may be used for DMA, so we must make - * sure that all such allocations are cache aligned. Otherwise, - * unrelated code may cause parts of the buffer to be read into the - * cache before the transfer is done, causing old data to be seen by - * the CPU. - */ -#define ARCH_DMA_MINALIGN L1_CACHE_BYTES - -#endif diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h deleted file mode 100644 index ff0be92ebc32..000000000000 --- a/arch/unicore32/include/asm/cacheflush.h +++ /dev/null @@ -1,186 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/cacheflush.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_CACHEFLUSH_H__ -#define __UNICORE_CACHEFLUSH_H__ - -#include - -#include - -#define CACHE_COLOUR(vaddr) ((vaddr & (SHMLBA - 1)) >> PAGE_SHIFT) - -/* - * This flag is used to indicate that the page pointed to by a pte is clean - * and does not require cleaning before returning it to the user. - */ -#define PG_dcache_clean PG_arch_1 - -/* - * MM Cache Management - * =================== - * - * The arch/unicore32/mm/cache.S files implement these methods. - * - * Start addresses are inclusive and end addresses are exclusive; - * start addresses should be rounded down, end addresses up. - * - * See Documentation/core-api/cachetlb.rst for more information. - * Please note that the implementation of these, and the required - * effects are cache-type (VIVT/VIPT/PIPT) specific. - * - * flush_icache_all() - * - * Unconditionally clean and invalidate the entire icache. - * Currently only needed for cache-v6.S and cache-v7.S, see - * __flush_icache_all for the generic implementation. - * - * flush_kern_all() - * - * Unconditionally clean and invalidate the entire cache. - * - * flush_user_all() - * - * Clean and invalidate all user space cache entries - * before a change of page tables. - * - * flush_user_range(start, end, flags) - * - * Clean and invalidate a range of cache entries in the - * specified address space before a change of page tables. - * - start - user start address (inclusive, page aligned) - * - end - user end address (exclusive, page aligned) - * - flags - vma->vm_flags field - * - * coherent_kern_range(start, end) - * - * Ensure coherency between the Icache and the Dcache in the - * region described by start, end. If you have non-snooping - * Harvard caches, you need to implement this function. - * - start - virtual start address - * - end - virtual end address - * - * coherent_user_range(start, end) - * - * Ensure coherency between the Icache and the Dcache in the - * region described by start, end. If you have non-snooping - * Harvard caches, you need to implement this function. - * - start - virtual start address - * - end - virtual end address - * - * flush_kern_dcache_area(kaddr, size) - * - * Ensure that the data held in page is written back. - * - kaddr - page address - * - size - region size - * - * DMA Cache Coherency - * =================== - * - * dma_flush_range(start, end) - * - * Clean and invalidate the specified virtual address range. - * - start - virtual start address - * - end - virtual end address - */ - -extern void __cpuc_flush_icache_all(void); -extern void __cpuc_flush_kern_all(void); -extern void __cpuc_flush_user_all(void); -extern void __cpuc_flush_user_range(unsigned long, unsigned long, unsigned int); -extern void __cpuc_coherent_kern_range(unsigned long, unsigned long); -extern void __cpuc_coherent_user_range(unsigned long, unsigned long); -extern void __cpuc_flush_dcache_area(void *, size_t); -extern void __cpuc_flush_kern_dcache_area(void *addr, size_t size); - -/* - * Copy user data from/to a page which is mapped into a different - * processes address space. Really, we want to allow our "user - * space" model to handle this. - */ -extern void copy_to_user_page(struct vm_area_struct *, struct page *, - unsigned long, void *, const void *, unsigned long); -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - } while (0) - -/* - * Convert calls to our calling convention. - */ -/* Invalidate I-cache */ -static inline void __flush_icache_all(void) -{ - asm("movc p0.c5, %0, #20;\n" - "nop; nop; nop; nop; nop; nop; nop; nop\n" - : - : "r" (0)); -} - -#define flush_cache_all() __cpuc_flush_kern_all() - -extern void flush_cache_mm(struct mm_struct *mm); -extern void flush_cache_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end); -extern void flush_cache_page(struct vm_area_struct *vma, - unsigned long user_addr, unsigned long pfn); - -#define flush_cache_dup_mm(mm) flush_cache_mm(mm) - -/* - * Perform necessary cache operations to ensure that data previously - * stored within this range of addresses can be executed by the CPU. - */ -#define flush_icache_range(s, e) __cpuc_coherent_kern_range(s, e) - -/* - * Perform necessary cache operations to ensure that the TLB will - * see data written in the specified area. - */ -#define clean_dcache_area(start, size) cpu_dcache_clean_area(start, size) - -/* - * flush_dcache_page is used when the kernel has written to the page - * cache page at virtual address page->virtual. - * - * If this page isn't mapped (ie, page_mapping == NULL), or it might - * have userspace mappings, then we _must_ always clean + invalidate - * the dcache entries associated with the kernel mapping. - * - * Otherwise we can defer the operation, and clean the cache when we are - * about to change to user space. This is the same method as used on SPARC64. - * See update_mmu_cache for the user space part. - */ -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -extern void flush_dcache_page(struct page *); - -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) - -/* - * We don't appear to need to do anything here. In fact, if we did, we'd - * duplicate cache flushing elsewhere performed by flush_dcache_page(). - */ -#define flush_icache_page(vma, page) do { } while (0) - -/* - * flush_cache_vmap() is used when creating mappings (eg, via vmap, - * vmalloc, ioremap etc) in kernel space for pages. On non-VIPT - * caches, since the direct-mappings of these pages may contain cached - * data, we need to do a full cache flush to ensure that writebacks - * don't corrupt data placed into these pages via the new mappings. - */ -static inline void flush_cache_vmap(unsigned long start, unsigned long end) -{ -} - -static inline void flush_cache_vunmap(unsigned long start, unsigned long end) -{ -} - -#endif diff --git a/arch/unicore32/include/asm/checksum.h b/arch/unicore32/include/asm/checksum.h deleted file mode 100644 index e774ca268c15..000000000000 --- a/arch/unicore32/include/asm/checksum.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/checksum.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * IP checksum routines - */ -#ifndef __UNICORE_CHECKSUM_H__ -#define __UNICORE_CHECKSUM_H__ - -/* - * computes the checksum of the TCP/UDP pseudo-header - * returns a 16-bit checksum, already complemented - */ - -static inline __wsum -csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, - __u8 proto, __wsum sum) -{ - __asm__( - "add.a %0, %1, %2\n" - "addc.a %0, %0, %3\n" - "addc.a %0, %0, %4 << #8\n" - "addc.a %0, %0, %5\n" - "addc %0, %0, #0\n" - : "=&r"(sum) - : "r" (sum), "r" (daddr), "r" (saddr), "r" (len), "Ir" (htons(proto)) - : "cc"); - return sum; -} -#define csum_tcpudp_nofold csum_tcpudp_nofold - -#include - -#endif diff --git a/arch/unicore32/include/asm/cmpxchg.h b/arch/unicore32/include/asm/cmpxchg.h deleted file mode 100644 index 87f960a2e4f0..000000000000 --- a/arch/unicore32/include/asm/cmpxchg.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Atomics xchg/cmpxchg for PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2012 GUAN Xue-tao - */ -#ifndef __UNICORE_CMPXCHG_H__ -#define __UNICORE_CMPXCHG_H__ - -/* - * Generate a link failure on undefined symbol if the pointer points to a value - * of unsupported size. - */ -extern void __xchg_bad_pointer(void); - -static inline unsigned long __xchg(unsigned long x, volatile void *ptr, - int size) -{ - unsigned long ret; - - switch (size) { - case 1: - asm volatile("swapb %0, %1, [%2]" - : "=&r" (ret) - : "r" (x), "r" (ptr) - : "memory", "cc"); - break; - case 4: - asm volatile("swapw %0, %1, [%2]" - : "=&r" (ret) - : "r" (x), "r" (ptr) - : "memory", "cc"); - break; - default: - __xchg_bad_pointer(); - } - - return ret; -} - -#define xchg(ptr, x) \ - ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr)))) - -#include - -/* - * cmpxchg_local and cmpxchg64_local are atomic wrt current CPU. Always make - * them available. - */ -#define cmpxchg_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg_local_generic((ptr), \ - (unsigned long)(o), (unsigned long)(n), sizeof(*(ptr)))) -#define cmpxchg64_local(ptr, o, n) \ - __cmpxchg64_local_generic((ptr), (o), (n)) - -#include - -#endif /* __UNICORE_CMPXCHG_H__ */ diff --git a/arch/unicore32/include/asm/cpu-single.h b/arch/unicore32/include/asm/cpu-single.h deleted file mode 100644 index 1b419d697fd1..000000000000 --- a/arch/unicore32/include/asm/cpu-single.h +++ /dev/null @@ -1,42 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/cpu-single.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_CPU_SINGLE_H__ -#define __UNICORE_CPU_SINGLE_H__ - -#include -#include - -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ - -#define cpu_switch_mm(pgd, mm) cpu_do_switch_mm(virt_to_phys(pgd), mm) - -#define cpu_get_pgd() \ - ({ \ - unsigned long pg; \ - __asm__("movc %0, p0.c2, #0" \ - : "=r" (pg) : : "cc"); \ - pg &= ~0x0fff; \ - (pgd_t *)phys_to_virt(pg); \ - }) - -struct mm_struct; - -/* declare all the functions as extern */ -extern void cpu_proc_fin(void); -extern int cpu_do_idle(void); -extern void cpu_dcache_clean_area(void *, int); -extern void cpu_do_switch_mm(unsigned long pgd_phys, struct mm_struct *mm); -extern void cpu_set_pte(pte_t *ptep, pte_t pte); -extern void cpu_reset(unsigned long addr) __attribute__((noreturn)); - -#endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - -#endif /* __UNICORE_CPU_SINGLE_H__ */ diff --git a/arch/unicore32/include/asm/cputype.h b/arch/unicore32/include/asm/cputype.h deleted file mode 100644 index 08a47e3bdbcc..000000000000 --- a/arch/unicore32/include/asm/cputype.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/cputype.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_CPUTYPE_H__ -#define __UNICORE_CPUTYPE_H__ - -#include - -#define CPUID_CPUID 0 -#define CPUID_CACHETYPE 1 - -#define read_cpuid(reg) \ - ({ \ - unsigned int __val; \ - asm("movc %0, p0.c0, #" __stringify(reg) \ - : "=r" (__val) \ - : \ - : "cc"); \ - __val; \ - }) - -#define uc32_cpuid read_cpuid(CPUID_CPUID) -#define uc32_cachetype read_cpuid(CPUID_CACHETYPE) - -#endif diff --git a/arch/unicore32/include/asm/delay.h b/arch/unicore32/include/asm/delay.h deleted file mode 100644 index 934193edfa66..000000000000 --- a/arch/unicore32/include/asm/delay.h +++ /dev/null @@ -1,49 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/delay.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Delay routines, using a pre-computed "loops_per_second" value. - */ -#ifndef __UNICORE_DELAY_H__ -#define __UNICORE_DELAY_H__ - -#include /* HZ */ - -extern void __delay(int loops); - -/* - * This function intentionally does not exist; if you see references to - * it, it means that you're calling udelay() with an out of range value. - * - * With currently imposed limits, this means that we support a max delay - * of 2000us. Further limits: HZ<=1000 and bogomips<=3355 - */ -extern void __bad_udelay(void); - -/* - * division by multiplication: you don't have to worry about - * loss of precision. - * - * Use only for very small delays ( < 1 msec). Should probably use a - * lookup table, really, as the multiplications take much too long with - * short delays. This is a "reasonable" implementation, though (and the - * first constant multiplications gets optimized away if the delay is - * a constant) - */ -extern void __udelay(unsigned long usecs); -extern void __const_udelay(unsigned long); - -#define MAX_UDELAY_MS 2 - -#define udelay(n) \ - (__builtin_constant_p(n) ? \ - ((n) > (MAX_UDELAY_MS * 1000) ? __bad_udelay() : \ - __const_udelay((n) * ((2199023U*HZ)>>11))) : \ - __udelay(n)) - -#endif /* __UNICORE_DELAY_H__ */ - diff --git a/arch/unicore32/include/asm/dma.h b/arch/unicore32/include/asm/dma.h deleted file mode 100644 index 1326310b21e6..000000000000 --- a/arch/unicore32/include/asm/dma.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/dma.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_DMA_H__ -#define __UNICORE_DMA_H__ - -#include -#include - -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#endif - -#endif /* __UNICORE_DMA_H__ */ diff --git a/arch/unicore32/include/asm/elf.h b/arch/unicore32/include/asm/elf.h deleted file mode 100644 index a464ed5f05d4..000000000000 --- a/arch/unicore32/include/asm/elf.h +++ /dev/null @@ -1,90 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/elf.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_ELF_H__ -#define __UNICORE_ELF_H__ - -#include - -/* - * ELF register definitions.. - */ -#include -#include - -typedef unsigned long elf_greg_t; -typedef unsigned long elf_freg_t[3]; - -#define ELF_NGREG (sizeof(struct pt_regs) / sizeof(elf_greg_t)) -typedef elf_greg_t elf_gregset_t[ELF_NGREG]; - -typedef struct fp_state elf_fpregset_t; - -#define R_UNICORE_NONE 0 -#define R_UNICORE_PC24 1 -#define R_UNICORE_ABS32 2 -#define R_UNICORE_CALL 28 -#define R_UNICORE_JUMP24 29 - -/* - * These are used to set parameters in the core dumps. - */ -#define ELF_CLASS ELFCLASS32 -#define ELF_DATA ELFDATA2LSB -#define ELF_ARCH EM_UNICORE - -/* - * This yields a string that ld.so will use to load implementation - * specific libraries for optimization. This is more specific in - * intent than poking at uname or /proc/cpuinfo. - * - */ -#define ELF_PLATFORM_SIZE 8 -#define ELF_PLATFORM (elf_platform) - -extern char elf_platform[]; - -struct elf32_hdr; - -/* - * This is used to ensure we don't load something for the wrong architecture. - */ -extern int elf_check_arch(const struct elf32_hdr *); -#define elf_check_arch elf_check_arch - -struct task_struct; -int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs); -#define ELF_CORE_COPY_TASK_REGS dump_task_regs - -#define ELF_EXEC_PAGESIZE 4096 - -/* This is the location that an ET_DYN program is loaded if exec'ed. Typical - use of this is to invoke "./ld.so someprog" to test out a new version of - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - -#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) - -/* When the program starts, a1 contains a pointer to a function to be - registered with atexit, as per the SVR4 ABI. A value of 0 means we - have no such handler. */ -#define ELF_PLAT_INIT(_r, load_addr) {(_r)->UCreg_00 = 0; } - -extern void elf_set_personality(const struct elf32_hdr *); -#define SET_PERSONALITY(ex) elf_set_personality(&(ex)) - -struct mm_struct; -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk - -extern int vectors_user_mapping(void); -#define arch_setup_additional_pages(bprm, uses_interp) vectors_user_mapping() -#define ARCH_HAS_SETUP_ADDITIONAL_PAGES - -#endif diff --git a/arch/unicore32/include/asm/fpstate.h b/arch/unicore32/include/asm/fpstate.h deleted file mode 100644 index 5811293e7a7e..000000000000 --- a/arch/unicore32/include/asm/fpstate.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/fpstate.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_FPSTATE_H__ -#define __UNICORE_FPSTATE_H__ - -#ifndef __ASSEMBLY__ - -#define FP_REGS_NUMBER 33 - -struct fp_state { - unsigned int regs[FP_REGS_NUMBER]; -} __attribute__((aligned(8))); - -#endif - -#endif diff --git a/arch/unicore32/include/asm/fpu-ucf64.h b/arch/unicore32/include/asm/fpu-ucf64.h deleted file mode 100644 index 7a0c8a9e05d4..000000000000 --- a/arch/unicore32/include/asm/fpu-ucf64.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/fpu-ucf64.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ -#define FPSCR s31 - -/* FPSCR bits */ -#define FPSCR_DEFAULT_NAN (1<<25) - -#define FPSCR_CMPINSTR_BIT (1<<31) - -#define FPSCR_CON (1<<29) -#define FPSCR_TRAP (1<<27) - -/* RND mode */ -#define FPSCR_ROUND_NEAREST (0<<0) -#define FPSCR_ROUND_PLUSINF (2<<0) -#define FPSCR_ROUND_MINUSINF (3<<0) -#define FPSCR_ROUND_TOZERO (1<<0) -#define FPSCR_RMODE_BIT (0) -#define FPSCR_RMODE_MASK (7 << FPSCR_RMODE_BIT) - -/* trap enable */ -#define FPSCR_IOE (1<<16) -#define FPSCR_OFE (1<<14) -#define FPSCR_UFE (1<<13) -#define FPSCR_IXE (1<<12) -#define FPSCR_HIE (1<<11) -#define FPSCR_NDE (1<<10) /* non denomal */ - -/* flags */ -#define FPSCR_IDC (1<<24) -#define FPSCR_HIC (1<<23) -#define FPSCR_IXC (1<<22) -#define FPSCR_OFC (1<<21) -#define FPSCR_UFC (1<<20) -#define FPSCR_IOC (1<<19) - -/* stick bits */ -#define FPSCR_IOS (1<<9) -#define FPSCR_OFS (1<<7) -#define FPSCR_UFS (1<<6) -#define FPSCR_IXS (1<<5) -#define FPSCR_HIS (1<<4) -#define FPSCR_NDS (1<<3) /*non denomal */ diff --git a/arch/unicore32/include/asm/gpio.h b/arch/unicore32/include/asm/gpio.h deleted file mode 100644 index dfad04ca0a65..000000000000 --- a/arch/unicore32/include/asm/gpio.h +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/gpio.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_GPIO_H__ -#define __UNICORE_GPIO_H__ - -#include -#include -#include -#include - -#define GPI_OTP_INT 0 -#define GPI_PCI_INTA 1 -#define GPI_PCI_INTB 2 -#define GPI_PCI_INTC 3 -#define GPI_PCI_INTD 4 -#define GPI_BAT_DET 5 -#define GPI_SD_CD 6 -#define GPI_SOFF_REQ 7 -#define GPI_SD_WP 8 -#define GPI_LCD_CASE_OFF 9 -#define GPO_WIFI_EN 10 -#define GPO_HDD_LED 11 -#define GPO_VGA_EN 12 -#define GPO_LCD_EN 13 -#define GPO_LED_DATA 14 -#define GPO_LED_CLK 15 -#define GPO_CAM_PWR_EN 16 -#define GPO_LCD_VCC_EN 17 -#define GPO_SOFT_OFF 18 -#define GPO_BT_EN 19 -#define GPO_FAN_ON 20 -#define GPO_SPKR 21 -#define GPO_SET_V1 23 -#define GPO_SET_V2 24 -#define GPO_CPU_HEALTH 25 -#define GPO_LAN_SEL 26 - -#ifdef CONFIG_PUV3_NB0916 -#define GPI_BTN_TOUCH 14 -#define GPIO_IN 0x000043ff /* 1 for input */ -#define GPIO_OUT 0x0fffbc00 /* 1 for output */ -#endif /* CONFIG_PUV3_NB0916 */ - -#ifdef CONFIG_PUV3_SMW0919 -#define GPIO_IN 0x000003ff /* 1 for input */ -#define GPIO_OUT 0x0ffffc00 /* 1 for output */ -#endif /* CONFIG_PUV3_SMW0919 */ - -#ifdef CONFIG_PUV3_DB0913 -#define GPIO_IN 0x000001df /* 1 for input */ -#define GPIO_OUT 0x03fee800 /* 1 for output */ -#endif /* CONFIG_PUV3_DB0913 */ - -#define GPIO_DIR (~((GPIO_IN) | 0xf0000000)) - /* 0 input, 1 output */ - -static inline int gpio_get_value(unsigned gpio) -{ - if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX)) - return readl(GPIO_GPLR) & GPIO_GPIO(gpio); - else - return __gpio_get_value(gpio); -} - -static inline void gpio_set_value(unsigned gpio, int value) -{ - if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX)) - if (value) - writel(GPIO_GPIO(gpio), GPIO_GPSR); - else - writel(GPIO_GPIO(gpio), GPIO_GPCR); - else - __gpio_set_value(gpio, value); -} - -#define gpio_cansleep __gpio_cansleep - -static inline unsigned gpio_to_irq(unsigned gpio) -{ - if ((gpio < IRQ_GPIOHIGH) && (FIELD(1, 1, gpio) & readl(GPIO_GPIR))) - return IRQ_GPIOLOW0 + gpio; - else - return IRQ_GPIO0 + gpio; -} - -static inline unsigned irq_to_gpio(unsigned irq) -{ - if (irq < IRQ_GPIOHIGH) - return irq - IRQ_GPIOLOW0; - else - return irq - IRQ_GPIO0; -} - -#endif /* __UNICORE_GPIO_H__ */ diff --git a/arch/unicore32/include/asm/hwcap.h b/arch/unicore32/include/asm/hwcap.h deleted file mode 100644 index 2e15ffbe8391..000000000000 --- a/arch/unicore32/include/asm/hwcap.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/hwcap.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_HWCAP_H__ -#define __UNICORE_HWCAP_H__ - -/* - * HWCAP flags - */ -#define HWCAP_MSP 1 -#define HWCAP_UNICORE16 2 -#define HWCAP_CMOV 4 -#define HWCAP_UNICORE_F64 8 -#define HWCAP_TLS 0x80 - -#if defined(__KERNEL__) && !defined(__ASSEMBLY__) -/* - * This yields a mask that user programs can use to figure out what - * instruction set this cpu supports. - */ -#define ELF_HWCAP (HWCAP_CMOV | HWCAP_UNICORE_F64) -#endif - -#endif diff --git a/arch/unicore32/include/asm/hwdef-copro.h b/arch/unicore32/include/asm/hwdef-copro.h deleted file mode 100644 index 2db8cf864e43..000000000000 --- a/arch/unicore32/include/asm/hwdef-copro.h +++ /dev/null @@ -1,45 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Co-processor register definitions for PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2012 GUAN Xue-tao - */ -#ifndef __UNICORE_HWDEF_COPRO_H__ -#define __UNICORE_HWDEF_COPRO_H__ - -/* - * Control Register bits (CP#0 CR1) - */ -#define CR_M (1 << 0) /* MMU enable */ -#define CR_A (1 << 1) /* Alignment abort enable */ -#define CR_D (1 << 2) /* Dcache enable */ -#define CR_I (1 << 3) /* Icache enable */ -#define CR_B (1 << 4) /* Dcache write mechanism: write back */ -#define CR_T (1 << 5) /* Burst enable */ -#define CR_V (1 << 13) /* Vectors relocated to 0xffff0000 */ - -#ifndef __ASSEMBLY__ - -#define vectors_high() (cr_alignment & CR_V) - -extern unsigned long cr_no_alignment; /* defined in entry.S */ -extern unsigned long cr_alignment; /* defined in entry.S */ - -static inline unsigned int get_cr(void) -{ - unsigned int val; - asm("movc %0, p0.c1, #0" : "=r" (val) : : "cc"); - return val; -} - -static inline void set_cr(unsigned int val) -{ - asm volatile("movc p0.c1, %0, #0" : : "r" (val) : "cc"); - isb(); -} - -extern void adjust_cr(unsigned long mask, unsigned long set); - -#endif /* __ASSEMBLY__ */ - -#endif /* __UNICORE_HWDEF_COPRO_H__ */ diff --git a/arch/unicore32/include/asm/io.h b/arch/unicore32/include/asm/io.h deleted file mode 100644 index bd4e7c332f85..000000000000 --- a/arch/unicore32/include/asm/io.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/io.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_IO_H__ -#define __UNICORE_IO_H__ - -#ifdef __KERNEL__ - -#include -#include - -#define PCI_IOBASE PKUNITY_PCILIO_BASE -#include - -/* - * __uc32_ioremap takes CPU physical address. - */ -extern void __iomem *__uc32_ioremap(unsigned long, size_t); -extern void __uc32_iounmap(volatile void __iomem *addr); - -/* - * ioremap and friends. - * - * ioremap takes a PCI memory address, as specified in - * Documentation/driver-api/io-mapping.rst. - * - */ -#define ioremap(cookie, size) __uc32_ioremap(cookie, size) -#define iounmap(cookie) __uc32_iounmap(cookie) - -#define readb_relaxed readb -#define readw_relaxed readw -#define readl_relaxed readl - -#define HAVE_ARCH_PIO_SIZE -#define PIO_OFFSET (unsigned int)(PCI_IOBASE) -#define PIO_MASK (unsigned int)(IO_SPACE_LIMIT) -#define PIO_RESERVED (PIO_OFFSET + PIO_MASK + 1) - -#ifdef CONFIG_STRICT_DEVMEM - -#include -#include - -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain - * address is valid. The argument is a physical page number. - * We mimic x86 here by disallowing access to system RAM as well as - * device-exclusive MMIO regions. This effectively disable read()/write() - * on /dev/mem. - */ -static inline int devmem_is_allowed(unsigned long pfn) -{ - if (iomem_is_exclusive(pfn << PAGE_SHIFT)) - return 0; - if (!page_is_ram(pfn)) - return 1; - return 0; -} - -#endif /* CONFIG_STRICT_DEVMEM */ - -#endif /* __KERNEL__ */ -#endif /* __UNICORE_IO_H__ */ diff --git a/arch/unicore32/include/asm/irq.h b/arch/unicore32/include/asm/irq.h deleted file mode 100644 index 3f7f07c0338c..000000000000 --- a/arch/unicore32/include/asm/irq.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/irq.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_IRQ_H__ -#define __UNICORE_IRQ_H__ - -#include - -#define IRQ_GPIOLOW0 0x00 -#define IRQ_GPIOLOW1 0x01 -#define IRQ_GPIOLOW2 0x02 -#define IRQ_GPIOLOW3 0x03 -#define IRQ_GPIOLOW4 0x04 -#define IRQ_GPIOLOW5 0x05 -#define IRQ_GPIOLOW6 0x06 -#define IRQ_GPIOLOW7 0x07 -#define IRQ_GPIOHIGH 0x08 -#define IRQ_USB 0x09 -#define IRQ_SDC 0x0a -#define IRQ_AC97 0x0b -#define IRQ_SATA 0x0c -#define IRQ_MME 0x0d -#define IRQ_PCI_BRIDGE 0x0e -#define IRQ_DDR 0x0f -#define IRQ_SPI 0x10 -#define IRQ_UNIGFX 0x11 -#define IRQ_I2C 0x11 -#define IRQ_UART1 0x12 -#define IRQ_UART0 0x13 -#define IRQ_UMAL 0x14 -#define IRQ_NAND 0x15 -#define IRQ_PS2_KBD 0x16 -#define IRQ_PS2_AUX 0x17 -#define IRQ_DMA 0x18 -#define IRQ_DMAERR 0x19 -#define IRQ_TIMER0 0x1a -#define IRQ_TIMER1 0x1b -#define IRQ_TIMER2 0x1c -#define IRQ_TIMER3 0x1d -#define IRQ_RTC 0x1e -#define IRQ_RTCAlarm 0x1f - -#define IRQ_GPIO0 0x20 -#define IRQ_GPIO1 0x21 -#define IRQ_GPIO2 0x22 -#define IRQ_GPIO3 0x23 -#define IRQ_GPIO4 0x24 -#define IRQ_GPIO5 0x25 -#define IRQ_GPIO6 0x26 -#define IRQ_GPIO7 0x27 -#define IRQ_GPIO8 0x28 -#define IRQ_GPIO9 0x29 -#define IRQ_GPIO10 0x2a -#define IRQ_GPIO11 0x2b -#define IRQ_GPIO12 0x2c -#define IRQ_GPIO13 0x2d -#define IRQ_GPIO14 0x2e -#define IRQ_GPIO15 0x2f -#define IRQ_GPIO16 0x30 -#define IRQ_GPIO17 0x31 -#define IRQ_GPIO18 0x32 -#define IRQ_GPIO19 0x33 -#define IRQ_GPIO20 0x34 -#define IRQ_GPIO21 0x35 -#define IRQ_GPIO22 0x36 -#define IRQ_GPIO23 0x37 -#define IRQ_GPIO24 0x38 -#define IRQ_GPIO25 0x39 -#define IRQ_GPIO26 0x3a -#define IRQ_GPIO27 0x3b - -#ifdef CONFIG_ARCH_FPGA -#define IRQ_PCIINTA IRQ_GPIOLOW2 -#define IRQ_PCIINTB IRQ_GPIOLOW1 -#define IRQ_PCIINTC IRQ_GPIOLOW0 -#define IRQ_PCIINTD IRQ_GPIOLOW6 -#endif - -#if defined(CONFIG_PUV3_DB0913) || defined(CONFIG_PUV3_NB0916) \ - || defined(CONFIG_PUV3_SMW0919) -#define IRQ_PCIINTA IRQ_GPIOLOW1 -#define IRQ_PCIINTB IRQ_GPIOLOW2 -#define IRQ_PCIINTC IRQ_GPIOLOW3 -#define IRQ_PCIINTD IRQ_GPIOLOW4 -#endif - -#define IRQ_SD_CD IRQ_GPIO6 /* falling or rising trigger */ - -#ifndef __ASSEMBLY__ -struct pt_regs; - -extern void asm_do_IRQ(unsigned int, struct pt_regs *); - -#endif - -#endif - diff --git a/arch/unicore32/include/asm/irqflags.h b/arch/unicore32/include/asm/irqflags.h deleted file mode 100644 index f64c82e3eae6..000000000000 --- a/arch/unicore32/include/asm/irqflags.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/irqflags.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_IRQFLAGS_H__ -#define __UNICORE_IRQFLAGS_H__ - -#ifdef __KERNEL__ - -#include - -#define ARCH_IRQ_DISABLED (PRIV_MODE | PSR_I_BIT) -#define ARCH_IRQ_ENABLED (PRIV_MODE) - -/* - * Save the current interrupt enable state. - */ -static inline unsigned long arch_local_save_flags(void) -{ - unsigned long temp; - - asm volatile("mov %0, asr" : "=r" (temp) : : "memory", "cc"); - - return temp & PSR_c; -} - -/* - * restore saved IRQ state - */ -static inline void arch_local_irq_restore(unsigned long flags) -{ - unsigned long temp; - - asm volatile( - "mov %0, asr\n" - "mov.a asr, %1\n" - "mov.f asr, %0" - : "=&r" (temp) - : "r" (flags) - : "memory", "cc"); -} - -#include - -#endif -#endif diff --git a/arch/unicore32/include/asm/linkage.h b/arch/unicore32/include/asm/linkage.h deleted file mode 100644 index 8e341ba7bc4a..000000000000 --- a/arch/unicore32/include/asm/linkage.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/linkage.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_LINKAGE_H__ -#define __UNICORE_LINKAGE_H__ - -#define __ALIGN .align 0 -#define __ALIGN_STR ".align 0" - -#define ENDPROC(name) \ - .type name, %function; \ - END(name) - -#endif diff --git a/arch/unicore32/include/asm/memblock.h b/arch/unicore32/include/asm/memblock.h deleted file mode 100644 index eb56a6ddce83..000000000000 --- a/arch/unicore32/include/asm/memblock.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/memblock.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_MEMBLOCK_H__ -#define __UNICORE_MEMBLOCK_H__ - -/* - * Memory map description - */ -# define NR_BANKS 8 - -struct membank { - unsigned long start; - unsigned long size; - unsigned int highmem; -}; - -struct meminfo { - int nr_banks; - struct membank bank[NR_BANKS]; -}; - -extern struct meminfo meminfo; - -#define for_each_bank(iter, mi) \ - for (iter = 0; iter < (mi)->nr_banks; iter++) - -#define bank_pfn_start(bank) __phys_to_pfn((bank)->start) -#define bank_pfn_end(bank) __phys_to_pfn((bank)->start + (bank)->size) -#define bank_pfn_size(bank) ((bank)->size >> PAGE_SHIFT) -#define bank_phys_start(bank) ((bank)->start) -#define bank_phys_end(bank) ((bank)->start + (bank)->size) -#define bank_phys_size(bank) ((bank)->size) - -extern void uc32_memblock_init(struct meminfo *); - -#endif diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h deleted file mode 100644 index 66285178dd9b..000000000000 --- a/arch/unicore32/include/asm/memory.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/memory.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Note: this file should not be included by non-asm/.h files - */ -#ifndef __UNICORE_MEMORY_H__ -#define __UNICORE_MEMORY_H__ - -#include -#include -#include -#include - -/* - * PAGE_OFFSET - the virtual address of the start of the kernel image - * TASK_SIZE - the maximum size of a user space task. - * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area - */ -#define PAGE_OFFSET UL(0xC0000000) -#define TASK_SIZE (PAGE_OFFSET - UL(0x41000000)) -#define TASK_UNMAPPED_BASE (PAGE_OFFSET / 3) - -/* - * The module space lives between the addresses given by TASK_SIZE - * and PAGE_OFFSET - it must be within 32MB of the kernel text. - */ -#define MODULES_VADDR (PAGE_OFFSET - 16*1024*1024) -#if TASK_SIZE > MODULES_VADDR -#error Top of user space clashes with start of module space -#endif - -#define MODULES_END (PAGE_OFFSET) - -/* - * Allow 16MB-aligned ioremap pages - */ -#define IOREMAP_MAX_ORDER 24 - -/* - * Physical vs virtual RAM address space conversion. These are - * private definitions which should NOT be used outside memory.h - * files. Use virt_to_phys/phys_to_virt/__pa/__va instead. - */ -#ifndef __virt_to_phys -#define __virt_to_phys(x) ((x) - PAGE_OFFSET + PHYS_OFFSET) -#define __phys_to_virt(x) ((x) - PHYS_OFFSET + PAGE_OFFSET) -#endif - -/* - * Convert a page to/from a physical address - */ -#define page_to_phys(page) (__pfn_to_phys(page_to_pfn(page))) -#define phys_to_page(phys) (pfn_to_page(__phys_to_pfn(phys))) - -#ifndef __ASSEMBLY__ - -#ifndef arch_adjust_zones -#define arch_adjust_zones(max_zone_pfn) do { } while (0) -#endif - -/* - * PFNs are used to describe any physical page; this means - * PFN 0 == physical address 0. - * - * This is the PFN of the first RAM page in the kernel - * direct-mapped view. We assume this is the first page - * of RAM in the mem_map as well. - */ -#define PHYS_PFN_OFFSET (PHYS_OFFSET >> PAGE_SHIFT) - -/* - * Drivers should NOT use these either. - */ -#define __pa(x) __virt_to_phys((unsigned long)(x)) -#define __va(x) ((void *)__phys_to_virt((unsigned long)(x))) -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) - -/* - * Conversion between a struct page and a physical address. - * - * page_to_pfn(page) convert a struct page * to a PFN number - * pfn_to_page(pfn) convert a _valid_ PFN number to struct page * - * - * virt_to_page(k) convert a _valid_ virtual address to struct page * - * virt_addr_valid(k) indicates whether a virtual address is valid - */ -#define ARCH_PFN_OFFSET PHYS_PFN_OFFSET - -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define virt_addr_valid(kaddr) ((unsigned long)(kaddr) >= PAGE_OFFSET && \ - (unsigned long)(kaddr) < (unsigned long)high_memory) - -#endif - -#include - -#endif diff --git a/arch/unicore32/include/asm/mmu.h b/arch/unicore32/include/asm/mmu.h deleted file mode 100644 index 8ad4e7eae17b..000000000000 --- a/arch/unicore32/include/asm/mmu.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/mmu.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_MMU_H__ -#define __UNICORE_MMU_H__ - -typedef unsigned long mm_context_t; - -#endif diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h deleted file mode 100644 index 388c0c811c68..000000000000 --- a/arch/unicore32/include/asm/mmu_context.h +++ /dev/null @@ -1,98 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/mmu_context.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_MMU_CONTEXT_H__ -#define __UNICORE_MMU_CONTEXT_H__ - -#include -#include -#include -#include -#include - -#include -#include - -#define init_new_context(tsk, mm) 0 - -#define destroy_context(mm) do { } while (0) - -/* - * This is called when "tsk" is about to enter lazy TLB mode. - * - * mm: describes the currently active mm context - * tsk: task which is entering lazy tlb - * cpu: cpu number which is entering lazy tlb - * - * tsk->mm will be NULL - */ -static inline void -enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ -} - -/* - * This is the actual mm switch as far as the scheduler - * is concerned. No registers are touched. We avoid - * calling the CPU specific function when the mm hasn't - * actually changed. - */ -static inline void -switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) -{ - unsigned int cpu = smp_processor_id(); - - if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next) - cpu_switch_mm(next->pgd, next); -} - -#define deactivate_mm(tsk, mm) do { } while (0) -#define activate_mm(prev, next) switch_mm(prev, next, NULL) - -/* - * We are inserting a "fake" vma for the user-accessible vector page so - * gdb and friends can get to it through ptrace and /proc//mem. - * But we also want to remove it before the generic code gets to see it - * during process exit or the unmapping of it would cause total havoc. - * (the macro is used as remove_vma() is static to mm/mmap.c) - */ -#define arch_exit_mmap(mm) \ -do { \ - struct vm_area_struct *high_vma = find_vma(mm, 0xffff0000); \ - if (high_vma) { \ - BUG_ON(high_vma->vm_next); /* it should be last */ \ - if (high_vma->vm_prev) \ - high_vma->vm_prev->vm_next = NULL; \ - else \ - mm->mmap = NULL; \ - rb_erase(&high_vma->vm_rb, &mm->mm_rb); \ - vmacache_invalidate(mm); \ - mm->map_count--; \ - remove_vma(high_vma); \ - } \ -} while (0) - -static inline int arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) -{ - return 0; -} - -static inline void arch_unmap(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ -} - -static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - bool write, bool execute, bool foreign) -{ - /* by default, allow everything */ - return true; -} -#endif diff --git a/arch/unicore32/include/asm/page.h b/arch/unicore32/include/asm/page.h deleted file mode 100644 index 96d6bdf180bd..000000000000 --- a/arch/unicore32/include/asm/page.h +++ /dev/null @@ -1,74 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/page.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_PAGE_H__ -#define __UNICORE_PAGE_H__ - -/* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 -#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) - -#ifndef __ASSEMBLY__ - -struct page; -struct vm_area_struct; - -#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) -extern void copy_page(void *to, const void *from); - -#define clear_user_page(page, vaddr, pg) clear_page(page) -#define copy_user_page(to, from, vaddr, pg) copy_page(to, from) - -#undef STRICT_MM_TYPECHECKS - -#ifdef STRICT_MM_TYPECHECKS -/* - * These are used to make use of C type-checking.. - */ -typedef struct { unsigned long pte; } pte_t; -typedef struct { unsigned long pgd; } pgd_t; -typedef struct { unsigned long pgprot; } pgprot_t; - -#define pte_val(x) ((x).pte) -#define pgd_val(x) ((x).pgd) -#define pgprot_val(x) ((x).pgprot) - -#define __pte(x) ((pte_t) { (x) }) -#define __pgd(x) ((pgd_t) { (x) }) -#define __pgprot(x) ((pgprot_t) { (x) }) - -#else -/* - * .. while these make it easier on the compiler - */ -typedef unsigned long pte_t; -typedef unsigned long pgd_t; -typedef unsigned long pgprot_t; - -#define pte_val(x) (x) -#define pgd_val(x) (x) -#define pgprot_val(x) (x) - -#define __pte(x) (x) -#define __pgd(x) (x) -#define __pgprot(x) (x) - -#endif /* STRICT_MM_TYPECHECKS */ - -typedef struct page *pgtable_t; - -extern int pfn_valid(unsigned long); - -#include - -#endif /* !__ASSEMBLY__ */ - -#include - -#endif diff --git a/arch/unicore32/include/asm/pci.h b/arch/unicore32/include/asm/pci.h deleted file mode 100644 index 3efa8ee1afce..000000000000 --- a/arch/unicore32/include/asm/pci.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/pci.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_PCI_H__ -#define __UNICORE_PCI_H__ - -#ifdef __KERNEL__ -#include -#include /* for PCIBIOS_MIN_* */ - -#define HAVE_PCI_MMAP -#define ARCH_GENERIC_PCI_MMAP_RESOURCE - -#endif /* __KERNEL__ */ -#endif diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h deleted file mode 100644 index ba1c9a79993b..000000000000 --- a/arch/unicore32/include/asm/pgalloc.h +++ /dev/null @@ -1,87 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/pgalloc.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_PGALLOC_H__ -#define __UNICORE_PGALLOC_H__ - -#include -#include -#include -#include - -#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL -#define __HAVE_ARCH_PTE_ALLOC_ONE -#include - -#define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) -#define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) - -extern pgd_t *get_pgd_slow(struct mm_struct *mm); -extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd); - -#define pgd_alloc(mm) get_pgd_slow(mm) -#define pgd_free(mm, pgd) free_pgd_slow(mm, pgd) - -/* - * Allocate one PTE table. - */ -static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm) -{ - pte_t *pte = __pte_alloc_one_kernel(mm); - - if (pte) - clean_dcache_area(pte, PTRS_PER_PTE * sizeof(pte_t)); - - return pte; -} - -static inline pgtable_t -pte_alloc_one(struct mm_struct *mm) -{ - struct page *pte; - - pte = __pte_alloc_one(mm, GFP_PGTABLE_USER); - if (!pte) - return NULL; - if (!PageHighMem(pte)) - clean_pte_table(page_address(pte)); - return pte; -} - -static inline void __pmd_populate(pmd_t *pmdp, unsigned long pmdval) -{ - set_pmd(pmdp, __pmd(pmdval)); - flush_pmd_entry(pmdp); -} - -/* - * Populate the pmdp entry with a pointer to the pte. This pmd is part - * of the mm address space. - */ -static inline void -pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) -{ - unsigned long pte_ptr = (unsigned long)ptep; - - /* - * The pmd must be loaded with the physical - * address of the PTE table - */ - __pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE); -} - -static inline void -pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) -{ - __pmd_populate(pmdp, - page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE); -} -#define pmd_pgtable(pmd) pmd_page(pmd) - -#endif diff --git a/arch/unicore32/include/asm/pgtable-hwdef.h b/arch/unicore32/include/asm/pgtable-hwdef.h deleted file mode 100644 index f28b58c61db9..000000000000 --- a/arch/unicore32/include/asm/pgtable-hwdef.h +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/pgtable-hwdef.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_PGTABLE_HWDEF_H__ -#define __UNICORE_PGTABLE_HWDEF_H__ - -/* - * Hardware page table definitions. - * - * + Level 1 descriptor (PMD) - * - common - */ -#define PMD_TYPE_MASK (3 << 0) -#define PMD_TYPE_TABLE (0 << 0) -/*#define PMD_TYPE_LARGE (1 << 0) */ -#define PMD_TYPE_INVALID (2 << 0) -#define PMD_TYPE_SECT (3 << 0) - -#define PMD_PRESENT (1 << 2) -#define PMD_YOUNG (1 << 3) - -/*#define PMD_SECT_DIRTY (1 << 4) */ -#define PMD_SECT_CACHEABLE (1 << 5) -#define PMD_SECT_EXEC (1 << 6) -#define PMD_SECT_WRITE (1 << 7) -#define PMD_SECT_READ (1 << 8) - -/* - * + Level 2 descriptor (PTE) - * - common - */ -#define PTE_TYPE_MASK (3 << 0) -#define PTE_TYPE_SMALL (0 << 0) -#define PTE_TYPE_MIDDLE (1 << 0) -#define PTE_TYPE_LARGE (2 << 0) -#define PTE_TYPE_INVALID (3 << 0) - -#define PTE_PRESENT (1 << 2) -#define PTE_YOUNG (1 << 3) -#define PTE_DIRTY (1 << 4) -#define PTE_CACHEABLE (1 << 5) -#define PTE_EXEC (1 << 6) -#define PTE_WRITE (1 << 7) -#define PTE_READ (1 << 8) - -#endif diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h deleted file mode 100644 index 97f564c8ecba..000000000000 --- a/arch/unicore32/include/asm/pgtable.h +++ /dev/null @@ -1,267 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/pgtable.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_PGTABLE_H__ -#define __UNICORE_PGTABLE_H__ - -#include -#include - -#include -#include - -/* - * Just any arbitrary offset to the start of the vmalloc VM area: the - * current 8MB value just means that there will be a 8MB "hole" after the - * physical memory until the kernel virtual memory starts. That means that - * any out-of-bounds memory accesses will hopefully be caught. - * The vmalloc() routines leaves a hole of 4kB between each vmalloced - * area for the same reason. ;) - * - * Note that platforms may override VMALLOC_START, but they must provide - * VMALLOC_END. VMALLOC_END defines the (exclusive) limit of this space, - * which may not overlap IO space. - */ -#ifndef VMALLOC_START -#define VMALLOC_OFFSET SZ_8M -#define VMALLOC_START (((unsigned long)high_memory + VMALLOC_OFFSET) \ - & ~(VMALLOC_OFFSET-1)) -#define VMALLOC_END (0xff000000UL) -#endif - -#define PTRS_PER_PTE 1024 -#define PTRS_PER_PGD 1024 - -/* - * PGDIR_SHIFT determines what a third-level page table entry can map - */ -#define PGDIR_SHIFT 22 - -#ifndef __ASSEMBLY__ -extern void __pte_error(const char *file, int line, unsigned long val); -extern void __pgd_error(const char *file, int line, unsigned long val); - -#define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte)) -#define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) -#endif /* !__ASSEMBLY__ */ - -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -/* - * This is the lowest virtual address we can permit any user space - * mapping to be mapped at. This is particularly important for - * non-high vector CPUs. - */ -#define FIRST_USER_ADDRESS PAGE_SIZE - -#define FIRST_USER_PGD_NR 1 -#define USER_PTRS_PER_PGD ((TASK_SIZE/PGDIR_SIZE) - FIRST_USER_PGD_NR) - -/* - * section address mask and size definitions. - */ -#define SECTION_SHIFT 22 -#define SECTION_SIZE (1UL << SECTION_SHIFT) -#define SECTION_MASK (~(SECTION_SIZE-1)) - -#ifndef __ASSEMBLY__ - -/* - * The pgprot_* and protection_map entries will be fixed up in runtime - * to include the cachable bits based on memory policy, as well as any - * architecture dependent bits. - */ -#define _PTE_DEFAULT (PTE_PRESENT | PTE_YOUNG | PTE_CACHEABLE) - -extern pgprot_t pgprot_user; -extern pgprot_t pgprot_kernel; - -#define PAGE_NONE pgprot_user -#define PAGE_SHARED __pgprot(pgprot_val(pgprot_user | PTE_READ \ - | PTE_WRITE)) -#define PAGE_SHARED_EXEC __pgprot(pgprot_val(pgprot_user | PTE_READ \ - | PTE_WRITE \ - | PTE_EXEC)) -#define PAGE_COPY __pgprot(pgprot_val(pgprot_user | PTE_READ) -#define PAGE_COPY_EXEC __pgprot(pgprot_val(pgprot_user | PTE_READ \ - | PTE_EXEC)) -#define PAGE_READONLY __pgprot(pgprot_val(pgprot_user | PTE_READ)) -#define PAGE_READONLY_EXEC __pgprot(pgprot_val(pgprot_user | PTE_READ \ - | PTE_EXEC)) -#define PAGE_KERNEL pgprot_kernel -#define PAGE_KERNEL_EXEC __pgprot(pgprot_val(pgprot_kernel | PTE_EXEC)) - -#define __PAGE_NONE __pgprot(_PTE_DEFAULT) -#define __PAGE_SHARED __pgprot(_PTE_DEFAULT | PTE_READ \ - | PTE_WRITE) -#define __PAGE_SHARED_EXEC __pgprot(_PTE_DEFAULT | PTE_READ \ - | PTE_WRITE \ - | PTE_EXEC) -#define __PAGE_COPY __pgprot(_PTE_DEFAULT | PTE_READ) -#define __PAGE_COPY_EXEC __pgprot(_PTE_DEFAULT | PTE_READ \ - | PTE_EXEC) -#define __PAGE_READONLY __pgprot(_PTE_DEFAULT | PTE_READ) -#define __PAGE_READONLY_EXEC __pgprot(_PTE_DEFAULT | PTE_READ \ - | PTE_EXEC) - -#endif /* __ASSEMBLY__ */ - -/* - * The table below defines the page protection levels that we insert into our - * Linux page table version. These get translated into the best that the - * architecture can perform. Note that on UniCore hardware: - * 1) We cannot do execute protection - * 2) If we could do execute protection, then read is implied - * 3) write implies read permissions - */ -#define __P000 __PAGE_NONE -#define __P001 __PAGE_READONLY -#define __P010 __PAGE_COPY -#define __P011 __PAGE_COPY -#define __P100 __PAGE_READONLY_EXEC -#define __P101 __PAGE_READONLY_EXEC -#define __P110 __PAGE_COPY_EXEC -#define __P111 __PAGE_COPY_EXEC - -#define __S000 __PAGE_NONE -#define __S001 __PAGE_READONLY -#define __S010 __PAGE_SHARED -#define __S011 __PAGE_SHARED -#define __S100 __PAGE_READONLY_EXEC -#define __S101 __PAGE_READONLY_EXEC -#define __S110 __PAGE_SHARED_EXEC -#define __S111 __PAGE_SHARED_EXEC - -#ifndef __ASSEMBLY__ -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern struct page *empty_zero_page; -#define ZERO_PAGE(vaddr) (empty_zero_page) - -#define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) -#define pfn_pte(pfn, prot) (__pte(((pfn) << PAGE_SHIFT) \ - | pgprot_val(prot))) - -#define pte_none(pte) (!pte_val(pte)) -#define pte_clear(mm, addr, ptep) set_pte(ptep, __pte(0)) -#define pte_page(pte) (pfn_to_page(pte_pfn(pte))) - -#define set_pte(ptep, pte) cpu_set_pte(ptep, pte) - -#define set_pte_at(mm, addr, ptep, pteval) \ - do { \ - set_pte(ptep, pteval); \ - } while (0) - -/* - * The following only work if pte_present() is true. - * Undefined behaviour if not.. - */ -#define pte_present(pte) (pte_val(pte) & PTE_PRESENT) -#define pte_write(pte) (pte_val(pte) & PTE_WRITE) -#define pte_dirty(pte) (pte_val(pte) & PTE_DIRTY) -#define pte_young(pte) (pte_val(pte) & PTE_YOUNG) -#define pte_exec(pte) (pte_val(pte) & PTE_EXEC) - -#define PTE_BIT_FUNC(fn, op) \ -static inline pte_t pte_##fn(pte_t pte) { pte_val(pte) op; return pte; } - -PTE_BIT_FUNC(wrprotect, &= ~PTE_WRITE); -PTE_BIT_FUNC(mkwrite, |= PTE_WRITE); -PTE_BIT_FUNC(mkclean, &= ~PTE_DIRTY); -PTE_BIT_FUNC(mkdirty, |= PTE_DIRTY); -PTE_BIT_FUNC(mkold, &= ~PTE_YOUNG); -PTE_BIT_FUNC(mkyoung, |= PTE_YOUNG); - -/* - * Mark the prot value as uncacheable. - */ -#define pgprot_noncached(prot) \ - __pgprot(pgprot_val(prot) & ~PTE_CACHEABLE) -#define pgprot_writecombine(prot) \ - __pgprot(pgprot_val(prot) & ~PTE_CACHEABLE) - -#define pmd_none(pmd) (!pmd_val(pmd)) -#define pmd_present(pmd) (pmd_val(pmd) & PMD_PRESENT) -#define pmd_bad(pmd) (((pmd_val(pmd) & \ - (PMD_PRESENT | PMD_TYPE_MASK)) \ - != (PMD_PRESENT | PMD_TYPE_TABLE))) - -#define set_pmd(pmdpd, pmdval) \ - do { \ - *(pmdpd) = pmdval; \ - } while (0) - -#define pmd_clear(pmdp) \ - do { \ - set_pmd(pmdp, __pmd(0));\ - clean_pmd_entry(pmdp); \ - } while (0) - -#define pmd_page_vaddr(pmd) ((pte_t *)__va(pmd_val(pmd) & PAGE_MASK)) -#define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd))) - -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ -#define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot) - -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ - const unsigned long mask = PTE_EXEC | PTE_WRITE | PTE_READ; - pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask); - return pte; -} - -extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; - -/* - * Encode and decode a swap entry. Swap entries are stored in the Linux - * page tables as follows: - * - * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 - * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 - * <--------------- offset --------------> <--- type --> 0 0 0 0 0 - * - * This gives us up to 127 swap files and 32GB per swap file. Note that - * the offset field is always non-zero. - */ -#define __SWP_TYPE_SHIFT 5 -#define __SWP_TYPE_BITS 7 -#define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1) -#define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) - -#define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) \ - & __SWP_TYPE_MASK) -#define __swp_offset(x) ((x).val >> __SWP_OFFSET_SHIFT) -#define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << __SWP_TYPE_SHIFT) | \ - ((offset) << __SWP_OFFSET_SHIFT) }) - -#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) - -/* - * It is an error for the kernel to have more swap files than we can - * encode in the PTEs. This ensures that we know when MAX_SWAPFILES - * is increased beyond what we presently support. - */ -#define MAX_SWAPFILES_CHECK() \ - BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS) - -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -/* FIXME: this is not correct */ -#define kern_addr_valid(addr) (1) - -#endif /* !__ASSEMBLY__ */ - -#endif /* __UNICORE_PGTABLE_H__ */ diff --git a/arch/unicore32/include/asm/processor.h b/arch/unicore32/include/asm/processor.h deleted file mode 100644 index 6f01620da3d1..000000000000 --- a/arch/unicore32/include/asm/processor.h +++ /dev/null @@ -1,74 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/processor.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_PROCESSOR_H__ -#define __UNICORE_PROCESSOR_H__ - -#ifdef __KERNEL__ - -#include -#include - -#ifdef __KERNEL__ -#define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX TASK_SIZE -#endif - -struct debug_entry { - u32 address; - u32 insn; -}; - -struct debug_info { - int nsaved; - struct debug_entry bp[2]; -}; - -struct thread_struct { - /* fault info */ - unsigned long address; - unsigned long trap_no; - unsigned long error_code; - /* debugging */ - struct debug_info debug; -}; - -#define INIT_THREAD { } - -#define start_thread(regs, pc, sp) \ -({ \ - unsigned long *stack = (unsigned long *)sp; \ - memset(regs->uregs, 0, sizeof(regs->uregs)); \ - regs->UCreg_asr = USER_MODE; \ - regs->UCreg_pc = pc & ~1; /* pc */ \ - regs->UCreg_sp = sp; /* sp */ \ - regs->UCreg_02 = stack[2]; /* r2 (envp) */ \ - regs->UCreg_01 = stack[1]; /* r1 (argv) */ \ - regs->UCreg_00 = stack[0]; /* r0 (argc) */ \ -}) - -/* Forward declaration, a strange C thing */ -struct task_struct; - -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - -unsigned long get_wchan(struct task_struct *p); - -#define cpu_relax() barrier() - -#define task_pt_regs(p) \ - ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1) - -#define KSTK_EIP(tsk) (task_pt_regs(tsk)->UCreg_pc) -#define KSTK_ESP(tsk) (task_pt_regs(tsk)->UCreg_sp) - -#endif - -#endif /* __UNICORE_PROCESSOR_H__ */ diff --git a/arch/unicore32/include/asm/ptrace.h b/arch/unicore32/include/asm/ptrace.h deleted file mode 100644 index bb4cbc42c321..000000000000 --- a/arch/unicore32/include/asm/ptrace.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/ptrace.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_PTRACE_H__ -#define __UNICORE_PTRACE_H__ - -#include - -#ifndef __ASSEMBLY__ - -#define user_mode(regs) \ - (processor_mode(regs) == USER_MODE) - -#define processor_mode(regs) \ - ((regs)->UCreg_asr & MODE_MASK) - -#define interrupts_enabled(regs) \ - (!((regs)->UCreg_asr & PSR_I_BIT)) - -#define fast_interrupts_enabled(regs) \ - (!((regs)->UCreg_asr & PSR_R_BIT)) - -/* Are the current registers suitable for user mode? - * (used to maintain security in signal handlers) - */ -static inline int valid_user_regs(struct pt_regs *regs) -{ - unsigned long mode = regs->UCreg_asr & MODE_MASK; - - /* - * Always clear the R (REAL) bits - */ - regs->UCreg_asr &= ~(PSR_R_BIT); - - if ((regs->UCreg_asr & PSR_I_BIT) == 0) { - if (mode == USER_MODE) - return 1; - } - - /* - * Force ASR to something logical... - */ - regs->UCreg_asr &= PSR_f | USER_MODE; - - return 0; -} - -#define instruction_pointer(regs) ((regs)->UCreg_pc) -#define user_stack_pointer(regs) ((regs)->UCreg_sp) -#define profile_pc(regs) instruction_pointer(regs) - -#endif /* __ASSEMBLY__ */ -#endif diff --git a/arch/unicore32/include/asm/stacktrace.h b/arch/unicore32/include/asm/stacktrace.h deleted file mode 100644 index 3e59f9d2faed..000000000000 --- a/arch/unicore32/include/asm/stacktrace.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/stacktrace.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_STACKTRACE_H__ -#define __UNICORE_STACKTRACE_H__ - -struct stackframe { - unsigned long fp; - unsigned long sp; - unsigned long lr; - unsigned long pc; -}; - -#ifdef CONFIG_FRAME_POINTER -extern int unwind_frame(struct stackframe *frame); -#else -#define unwind_frame(f) (-EINVAL) -#endif -extern void walk_stackframe(struct stackframe *frame, - int (*fn)(struct stackframe *, void *), void *data); - -#endif /* __UNICORE_STACKTRACE_H__ */ diff --git a/arch/unicore32/include/asm/string.h b/arch/unicore32/include/asm/string.h deleted file mode 100644 index 1649b0e4271b..000000000000 --- a/arch/unicore32/include/asm/string.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/string.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_STRING_H__ -#define __UNICORE_STRING_H__ - -/* - * We don't do inline string functions, since the - * optimised inline asm versions are not small. - */ - -#define __HAVE_ARCH_STRRCHR -extern char *strrchr(const char *s, int c); - -#define __HAVE_ARCH_STRCHR -extern char *strchr(const char *s, int c); - -#define __HAVE_ARCH_MEMCPY -extern void *memcpy(void *, const void *, __kernel_size_t); - -#define __HAVE_ARCH_MEMMOVE -extern void *memmove(void *, const void *, __kernel_size_t); - -#define __HAVE_ARCH_MEMCHR -extern void *memchr(const void *, int, __kernel_size_t); - -#define __HAVE_ARCH_MEMSET -extern void *memset(void *, int, __kernel_size_t); - -#endif diff --git a/arch/unicore32/include/asm/suspend.h b/arch/unicore32/include/asm/suspend.h deleted file mode 100644 index 72bd89c44d10..000000000000 --- a/arch/unicore32/include/asm/suspend.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/suspend.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_SUSPEND_H__ -#define __UNICORE_SUSPEND_H__ - -#ifndef __ASSEMBLY__ - -#include - -struct swsusp_arch_regs { - struct cpu_context_save cpu_context; /* cpu context */ -#ifdef CONFIG_UNICORE_FPU_F64 - struct fp_state fpstate __attribute__((aligned(8))); -#endif -}; -#endif - -#endif /* __UNICORE_SUSPEND_H__ */ - diff --git a/arch/unicore32/include/asm/switch_to.h b/arch/unicore32/include/asm/switch_to.h deleted file mode 100644 index 12e534b3bfa5..000000000000 --- a/arch/unicore32/include/asm/switch_to.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Task switching for PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2012 GUAN Xue-tao - */ -#ifndef __UNICORE_SWITCH_TO_H__ -#define __UNICORE_SWITCH_TO_H__ - -struct task_struct; -struct thread_info; - -/* - * switch_to(prev, next) should switch from task `prev' to `next' - * `prev' will never be the same as `next'. schedule() itself - * contains the memory barrier to tell GCC not to cache `current'. - */ -extern struct task_struct *__switch_to(struct task_struct *, - struct thread_info *, struct thread_info *); - -#define switch_to(prev, next, last) \ - do { \ - last = __switch_to(prev, task_thread_info(prev), \ - task_thread_info(next)); \ - } while (0) - -#endif /* __UNICORE_SWITCH_TO_H__ */ diff --git a/arch/unicore32/include/asm/syscall.h b/arch/unicore32/include/asm/syscall.h deleted file mode 100644 index 607961797fff..000000000000 --- a/arch/unicore32/include/asm/syscall.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_UNICORE_SYSCALL_H -#define _ASM_UNICORE_SYSCALL_H - -#include - -static inline int syscall_get_arch(struct task_struct *task) -{ - return AUDIT_ARCH_UNICORE; -} - -#endif /* _ASM_UNICORE_SYSCALL_H */ diff --git a/arch/unicore32/include/asm/thread_info.h b/arch/unicore32/include/asm/thread_info.h deleted file mode 100644 index d8a6d6b7a403..000000000000 --- a/arch/unicore32/include/asm/thread_info.h +++ /dev/null @@ -1,133 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/thread_info.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_THREAD_INFO_H__ -#define __UNICORE_THREAD_INFO_H__ - -#ifdef __KERNEL__ - -#include -#include - -#define THREAD_SIZE_ORDER 1 -#define THREAD_SIZE 8192 -#define THREAD_START_SP (THREAD_SIZE - 8) - -#ifndef __ASSEMBLY__ - -struct task_struct; - -#include - -typedef struct { - unsigned long seg; -} mm_segment_t; - -struct cpu_context_save { - __u32 r4; - __u32 r5; - __u32 r6; - __u32 r7; - __u32 r8; - __u32 r9; - __u32 r10; - __u32 r11; - __u32 r12; - __u32 r13; - __u32 r14; - __u32 r15; - __u32 r16; - __u32 r17; - __u32 r18; - __u32 r19; - __u32 r20; - __u32 r21; - __u32 r22; - __u32 r23; - __u32 r24; - __u32 r25; - __u32 r26; - __u32 fp; - __u32 sp; - __u32 pc; -}; - -/* - * low level task data that entry.S needs immediate access to. - * __switch_to() assumes cpu_context follows immediately after cpu_domain. - */ -struct thread_info { - unsigned long flags; /* low level flags */ - int preempt_count; /* 0 => preemptable */ - /* <0 => bug */ - mm_segment_t addr_limit; /* address limit */ - struct task_struct *task; /* main task structure */ - __u32 cpu; /* cpu */ - struct cpu_context_save cpu_context; /* cpu context */ - __u32 syscall; /* syscall number */ - __u8 used_cp[16]; /* thread used copro */ -#ifdef CONFIG_UNICORE_FPU_F64 - struct fp_state fpstate __attribute__((aligned(8))); -#endif -}; - -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .flags = 0, \ - .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ -} - -/* - * how to get the thread information struct from C - */ -static inline struct thread_info *current_thread_info(void) __attribute_const__; - -static inline struct thread_info *current_thread_info(void) -{ - register unsigned long sp asm ("sp"); - return (struct thread_info *)(sp & ~(THREAD_SIZE - 1)); -} - -#define thread_saved_pc(tsk) \ - ((unsigned long)(task_thread_info(tsk)->cpu_context.pc)) -#define thread_saved_sp(tsk) \ - ((unsigned long)(task_thread_info(tsk)->cpu_context.sp)) -#define thread_saved_fp(tsk) \ - ((unsigned long)(task_thread_info(tsk)->cpu_context.fp)) - -#endif - -/* - * thread information flags: - * TIF_SYSCALL_TRACE - syscall trace active - * TIF_SIGPENDING - signal pending - * TIF_NEED_RESCHED - rescheduling necessary - * TIF_NOTIFY_RESUME - callback before returning to user - */ -#define TIF_SIGPENDING 0 -#define TIF_NEED_RESCHED 1 -#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ -#define TIF_SYSCALL_TRACE 8 -#define TIF_MEMDIE 18 -#define TIF_RESTORE_SIGMASK 20 - -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) -#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) - -/* - * Change these and you break ASM code in entry-common.S - */ -#define _TIF_WORK_MASK \ - (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_NOTIFY_RESUME) - -#endif /* __KERNEL__ */ -#endif /* __UNICORE_THREAD_INFO_H__ */ diff --git a/arch/unicore32/include/asm/timex.h b/arch/unicore32/include/asm/timex.h deleted file mode 100644 index d714af3dbce1..000000000000 --- a/arch/unicore32/include/asm/timex.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/timex.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __UNICORE_TIMEX_H__ -#define __UNICORE_TIMEX_H__ - -#ifdef CONFIG_ARCH_FPGA - -/* in FPGA, APB clock is 33M, and OST clock is 32K, */ -/* so, 1M is selected for timer interrupt correctly */ -#define CLOCK_TICK_RATE (32*1024) - -#endif - -#if defined(CONFIG_PUV3_DB0913) \ - || defined(CONFIG_PUV3_NB0916) \ - || defined(CONFIG_PUV3_SMW0919) - -#define CLOCK_TICK_RATE (14318000) - -#endif - -#include - -#endif diff --git a/arch/unicore32/include/asm/tlb.h b/arch/unicore32/include/asm/tlb.h deleted file mode 100644 index 4663d8cc80ef..000000000000 --- a/arch/unicore32/include/asm/tlb.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/tlb.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_TLB_H__ -#define __UNICORE_TLB_H__ - -/* - * unicore32 lacks an efficient flush_tlb_range(), use flush_tlb_mm(). - */ - -#define __pte_free_tlb(tlb, pte, addr) \ - do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), (pte)); \ - } while (0) - -#include - -#endif diff --git a/arch/unicore32/include/asm/tlbflush.h b/arch/unicore32/include/asm/tlbflush.h deleted file mode 100644 index 1cf18ef55515..000000000000 --- a/arch/unicore32/include/asm/tlbflush.h +++ /dev/null @@ -1,192 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/tlbflush.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_TLBFLUSH_H__ -#define __UNICORE_TLBFLUSH_H__ - -#ifndef __ASSEMBLY__ - -#include - -extern void __cpu_flush_user_tlb_range(unsigned long, unsigned long, - struct vm_area_struct *); -extern void __cpu_flush_kern_tlb_range(unsigned long, unsigned long); - -/* - * TLB Management - * ============== - * - * The arch/unicore/mm/tlb-*.S files implement these methods. - * - * The TLB specific code is expected to perform whatever tests it - * needs to determine if it should invalidate the TLB for each - * call. Start addresses are inclusive and end addresses are - * exclusive; it is safe to round these addresses down. - * - * flush_tlb_all() - * - * Invalidate the entire TLB. - * - * flush_tlb_mm(mm) - * - * Invalidate all TLB entries in a particular address - * space. - * - mm - mm_struct describing address space - * - * flush_tlb_range(mm,start,end) - * - * Invalidate a range of TLB entries in the specified - * address space. - * - mm - mm_struct describing address space - * - start - start address (may not be aligned) - * - end - end address (exclusive, may not be aligned) - * - * flush_tlb_page(vaddr,vma) - * - * Invalidate the specified page in the specified address range. - * - vaddr - virtual address (may not be aligned) - * - vma - vma_struct describing address range - * - * flush_kern_tlb_page(kaddr) - * - * Invalidate the TLB entry for the specified page. The address - * will be in the kernels virtual memory space. Current uses - * only require the D-TLB to be invalidated. - * - kaddr - Kernel virtual memory address - */ - -static inline void local_flush_tlb_all(void) -{ - const int zero = 0; - - /* TLB invalidate all */ - asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (zero) : "cc"); -} - -static inline void local_flush_tlb_mm(struct mm_struct *mm) -{ - const int zero = 0; - - if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) { - /* TLB invalidate all */ - asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (zero) : "cc"); - } - put_cpu(); -} - -static inline void -local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) -{ - if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) { -#ifndef CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE - /* iTLB invalidate page */ - asm("movc p0.c6, %0, #5; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (uaddr & PAGE_MASK) : "cc"); - /* dTLB invalidate page */ - asm("movc p0.c6, %0, #3; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (uaddr & PAGE_MASK) : "cc"); -#else - /* TLB invalidate all */ - asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (uaddr & PAGE_MASK) : "cc"); -#endif - } -} - -static inline void local_flush_tlb_kernel_page(unsigned long kaddr) -{ -#ifndef CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE - /* iTLB invalidate page */ - asm("movc p0.c6, %0, #5; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (kaddr & PAGE_MASK) : "cc"); - /* dTLB invalidate page */ - asm("movc p0.c6, %0, #3; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (kaddr & PAGE_MASK) : "cc"); -#else - /* TLB invalidate all */ - asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (kaddr & PAGE_MASK) : "cc"); -#endif -} - -/* - * flush_pmd_entry - * - * Flush a PMD entry (word aligned, or double-word aligned) to - * RAM if the TLB for the CPU we are running on requires this. - * This is typically used when we are creating PMD entries. - * - * clean_pmd_entry - * - * Clean (but don't drain the write buffer) if the CPU requires - * these operations. This is typically used when we are removing - * PMD entries. - */ -static inline void flush_pmd_entry(pmd_t *pmd) -{ -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - /* flush dcache line, see dcacheline_flush in proc-macros.S */ - asm("mov r1, %0 << #20\n" - "ldw r2, =_stext\n" - "add r2, r2, r1 >> #20\n" - "ldw r1, [r2+], #0x0000\n" - "ldw r1, [r2+], #0x1000\n" - "ldw r1, [r2+], #0x2000\n" - "ldw r1, [r2+], #0x3000\n" - : : "r" (pmd) : "r1", "r2"); -#else - /* flush dcache all */ - asm("movc p0.c5, %0, #14; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (pmd) : "cc"); -#endif -} - -static inline void clean_pmd_entry(pmd_t *pmd) -{ -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - /* clean dcache line */ - asm("movc p0.c5, %0, #11; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (__pa(pmd) & ~(L1_CACHE_BYTES - 1)) : "cc"); -#else - /* clean dcache all */ - asm("movc p0.c5, %0, #10; nop; nop; nop; nop; nop; nop; nop; nop" - : : "r" (pmd) : "cc"); -#endif -} - -/* - * Convert calls to our calling convention. - */ -#define local_flush_tlb_range(vma, start, end) \ - __cpu_flush_user_tlb_range(start, end, vma) -#define local_flush_tlb_kernel_range(s, e) \ - __cpu_flush_kern_tlb_range(s, e) - -#define flush_tlb_all local_flush_tlb_all -#define flush_tlb_mm local_flush_tlb_mm -#define flush_tlb_page local_flush_tlb_page -#define flush_tlb_kernel_page local_flush_tlb_kernel_page -#define flush_tlb_range local_flush_tlb_range -#define flush_tlb_kernel_range local_flush_tlb_kernel_range - -/* - * if PG_dcache_clean is not set for the page, we need to ensure that any - * cache entries for the kernels virtual memory range are written - * back to the page. - */ -extern void update_mmu_cache(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep); - -extern void do_bad_area(unsigned long addr, unsigned int fsr, - struct pt_regs *regs); - -#endif - -#endif diff --git a/arch/unicore32/include/asm/traps.h b/arch/unicore32/include/asm/traps.h deleted file mode 100644 index ad1508a9a903..000000000000 --- a/arch/unicore32/include/asm/traps.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/traps.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_TRAP_H__ -#define __UNICORE_TRAP_H__ - -extern void __init early_trap_init(void); -extern void dump_backtrace_entry(unsigned long where, - unsigned long from, unsigned long frame); - -extern void do_DataAbort(unsigned long addr, unsigned int fsr, - struct pt_regs *regs); -#endif diff --git a/arch/unicore32/include/asm/uaccess.h b/arch/unicore32/include/asm/uaccess.h deleted file mode 100644 index 33c24f430511..000000000000 --- a/arch/unicore32/include/asm/uaccess.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/asm/uaccess.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_UACCESS_H__ -#define __UNICORE_UACCESS_H__ - -#include - -#define __strncpy_from_user __strncpy_from_user -#define __strnlen_user __strnlen_user -#define __clear_user __clear_user - -#define __kernel_ok (uaccess_kernel()) -#define __user_ok(addr, size) (((size) <= TASK_SIZE) \ - && ((addr) <= TASK_SIZE - (size))) -#define __access_ok(addr, size) (__kernel_ok || __user_ok((addr), (size))) - -extern unsigned long __must_check -raw_copy_from_user(void *to, const void __user *from, unsigned long n); -extern unsigned long __must_check -raw_copy_to_user(void __user *to, const void *from, unsigned long n); -extern unsigned long __must_check -__clear_user(void __user *addr, unsigned long n); -extern unsigned long __must_check -__strncpy_from_user(char *to, const char __user *from, unsigned long count); -extern unsigned long -__strnlen_user(const char __user *s, long n); -#define INLINE_COPY_FROM_USER -#define INLINE_COPY_TO_USER - -#include - -#endif /* __UNICORE_UACCESS_H__ */ diff --git a/arch/unicore32/include/asm/vmalloc.h b/arch/unicore32/include/asm/vmalloc.h deleted file mode 100644 index 054435818a14..000000000000 --- a/arch/unicore32/include/asm/vmalloc.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef _ASM_UNICORE32_VMALLOC_H -#define _ASM_UNICORE32_VMALLOC_H - -#endif /* _ASM_UNICORE32_VMALLOC_H */ diff --git a/arch/unicore32/include/mach/PKUnity.h b/arch/unicore32/include/mach/PKUnity.h deleted file mode 100644 index 78f77517c1c7..000000000000 --- a/arch/unicore32/include/mach/PKUnity.h +++ /dev/null @@ -1,95 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/PKUnity.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -/* Be sure that virtual mapping is defined right */ -#ifndef __MACH_PUV3_HARDWARE_H__ -#error You must include hardware.h not PKUnity.h -#endif - -#include - -/* - * Memory Definitions - */ -#define PKUNITY_SDRAM_BASE 0x00000000 /* 0x00000000 - 0x7FFFFFFF 2GB */ -#define PKUNITY_MMIO_BASE 0x80000000 /* 0x80000000 - 0xFFFFFFFF 2GB */ - -/* - * PKUNITY System Bus Addresses (PCI): 0x80000000 - 0xBFFFFFFF (1GB) - * 0x80000000 - 0x8000000B 12B PCI Configuration regs - * 0x80010000 - 0x80010250 592B PCI Bridge Base - * 0x80030000 - 0x8003FFFF 64KB PCI Legacy IO - * 0x90000000 - 0x97FFFFFF 128MB PCI AHB-PCI MEM-mapping - * 0x98000000 - 0x9FFFFFFF 128MB PCI PCI-AHB MEM-mapping - */ -#define PKUNITY_PCI_BASE io_p2v(0x80000000) /* 0x80000000 - 0xBFFFFFFF 1GB */ -#include - -#define PKUNITY_PCICFG_BASE (PKUNITY_PCI_BASE + 0x0) -#define PKUNITY_PCIBRI_BASE (PKUNITY_PCI_BASE + 0x00010000) -#define PKUNITY_PCILIO_BASE (PKUNITY_PCI_BASE + 0x00030000) -#define PKUNITY_PCIMEM_BASE (PKUNITY_PCI_BASE + 0x10000000) -#define PKUNITY_PCIAHB_BASE (PKUNITY_PCI_BASE + 0x18000000) - -/* - * PKUNITY System Bus Addresses (AHB): 0xC0000000 - 0xEDFFFFFF (640MB) - */ -#define PKUNITY_AHB_BASE io_p2v(0xC0000000) - -/* AHB-0 is DDR2 SDRAM */ -/* AHB-1 is PCI Space */ -#define PKUNITY_ARBITER_BASE (PKUNITY_AHB_BASE + 0x000000) /* AHB-2 */ -#define PKUNITY_DDR2CTRL_BASE (PKUNITY_AHB_BASE + 0x100000) /* AHB-3 */ -#define PKUNITY_DMAC_BASE (PKUNITY_AHB_BASE + 0x200000) /* AHB-4 */ -#include -#define PKUNITY_UMAL_BASE (PKUNITY_AHB_BASE + 0x300000) /* AHB-5 */ -#include -#define PKUNITY_USB_BASE (PKUNITY_AHB_BASE + 0x400000) /* AHB-6 */ -#define PKUNITY_SATA_BASE (PKUNITY_AHB_BASE + 0x500000) /* AHB-7 */ -#define PKUNITY_SMC_BASE (PKUNITY_AHB_BASE + 0x600000) /* AHB-8 */ -/* AHB-9 is for APB bridge */ -#define PKUNITY_MME_BASE (PKUNITY_AHB_BASE + 0x700000) /* AHB-10 */ -#define PKUNITY_UNIGFX_BASE (PKUNITY_AHB_BASE + 0x800000) /* AHB-11 */ -#include -#define PKUNITY_NAND_BASE (PKUNITY_AHB_BASE + 0x900000) /* AHB-12 */ -#include -#define PKUNITY_H264D_BASE (PKUNITY_AHB_BASE + 0xA00000) /* AHB-13 */ -#define PKUNITY_H264E_BASE (PKUNITY_AHB_BASE + 0xB00000) /* AHB-14 */ - -/* - * PKUNITY Peripheral Bus Addresses (APB): 0xEE000000 - 0xEFFFFFFF (128MB) - */ -#define PKUNITY_APB_BASE io_p2v(0xEE000000) - -#define PKUNITY_UART0_BASE (PKUNITY_APB_BASE + 0x000000) /* APB-0 */ -#define PKUNITY_UART1_BASE (PKUNITY_APB_BASE + 0x100000) /* APB-1 */ -#include -#define PKUNITY_I2C_BASE (PKUNITY_APB_BASE + 0x200000) /* APB-2 */ -#include -#define PKUNITY_SPI_BASE (PKUNITY_APB_BASE + 0x300000) /* APB-3 */ -#include -#define PKUNITY_AC97_BASE (PKUNITY_APB_BASE + 0x400000) /* APB-4 */ -#include -#define PKUNITY_GPIO_BASE (PKUNITY_APB_BASE + 0x500000) /* APB-5 */ -#include -#define PKUNITY_INTC_BASE (PKUNITY_APB_BASE + 0x600000) /* APB-6 */ -#include -#define PKUNITY_RTC_BASE (PKUNITY_APB_BASE + 0x700000) /* APB-7 */ -#include -#define PKUNITY_OST_BASE (PKUNITY_APB_BASE + 0x800000) /* APB-8 */ -#include -#define PKUNITY_RESETC_BASE (PKUNITY_APB_BASE + 0x900000) /* APB-9 */ -#include -#define PKUNITY_PM_BASE (PKUNITY_APB_BASE + 0xA00000) /* APB-10 */ -#include -#define PKUNITY_PS2_BASE (PKUNITY_APB_BASE + 0xB00000) /* APB-11 */ -#include -#define PKUNITY_SDC_BASE (PKUNITY_APB_BASE + 0xC00000) /* APB-12 */ -#include - diff --git a/arch/unicore32/include/mach/bitfield.h b/arch/unicore32/include/mach/bitfield.h deleted file mode 100644 index 766b7f01f1cd..000000000000 --- a/arch/unicore32/include/mach/bitfield.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/bitfield.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __MACH_PUV3_BITFIELD_H__ -#define __MACH_PUV3_BITFIELD_H__ - -#ifndef __ASSEMBLY__ -#define UData(Data) ((unsigned long) (Data)) -#else -#define UData(Data) (Data) -#endif - -#define FIELD(val, vmask, vshift) (((val) & ((UData(1) << (vmask)) - 1)) << (vshift)) -#define FMASK(vmask, vshift) (((UData(1) << (vmask)) - 1) << (vshift)) - -#endif /* __MACH_PUV3_BITFIELD_H__ */ diff --git a/arch/unicore32/include/mach/dma.h b/arch/unicore32/include/mach/dma.h deleted file mode 100644 index 271001cd13c4..000000000000 --- a/arch/unicore32/include/mach/dma.h +++ /dev/null @@ -1,45 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/dma.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __MACH_PUV3_DMA_H__ -#define __MACH_PUV3_DMA_H__ - -/* - * The PKUnity has six internal DMA channels. - */ -#define MAX_DMA_CHANNELS 6 - -typedef enum { - DMA_PRIO_HIGH = 0, - DMA_PRIO_MEDIUM = 1, - DMA_PRIO_LOW = 2 -} puv3_dma_prio; - -/* - * DMA registration - */ - -extern int puv3_request_dma(char *name, - puv3_dma_prio prio, - void (*irq_handler)(int, void *), - void (*err_handler)(int, void *), - void *data); - -extern void puv3_free_dma(int dma_ch); - -static inline void puv3_stop_dma(int ch) -{ - writel(readl(DMAC_CONFIG(ch)) & ~DMAC_CONFIG_EN, DMAC_CONFIG(ch)); -} - -static inline void puv3_resume_dma(int ch) -{ - writel(readl(DMAC_CONFIG(ch)) | DMAC_CONFIG_EN, DMAC_CONFIG(ch)); -} - -#endif /* __MACH_PUV3_DMA_H__ */ diff --git a/arch/unicore32/include/mach/hardware.h b/arch/unicore32/include/mach/hardware.h deleted file mode 100644 index 2d7571cbd1d0..000000000000 --- a/arch/unicore32/include/mach/hardware.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/hardware.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * This file contains the hardware definitions for PKUnity architecture - */ - -#ifndef __MACH_PUV3_HARDWARE_H__ -#define __MACH_PUV3_HARDWARE_H__ - -#include - -#ifndef __ASSEMBLY__ -#define io_p2v(x) (void __iomem *)((x) - PKUNITY_MMIO_BASE) -#define io_v2p(x) (phys_addr_t)((x) + PKUNITY_MMIO_BASE) -#else -#define io_p2v(x) ((x) - PKUNITY_MMIO_BASE) -#define io_v2p(x) ((x) + PKUNITY_MMIO_BASE) -#endif - -#define PCIBIOS_MIN_IO 0x4000 /* should lower than 64KB */ -#define PCIBIOS_MIN_MEM io_v2p(PKUNITY_PCIMEM_BASE) - -#define pcibios_assign_all_busses() 1 - -#endif /* __MACH_PUV3_HARDWARE_H__ */ diff --git a/arch/unicore32/include/mach/map.h b/arch/unicore32/include/mach/map.h deleted file mode 100644 index 7a83eeeb1287..000000000000 --- a/arch/unicore32/include/mach/map.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/map.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Page table mapping constructs and function prototypes - */ -#define MT_DEVICE 0 -#define MT_DEVICE_CACHED 2 -#define MT_KUSER 7 -#define MT_HIGH_VECTORS 8 -#define MT_MEMORY 9 -#define MT_ROM 10 - diff --git a/arch/unicore32/include/mach/memory.h b/arch/unicore32/include/mach/memory.h deleted file mode 100644 index b4e6035cb9a3..000000000000 --- a/arch/unicore32/include/mach/memory.h +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/memory.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __MACH_PUV3_MEMORY_H__ -#define __MACH_PUV3_MEMORY_H__ - -#include - -/* Physical DRAM offset. */ -#define PHYS_OFFSET UL(0x00000000) -/* The base address of exception vectors. */ -#define VECTORS_BASE UL(0xffff0000) -/* The base address of kuser area. */ -#define KUSER_BASE UL(0x80000000) - -#ifdef __ASSEMBLY__ -/* The byte offset of the kernel image in RAM from the start of RAM. */ -#define KERNEL_IMAGE_START 0x00408000 -#endif - -#if !defined(__ASSEMBLY__) && defined(CONFIG_PCI) - -void puv3_pci_adjust_zones(unsigned long *max_zone_pfn); - -#define arch_adjust_zones(max_zone_pfn) \ - puv3_pci_adjust_zones(max_zone_pfn) - -#endif - -/* - * PCI controller in PKUnity-3 masks highest 5-bit for upstream channel, - * so we must limit the DMA allocation within 128M physical memory for - * supporting PCI devices. - */ -#define PCI_DMA_THRESHOLD (PHYS_OFFSET + SZ_128M - 1) - -#define is_pcibus_device(dev) (dev && \ - (strncmp(dev->bus->name, "pci", 3) == 0)) - -#define __virt_to_pcibus(x) (__virt_to_phys((x) + PKUNITY_PCIAHB_BASE)) -#define __pcibus_to_virt(x) (__phys_to_virt(x) - PKUNITY_PCIAHB_BASE) - -/* kuser area */ -#define KUSER_VECPAGE_BASE (KUSER_BASE + UL(0x3fff0000)) -/* kuser_vecpage (0xbfff0000) is ro, and vectors page (0xffff0000) is rw */ -#define kuser_vecpage_to_vectors(x) ((x) - (KUSER_VECPAGE_BASE) \ - + (VECTORS_BASE)) - -#endif diff --git a/arch/unicore32/include/mach/ocd.h b/arch/unicore32/include/mach/ocd.h deleted file mode 100644 index 2a814929e389..000000000000 --- a/arch/unicore32/include/mach/ocd.h +++ /dev/null @@ -1,33 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/ocd.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __MACH_PUV3_OCD_H__ -#define __MACH_PUV3_OCD_H__ - -#if defined(CONFIG_DEBUG_OCD) -static inline void ocd_putc(unsigned int c) -{ - int status, i = 0x2000000; - - do { - if (--i < 0) - return; - - asm volatile ("movc %0, p1.c0, #0" : "=r" (status)); - } while (status & 2); - - asm("movc p1.c1, %0, #1" : : "r" (c)); -} - -#define putc(ch) ocd_putc(ch) -#else -#define putc(ch) -#endif - -#endif diff --git a/arch/unicore32/include/mach/pm.h b/arch/unicore32/include/mach/pm.h deleted file mode 100644 index cb40b8490a57..000000000000 --- a/arch/unicore32/include/mach/pm.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore/include/mach/pm.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __PUV3_PM_H__ -#define __PUV3_PM_H__ - -#include - -struct puv3_cpu_pm_fns { - int save_count; - void (*save)(unsigned long *); - void (*restore)(unsigned long *); - int (*valid)(suspend_state_t state); - void (*enter)(suspend_state_t state); - int (*prepare)(void); - void (*finish)(void); -}; - -extern struct puv3_cpu_pm_fns *puv3_cpu_pm_fns; - -/* sleep.S */ -extern void puv3_cpu_suspend(unsigned int); - -extern void puv3_cpu_resume(void); - -extern int puv3_pm_enter(suspend_state_t state); - -/* Defined in hibernate_asm.S */ -extern int restore_image(pgd_t *resume_pg_dir, struct pbe *restore_pblist); - -extern struct pbe *restore_pblist; -#endif diff --git a/arch/unicore32/include/mach/regs-ac97.h b/arch/unicore32/include/mach/regs-ac97.h deleted file mode 100644 index 85c601898d02..000000000000 --- a/arch/unicore32/include/mach/regs-ac97.h +++ /dev/null @@ -1,33 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity AC97 Registers - */ - -#define PKUNITY_AC97_CONR (PKUNITY_AC97_BASE + 0x0000) -#define PKUNITY_AC97_OCR (PKUNITY_AC97_BASE + 0x0004) -#define PKUNITY_AC97_ICR (PKUNITY_AC97_BASE + 0x0008) -#define PKUNITY_AC97_CRAC (PKUNITY_AC97_BASE + 0x000C) -#define PKUNITY_AC97_INTR (PKUNITY_AC97_BASE + 0x0010) -#define PKUNITY_AC97_INTRSTAT (PKUNITY_AC97_BASE + 0x0014) -#define PKUNITY_AC97_INTRCLEAR (PKUNITY_AC97_BASE + 0x0018) -#define PKUNITY_AC97_ENABLE (PKUNITY_AC97_BASE + 0x001C) -#define PKUNITY_AC97_OUT_FIFO (PKUNITY_AC97_BASE + 0x0020) -#define PKUNITY_AC97_IN_FIFO (PKUNITY_AC97_BASE + 0x0030) - -#define AC97_CODEC_REG(v) FIELD((v), 7, 16) -#define AC97_CODEC_VAL(v) FIELD((v), 16, 0) -#define AC97_CODEC_WRITECOMPLETE FIELD(1, 1, 2) - -/* - * VAR PLAY SAMPLE RATE - */ -#define AC97_CMD_VPSAMPLE (FIELD(3, 2, 16) | FIELD(3, 2, 0)) - -/* - * FIX CAPTURE SAMPLE RATE - */ -#define AC97_CMD_FCSAMPLE FIELD(7, 3, 0) - -#define AC97_CMD_RESET FIELD(1, 1, 0) -#define AC97_CMD_ENABLE FIELD(1, 1, 0) -#define AC97_CMD_DISABLE FIELD(0, 1, 0) diff --git a/arch/unicore32/include/mach/regs-dmac.h b/arch/unicore32/include/mach/regs-dmac.h deleted file mode 100644 index bbdc52d06a98..000000000000 --- a/arch/unicore32/include/mach/regs-dmac.h +++ /dev/null @@ -1,82 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Direct Memory Access Controller (DMAC) - */ - -/* - * Interrupt Status Reg DMAC_ISR. - */ -#define DMAC_ISR (PKUNITY_DMAC_BASE + 0x0020) -/* - * Interrupt Transfer Complete Status Reg DMAC_ITCSR. - */ -#define DMAC_ITCSR (PKUNITY_DMAC_BASE + 0x0050) -/* - * Interrupt Transfer Complete Clear Reg DMAC_ITCCR. - */ -#define DMAC_ITCCR (PKUNITY_DMAC_BASE + 0x0060) -/* - * Interrupt Error Status Reg DMAC_IESR. - */ -#define DMAC_IESR (PKUNITY_DMAC_BASE + 0x0080) -/* - * Interrupt Error Clear Reg DMAC_IECR. - */ -#define DMAC_IECR (PKUNITY_DMAC_BASE + 0x0090) -/* - * Enable Channels Reg DMAC_ENCH. - */ -#define DMAC_ENCH (PKUNITY_DMAC_BASE + 0x00B0) - -/* - * DMA control reg. Space [byte] - */ -#define DMASp 0x00000100 - -/* - * Source Addr DMAC_SRCADDR(ch). - */ -#define DMAC_SRCADDR(ch) (PKUNITY_DMAC_BASE + (ch)*DMASp + 0x00) -/* - * Destination Addr DMAC_DESTADDR(ch). - */ -#define DMAC_DESTADDR(ch) (PKUNITY_DMAC_BASE + (ch)*DMASp + 0x04) -/* - * Control Reg DMAC_CONTROL(ch). - */ -#define DMAC_CONTROL(ch) (PKUNITY_DMAC_BASE + (ch)*DMASp + 0x0C) -/* - * Configuration Reg DMAC_CONFIG(ch). - */ -#define DMAC_CONFIG(ch) (PKUNITY_DMAC_BASE + (ch)*DMASp + 0x10) - -#define DMAC_IR_MASK FMASK(6, 0) -/* - * select channel (ch) - */ -#define DMAC_CHANNEL(ch) FIELD(1, 1, (ch)) - -#define DMAC_CONTROL_SIZE_BYTE(v) (FIELD((v), 12, 14) | \ - FIELD(0, 3, 9) | FIELD(0, 3, 6)) -#define DMAC_CONTROL_SIZE_HWORD(v) (FIELD((v) >> 1, 12, 14) | \ - FIELD(1, 3, 9) | FIELD(1, 3, 6)) -#define DMAC_CONTROL_SIZE_WORD(v) (FIELD((v) >> 2, 12, 14) | \ - FIELD(2, 3, 9) | FIELD(2, 3, 6)) -#define DMAC_CONTROL_DI FIELD(1, 1, 13) -#define DMAC_CONTROL_SI FIELD(1, 1, 12) -#define DMAC_CONTROL_BURST_1BYTE (FIELD(0, 3, 3) | FIELD(0, 3, 0)) -#define DMAC_CONTROL_BURST_4BYTE (FIELD(3, 3, 3) | FIELD(3, 3, 0)) -#define DMAC_CONTROL_BURST_8BYTE (FIELD(5, 3, 3) | FIELD(5, 3, 0)) -#define DMAC_CONTROL_BURST_16BYTE (FIELD(7, 3, 3) | FIELD(7, 3, 0)) - -#define DMAC_CONFIG_UART0_WR (FIELD(2, 4, 11) | FIELD(1, 2, 1)) -#define DMAC_CONFIG_UART0_RD (FIELD(2, 4, 7) | FIELD(2, 2, 1)) -#define DMAC_CONFIG_UART1_WR (FIELD(3, 4, 11) | FIELD(1, 2, 1)) -#define DMAC_CONFIG_UART1RD (FIELD(3, 4, 7) | FIELD(2, 2, 1)) -#define DMAC_CONFIG_AC97WR (FIELD(4, 4, 11) | FIELD(1, 2, 1)) -#define DMAC_CONFIG_AC97RD (FIELD(4, 4, 7) | FIELD(2, 2, 1)) -#define DMAC_CONFIG_MMCWR (FIELD(7, 4, 11) | FIELD(1, 2, 1)) -#define DMAC_CONFIG_MMCRD (FIELD(7, 4, 7) | FIELD(2, 2, 1)) -#define DMAC_CONFIG_MASKITC FIELD(1, 1, 4) -#define DMAC_CONFIG_MASKIE FIELD(1, 1, 3) -#define DMAC_CONFIG_EN FIELD(1, 1, 0) diff --git a/arch/unicore32/include/mach/regs-gpio.h b/arch/unicore32/include/mach/regs-gpio.h deleted file mode 100644 index 5fc701ee33e3..000000000000 --- a/arch/unicore32/include/mach/regs-gpio.h +++ /dev/null @@ -1,71 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity General-Purpose Input/Output (GPIO) Registers - */ - -/* - * Voltage Status Reg GPIO_GPLR. - */ -#define GPIO_GPLR (PKUNITY_GPIO_BASE + 0x0000) -/* - * Pin Direction Reg GPIO_GPDR. - */ -#define GPIO_GPDR (PKUNITY_GPIO_BASE + 0x0004) -/* - * Output Pin Set Reg GPIO_GPSR. - */ -#define GPIO_GPSR (PKUNITY_GPIO_BASE + 0x0008) -/* - * Output Pin Clear Reg GPIO_GPCR. - */ -#define GPIO_GPCR (PKUNITY_GPIO_BASE + 0x000C) -/* - * Raise Edge Detect Reg GPIO_GRER. - */ -#define GPIO_GRER (PKUNITY_GPIO_BASE + 0x0010) -/* - * Fall Edge Detect Reg GPIO_GFER. - */ -#define GPIO_GFER (PKUNITY_GPIO_BASE + 0x0014) -/* - * Edge Status Reg GPIO_GEDR. - */ -#define GPIO_GEDR (PKUNITY_GPIO_BASE + 0x0018) -/* - * Special Voltage Detect Reg GPIO_GPIR. - */ -#define GPIO_GPIR (PKUNITY_GPIO_BASE + 0x0020) - -#define GPIO_MIN (0) -#define GPIO_MAX (27) - -#define GPIO_GPIO(Nb) (0x00000001 << (Nb)) /* GPIO [0..27] */ -#define GPIO_GPIO0 GPIO_GPIO(0) /* GPIO [0] */ -#define GPIO_GPIO1 GPIO_GPIO(1) /* GPIO [1] */ -#define GPIO_GPIO2 GPIO_GPIO(2) /* GPIO [2] */ -#define GPIO_GPIO3 GPIO_GPIO(3) /* GPIO [3] */ -#define GPIO_GPIO4 GPIO_GPIO(4) /* GPIO [4] */ -#define GPIO_GPIO5 GPIO_GPIO(5) /* GPIO [5] */ -#define GPIO_GPIO6 GPIO_GPIO(6) /* GPIO [6] */ -#define GPIO_GPIO7 GPIO_GPIO(7) /* GPIO [7] */ -#define GPIO_GPIO8 GPIO_GPIO(8) /* GPIO [8] */ -#define GPIO_GPIO9 GPIO_GPIO(9) /* GPIO [9] */ -#define GPIO_GPIO10 GPIO_GPIO(10) /* GPIO [10] */ -#define GPIO_GPIO11 GPIO_GPIO(11) /* GPIO [11] */ -#define GPIO_GPIO12 GPIO_GPIO(12) /* GPIO [12] */ -#define GPIO_GPIO13 GPIO_GPIO(13) /* GPIO [13] */ -#define GPIO_GPIO14 GPIO_GPIO(14) /* GPIO [14] */ -#define GPIO_GPIO15 GPIO_GPIO(15) /* GPIO [15] */ -#define GPIO_GPIO16 GPIO_GPIO(16) /* GPIO [16] */ -#define GPIO_GPIO17 GPIO_GPIO(17) /* GPIO [17] */ -#define GPIO_GPIO18 GPIO_GPIO(18) /* GPIO [18] */ -#define GPIO_GPIO19 GPIO_GPIO(19) /* GPIO [19] */ -#define GPIO_GPIO20 GPIO_GPIO(20) /* GPIO [20] */ -#define GPIO_GPIO21 GPIO_GPIO(21) /* GPIO [21] */ -#define GPIO_GPIO22 GPIO_GPIO(22) /* GPIO [22] */ -#define GPIO_GPIO23 GPIO_GPIO(23) /* GPIO [23] */ -#define GPIO_GPIO24 GPIO_GPIO(24) /* GPIO [24] */ -#define GPIO_GPIO25 GPIO_GPIO(25) /* GPIO [25] */ -#define GPIO_GPIO26 GPIO_GPIO(26) /* GPIO [26] */ -#define GPIO_GPIO27 GPIO_GPIO(27) /* GPIO [27] */ - diff --git a/arch/unicore32/include/mach/regs-i2c.h b/arch/unicore32/include/mach/regs-i2c.h deleted file mode 100644 index b41aa7c92430..000000000000 --- a/arch/unicore32/include/mach/regs-i2c.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Inter-integrated Circuit (I2C) Registers - */ - -/* - * Control Reg I2C_CON. - */ -#define I2C_CON (PKUNITY_I2C_BASE + 0x0000) -/* - * Target Address Reg I2C_TAR. - */ -#define I2C_TAR (PKUNITY_I2C_BASE + 0x0004) -/* - * Data buffer and command Reg I2C_DATACMD. - */ -#define I2C_DATACMD (PKUNITY_I2C_BASE + 0x0010) -/* - * Enable Reg I2C_ENABLE. - */ -#define I2C_ENABLE (PKUNITY_I2C_BASE + 0x006C) -/* - * Status Reg I2C_STATUS. - */ -#define I2C_STATUS (PKUNITY_I2C_BASE + 0x0070) -/* - * Tx FIFO Length Reg I2C_TXFLR. - */ -#define I2C_TXFLR (PKUNITY_I2C_BASE + 0x0074) -/* - * Rx FIFO Length Reg I2C_RXFLR. - */ -#define I2C_RXFLR (PKUNITY_I2C_BASE + 0x0078) -/* - * Enable Status Reg I2C_ENSTATUS. - */ -#define I2C_ENSTATUS (PKUNITY_I2C_BASE + 0x009C) - -#define I2C_CON_MASTER FIELD(1, 1, 0) -#define I2C_CON_SPEED_STD FIELD(1, 2, 1) -#define I2C_CON_SPEED_FAST FIELD(2, 2, 1) -#define I2C_CON_RESTART FIELD(1, 1, 5) -#define I2C_CON_SLAVEDISABLE FIELD(1, 1, 6) - -#define I2C_DATACMD_READ FIELD(1, 1, 8) -#define I2C_DATACMD_WRITE FIELD(0, 1, 8) -#define I2C_DATACMD_DAT_MASK FMASK(8, 0) -#define I2C_DATACMD_DAT(v) FIELD((v), 8, 0) - -#define I2C_ENABLE_ENABLE FIELD(1, 1, 0) -#define I2C_ENABLE_DISABLE FIELD(0, 1, 0) - -#define I2C_STATUS_RFF FIELD(1, 1, 4) -#define I2C_STATUS_RFNE FIELD(1, 1, 3) -#define I2C_STATUS_TFE FIELD(1, 1, 2) -#define I2C_STATUS_TFNF FIELD(1, 1, 1) -#define I2C_STATUS_ACTIVITY FIELD(1, 1, 0) - -#define I2C_ENSTATUS_ENABLE FIELD(1, 1, 0) - -#define I2C_TAR_THERMAL 0x4f -#define I2C_TAR_SPD 0x50 -#define I2C_TAR_PWIC 0x55 -#define I2C_TAR_EEPROM 0x57 diff --git a/arch/unicore32/include/mach/regs-intc.h b/arch/unicore32/include/mach/regs-intc.h deleted file mode 100644 index 4eb1b5b571bb..000000000000 --- a/arch/unicore32/include/mach/regs-intc.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUNITY Interrupt Controller (INTC) Registers - */ -/* - * INTC Level Reg INTC_ICLR. - */ -#define INTC_ICLR (PKUNITY_INTC_BASE + 0x0000) -/* - * INTC Mask Reg INTC_ICMR. - */ -#define INTC_ICMR (PKUNITY_INTC_BASE + 0x0004) -/* - * INTC Pending Reg INTC_ICPR. - */ -#define INTC_ICPR (PKUNITY_INTC_BASE + 0x0008) -/* - * INTC IRQ Pending Reg INTC_ICIP. - */ -#define INTC_ICIP (PKUNITY_INTC_BASE + 0x000C) -/* - * INTC REAL Pending Reg INTC_ICFP. - */ -#define INTC_ICFP (PKUNITY_INTC_BASE + 0x0010) -/* - * INTC Control Reg INTC_ICCR. - */ -#define INTC_ICCR (PKUNITY_INTC_BASE + 0x0014) - diff --git a/arch/unicore32/include/mach/regs-nand.h b/arch/unicore32/include/mach/regs-nand.h deleted file mode 100644 index 7f29939251ef..000000000000 --- a/arch/unicore32/include/mach/regs-nand.h +++ /dev/null @@ -1,80 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity NAND Controller Registers - */ -/* - * ID Reg. 0 NAND_IDR0 - */ -#define NAND_IDR0 (PKUNITY_NAND_BASE + 0x0000) -/* - * ID Reg. 1 NAND_IDR1 - */ -#define NAND_IDR1 (PKUNITY_NAND_BASE + 0x0004) -/* - * ID Reg. 2 NAND_IDR2 - */ -#define NAND_IDR2 (PKUNITY_NAND_BASE + 0x0008) -/* - * ID Reg. 3 NAND_IDR3 - */ -#define NAND_IDR3 (PKUNITY_NAND_BASE + 0x000C) -/* - * Page Address Reg 0 NAND_PAR0 - */ -#define NAND_PAR0 (PKUNITY_NAND_BASE + 0x0010) -/* - * Page Address Reg 1 NAND_PAR1 - */ -#define NAND_PAR1 (PKUNITY_NAND_BASE + 0x0014) -/* - * Page Address Reg 2 NAND_PAR2 - */ -#define NAND_PAR2 (PKUNITY_NAND_BASE + 0x0018) -/* - * ECC Enable Reg NAND_ECCEN - */ -#define NAND_ECCEN (PKUNITY_NAND_BASE + 0x001C) -/* - * Buffer Reg NAND_BUF - */ -#define NAND_BUF (PKUNITY_NAND_BASE + 0x0020) -/* - * ECC Status Reg NAND_ECCSR - */ -#define NAND_ECCSR (PKUNITY_NAND_BASE + 0x0024) -/* - * Command Reg NAND_CMD - */ -#define NAND_CMD (PKUNITY_NAND_BASE + 0x0028) -/* - * DMA Configure Reg NAND_DMACR - */ -#define NAND_DMACR (PKUNITY_NAND_BASE + 0x002C) -/* - * Interrupt Reg NAND_IR - */ -#define NAND_IR (PKUNITY_NAND_BASE + 0x0030) -/* - * Interrupt Mask Reg NAND_IMR - */ -#define NAND_IMR (PKUNITY_NAND_BASE + 0x0034) -/* - * Chip Enable Reg NAND_CHIPEN - */ -#define NAND_CHIPEN (PKUNITY_NAND_BASE + 0x0038) -/* - * Address Reg NAND_ADDR - */ -#define NAND_ADDR (PKUNITY_NAND_BASE + 0x003C) - -/* - * Command bits NAND_CMD_CMD_MASK - */ -#define NAND_CMD_CMD_MASK FMASK(4, 4) -#define NAND_CMD_CMD_READPAGE FIELD(0x0, 4, 4) -#define NAND_CMD_CMD_ERASEBLOCK FIELD(0x6, 4, 4) -#define NAND_CMD_CMD_READSTATUS FIELD(0x7, 4, 4) -#define NAND_CMD_CMD_WRITEPAGE FIELD(0x8, 4, 4) -#define NAND_CMD_CMD_READID FIELD(0x9, 4, 4) -#define NAND_CMD_CMD_RESET FIELD(0xf, 4, 4) - diff --git a/arch/unicore32/include/mach/regs-ost.h b/arch/unicore32/include/mach/regs-ost.h deleted file mode 100644 index 6c63e7b7569e..000000000000 --- a/arch/unicore32/include/mach/regs-ost.h +++ /dev/null @@ -1,91 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Operating System Timer (OST) Registers - */ -/* - * Match Reg 0 OST_OSMR0 - */ -#define OST_OSMR0 (PKUNITY_OST_BASE + 0x0000) -/* - * Match Reg 1 OST_OSMR1 - */ -#define OST_OSMR1 (PKUNITY_OST_BASE + 0x0004) -/* - * Match Reg 2 OST_OSMR2 - */ -#define OST_OSMR2 (PKUNITY_OST_BASE + 0x0008) -/* - * Match Reg 3 OST_OSMR3 - */ -#define OST_OSMR3 (PKUNITY_OST_BASE + 0x000C) -/* - * Counter Reg OST_OSCR - */ -#define OST_OSCR (PKUNITY_OST_BASE + 0x0010) -/* - * Status Reg OST_OSSR - */ -#define OST_OSSR (PKUNITY_OST_BASE + 0x0014) -/* - * Watchdog Enable Reg OST_OWER - */ -#define OST_OWER (PKUNITY_OST_BASE + 0x0018) -/* - * Interrupt Enable Reg OST_OIER - */ -#define OST_OIER (PKUNITY_OST_BASE + 0x001C) - -/* - * PWM Registers: IO base address: PKUNITY_OST_BASE + 0x80 - * PWCR: Pulse Width Control Reg - * DCCR: Duty Cycle Control Reg - * PCR: Period Control Reg - */ -#define OST_PWM_PWCR (0x00) -#define OST_PWM_DCCR (0x04) -#define OST_PWM_PCR (0x08) - -/* - * Match detected 0 OST_OSSR_M0 - */ -#define OST_OSSR_M0 FIELD(1, 1, 0) -/* - * Match detected 1 OST_OSSR_M1 - */ -#define OST_OSSR_M1 FIELD(1, 1, 1) -/* - * Match detected 2 OST_OSSR_M2 - */ -#define OST_OSSR_M2 FIELD(1, 1, 2) -/* - * Match detected 3 OST_OSSR_M3 - */ -#define OST_OSSR_M3 FIELD(1, 1, 3) - -/* - * Interrupt enable 0 OST_OIER_E0 - */ -#define OST_OIER_E0 FIELD(1, 1, 0) -/* - * Interrupt enable 1 OST_OIER_E1 - */ -#define OST_OIER_E1 FIELD(1, 1, 1) -/* - * Interrupt enable 2 OST_OIER_E2 - */ -#define OST_OIER_E2 FIELD(1, 1, 2) -/* - * Interrupt enable 3 OST_OIER_E3 - */ -#define OST_OIER_E3 FIELD(1, 1, 3) - -/* - * Watchdog Match Enable OST_OWER_WME - */ -#define OST_OWER_WME FIELD(1, 1, 0) - -/* - * PWM Full Duty Cycle OST_PWMDCCR_FDCYCLE - */ -#define OST_PWMDCCR_FDCYCLE FIELD(1, 1, 10) - diff --git a/arch/unicore32/include/mach/regs-pci.h b/arch/unicore32/include/mach/regs-pci.h deleted file mode 100644 index 25bb307b87c3..000000000000 --- a/arch/unicore32/include/mach/regs-pci.h +++ /dev/null @@ -1,95 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity AHB-PCI Bridge Registers - */ - -/* - * AHB/PCI fixed physical address for pci addess configuration - */ -/* - * PCICFG Bridge Base Reg. - */ -#define PCICFG_BRIBASE (PKUNITY_PCICFG_BASE + 0x0000) -/* - * PCICFG Address Reg. - */ -#define PCICFG_ADDR (PKUNITY_PCICFG_BASE + 0x0004) -/* - * PCICFG Address Reg. - */ -#define PCICFG_DATA (PKUNITY_PCICFG_BASE + 0x0008) - -/* - * PCI Bridge configuration space - */ -#define PCIBRI_ID (PKUNITY_PCIBRI_BASE + 0x0000) -#define PCIBRI_CMD (PKUNITY_PCIBRI_BASE + 0x0004) -#define PCIBRI_CLASS (PKUNITY_PCIBRI_BASE + 0x0008) -#define PCIBRI_LTR (PKUNITY_PCIBRI_BASE + 0x000C) -#define PCIBRI_BAR0 (PKUNITY_PCIBRI_BASE + 0x0010) -#define PCIBRI_BAR1 (PKUNITY_PCIBRI_BASE + 0x0014) -#define PCIBRI_BAR2 (PKUNITY_PCIBRI_BASE + 0x0018) -#define PCIBRI_BAR3 (PKUNITY_PCIBRI_BASE + 0x001C) -#define PCIBRI_BAR4 (PKUNITY_PCIBRI_BASE + 0x0020) -#define PCIBRI_BAR5 (PKUNITY_PCIBRI_BASE + 0x0024) - -#define PCIBRI_PCICTL0 (PKUNITY_PCIBRI_BASE + 0x0100) -#define PCIBRI_PCIBAR0 (PKUNITY_PCIBRI_BASE + 0x0104) -#define PCIBRI_PCIAMR0 (PKUNITY_PCIBRI_BASE + 0x0108) -#define PCIBRI_PCITAR0 (PKUNITY_PCIBRI_BASE + 0x010C) -#define PCIBRI_PCICTL1 (PKUNITY_PCIBRI_BASE + 0x0110) -#define PCIBRI_PCIBAR1 (PKUNITY_PCIBRI_BASE + 0x0114) -#define PCIBRI_PCIAMR1 (PKUNITY_PCIBRI_BASE + 0x0118) -#define PCIBRI_PCITAR1 (PKUNITY_PCIBRI_BASE + 0x011C) -#define PCIBRI_PCICTL2 (PKUNITY_PCIBRI_BASE + 0x0120) -#define PCIBRI_PCIBAR2 (PKUNITY_PCIBRI_BASE + 0x0124) -#define PCIBRI_PCIAMR2 (PKUNITY_PCIBRI_BASE + 0x0128) -#define PCIBRI_PCITAR2 (PKUNITY_PCIBRI_BASE + 0x012C) -#define PCIBRI_PCICTL3 (PKUNITY_PCIBRI_BASE + 0x0130) -#define PCIBRI_PCIBAR3 (PKUNITY_PCIBRI_BASE + 0x0134) -#define PCIBRI_PCIAMR3 (PKUNITY_PCIBRI_BASE + 0x0138) -#define PCIBRI_PCITAR3 (PKUNITY_PCIBRI_BASE + 0x013C) -#define PCIBRI_PCICTL4 (PKUNITY_PCIBRI_BASE + 0x0140) -#define PCIBRI_PCIBAR4 (PKUNITY_PCIBRI_BASE + 0x0144) -#define PCIBRI_PCIAMR4 (PKUNITY_PCIBRI_BASE + 0x0148) -#define PCIBRI_PCITAR4 (PKUNITY_PCIBRI_BASE + 0x014C) -#define PCIBRI_PCICTL5 (PKUNITY_PCIBRI_BASE + 0x0150) -#define PCIBRI_PCIBAR5 (PKUNITY_PCIBRI_BASE + 0x0154) -#define PCIBRI_PCIAMR5 (PKUNITY_PCIBRI_BASE + 0x0158) -#define PCIBRI_PCITAR5 (PKUNITY_PCIBRI_BASE + 0x015C) - -#define PCIBRI_AHBCTL0 (PKUNITY_PCIBRI_BASE + 0x0180) -#define PCIBRI_AHBBAR0 (PKUNITY_PCIBRI_BASE + 0x0184) -#define PCIBRI_AHBAMR0 (PKUNITY_PCIBRI_BASE + 0x0188) -#define PCIBRI_AHBTAR0 (PKUNITY_PCIBRI_BASE + 0x018C) -#define PCIBRI_AHBCTL1 (PKUNITY_PCIBRI_BASE + 0x0190) -#define PCIBRI_AHBBAR1 (PKUNITY_PCIBRI_BASE + 0x0194) -#define PCIBRI_AHBAMR1 (PKUNITY_PCIBRI_BASE + 0x0198) -#define PCIBRI_AHBTAR1 (PKUNITY_PCIBRI_BASE + 0x019C) -#define PCIBRI_AHBCTL2 (PKUNITY_PCIBRI_BASE + 0x01A0) -#define PCIBRI_AHBBAR2 (PKUNITY_PCIBRI_BASE + 0x01A4) -#define PCIBRI_AHBAMR2 (PKUNITY_PCIBRI_BASE + 0x01A8) -#define PCIBRI_AHBTAR2 (PKUNITY_PCIBRI_BASE + 0x01AC) -#define PCIBRI_AHBCTL3 (PKUNITY_PCIBRI_BASE + 0x01B0) -#define PCIBRI_AHBBAR3 (PKUNITY_PCIBRI_BASE + 0x01B4) -#define PCIBRI_AHBAMR3 (PKUNITY_PCIBRI_BASE + 0x01B8) -#define PCIBRI_AHBTAR3 (PKUNITY_PCIBRI_BASE + 0x01BC) -#define PCIBRI_AHBCTL4 (PKUNITY_PCIBRI_BASE + 0x01C0) -#define PCIBRI_AHBBAR4 (PKUNITY_PCIBRI_BASE + 0x01C4) -#define PCIBRI_AHBAMR4 (PKUNITY_PCIBRI_BASE + 0x01C8) -#define PCIBRI_AHBTAR4 (PKUNITY_PCIBRI_BASE + 0x01CC) -#define PCIBRI_AHBCTL5 (PKUNITY_PCIBRI_BASE + 0x01D0) -#define PCIBRI_AHBBAR5 (PKUNITY_PCIBRI_BASE + 0x01D4) -#define PCIBRI_AHBAMR5 (PKUNITY_PCIBRI_BASE + 0x01D8) -#define PCIBRI_AHBTAR5 (PKUNITY_PCIBRI_BASE + 0x01DC) - -#define PCIBRI_CTLx_AT FIELD(1, 1, 2) -#define PCIBRI_CTLx_PREF FIELD(1, 1, 1) -#define PCIBRI_CTLx_MRL FIELD(1, 1, 0) - -#define PCIBRI_BARx_ADDR FIELD(0xFFFFFFFC, 30, 2) -#define PCIBRI_BARx_IO FIELD(1, 1, 0) -#define PCIBRI_BARx_MEM FIELD(0, 1, 0) - -#define PCIBRI_CMD_IO FIELD(1, 1, 0) -#define PCIBRI_CMD_MEM FIELD(1, 1, 1) diff --git a/arch/unicore32/include/mach/regs-pm.h b/arch/unicore32/include/mach/regs-pm.h deleted file mode 100644 index 777b1ace39b9..000000000000 --- a/arch/unicore32/include/mach/regs-pm.h +++ /dev/null @@ -1,127 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUNITY Power Manager (PM) Registers - */ -/* - * PM Control Reg PM_PMCR - */ -#define PM_PMCR (PKUNITY_PM_BASE + 0x0000) -/* - * PM General Conf. Reg PM_PGCR - */ -#define PM_PGCR (PKUNITY_PM_BASE + 0x0004) -/* - * PM PLL Conf. Reg PM_PPCR - */ -#define PM_PPCR (PKUNITY_PM_BASE + 0x0008) -/* - * PM Wakeup Enable Reg PM_PWER - */ -#define PM_PWER (PKUNITY_PM_BASE + 0x000C) -/* - * PM GPIO Sleep Status Reg PM_PGSR - */ -#define PM_PGSR (PKUNITY_PM_BASE + 0x0010) -/* - * PM Clock Gate Reg PM_PCGR - */ -#define PM_PCGR (PKUNITY_PM_BASE + 0x0014) -/* - * PM SYS PLL Conf. Reg PM_PLLSYSCFG - */ -#define PM_PLLSYSCFG (PKUNITY_PM_BASE + 0x0018) -/* - * PM DDR PLL Conf. Reg PM_PLLDDRCFG - */ -#define PM_PLLDDRCFG (PKUNITY_PM_BASE + 0x001C) -/* - * PM VGA PLL Conf. Reg PM_PLLVGACFG - */ -#define PM_PLLVGACFG (PKUNITY_PM_BASE + 0x0020) -/* - * PM Div Conf. Reg PM_DIVCFG - */ -#define PM_DIVCFG (PKUNITY_PM_BASE + 0x0024) -/* - * PM SYS PLL Status Reg PM_PLLSYSSTATUS - */ -#define PM_PLLSYSSTATUS (PKUNITY_PM_BASE + 0x0028) -/* - * PM DDR PLL Status Reg PM_PLLDDRSTATUS - */ -#define PM_PLLDDRSTATUS (PKUNITY_PM_BASE + 0x002C) -/* - * PM VGA PLL Status Reg PM_PLLVGASTATUS - */ -#define PM_PLLVGASTATUS (PKUNITY_PM_BASE + 0x0030) -/* - * PM Div Status Reg PM_DIVSTATUS - */ -#define PM_DIVSTATUS (PKUNITY_PM_BASE + 0x0034) -/* - * PM Software Reset Reg PM_SWRESET - */ -#define PM_SWRESET (PKUNITY_PM_BASE + 0x0038) -/* - * PM DDR2 PAD Start Reg PM_DDR2START - */ -#define PM_DDR2START (PKUNITY_PM_BASE + 0x003C) -/* - * PM DDR2 PAD Status Reg PM_DDR2CAL0 - */ -#define PM_DDR2CAL0 (PKUNITY_PM_BASE + 0x0040) -/* - * PM PLL DFC Done Reg PM_PLLDFCDONE - */ -#define PM_PLLDFCDONE (PKUNITY_PM_BASE + 0x0044) - -#define PM_PMCR_SFB FIELD(1, 1, 0) -#define PM_PMCR_IFB FIELD(1, 1, 1) -#define PM_PMCR_CFBSYS FIELD(1, 1, 2) -#define PM_PMCR_CFBDDR FIELD(1, 1, 3) -#define PM_PMCR_CFBVGA FIELD(1, 1, 4) -#define PM_PMCR_CFBDIVBCLK FIELD(1, 1, 5) - -/* - * GPIO 8~27 wake-up enable PM_PWER_GPIOHIGH - */ -#define PM_PWER_GPIOHIGH FIELD(1, 1, 8) -/* - * RTC alarm wake-up enable PM_PWER_RTC - */ -#define PM_PWER_RTC FIELD(1, 1, 31) - -#define PM_PCGR_BCLK64DDR FIELD(1, 1, 0) -#define PM_PCGR_BCLK64VGA FIELD(1, 1, 1) -#define PM_PCGR_BCLKDDR FIELD(1, 1, 2) -#define PM_PCGR_BCLKPCI FIELD(1, 1, 4) -#define PM_PCGR_BCLKDMAC FIELD(1, 1, 5) -#define PM_PCGR_BCLKUMAL FIELD(1, 1, 6) -#define PM_PCGR_BCLKUSB FIELD(1, 1, 7) -#define PM_PCGR_BCLKMME FIELD(1, 1, 10) -#define PM_PCGR_BCLKNAND FIELD(1, 1, 11) -#define PM_PCGR_BCLKH264E FIELD(1, 1, 12) -#define PM_PCGR_BCLKVGA FIELD(1, 1, 13) -#define PM_PCGR_BCLKH264D FIELD(1, 1, 14) -#define PM_PCGR_VECLK FIELD(1, 1, 15) -#define PM_PCGR_HECLK FIELD(1, 1, 16) -#define PM_PCGR_HDCLK FIELD(1, 1, 17) -#define PM_PCGR_NANDCLK FIELD(1, 1, 18) -#define PM_PCGR_GECLK FIELD(1, 1, 19) -#define PM_PCGR_VGACLK FIELD(1, 1, 20) -#define PM_PCGR_PCICLK FIELD(1, 1, 21) -#define PM_PCGR_SATACLK FIELD(1, 1, 25) - -/* - * [23:20]PM_DIVCFG_VGACLK(v) - */ -#define PM_DIVCFG_VGACLK_MASK FMASK(4, 20) -#define PM_DIVCFG_VGACLK(v) FIELD((v), 4, 20) - -#define PM_SWRESET_USB FIELD(1, 1, 6) -#define PM_SWRESET_VGADIV FIELD(1, 1, 26) -#define PM_SWRESET_GEDIV FIELD(1, 1, 27) - -#define PM_PLLDFCDONE_SYSDFC FIELD(1, 1, 0) -#define PM_PLLDFCDONE_DDRDFC FIELD(1, 1, 1) -#define PM_PLLDFCDONE_VGADFC FIELD(1, 1, 2) diff --git a/arch/unicore32/include/mach/regs-ps2.h b/arch/unicore32/include/mach/regs-ps2.h deleted file mode 100644 index d539d7482462..000000000000 --- a/arch/unicore32/include/mach/regs-ps2.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity PS2 Controller Registers - */ -/* - * the same as I8042_DATA_REG PS2_DATA - */ -#define PS2_DATA (PKUNITY_PS2_BASE + 0x0060) -/* - * the same as I8042_COMMAND_REG PS2_COMMAND - */ -#define PS2_COMMAND (PKUNITY_PS2_BASE + 0x0064) -/* - * the same as I8042_STATUS_REG PS2_STATUS - */ -#define PS2_STATUS (PKUNITY_PS2_BASE + 0x0064) -/* - * counter reg PS2_CNT - */ -#define PS2_CNT (PKUNITY_PS2_BASE + 0x0068) - diff --git a/arch/unicore32/include/mach/regs-resetc.h b/arch/unicore32/include/mach/regs-resetc.h deleted file mode 100644 index 5f2b9d77a9ec..000000000000 --- a/arch/unicore32/include/mach/regs-resetc.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Reset Controller (RC) Registers - */ -/* - * Software Reset Register - */ -#define RESETC_SWRR (PKUNITY_RESETC_BASE + 0x0000) -/* - * Reset Status Register - */ -#define RESETC_RSSR (PKUNITY_RESETC_BASE + 0x0004) - -/* - * Software Reset Bit - */ -#define RESETC_SWRR_SRB FIELD(1, 1, 0) - -/* - * Hardware Reset - */ -#define RESETC_RSSR_HWR FIELD(1, 1, 0) -/* - * Software Reset - */ -#define RESETC_RSSR_SWR FIELD(1, 1, 1) -/* - * Watchdog Reset - */ -#define RESETC_RSSR_WDR FIELD(1, 1, 2) -/* - * Sleep Mode Reset - */ -#define RESETC_RSSR_SMR FIELD(1, 1, 3) - diff --git a/arch/unicore32/include/mach/regs-rtc.h b/arch/unicore32/include/mach/regs-rtc.h deleted file mode 100644 index f2f7f47eb65e..000000000000 --- a/arch/unicore32/include/mach/regs-rtc.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Real-Time Clock (RTC) control registers - */ -/* - * RTC Alarm Reg RTC_RTAR - */ -#define RTC_RTAR (PKUNITY_RTC_BASE + 0x0000) -/* - * RTC Count Reg RTC_RCNR - */ -#define RTC_RCNR (PKUNITY_RTC_BASE + 0x0004) -/* - * RTC Trim Reg RTC_RTTR - */ -#define RTC_RTTR (PKUNITY_RTC_BASE + 0x0008) -/* - * RTC Status Reg RTC_RTSR - */ -#define RTC_RTSR (PKUNITY_RTC_BASE + 0x0010) - -/* - * ALarm detected RTC_RTSR_AL - */ -#define RTC_RTSR_AL FIELD(1, 1, 0) -/* - * 1 Hz clock detected RTC_RTSR_HZ - */ -#define RTC_RTSR_HZ FIELD(1, 1, 1) -/* - * ALarm interrupt Enable RTC_RTSR_ALE - */ -#define RTC_RTSR_ALE FIELD(1, 1, 2) -/* - * 1 Hz clock interrupt Enable RTC_RTSR_HZE - */ -#define RTC_RTSR_HZE FIELD(1, 1, 3) - diff --git a/arch/unicore32/include/mach/regs-sdc.h b/arch/unicore32/include/mach/regs-sdc.h deleted file mode 100644 index 658bfaf4cb3c..000000000000 --- a/arch/unicore32/include/mach/regs-sdc.h +++ /dev/null @@ -1,157 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Multi-Media Card and Security Digital Card (MMC/SD) Registers - */ -/* - * Clock Control Reg SDC_CCR - */ -#define SDC_CCR (PKUNITY_SDC_BASE + 0x0000) -/* - * Software Reset Reg SDC_SRR - */ -#define SDC_SRR (PKUNITY_SDC_BASE + 0x0004) -/* - * Argument Reg SDC_ARGUMENT - */ -#define SDC_ARGUMENT (PKUNITY_SDC_BASE + 0x0008) -/* - * Command Reg SDC_COMMAND - */ -#define SDC_COMMAND (PKUNITY_SDC_BASE + 0x000C) -/* - * Block Size Reg SDC_BLOCKSIZE - */ -#define SDC_BLOCKSIZE (PKUNITY_SDC_BASE + 0x0010) -/* - * Block Cound Reg SDC_BLOCKCOUNT - */ -#define SDC_BLOCKCOUNT (PKUNITY_SDC_BASE + 0x0014) -/* - * Transfer Mode Reg SDC_TMR - */ -#define SDC_TMR (PKUNITY_SDC_BASE + 0x0018) -/* - * Response Reg. 0 SDC_RES0 - */ -#define SDC_RES0 (PKUNITY_SDC_BASE + 0x001C) -/* - * Response Reg. 1 SDC_RES1 - */ -#define SDC_RES1 (PKUNITY_SDC_BASE + 0x0020) -/* - * Response Reg. 2 SDC_RES2 - */ -#define SDC_RES2 (PKUNITY_SDC_BASE + 0x0024) -/* - * Response Reg. 3 SDC_RES3 - */ -#define SDC_RES3 (PKUNITY_SDC_BASE + 0x0028) -/* - * Read Timeout Control Reg SDC_RTCR - */ -#define SDC_RTCR (PKUNITY_SDC_BASE + 0x002C) -/* - * Interrupt Status Reg SDC_ISR - */ -#define SDC_ISR (PKUNITY_SDC_BASE + 0x0030) -/* - * Interrupt Status Mask Reg SDC_ISMR - */ -#define SDC_ISMR (PKUNITY_SDC_BASE + 0x0034) -/* - * RX FIFO SDC_RXFIFO - */ -#define SDC_RXFIFO (PKUNITY_SDC_BASE + 0x0038) -/* - * TX FIFO SDC_TXFIFO - */ -#define SDC_TXFIFO (PKUNITY_SDC_BASE + 0x003C) - -/* - * SD Clock Enable SDC_CCR_CLKEN - */ -#define SDC_CCR_CLKEN FIELD(1, 1, 2) -/* - * [15:8] SDC_CCR_PDIV(v) - */ -#define SDC_CCR_PDIV(v) FIELD((v), 8, 8) - -/* - * Software reset enable SDC_SRR_ENABLE - */ -#define SDC_SRR_ENABLE FIELD(0, 1, 0) -/* - * Software reset disable SDC_SRR_DISABLE - */ -#define SDC_SRR_DISABLE FIELD(1, 1, 0) - -/* - * Response type SDC_COMMAND_RESTYPE_MASK - */ -#define SDC_COMMAND_RESTYPE_MASK FMASK(2, 0) -/* - * No response SDC_COMMAND_RESTYPE_NONE - */ -#define SDC_COMMAND_RESTYPE_NONE FIELD(0, 2, 0) -/* - * 136-bit long response SDC_COMMAND_RESTYPE_LONG - */ -#define SDC_COMMAND_RESTYPE_LONG FIELD(1, 2, 0) -/* - * 48-bit short response SDC_COMMAND_RESTYPE_SHORT - */ -#define SDC_COMMAND_RESTYPE_SHORT FIELD(2, 2, 0) -/* - * 48-bit short and test if busy response SDC_COMMAND_RESTYPE_SHORTBUSY - */ -#define SDC_COMMAND_RESTYPE_SHORTBUSY FIELD(3, 2, 0) -/* - * data ready SDC_COMMAND_DATAREADY - */ -#define SDC_COMMAND_DATAREADY FIELD(1, 1, 2) -#define SDC_COMMAND_CMDEN FIELD(1, 1, 3) -/* - * [10:5] SDC_COMMAND_CMDINDEX(v) - */ -#define SDC_COMMAND_CMDINDEX(v) FIELD((v), 6, 5) - -/* - * [10:0] SDC_BLOCKSIZE_BSMASK(v) - */ -#define SDC_BLOCKSIZE_BSMASK(v) FIELD((v), 11, 0) -/* - * [11:0] SDC_BLOCKCOUNT_BCMASK(v) - */ -#define SDC_BLOCKCOUNT_BCMASK(v) FIELD((v), 12, 0) - -/* - * Data Width 1bit SDC_TMR_WTH_1BIT - */ -#define SDC_TMR_WTH_1BIT FIELD(0, 1, 0) -/* - * Data Width 4bit SDC_TMR_WTH_4BIT - */ -#define SDC_TMR_WTH_4BIT FIELD(1, 1, 0) -/* - * Read SDC_TMR_DIR_READ - */ -#define SDC_TMR_DIR_READ FIELD(0, 1, 1) -/* - * Write SDC_TMR_DIR_WRITE - */ -#define SDC_TMR_DIR_WRITE FIELD(1, 1, 1) - -#define SDC_IR_MASK FMASK(13, 0) -#define SDC_IR_RESTIMEOUT FIELD(1, 1, 0) -#define SDC_IR_WRITECRC FIELD(1, 1, 1) -#define SDC_IR_READCRC FIELD(1, 1, 2) -#define SDC_IR_TXFIFOREAD FIELD(1, 1, 3) -#define SDC_IR_RXFIFOWRITE FIELD(1, 1, 4) -#define SDC_IR_READTIMEOUT FIELD(1, 1, 5) -#define SDC_IR_DATACOMPLETE FIELD(1, 1, 6) -#define SDC_IR_CMDCOMPLETE FIELD(1, 1, 7) -#define SDC_IR_RXFIFOFULL FIELD(1, 1, 8) -#define SDC_IR_RXFIFOEMPTY FIELD(1, 1, 9) -#define SDC_IR_TXFIFOFULL FIELD(1, 1, 10) -#define SDC_IR_TXFIFOEMPTY FIELD(1, 1, 11) -#define SDC_IR_ENDCMDWITHRES FIELD(1, 1, 12) diff --git a/arch/unicore32/include/mach/regs-spi.h b/arch/unicore32/include/mach/regs-spi.h deleted file mode 100644 index 3460647a9c2a..000000000000 --- a/arch/unicore32/include/mach/regs-spi.h +++ /dev/null @@ -1,99 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Serial Peripheral Interface (SPI) Registers - */ -/* - * Control reg. 0 SPI_CR0 - */ -#define SPI_CR0 (PKUNITY_SPI_BASE + 0x0000) -/* - * Control reg. 1 SPI_CR1 - */ -#define SPI_CR1 (PKUNITY_SPI_BASE + 0x0004) -/* - * Enable reg SPI_SSIENR - */ -#define SPI_SSIENR (PKUNITY_SPI_BASE + 0x0008) -/* - * Status reg SPI_SR - */ -#define SPI_SR (PKUNITY_SPI_BASE + 0x0028) -/* - * Interrupt Mask reg SPI_IMR - */ -#define SPI_IMR (PKUNITY_SPI_BASE + 0x002C) -/* - * Interrupt Status reg SPI_ISR - */ -#define SPI_ISR (PKUNITY_SPI_BASE + 0x0030) - -/* - * Enable SPI Controller SPI_SSIENR_EN - */ -#define SPI_SSIENR_EN FIELD(1, 1, 0) - -/* - * SPI Busy SPI_SR_BUSY - */ -#define SPI_SR_BUSY FIELD(1, 1, 0) -/* - * Transmit FIFO Not Full SPI_SR_TFNF - */ -#define SPI_SR_TFNF FIELD(1, 1, 1) -/* - * Transmit FIFO Empty SPI_SR_TFE - */ -#define SPI_SR_TFE FIELD(1, 1, 2) -/* - * Receive FIFO Not Empty SPI_SR_RFNE - */ -#define SPI_SR_RFNE FIELD(1, 1, 3) -/* - * Receive FIFO Full SPI_SR_RFF - */ -#define SPI_SR_RFF FIELD(1, 1, 4) - -/* - * Trans. FIFO Empty Interrupt Status SPI_ISR_TXEIS - */ -#define SPI_ISR_TXEIS FIELD(1, 1, 0) -/* - * Trans. FIFO Overflow Interrupt Status SPI_ISR_TXOIS - */ -#define SPI_ISR_TXOIS FIELD(1, 1, 1) -/* - * Receiv. FIFO Underflow Interrupt Status SPI_ISR_RXUIS - */ -#define SPI_ISR_RXUIS FIELD(1, 1, 2) -/* - * Receiv. FIFO Overflow Interrupt Status SPI_ISR_RXOIS - */ -#define SPI_ISR_RXOIS FIELD(1, 1, 3) -/* - * Receiv. FIFO Full Interrupt Status SPI_ISR_RXFIS - */ -#define SPI_ISR_RXFIS FIELD(1, 1, 4) -#define SPI_ISR_MSTIS FIELD(1, 1, 5) - -/* - * Trans. FIFO Empty Interrupt Mask SPI_IMR_TXEIM - */ -#define SPI_IMR_TXEIM FIELD(1, 1, 0) -/* - * Trans. FIFO Overflow Interrupt Mask SPI_IMR_TXOIM - */ -#define SPI_IMR_TXOIM FIELD(1, 1, 1) -/* - * Receiv. FIFO Underflow Interrupt Mask SPI_IMR_RXUIM - */ -#define SPI_IMR_RXUIM FIELD(1, 1, 2) -/* - * Receiv. FIFO Overflow Interrupt Mask SPI_IMR_RXOIM - */ -#define SPI_IMR_RXOIM FIELD(1, 1, 3) -/* - * Receiv. FIFO Full Interrupt Mask SPI_IMR_RXFIM - */ -#define SPI_IMR_RXFIM FIELD(1, 1, 4) -#define SPI_IMR_MSTIM FIELD(1, 1, 5) - diff --git a/arch/unicore32/include/mach/regs-uart.h b/arch/unicore32/include/mach/regs-uart.h deleted file mode 100644 index 9fa6b1938b77..000000000000 --- a/arch/unicore32/include/mach/regs-uart.h +++ /dev/null @@ -1,3 +0,0 @@ -/* - * PKUnity Universal Asynchronous Receiver/Transmitter (UART) Registers - */ diff --git a/arch/unicore32/include/mach/regs-umal.h b/arch/unicore32/include/mach/regs-umal.h deleted file mode 100644 index 7023089c61c6..000000000000 --- a/arch/unicore32/include/mach/regs-umal.h +++ /dev/null @@ -1,230 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity Ultra Media Access Layer (UMAL) Ethernet MAC Registers - */ - -/* MAC module of UMAL */ -/* UMAL's MAC module includes G/MII interface, several additional PHY - * interfaces, and MAC control sub-layer, which provides support for control - * frames (e.g. PAUSE frames). - */ -/* - * TX/RX reset and control UMAL_CFG1 - */ -#define UMAL_CFG1 (PKUNITY_UMAL_BASE + 0x0000) -/* - * MAC interface mode control UMAL_CFG2 - */ -#define UMAL_CFG2 (PKUNITY_UMAL_BASE + 0x0004) -/* - * Inter Packet/Frame Gap UMAL_IPGIFG - */ -#define UMAL_IPGIFG (PKUNITY_UMAL_BASE + 0x0008) -/* - * Collision retry or backoff UMAL_HALFDUPLEX - */ -#define UMAL_HALFDUPLEX (PKUNITY_UMAL_BASE + 0x000c) -/* - * Maximum Frame Length UMAL_MAXFRAME - */ -#define UMAL_MAXFRAME (PKUNITY_UMAL_BASE + 0x0010) -/* - * Test Regsiter UMAL_TESTREG - */ -#define UMAL_TESTREG (PKUNITY_UMAL_BASE + 0x001c) -/* - * MII Management Configure UMAL_MIICFG - */ -#define UMAL_MIICFG (PKUNITY_UMAL_BASE + 0x0020) -/* - * MII Management Command UMAL_MIICMD - */ -#define UMAL_MIICMD (PKUNITY_UMAL_BASE + 0x0024) -/* - * MII Management Address UMAL_MIIADDR - */ -#define UMAL_MIIADDR (PKUNITY_UMAL_BASE + 0x0028) -/* - * MII Management Control UMAL_MIICTRL - */ -#define UMAL_MIICTRL (PKUNITY_UMAL_BASE + 0x002c) -/* - * MII Management Status UMAL_MIISTATUS - */ -#define UMAL_MIISTATUS (PKUNITY_UMAL_BASE + 0x0030) -/* - * MII Management Indicator UMAL_MIIIDCT - */ -#define UMAL_MIIIDCT (PKUNITY_UMAL_BASE + 0x0034) -/* - * Interface Control UMAL_IFCTRL - */ -#define UMAL_IFCTRL (PKUNITY_UMAL_BASE + 0x0038) -/* - * Interface Status UMAL_IFSTATUS - */ -#define UMAL_IFSTATUS (PKUNITY_UMAL_BASE + 0x003c) -/* - * MAC address (high 4 bytes) UMAL_STADDR1 - */ -#define UMAL_STADDR1 (PKUNITY_UMAL_BASE + 0x0040) -/* - * MAC address (low 2 bytes) UMAL_STADDR2 - */ -#define UMAL_STADDR2 (PKUNITY_UMAL_BASE + 0x0044) - -/* FIFO MODULE OF UMAL */ -/* UMAL's FIFO module provides data queuing for increased system level - * throughput - */ -#define UMAL_FIFOCFG0 (PKUNITY_UMAL_BASE + 0x0048) -#define UMAL_FIFOCFG1 (PKUNITY_UMAL_BASE + 0x004c) -#define UMAL_FIFOCFG2 (PKUNITY_UMAL_BASE + 0x0050) -#define UMAL_FIFOCFG3 (PKUNITY_UMAL_BASE + 0x0054) -#define UMAL_FIFOCFG4 (PKUNITY_UMAL_BASE + 0x0058) -#define UMAL_FIFOCFG5 (PKUNITY_UMAL_BASE + 0x005c) -#define UMAL_FIFORAM0 (PKUNITY_UMAL_BASE + 0x0060) -#define UMAL_FIFORAM1 (PKUNITY_UMAL_BASE + 0x0064) -#define UMAL_FIFORAM2 (PKUNITY_UMAL_BASE + 0x0068) -#define UMAL_FIFORAM3 (PKUNITY_UMAL_BASE + 0x006c) -#define UMAL_FIFORAM4 (PKUNITY_UMAL_BASE + 0x0070) -#define UMAL_FIFORAM5 (PKUNITY_UMAL_BASE + 0x0074) -#define UMAL_FIFORAM6 (PKUNITY_UMAL_BASE + 0x0078) -#define UMAL_FIFORAM7 (PKUNITY_UMAL_BASE + 0x007c) - -/* MAHBE MODULE OF UMAL */ -/* UMAL's MAHBE module interfaces to the host system through 32-bit AHB Master - * and Slave ports.Registers within the M-AHBE provide Control and Status - * information concerning these transfers. - */ -/* - * Transmit Control UMAL_DMATxCtrl - */ -#define UMAL_DMATxCtrl (PKUNITY_UMAL_BASE + 0x0180) -/* - * Pointer to TX Descripter UMAL_DMATxDescriptor - */ -#define UMAL_DMATxDescriptor (PKUNITY_UMAL_BASE + 0x0184) -/* - * Status of Tx Packet Transfers UMAL_DMATxStatus - */ -#define UMAL_DMATxStatus (PKUNITY_UMAL_BASE + 0x0188) -/* - * Receive Control UMAL_DMARxCtrl - */ -#define UMAL_DMARxCtrl (PKUNITY_UMAL_BASE + 0x018c) -/* - * Pointer to Rx Descriptor UMAL_DMARxDescriptor - */ -#define UMAL_DMARxDescriptor (PKUNITY_UMAL_BASE + 0x0190) -/* - * Status of Rx Packet Transfers UMAL_DMARxStatus - */ -#define UMAL_DMARxStatus (PKUNITY_UMAL_BASE + 0x0194) -/* - * Interrupt Mask UMAL_DMAIntrMask - */ -#define UMAL_DMAIntrMask (PKUNITY_UMAL_BASE + 0x0198) -/* - * Interrupts, read only UMAL_DMAInterrupt - */ -#define UMAL_DMAInterrupt (PKUNITY_UMAL_BASE + 0x019c) - -/* - * Commands for UMAL_CFG1 register - */ -#define UMAL_CFG1_TXENABLE FIELD(1, 1, 0) -#define UMAL_CFG1_RXENABLE FIELD(1, 1, 2) -#define UMAL_CFG1_TXFLOWCTL FIELD(1, 1, 4) -#define UMAL_CFG1_RXFLOWCTL FIELD(1, 1, 5) -#define UMAL_CFG1_CONFLPBK FIELD(1, 1, 8) -#define UMAL_CFG1_RESET FIELD(1, 1, 31) -#define UMAL_CFG1_CONFFLCTL (MAC_TX_FLOW_CTL | MAC_RX_FLOW_CTL) - -/* - * Commands for UMAL_CFG2 register - */ -#define UMAL_CFG2_FULLDUPLEX FIELD(1, 1, 0) -#define UMAL_CFG2_CRCENABLE FIELD(1, 1, 1) -#define UMAL_CFG2_PADCRC FIELD(1, 1, 2) -#define UMAL_CFG2_LENGTHCHECK FIELD(1, 1, 4) -#define UMAL_CFG2_MODEMASK FMASK(2, 8) -#define UMAL_CFG2_NIBBLEMODE FIELD(1, 2, 8) -#define UMAL_CFG2_BYTEMODE FIELD(2, 2, 8) -#define UMAL_CFG2_PREAMBLENMASK FMASK(4, 12) -#define UMAL_CFG2_DEFPREAMBLEN FIELD(7, 4, 12) -#define UMAL_CFG2_FD100 (UMAL_CFG2_DEFPREAMBLEN | UMAL_CFG2_NIBBLEMODE \ - | UMAL_CFG2_LENGTHCHECK | UMAL_CFG2_PADCRC \ - | UMAL_CFG2_CRCENABLE | UMAL_CFG2_FULLDUPLEX) -#define UMAL_CFG2_FD1000 (UMAL_CFG2_DEFPREAMBLEN | UMAL_CFG2_BYTEMODE \ - | UMAL_CFG2_LENGTHCHECK | UMAL_CFG2_PADCRC \ - | UMAL_CFG2_CRCENABLE | UMAL_CFG2_FULLDUPLEX) -#define UMAL_CFG2_HD100 (UMAL_CFG2_DEFPREAMBLEN | UMAL_CFG2_NIBBLEMODE \ - | UMAL_CFG2_LENGTHCHECK | UMAL_CFG2_PADCRC \ - | UMAL_CFG2_CRCENABLE) - -/* - * Command for UMAL_IFCTRL register - */ -#define UMAL_IFCTRL_RESET FIELD(1, 1, 31) - -/* - * Command for UMAL_MIICFG register - */ -#define UMAL_MIICFG_RESET FIELD(1, 1, 31) - -/* - * Command for UMAL_MIICMD register - */ -#define UMAL_MIICMD_READ FIELD(1, 1, 0) - -/* - * Command for UMAL_MIIIDCT register - */ -#define UMAL_MIIIDCT_BUSY FIELD(1, 1, 0) -#define UMAL_MIIIDCT_NOTVALID FIELD(1, 1, 2) - -/* - * Commands for DMATxCtrl regesters - */ -#define UMAL_DMA_Enable FIELD(1, 1, 0) - -/* - * Commands for DMARxCtrl regesters - */ -#define UMAL_DMAIntrMask_ENABLEHALFWORD FIELD(1, 1, 16) - -/* - * Command for DMARxStatus - */ -#define CLR_RX_BUS_ERR FIELD(1, 1, 3) -#define CLR_RX_OVERFLOW FIELD(1, 1, 2) -#define CLR_RX_PKT FIELD(1, 1, 0) - -/* - * Command for DMATxStatus - */ -#define CLR_TX_BUS_ERR FIELD(1, 1, 3) -#define CLR_TX_UNDERRUN FIELD(1, 1, 1) -#define CLR_TX_PKT FIELD(1, 1, 0) - -/* - * Commands for DMAIntrMask and DMAInterrupt register - */ -#define INT_RX_MASK FIELD(0xd, 4, 4) -#define INT_TX_MASK FIELD(0xb, 4, 0) - -#define INT_RX_BUS_ERR FIELD(1, 1, 7) -#define INT_RX_OVERFLOW FIELD(1, 1, 6) -#define INT_RX_PKT FIELD(1, 1, 4) -#define INT_TX_BUS_ERR FIELD(1, 1, 3) -#define INT_TX_UNDERRUN FIELD(1, 1, 1) -#define INT_TX_PKT FIELD(1, 1, 0) - -/* - * MARCOS of UMAL's descriptors - */ -#define UMAL_DESC_PACKETSIZE_EMPTY FIELD(1, 1, 31) -#define UMAL_DESC_PACKETSIZE_NONEMPTY FIELD(0, 1, 31) -#define UMAL_DESC_PACKETSIZE_SIZEMASK FMASK(12, 0) - diff --git a/arch/unicore32/include/mach/regs-unigfx.h b/arch/unicore32/include/mach/regs-unigfx.h deleted file mode 100644 index 553d1157c6b2..000000000000 --- a/arch/unicore32/include/mach/regs-unigfx.h +++ /dev/null @@ -1,201 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PKUnity UNIGFX Registers - */ - -#define UDE_BASE (PKUNITY_UNIGFX_BASE + 0x1400) -#define UGE_BASE (PKUNITY_UNIGFX_BASE + 0x0000) - -/* - * command reg for UNIGFX DE - */ -/* - * control reg UDE_CFG - */ -#define UDE_CFG (UDE_BASE + 0x0000) -/* - * framebuffer start address reg UDE_FSA - */ -#define UDE_FSA (UDE_BASE + 0x0004) -/* - * line size reg UDE_LS - */ -#define UDE_LS (UDE_BASE + 0x0008) -/* - * pitch size reg UDE_PS - */ -#define UDE_PS (UDE_BASE + 0x000C) -/* - * horizontal active time reg UDE_HAT - */ -#define UDE_HAT (UDE_BASE + 0x0010) -/* - * horizontal blank time reg UDE_HBT - */ -#define UDE_HBT (UDE_BASE + 0x0014) -/* - * horizontal sync time reg UDE_HST - */ -#define UDE_HST (UDE_BASE + 0x0018) -/* - * vertival active time reg UDE_VAT - */ -#define UDE_VAT (UDE_BASE + 0x001C) -/* - * vertival blank time reg UDE_VBT - */ -#define UDE_VBT (UDE_BASE + 0x0020) -/* - * vertival sync time reg UDE_VST - */ -#define UDE_VST (UDE_BASE + 0x0024) -/* - * cursor position UDE_CXY - */ -#define UDE_CXY (UDE_BASE + 0x0028) -/* - * cursor front color UDE_CC0 - */ -#define UDE_CC0 (UDE_BASE + 0x002C) -/* - * cursor background color UDE_CC1 - */ -#define UDE_CC1 (UDE_BASE + 0x0030) -/* - * video position UDE_VXY - */ -#define UDE_VXY (UDE_BASE + 0x0034) -/* - * video start address reg UDE_VSA - */ -#define UDE_VSA (UDE_BASE + 0x0040) -/* - * video size reg UDE_VS - */ -#define UDE_VS (UDE_BASE + 0x004C) - -/* - * command reg for UNIGFX GE - */ -/* - * src xy reg UGE_SRCXY - */ -#define UGE_SRCXY (UGE_BASE + 0x0000) -/* - * dst xy reg UGE_DSTXY - */ -#define UGE_DSTXY (UGE_BASE + 0x0004) -/* - * pitch reg UGE_PITCH - */ -#define UGE_PITCH (UGE_BASE + 0x0008) -/* - * src start reg UGE_SRCSTART - */ -#define UGE_SRCSTART (UGE_BASE + 0x000C) -/* - * dst start reg UGE_DSTSTART - */ -#define UGE_DSTSTART (UGE_BASE + 0x0010) -/* - * width height reg UGE_WIDHEIGHT - */ -#define UGE_WIDHEIGHT (UGE_BASE + 0x0014) -/* - * rop alpah reg UGE_ROPALPHA - */ -#define UGE_ROPALPHA (UGE_BASE + 0x0018) -/* - * front color UGE_FCOLOR - */ -#define UGE_FCOLOR (UGE_BASE + 0x001C) -/* - * background color UGE_BCOLOR - */ -#define UGE_BCOLOR (UGE_BASE + 0x0020) -/* - * src color key for high value UGE_SCH - */ -#define UGE_SCH (UGE_BASE + 0x0024) -/* - * dst color key for high value UGE_DCH - */ -#define UGE_DCH (UGE_BASE + 0x0028) -/* - * src color key for low value UGE_SCL - */ -#define UGE_SCL (UGE_BASE + 0x002C) -/* - * dst color key for low value UGE_DCL - */ -#define UGE_DCL (UGE_BASE + 0x0030) -/* - * clip 0 reg UGE_CLIP0 - */ -#define UGE_CLIP0 (UGE_BASE + 0x0034) -/* - * clip 1 reg UGE_CLIP1 - */ -#define UGE_CLIP1 (UGE_BASE + 0x0038) -/* - * command reg UGE_COMMAND - */ -#define UGE_COMMAND (UGE_BASE + 0x003C) -/* - * pattern 0 UGE_P0 - */ -#define UGE_P0 (UGE_BASE + 0x0040) -#define UGE_P1 (UGE_BASE + 0x0044) -#define UGE_P2 (UGE_BASE + 0x0048) -#define UGE_P3 (UGE_BASE + 0x004C) -#define UGE_P4 (UGE_BASE + 0x0050) -#define UGE_P5 (UGE_BASE + 0x0054) -#define UGE_P6 (UGE_BASE + 0x0058) -#define UGE_P7 (UGE_BASE + 0x005C) -#define UGE_P8 (UGE_BASE + 0x0060) -#define UGE_P9 (UGE_BASE + 0x0064) -#define UGE_P10 (UGE_BASE + 0x0068) -#define UGE_P11 (UGE_BASE + 0x006C) -#define UGE_P12 (UGE_BASE + 0x0070) -#define UGE_P13 (UGE_BASE + 0x0074) -#define UGE_P14 (UGE_BASE + 0x0078) -#define UGE_P15 (UGE_BASE + 0x007C) -#define UGE_P16 (UGE_BASE + 0x0080) -#define UGE_P17 (UGE_BASE + 0x0084) -#define UGE_P18 (UGE_BASE + 0x0088) -#define UGE_P19 (UGE_BASE + 0x008C) -#define UGE_P20 (UGE_BASE + 0x0090) -#define UGE_P21 (UGE_BASE + 0x0094) -#define UGE_P22 (UGE_BASE + 0x0098) -#define UGE_P23 (UGE_BASE + 0x009C) -#define UGE_P24 (UGE_BASE + 0x00A0) -#define UGE_P25 (UGE_BASE + 0x00A4) -#define UGE_P26 (UGE_BASE + 0x00A8) -#define UGE_P27 (UGE_BASE + 0x00AC) -#define UGE_P28 (UGE_BASE + 0x00B0) -#define UGE_P29 (UGE_BASE + 0x00B4) -#define UGE_P30 (UGE_BASE + 0x00B8) -#define UGE_P31 (UGE_BASE + 0x00BC) - -#define UDE_CFG_DST_MASK FMASK(2, 8) -#define UDE_CFG_DST8 FIELD(0x0, 2, 8) -#define UDE_CFG_DST16 FIELD(0x1, 2, 8) -#define UDE_CFG_DST24 FIELD(0x2, 2, 8) -#define UDE_CFG_DST32 FIELD(0x3, 2, 8) - -/* - * GDEN enable UDE_CFG_GDEN_ENABLE - */ -#define UDE_CFG_GDEN_ENABLE FIELD(1, 1, 3) -/* - * VDEN enable UDE_CFG_VDEN_ENABLE - */ -#define UDE_CFG_VDEN_ENABLE FIELD(1, 1, 4) -/* - * CDEN enable UDE_CFG_CDEN_ENABLE - */ -#define UDE_CFG_CDEN_ENABLE FIELD(1, 1, 5) -/* - * TIMEUP enable UDE_CFG_TIMEUP_ENABLE - */ -#define UDE_CFG_TIMEUP_ENABLE FIELD(1, 1, 6) diff --git a/arch/unicore32/include/mach/uncompress.h b/arch/unicore32/include/mach/uncompress.h deleted file mode 100644 index 0c1a56a1913f..000000000000 --- a/arch/unicore32/include/mach/uncompress.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/include/mach/uncompress.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#ifndef __MACH_PUV3_UNCOMPRESS_H__ -#define __MACH_PUV3_UNCOMPRESS_H__ - -#include -#include - -extern char input_data[]; -extern char input_data_end[]; - -static void arch_decomp_puts(const char *ptr) -{ - char c; - - while ((c = *ptr++) != '\0') { - if (c == '\n') - putc('\r'); - putc(c); - } -} -#define ARCH_HAVE_DECOMP_PUTS - -#endif /* __MACH_PUV3_UNCOMPRESS_H__ */ diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild deleted file mode 100644 index e78470141932..000000000000 --- a/arch/unicore32/include/uapi/asm/Kbuild +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -generic-y += ucontext.h diff --git a/arch/unicore32/include/uapi/asm/byteorder.h b/arch/unicore32/include/uapi/asm/byteorder.h deleted file mode 100644 index 864fe4814cf4..000000000000 --- a/arch/unicore32/include/uapi/asm/byteorder.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * linux/arch/unicore32/include/asm/byteorder.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * UniCore ONLY support Little Endian mode, the data bus is connected such - * that byte accesses appear as: - * 0 = d0...d7, 1 = d8...d15, 2 = d16...d23, 3 = d24...d31 - * and word accesses (data or instruction) appear as: - * d0...d31 - */ -#ifndef __UNICORE_BYTEORDER_H__ -#define __UNICORE_BYTEORDER_H__ - -#include - -#endif - diff --git a/arch/unicore32/include/uapi/asm/ptrace.h b/arch/unicore32/include/uapi/asm/ptrace.h deleted file mode 100644 index 2820de83e37d..000000000000 --- a/arch/unicore32/include/uapi/asm/ptrace.h +++ /dev/null @@ -1,91 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * linux/arch/unicore32/include/asm/ptrace.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef _UAPI__UNICORE_PTRACE_H__ -#define _UAPI__UNICORE_PTRACE_H__ - -#define PTRACE_GET_THREAD_AREA 22 - -/* - * PSR bits - */ -#define USER_MODE 0x00000010 -#define REAL_MODE 0x00000011 -#define INTR_MODE 0x00000012 -#define PRIV_MODE 0x00000013 -#define ABRT_MODE 0x00000017 -#define EXTN_MODE 0x0000001b -#define SUSR_MODE 0x0000001f -#define MODE_MASK 0x0000001f -#define PSR_R_BIT 0x00000040 -#define PSR_I_BIT 0x00000080 -#define PSR_V_BIT 0x10000000 -#define PSR_C_BIT 0x20000000 -#define PSR_Z_BIT 0x40000000 -#define PSR_S_BIT 0x80000000 - -/* - * Groups of PSR bits - */ -#define PSR_f 0xff000000 /* Flags */ -#define PSR_c 0x000000ff /* Control */ - -#ifndef __ASSEMBLY__ - -/* - * This struct defines the way the registers are stored on the - * stack during a system call. Note that sizeof(struct pt_regs) - * has to be a multiple of 8. - */ -struct pt_regs { - unsigned long uregs[34]; -}; - -#define UCreg_asr uregs[32] -#define UCreg_pc uregs[31] -#define UCreg_lr uregs[30] -#define UCreg_sp uregs[29] -#define UCreg_ip uregs[28] -#define UCreg_fp uregs[27] -#define UCreg_26 uregs[26] -#define UCreg_25 uregs[25] -#define UCreg_24 uregs[24] -#define UCreg_23 uregs[23] -#define UCreg_22 uregs[22] -#define UCreg_21 uregs[21] -#define UCreg_20 uregs[20] -#define UCreg_19 uregs[19] -#define UCreg_18 uregs[18] -#define UCreg_17 uregs[17] -#define UCreg_16 uregs[16] -#define UCreg_15 uregs[15] -#define UCreg_14 uregs[14] -#define UCreg_13 uregs[13] -#define UCreg_12 uregs[12] -#define UCreg_11 uregs[11] -#define UCreg_10 uregs[10] -#define UCreg_09 uregs[9] -#define UCreg_08 uregs[8] -#define UCreg_07 uregs[7] -#define UCreg_06 uregs[6] -#define UCreg_05 uregs[5] -#define UCreg_04 uregs[4] -#define UCreg_03 uregs[3] -#define UCreg_02 uregs[2] -#define UCreg_01 uregs[1] -#define UCreg_00 uregs[0] -#define UCreg_ORIG_00 uregs[33] - - -#endif /* __ASSEMBLY__ */ - -#endif /* _UAPI__UNICORE_PTRACE_H__ */ diff --git a/arch/unicore32/include/uapi/asm/sigcontext.h b/arch/unicore32/include/uapi/asm/sigcontext.h deleted file mode 100644 index 79e56f28e4b5..000000000000 --- a/arch/unicore32/include/uapi/asm/sigcontext.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * linux/arch/unicore32/include/asm/sigcontext.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef __UNICORE_SIGCONTEXT_H__ -#define __UNICORE_SIGCONTEXT_H__ - -#include -/* - * Signal context structure - contains all info to do with the state - * before the signal handler was invoked. Note: only add new entries - * to the end of the structure. - */ -struct sigcontext { - unsigned long trap_no; - unsigned long error_code; - unsigned long oldmask; - unsigned long fault_address; - struct pt_regs regs; -}; - -#endif diff --git a/arch/unicore32/include/uapi/asm/unistd.h b/arch/unicore32/include/uapi/asm/unistd.h deleted file mode 100644 index 54a7378a70b1..000000000000 --- a/arch/unicore32/include/uapi/asm/unistd.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * linux/arch/unicore32/include/asm/unistd.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#define __ARCH_WANT_RENAMEAT -#define __ARCH_WANT_SET_GET_RLIMIT -#define __ARCH_WANT_STAT64 -#define __ARCH_WANT_TIME32_SYSCALLS - -/* Use the standard ABI for syscalls. */ -#include -#define __ARCH_WANT_SYS_CLONE diff --git a/arch/unicore32/kernel/Makefile b/arch/unicore32/kernel/Makefile deleted file mode 100644 index 2f79aa56735b..000000000000 --- a/arch/unicore32/kernel/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for the linux kernel. -# - -# Object file lists. -obj-y := dma.o elf.o entry.o process.o ptrace.o -obj-y += setup.o signal.o sys.o stacktrace.o traps.o - -obj-$(CONFIG_MODULES) += ksyms.o module.o -obj-$(CONFIG_EARLY_PRINTK) += early_printk.o - -obj-$(CONFIG_UNICORE_FPU_F64) += fpu-ucf64.o - -# obj-y for architecture PKUnity v3 -obj-$(CONFIG_ARCH_PUV3) += clock.o irq.o time.o - -obj-$(CONFIG_PUV3_GPIO) += gpio.o -obj-$(CONFIG_PUV3_PM) += pm.o sleep.o -obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate_asm.o - -obj-$(CONFIG_PCI) += pci.o - -# obj-y for specific machines -obj-$(CONFIG_ARCH_PUV3) += puv3-core.o -obj-$(CONFIG_PUV3_NB0916) += puv3-nb0916.o - -head-y := head.o -obj-$(CONFIG_DEBUG_LL) += debug.o - -extra-y := $(head-y) vmlinux.lds diff --git a/arch/unicore32/kernel/asm-offsets.c b/arch/unicore32/kernel/asm-offsets.c deleted file mode 100644 index f7d672267549..000000000000 --- a/arch/unicore32/kernel/asm-offsets.c +++ /dev/null @@ -1,108 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/asm-offsets.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed to extract - * and format the required data. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * GCC 3.0, 3.1: general bad code generation. - * GCC 3.2.0: incorrect function argument offset calculation. - * GCC 3.2.x: miscompiles NEW_AUX_ENT in fs/binfmt_elf.c - * (http://gcc.gnu.org/PR8896) and incorrect structure - * initialisation in fs/jffs2/erase.c - */ -#if (__GNUC__ < 4) -#error Your compiler should upgrade to uc4 -#error Known good compilers: 4.2.2 -#endif - -int main(void) -{ - DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); - BLANK(); - DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); - DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); - DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); - DEFINE(TI_TASK, offsetof(struct thread_info, task)); - DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); - DEFINE(TI_CPU_SAVE, offsetof(struct thread_info, cpu_context)); - DEFINE(TI_USED_CP, offsetof(struct thread_info, used_cp)); -#ifdef CONFIG_UNICORE_FPU_F64 - DEFINE(TI_FPSTATE, offsetof(struct thread_info, fpstate)); -#endif - BLANK(); - DEFINE(S_R0, offsetof(struct pt_regs, UCreg_00)); - DEFINE(S_R1, offsetof(struct pt_regs, UCreg_01)); - DEFINE(S_R2, offsetof(struct pt_regs, UCreg_02)); - DEFINE(S_R3, offsetof(struct pt_regs, UCreg_03)); - DEFINE(S_R4, offsetof(struct pt_regs, UCreg_04)); - DEFINE(S_R5, offsetof(struct pt_regs, UCreg_05)); - DEFINE(S_R6, offsetof(struct pt_regs, UCreg_06)); - DEFINE(S_R7, offsetof(struct pt_regs, UCreg_07)); - DEFINE(S_R8, offsetof(struct pt_regs, UCreg_08)); - DEFINE(S_R9, offsetof(struct pt_regs, UCreg_09)); - DEFINE(S_R10, offsetof(struct pt_regs, UCreg_10)); - DEFINE(S_R11, offsetof(struct pt_regs, UCreg_11)); - DEFINE(S_R12, offsetof(struct pt_regs, UCreg_12)); - DEFINE(S_R13, offsetof(struct pt_regs, UCreg_13)); - DEFINE(S_R14, offsetof(struct pt_regs, UCreg_14)); - DEFINE(S_R15, offsetof(struct pt_regs, UCreg_15)); - DEFINE(S_R16, offsetof(struct pt_regs, UCreg_16)); - DEFINE(S_R17, offsetof(struct pt_regs, UCreg_17)); - DEFINE(S_R18, offsetof(struct pt_regs, UCreg_18)); - DEFINE(S_R19, offsetof(struct pt_regs, UCreg_19)); - DEFINE(S_R20, offsetof(struct pt_regs, UCreg_20)); - DEFINE(S_R21, offsetof(struct pt_regs, UCreg_21)); - DEFINE(S_R22, offsetof(struct pt_regs, UCreg_22)); - DEFINE(S_R23, offsetof(struct pt_regs, UCreg_23)); - DEFINE(S_R24, offsetof(struct pt_regs, UCreg_24)); - DEFINE(S_R25, offsetof(struct pt_regs, UCreg_25)); - DEFINE(S_R26, offsetof(struct pt_regs, UCreg_26)); - DEFINE(S_FP, offsetof(struct pt_regs, UCreg_fp)); - DEFINE(S_IP, offsetof(struct pt_regs, UCreg_ip)); - DEFINE(S_SP, offsetof(struct pt_regs, UCreg_sp)); - DEFINE(S_LR, offsetof(struct pt_regs, UCreg_lr)); - DEFINE(S_PC, offsetof(struct pt_regs, UCreg_pc)); - DEFINE(S_PSR, offsetof(struct pt_regs, UCreg_asr)); - DEFINE(S_OLD_R0, offsetof(struct pt_regs, UCreg_ORIG_00)); - DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); - BLANK(); - DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm)); - DEFINE(VMA_VM_FLAGS, offsetof(struct vm_area_struct, vm_flags)); - BLANK(); - DEFINE(VM_EXEC, VM_EXEC); - BLANK(); - DEFINE(PAGE_SZ, PAGE_SIZE); - BLANK(); - DEFINE(SYS_ERROR0, 0x9f0000); - BLANK(); - DEFINE(PBE_ADDRESS, offsetof(struct pbe, address)); - DEFINE(PBE_ORIN_ADDRESS, offsetof(struct pbe, orig_address)); - DEFINE(PBE_NEXT, offsetof(struct pbe, next)); - DEFINE(SWSUSP_CPU, offsetof(struct swsusp_arch_regs, \ - cpu_context)); -#ifdef CONFIG_UNICORE_FPU_F64 - DEFINE(SWSUSP_FPSTATE, offsetof(struct swsusp_arch_regs, \ - fpstate)); -#endif - BLANK(); - DEFINE(DMA_BIDIRECTIONAL, DMA_BIDIRECTIONAL); - DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); - DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); - return 0; -} diff --git a/arch/unicore32/kernel/clock.c b/arch/unicore32/kernel/clock.c deleted file mode 100644 index 41df6be0a3b2..000000000000 --- a/arch/unicore32/kernel/clock.c +++ /dev/null @@ -1,387 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/clock.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Very simple clock implementation - */ -struct clk { - struct list_head node; - unsigned long rate; - const char *name; -}; - -static struct clk clk_ost_clk = { - .name = "OST_CLK", - .rate = CLOCK_TICK_RATE, -}; - -static struct clk clk_mclk_clk = { - .name = "MAIN_CLK", -}; - -static struct clk clk_bclk32_clk = { - .name = "BUS32_CLK", -}; - -static struct clk clk_ddr_clk = { - .name = "DDR_CLK", -}; - -static struct clk clk_vga_clk = { - .name = "VGA_CLK", -}; - -static LIST_HEAD(clocks); -static DEFINE_MUTEX(clocks_mutex); - -struct clk *clk_get(struct device *dev, const char *id) -{ - struct clk *p, *clk = ERR_PTR(-ENOENT); - - mutex_lock(&clocks_mutex); - list_for_each_entry(p, &clocks, node) { - if (strcmp(id, p->name) == 0) { - clk = p; - break; - } - } - mutex_unlock(&clocks_mutex); - - return clk; -} -EXPORT_SYMBOL(clk_get); - -void clk_put(struct clk *clk) -{ -} -EXPORT_SYMBOL(clk_put); - -int clk_enable(struct clk *clk) -{ - return 0; -} -EXPORT_SYMBOL(clk_enable); - -void clk_disable(struct clk *clk) -{ -} -EXPORT_SYMBOL(clk_disable); - -unsigned long clk_get_rate(struct clk *clk) -{ - return clk->rate; -} -EXPORT_SYMBOL(clk_get_rate); - -struct { - unsigned long rate; - unsigned long cfg; - unsigned long div; -} vga_clk_table[] = { - {.rate = 25175000, .cfg = 0x00002001, .div = 0x9}, - {.rate = 31500000, .cfg = 0x00002001, .div = 0x7}, - {.rate = 40000000, .cfg = 0x00003801, .div = 0x9}, - {.rate = 49500000, .cfg = 0x00003801, .div = 0x7}, - {.rate = 65000000, .cfg = 0x00002c01, .div = 0x4}, - {.rate = 78750000, .cfg = 0x00002400, .div = 0x7}, - {.rate = 108000000, .cfg = 0x00002c01, .div = 0x2}, - {.rate = 106500000, .cfg = 0x00003c01, .div = 0x3}, - {.rate = 50650000, .cfg = 0x00106400, .div = 0x9}, - {.rate = 61500000, .cfg = 0x00106400, .div = 0xa}, - {.rate = 85500000, .cfg = 0x00002800, .div = 0x6}, -}; - -struct { - unsigned long mrate; - unsigned long prate; -} mclk_clk_table[] = { - {.mrate = 500000000, .prate = 0x00109801}, - {.mrate = 525000000, .prate = 0x00104C00}, - {.mrate = 550000000, .prate = 0x00105000}, - {.mrate = 575000000, .prate = 0x00105400}, - {.mrate = 600000000, .prate = 0x00105800}, - {.mrate = 625000000, .prate = 0x00105C00}, - {.mrate = 650000000, .prate = 0x00106000}, - {.mrate = 675000000, .prate = 0x00106400}, - {.mrate = 700000000, .prate = 0x00106800}, - {.mrate = 725000000, .prate = 0x00106C00}, - {.mrate = 750000000, .prate = 0x00107000}, - {.mrate = 775000000, .prate = 0x00107400}, - {.mrate = 800000000, .prate = 0x00107800}, -}; - -int clk_set_rate(struct clk *clk, unsigned long rate) -{ - if (clk == &clk_vga_clk) { - unsigned long pll_vgacfg, pll_vgadiv; - int ret, i; - - /* lookup vga_clk_table */ - ret = -EINVAL; - for (i = 0; i < ARRAY_SIZE(vga_clk_table); i++) { - if (rate == vga_clk_table[i].rate) { - pll_vgacfg = vga_clk_table[i].cfg; - pll_vgadiv = vga_clk_table[i].div; - ret = 0; - break; - } - } - - if (ret) - return ret; - - if (readl(PM_PLLVGACFG) == pll_vgacfg) - return 0; - - /* set pll vga cfg reg. */ - writel(pll_vgacfg, PM_PLLVGACFG); - - writel(PM_PMCR_CFBVGA, PM_PMCR); - while ((readl(PM_PLLDFCDONE) & PM_PLLDFCDONE_VGADFC) - != PM_PLLDFCDONE_VGADFC) - udelay(100); /* about 1ms */ - - /* set div cfg reg. */ - writel(readl(PM_PCGR) | PM_PCGR_VGACLK, PM_PCGR); - - writel((readl(PM_DIVCFG) & ~PM_DIVCFG_VGACLK_MASK) - | PM_DIVCFG_VGACLK(pll_vgadiv), PM_DIVCFG); - - writel(readl(PM_SWRESET) | PM_SWRESET_VGADIV, PM_SWRESET); - while ((readl(PM_SWRESET) & PM_SWRESET_VGADIV) - == PM_SWRESET_VGADIV) - udelay(100); /* 65536 bclk32, about 320us */ - - writel(readl(PM_PCGR) & ~PM_PCGR_VGACLK, PM_PCGR); - } -#ifdef CONFIG_CPU_FREQ - if (clk == &clk_mclk_clk) { - u32 pll_rate, divstatus = readl(PM_DIVSTATUS); - int ret, i; - - /* lookup mclk_clk_table */ - ret = -EINVAL; - for (i = 0; i < ARRAY_SIZE(mclk_clk_table); i++) { - if (rate == mclk_clk_table[i].mrate) { - pll_rate = mclk_clk_table[i].prate; - clk_mclk_clk.rate = mclk_clk_table[i].mrate; - ret = 0; - break; - } - } - - if (ret) - return ret; - - if (clk_mclk_clk.rate) - clk_bclk32_clk.rate = clk_mclk_clk.rate - / (((divstatus & 0x0000f000) >> 12) + 1); - - /* set pll sys cfg reg. */ - writel(pll_rate, PM_PLLSYSCFG); - - writel(PM_PMCR_CFBSYS, PM_PMCR); - while ((readl(PM_PLLDFCDONE) & PM_PLLDFCDONE_SYSDFC) - != PM_PLLDFCDONE_SYSDFC) - udelay(100); - /* about 1ms */ - } -#endif - return 0; -} -EXPORT_SYMBOL(clk_set_rate); - -int clk_register(struct clk *clk) -{ - mutex_lock(&clocks_mutex); - list_add(&clk->node, &clocks); - mutex_unlock(&clocks_mutex); - printk(KERN_DEFAULT "PKUnity PM: %s %lu.%02luM\n", clk->name, - (clk->rate)/1000000, (clk->rate)/10000 % 100); - return 0; -} -EXPORT_SYMBOL(clk_register); - -void clk_unregister(struct clk *clk) -{ - mutex_lock(&clocks_mutex); - list_del(&clk->node); - mutex_unlock(&clocks_mutex); -} -EXPORT_SYMBOL(clk_unregister); - -struct { - unsigned long prate; - unsigned long rate; -} pllrate_table[] = { - {.prate = 0x00002001, .rate = 250000000}, - {.prate = 0x00104801, .rate = 250000000}, - {.prate = 0x00104C01, .rate = 262500000}, - {.prate = 0x00002401, .rate = 275000000}, - {.prate = 0x00105001, .rate = 275000000}, - {.prate = 0x00105401, .rate = 287500000}, - {.prate = 0x00002801, .rate = 300000000}, - {.prate = 0x00105801, .rate = 300000000}, - {.prate = 0x00105C01, .rate = 312500000}, - {.prate = 0x00002C01, .rate = 325000000}, - {.prate = 0x00106001, .rate = 325000000}, - {.prate = 0x00106401, .rate = 337500000}, - {.prate = 0x00003001, .rate = 350000000}, - {.prate = 0x00106801, .rate = 350000000}, - {.prate = 0x00106C01, .rate = 362500000}, - {.prate = 0x00003401, .rate = 375000000}, - {.prate = 0x00107001, .rate = 375000000}, - {.prate = 0x00107401, .rate = 387500000}, - {.prate = 0x00003801, .rate = 400000000}, - {.prate = 0x00107801, .rate = 400000000}, - {.prate = 0x00107C01, .rate = 412500000}, - {.prate = 0x00003C01, .rate = 425000000}, - {.prate = 0x00108001, .rate = 425000000}, - {.prate = 0x00108401, .rate = 437500000}, - {.prate = 0x00004001, .rate = 450000000}, - {.prate = 0x00108801, .rate = 450000000}, - {.prate = 0x00108C01, .rate = 462500000}, - {.prate = 0x00004401, .rate = 475000000}, - {.prate = 0x00109001, .rate = 475000000}, - {.prate = 0x00109401, .rate = 487500000}, - {.prate = 0x00004801, .rate = 500000000}, - {.prate = 0x00109801, .rate = 500000000}, - {.prate = 0x00104C00, .rate = 525000000}, - {.prate = 0x00002400, .rate = 550000000}, - {.prate = 0x00105000, .rate = 550000000}, - {.prate = 0x00105400, .rate = 575000000}, - {.prate = 0x00002800, .rate = 600000000}, - {.prate = 0x00105800, .rate = 600000000}, - {.prate = 0x00105C00, .rate = 625000000}, - {.prate = 0x00002C00, .rate = 650000000}, - {.prate = 0x00106000, .rate = 650000000}, - {.prate = 0x00106400, .rate = 675000000}, - {.prate = 0x00003000, .rate = 700000000}, - {.prate = 0x00106800, .rate = 700000000}, - {.prate = 0x00106C00, .rate = 725000000}, - {.prate = 0x00003400, .rate = 750000000}, - {.prate = 0x00107000, .rate = 750000000}, - {.prate = 0x00107400, .rate = 775000000}, - {.prate = 0x00003800, .rate = 800000000}, - {.prate = 0x00107800, .rate = 800000000}, - {.prate = 0x00107C00, .rate = 825000000}, - {.prate = 0x00003C00, .rate = 850000000}, - {.prate = 0x00108000, .rate = 850000000}, - {.prate = 0x00108400, .rate = 875000000}, - {.prate = 0x00004000, .rate = 900000000}, - {.prate = 0x00108800, .rate = 900000000}, - {.prate = 0x00108C00, .rate = 925000000}, - {.prate = 0x00004400, .rate = 950000000}, - {.prate = 0x00109000, .rate = 950000000}, - {.prate = 0x00109400, .rate = 975000000}, - {.prate = 0x00004800, .rate = 1000000000}, - {.prate = 0x00109800, .rate = 1000000000}, -}; - -struct { - unsigned long prate; - unsigned long drate; -} pddr_table[] = { - {.prate = 0x00100800, .drate = 44236800}, - {.prate = 0x00100C00, .drate = 66355200}, - {.prate = 0x00101000, .drate = 88473600}, - {.prate = 0x00101400, .drate = 110592000}, - {.prate = 0x00101800, .drate = 132710400}, - {.prate = 0x00101C01, .drate = 154828800}, - {.prate = 0x00102001, .drate = 176947200}, - {.prate = 0x00102401, .drate = 199065600}, - {.prate = 0x00102801, .drate = 221184000}, - {.prate = 0x00102C01, .drate = 243302400}, - {.prate = 0x00103001, .drate = 265420800}, - {.prate = 0x00103401, .drate = 287539200}, - {.prate = 0x00103801, .drate = 309657600}, - {.prate = 0x00103C01, .drate = 331776000}, - {.prate = 0x00104001, .drate = 353894400}, -}; - -static int __init clk_init(void) -{ -#ifdef CONFIG_PUV3_PM - u32 pllrate, divstatus = readl(PM_DIVSTATUS); - u32 pcgr_val = readl(PM_PCGR); - int i; - - pcgr_val |= PM_PCGR_BCLKMME | PM_PCGR_BCLKH264E | PM_PCGR_BCLKH264D - | PM_PCGR_HECLK | PM_PCGR_HDCLK; - writel(pcgr_val, PM_PCGR); - - pllrate = readl(PM_PLLSYSSTATUS); - - /* lookup pmclk_table */ - clk_mclk_clk.rate = 0; - for (i = 0; i < ARRAY_SIZE(pllrate_table); i++) { - if (pllrate == pllrate_table[i].prate) { - clk_mclk_clk.rate = pllrate_table[i].rate; - break; - } - } - - if (clk_mclk_clk.rate) - clk_bclk32_clk.rate = clk_mclk_clk.rate / - (((divstatus & 0x0000f000) >> 12) + 1); - - pllrate = readl(PM_PLLDDRSTATUS); - - /* lookup pddr_table */ - clk_ddr_clk.rate = 0; - for (i = 0; i < ARRAY_SIZE(pddr_table); i++) { - if (pllrate == pddr_table[i].prate) { - clk_ddr_clk.rate = pddr_table[i].drate; - break; - } - } - - pllrate = readl(PM_PLLVGASTATUS); - - /* lookup pvga_table */ - clk_vga_clk.rate = 0; - for (i = 0; i < ARRAY_SIZE(pllrate_table); i++) { - if (pllrate == pllrate_table[i].prate) { - clk_vga_clk.rate = pllrate_table[i].rate; - break; - } - } - - if (clk_vga_clk.rate) - clk_vga_clk.rate = clk_vga_clk.rate / - (((divstatus & 0x00f00000) >> 20) + 1); - - clk_register(&clk_vga_clk); -#endif -#ifdef CONFIG_ARCH_FPGA - clk_ddr_clk.rate = 33000000; - clk_mclk_clk.rate = 33000000; - clk_bclk32_clk.rate = 33000000; -#endif - clk_register(&clk_ddr_clk); - clk_register(&clk_mclk_clk); - clk_register(&clk_bclk32_clk); - clk_register(&clk_ost_clk); - return 0; -} -core_initcall(clk_init); diff --git a/arch/unicore32/kernel/debug-macro.S b/arch/unicore32/kernel/debug-macro.S deleted file mode 100644 index 7e2da0de4f71..000000000000 --- a/arch/unicore32/kernel/debug-macro.S +++ /dev/null @@ -1,86 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/debug-macro.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Debugging macro include header - */ -#include -#include - - .macro put_word_ocd, rd, rx=r16 -1001: movc \rx, p1.c0, #0 - cand.a \rx, #2 - bne 1001b - movc p1.c1, \rd, #1 - .endm - -#ifdef CONFIG_DEBUG_OCD - /* debug using UniCore On-Chip-Debugger */ - .macro addruart, rx - .endm - - .macro senduart, rd, rx - put_word_ocd \rd, \rx - .endm - - .macro busyuart, rd, rx - .endm - - .macro waituart, rd, rx - .endm -#else -#define UART_CLK_DEFAULT 3686400 * 20 - /* Uartclk = MCLK/ 2, The MCLK on my board is 3686400 * 40 */ -#define BAUD_RATE_DEFAULT 115200 - /* The baud rate of the serial port */ - -#define UART_DIVISOR_DEFAULT (UART_CLK_DEFAULT \ - / (16 * BAUD_RATE_DEFAULT) - 1) - - .macro addruart,rx - mrc p0, #0, \rx, c1, c0 - tst \rx, #1 @ MMU enabled? - moveq \rx, #0xee000000 @ physical base address - movne \rx, #0x6e000000 @ virtual address - - @ We probe for the active serial port here - @ However, now we assume UART0 is active: epip4d - @ We assume r1 and r2 can be clobbered. - - movl r2, #UART_DIVISOR_DEFAULT - mov r1, #0x80 - str r1, [\rx, #UART_LCR_OFFSET] - and r1, r2, #0xff00 - mov r1, r1, lsr #8 - str r1, [\rx, #UART_DLH_OFFSET] - and r1, r2, #0xff - str r1, [\rx, #UART_DLL_OFFSET] - mov r1, #0x7 - str r1, [\rx, #UART_FCR_OFFSET] - mov r1, #0x3 - str r1, [\rx, #UART_LCR_OFFSET] - mov r1, #0x0 - str r1, [\rx, #UART_IER_OFFSET] - .endm - - .macro senduart,rd,rx - str \rd, [\rx, #UART_THR_OFFSET] - .endm - - .macro waituart,rd,rx -1001: ldr \rd, [\rx, #UART_LSR_OFFSET] - tst \rd, #UART_LSR_THRE - beq 1001b - .endm - - .macro busyuart,rd,rx -1001: ldr \rd, [\rx, #UART_LSR_OFFSET] - tst \rd, #UART_LSR_TEMT - bne 1001b - .endm -#endif - diff --git a/arch/unicore32/kernel/debug.S b/arch/unicore32/kernel/debug.S deleted file mode 100644 index 13bc8c8550e4..000000000000 --- a/arch/unicore32/kernel/debug.S +++ /dev/null @@ -1,82 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/debug.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * 32-bit debugging code - */ -#include -#include - - .text - -/* - * Some debugging routines (useful if you've got MM problems and - * printk isn't working). For DEBUGGING ONLY!!! Do not leave - * references to these in a production kernel! - */ -#include "debug-macro.S" - -/* - * Useful debugging routines - */ -ENTRY(printhex8) - mov r1, #8 - b printhex -ENDPROC(printhex8) - -ENTRY(printhex4) - mov r1, #4 - b printhex -ENDPROC(printhex4) - -ENTRY(printhex2) - mov r1, #2 -printhex: adr r2, hexbuf - add r3, r2, r1 - mov r1, #0 - stb r1, [r3] -1: and r1, r0, #15 - mov r0, r0 >> #4 - csub.a r1, #10 - beg 2f - add r1, r1, #'0' - 'a' + 10 -2: add r1, r1, #'a' - 10 - stb.w r1, [r3+], #-1 - cxor.a r3, r2 - bne 1b - mov r0, r2 - b printascii -ENDPROC(printhex2) - - .ltorg - -ENTRY(printascii) - addruart r3 - b 2f -1: waituart r2, r3 - senduart r1, r3 - busyuart r2, r3 - cxor.a r1, #'\n' - cmoveq r1, #'\r' - beq 1b -2: cxor.a r0, #0 - beq 3f - ldb.w r1, [r0]+, #1 - cxor.a r1, #0 - bne 1b -3: mov pc, lr -ENDPROC(printascii) - -ENTRY(printch) - addruart r3 - mov r1, r0 - mov r0, #0 - b 1b -ENDPROC(printch) - -hexbuf: .space 16 - diff --git a/arch/unicore32/kernel/dma.c b/arch/unicore32/kernel/dma.c deleted file mode 100644 index 7a0e2d4d6077..000000000000 --- a/arch/unicore32/kernel/dma.c +++ /dev/null @@ -1,179 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/dma.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -struct dma_channel { - char *name; - puv3_dma_prio prio; - void (*irq_handler)(int, void *); - void (*err_handler)(int, void *); - void *data; -}; - -static struct dma_channel dma_channels[MAX_DMA_CHANNELS]; - -int puv3_request_dma(char *name, puv3_dma_prio prio, - void (*irq_handler)(int, void *), - void (*err_handler)(int, void *), - void *data) -{ - unsigned long flags; - int i, found = 0; - - /* basic sanity checks */ - if (!name) - return -EINVAL; - - local_irq_save(flags); - - do { - /* try grabbing a DMA channel with the requested priority */ - for (i = 0; i < MAX_DMA_CHANNELS; i++) { - if ((dma_channels[i].prio == prio) && - !dma_channels[i].name) { - found = 1; - break; - } - } - /* if requested prio group is full, try a hier priority */ - } while (!found && prio--); - - if (found) { - dma_channels[i].name = name; - dma_channels[i].irq_handler = irq_handler; - dma_channels[i].err_handler = err_handler; - dma_channels[i].data = data; - } else { - printk(KERN_WARNING "No more available DMA channels for %s\n", - name); - i = -ENODEV; - } - - local_irq_restore(flags); - return i; -} -EXPORT_SYMBOL(puv3_request_dma); - -void puv3_free_dma(int dma_ch) -{ - unsigned long flags; - - if (!dma_channels[dma_ch].name) { - printk(KERN_CRIT - "%s: trying to free channel %d which is already freed\n", - __func__, dma_ch); - return; - } - - local_irq_save(flags); - dma_channels[dma_ch].name = NULL; - dma_channels[dma_ch].err_handler = NULL; - local_irq_restore(flags); -} -EXPORT_SYMBOL(puv3_free_dma); - -static irqreturn_t dma_irq_handler(int irq, void *dev_id) -{ - int i, dint; - - dint = readl(DMAC_ITCSR); - for (i = 0; i < MAX_DMA_CHANNELS; i++) { - if (dint & DMAC_CHANNEL(i)) { - struct dma_channel *channel = &dma_channels[i]; - - /* Clear TC interrupt of channel i */ - writel(DMAC_CHANNEL(i), DMAC_ITCCR); - writel(0, DMAC_ITCCR); - - if (channel->name && channel->irq_handler) { - channel->irq_handler(i, channel->data); - } else { - /* - * IRQ for an unregistered DMA channel: - * let's clear the interrupts and disable it. - */ - printk(KERN_WARNING "spurious IRQ for" - " DMA channel %d\n", i); - } - } - } - return IRQ_HANDLED; -} - -static irqreturn_t dma_err_handler(int irq, void *dev_id) -{ - int i, dint; - - dint = readl(DMAC_IESR); - for (i = 0; i < MAX_DMA_CHANNELS; i++) { - if (dint & DMAC_CHANNEL(i)) { - struct dma_channel *channel = &dma_channels[i]; - - /* Clear Err interrupt of channel i */ - writel(DMAC_CHANNEL(i), DMAC_IECR); - writel(0, DMAC_IECR); - - if (channel->name && channel->err_handler) { - channel->err_handler(i, channel->data); - } else { - /* - * IRQ for an unregistered DMA channel: - * let's clear the interrupts and disable it. - */ - printk(KERN_WARNING "spurious IRQ for" - " DMA channel %d\n", i); - } - } - } - return IRQ_HANDLED; -} - -int __init puv3_init_dma(void) -{ - int i, ret; - - /* dma channel priorities on v8 processors: - * ch 0 - 1 <--> (0) DMA_PRIO_HIGH - * ch 2 - 3 <--> (1) DMA_PRIO_MEDIUM - * ch 4 - 5 <--> (2) DMA_PRIO_LOW - */ - for (i = 0; i < MAX_DMA_CHANNELS; i++) { - puv3_stop_dma(i); - dma_channels[i].name = NULL; - dma_channels[i].prio = min((i & 0x7) >> 1, DMA_PRIO_LOW); - } - - ret = request_irq(IRQ_DMA, dma_irq_handler, 0, "DMA", NULL); - if (ret) { - printk(KERN_CRIT "Can't register IRQ for DMA\n"); - return ret; - } - - ret = request_irq(IRQ_DMAERR, dma_err_handler, 0, "DMAERR", NULL); - if (ret) { - printk(KERN_CRIT "Can't register IRQ for DMAERR\n"); - free_irq(IRQ_DMA, "DMA"); - return ret; - } - - return 0; -} - -postcore_initcall(puv3_init_dma); diff --git a/arch/unicore32/kernel/early_printk.c b/arch/unicore32/kernel/early_printk.c deleted file mode 100644 index c00b6712b8f7..000000000000 --- a/arch/unicore32/kernel/early_printk.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/early_printk.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include - -/* On-Chip-Debugger functions */ - -static void early_ocd_write(struct console *con, const char *s, unsigned n) -{ - while (*s && n-- > 0) { - if (*s == '\n') - ocd_putc((int)'\r'); - ocd_putc((int)*s); - s++; - } -} - -static struct console early_ocd_console = { - .name = "earlyocd", - .write = early_ocd_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -static int __init setup_early_printk(char *buf) -{ - if (!buf || early_console) - return 0; - - early_console = &early_ocd_console; - if (strstr(buf, "keep")) - early_console->flags &= ~CON_BOOT; - else - early_console->flags |= CON_BOOT; - register_console(early_console); - return 0; -} -early_param("earlyprintk", setup_early_printk); diff --git a/arch/unicore32/kernel/elf.c b/arch/unicore32/kernel/elf.c deleted file mode 100644 index 22adc65a03e9..000000000000 --- a/arch/unicore32/kernel/elf.c +++ /dev/null @@ -1,35 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/elf.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include - -int elf_check_arch(const struct elf32_hdr *x) -{ - /* Make sure it's an UniCore executable */ - if (x->e_machine != EM_UNICORE) - return 0; - - /* Make sure the entry address is reasonable */ - if (x->e_entry & 3) - return 0; - - return 1; -} -EXPORT_SYMBOL(elf_check_arch); - -void elf_set_personality(const struct elf32_hdr *x) -{ - unsigned int personality = PER_LINUX; - - set_personality(personality); -} -EXPORT_SYMBOL(elf_set_personality); diff --git a/arch/unicore32/kernel/entry.S b/arch/unicore32/kernel/entry.S deleted file mode 100644 index b35dc83069cb..000000000000 --- a/arch/unicore32/kernel/entry.S +++ /dev/null @@ -1,802 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/entry.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Low-level vector interface routines - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include "debug-macro.S" - -@ -@ Most of the stack format comes from struct pt_regs, but with -@ the addition of 8 bytes for storing syscall args 5 and 6. -@ -#define S_OFF 8 - -/* - * The SWI code relies on the fact that R0 is at the bottom of the stack - * (due to slow/fast restore user regs). - */ -#if S_R0 != 0 -#error "Please fix" -#endif - - .macro zero_fp -#ifdef CONFIG_FRAME_POINTER - mov fp, #0 -#endif - .endm - - .macro alignment_trap, rtemp -#ifdef CONFIG_ALIGNMENT_TRAP - ldw \rtemp, .LCcralign - ldw \rtemp, [\rtemp] - movc p0.c1, \rtemp, #0 -#endif - .endm - - .macro load_user_sp_lr, rd, rtemp, offset = 0 - mov \rtemp, asr - xor \rtemp, \rtemp, #(PRIV_MODE ^ SUSR_MODE) - mov.a asr, \rtemp @ switch to the SUSR mode - - ldw sp, [\rd+], #\offset @ load sp_user - ldw lr, [\rd+], #\offset + 4 @ load lr_user - - xor \rtemp, \rtemp, #(PRIV_MODE ^ SUSR_MODE) - mov.a asr, \rtemp @ switch back to the PRIV mode - .endm - - .macro priv_exit, rpsr - mov.a bsr, \rpsr - ldm.w (r0 - r15), [sp]+ - ldm.b (r16 - pc), [sp]+ @ load r0 - pc, asr - .endm - - .macro restore_user_regs, fast = 0, offset = 0 - ldw r1, [sp+], #\offset + S_PSR @ get calling asr - ldw lr, [sp+], #\offset + S_PC @ get pc - mov.a bsr, r1 @ save in bsr_priv - .if \fast - add sp, sp, #\offset + S_R1 @ r0 is syscall return value - ldm.w (r1 - r15), [sp]+ @ get calling r1 - r15 - ldur (r16 - lr), [sp]+ @ get calling r16 - lr - .else - ldm.w (r0 - r15), [sp]+ @ get calling r0 - r15 - ldur (r16 - lr), [sp]+ @ get calling r16 - lr - .endif - nop - add sp, sp, #S_FRAME_SIZE - S_R16 - mov.a pc, lr @ return - @ and move bsr_priv into asr - .endm - - .macro get_thread_info, rd - mov \rd, sp >> #13 - mov \rd, \rd << #13 - .endm - - .macro get_irqnr_and_base, irqnr, irqstat, base, tmp - ldw \base, =(PKUNITY_INTC_BASE) - ldw \irqstat, [\base+], #0xC @ INTC_ICIP - ldw \tmp, [\base+], #0x4 @ INTC_ICMR - and.a \irqstat, \irqstat, \tmp - beq 1001f - cntlz \irqnr, \irqstat - rsub \irqnr, \irqnr, #31 -1001: /* EQ will be set if no irqs pending */ - .endm - -#ifdef CONFIG_DEBUG_LL - .macro printreg, reg, temp - adr \temp, 901f - stm (r0-r3), [\temp]+ - stw lr, [\temp+], #0x10 - mov r0, \reg - b.l printhex8 - mov r0, #':' - b.l printch - mov r0, pc - b.l printhex8 - adr r0, 902f - b.l printascii - adr \temp, 901f - ldm (r0-r3), [\temp]+ - ldw lr, [\temp+], #0x10 - b 903f -901: .word 0, 0, 0, 0, 0 @ r0-r3, lr -902: .asciz ": epip4d\n" - .align -903: - .endm -#endif - -/* - * These are the registers used in the syscall handler, and allow us to - * have in theory up to 7 arguments to a function - r0 to r6. - * - * Note that tbl == why is intentional. - * - * We must set at least "tsk" and "why" when calling ret_with_reschedule. - */ -scno .req r21 @ syscall number -tbl .req r22 @ syscall table pointer -why .req r22 @ Linux syscall (!= 0) -tsk .req r23 @ current thread_info - -/* - * Interrupt handling. Preserves r17, r18, r19 - */ - .macro intr_handler -1: get_irqnr_and_base r0, r6, r5, lr - beq 2f - mov r1, sp - @ - @ routine called with r0 = irq number, r1 = struct pt_regs * - @ - adr lr, 1b - b asm_do_IRQ -2: - .endm - -/* - * PRIV mode handlers - */ - .macro priv_entry - sub sp, sp, #(S_FRAME_SIZE - 4) - stm (r1 - r15), [sp]+ - add r5, sp, #S_R15 - stm (r16 - r28), [r5]+ - - ldm (r1 - r3), [r0]+ - add r5, sp, #S_SP - 4 @ here for interlock avoidance - mov r4, #-1 @ "" "" "" "" - add r0, sp, #(S_FRAME_SIZE - 4) - stw.w r1, [sp+], #-4 @ save the "real" r0 copied - @ from the exception stack - - mov r1, lr - - @ - @ We are now ready to fill in the remaining blanks on the stack: - @ - @ r0 - sp_priv - @ r1 - lr_priv - @ r2 - lr_, already fixed up for correct return/restart - @ r3 - bsr_ - @ r4 - orig_r0 (see pt_regs definition in ptrace.h) - @ - stm (r0 - r4), [r5]+ - .endm - -/* - * User mode handlers - * - */ - .macro user_entry - sub sp, sp, #S_FRAME_SIZE - stm (r1 - r15), [sp+] - add r4, sp, #S_R16 - stm (r16 - r28), [r4]+ - - ldm (r1 - r3), [r0]+ - add r0, sp, #S_PC @ here for interlock avoidance - mov r4, #-1 @ "" "" "" "" - - stw r1, [sp] @ save the "real" r0 copied - @ from the exception stack - - @ - @ We are now ready to fill in the remaining blanks on the stack: - @ - @ r2 - lr_, already fixed up for correct return/restart - @ r3 - bsr_ - @ r4 - orig_r0 (see pt_regs definition in ptrace.h) - @ - @ Also, separately save sp_user and lr_user - @ - stm (r2 - r4), [r0]+ - stur (sp, lr), [r0-] - - @ - @ Enable the alignment trap while in kernel mode - @ - alignment_trap r0 - - @ - @ Clear FP to mark the first stack frame - @ - zero_fp - .endm - - .text - -@ -@ __invalid - generic code for failed exception -@ (re-entrant version of handlers) -@ -__invalid: - sub sp, sp, #S_FRAME_SIZE - stm (r1 - r15), [sp+] - add r1, sp, #S_R16 - stm (r16 - r28, sp, lr), [r1]+ - - zero_fp - - ldm (r4 - r6), [r0]+ - add r0, sp, #S_PC @ here for interlock avoidance - mov r7, #-1 @ "" "" "" "" - stw r4, [sp] @ save preserved r0 - stm (r5 - r7), [r0]+ @ lr_, - @ asr_, "old_r0" - - mov r0, sp - mov r1, asr - b bad_mode -ENDPROC(__invalid) - - .align 5 -__dabt_priv: - priv_entry - - @ - @ get ready to re-enable interrupts if appropriate - @ - mov r17, asr - cand.a r3, #PSR_I_BIT - bne 1f - andn r17, r17, #PSR_I_BIT -1: - - @ - @ Call the processor-specific abort handler: - @ - @ r2 - aborted context pc - @ r3 - aborted context asr - @ - @ The abort handler must return the aborted address in r0, and - @ the fault status register in r1. - @ - movc r1, p0.c3, #0 @ get FSR - movc r0, p0.c4, #0 @ get FAR - - @ - @ set desired INTR state, then call main handler - @ - mov.a asr, r17 - mov r2, sp - b.l do_DataAbort - - @ - @ INTRs off again before pulling preserved data off the stack - @ - disable_irq r0 - - @ - @ restore BSR and restart the instruction - @ - ldw r2, [sp+], #S_PSR - priv_exit r2 @ return from exception -ENDPROC(__dabt_priv) - - .align 5 -__intr_priv: - priv_entry - - intr_handler - - mov r0, #0 @ epip4d - movc p0.c5, r0, #14 - nop; nop; nop; nop; nop; nop; nop; nop - - ldw r4, [sp+], #S_PSR @ irqs are already disabled - - priv_exit r4 @ return from exception -ENDPROC(__intr_priv) - - .ltorg - - .align 5 -__extn_priv: - priv_entry - - mov r0, sp @ struct pt_regs *regs - mov r1, asr - b bad_mode @ not supported -ENDPROC(__extn_priv) - - .align 5 -__pabt_priv: - priv_entry - - @ - @ re-enable interrupts if appropriate - @ - mov r17, asr - cand.a r3, #PSR_I_BIT - bne 1f - andn r17, r17, #PSR_I_BIT -1: - - @ - @ set args, then call main handler - @ - @ r0 - address of faulting instruction - @ r1 - pointer to registers on stack - @ - mov r0, r2 @ pass address of aborted instruction - mov r1, #5 - mov.a asr, r17 - mov r2, sp @ regs - b.l do_PrefetchAbort @ call abort handler - - @ - @ INTRs off again before pulling preserved data off the stack - @ - disable_irq r0 - - @ - @ restore BSR and restart the instruction - @ - ldw r2, [sp+], #S_PSR - priv_exit r2 @ return from exception -ENDPROC(__pabt_priv) - - .align 5 -.LCcralign: - .word cr_alignment - - .align 5 -__dabt_user: - user_entry - -#ifdef CONFIG_UNICORE_FPU_F64 - cff ip, s31 - cand.a ip, #0x08000000 @ FPU execption traps? - beq 209f - - ldw ip, [sp+], #S_PC - add ip, ip, #4 - stw ip, [sp+], #S_PC - @ - @ fall through to the emulation code, which returns using r19 if - @ it has emulated the instruction, or the more conventional lr - @ if we are to treat this as a real extended instruction - @ - @ r0 - instruction - @ -1: ldw.u r0, [r2] - adr r19, ret_from_exception - adr lr, 209f - @ - @ fallthrough to call do_uc_f64 - @ -/* - * Check whether the instruction is a co-processor instruction. - * If yes, we need to call the relevant co-processor handler. - * - * Note that we don't do a full check here for the co-processor - * instructions; all instructions with bit 27 set are well - * defined. The only instructions that should fault are the - * co-processor instructions. - * - * Emulators may wish to make use of the following registers: - * r0 = instruction opcode. - * r2 = PC - * r19 = normal "successful" return address - * r20 = this threads thread_info structure. - * lr = unrecognised instruction return address - */ - get_thread_info r20 @ get current thread - and r8, r0, #0x00003c00 @ mask out CP number - mov r7, #1 - stb r7, [r20+], #TI_USED_CP + 2 @ set appropriate used_cp[] - - @ F64 hardware support entry point. - @ r0 = faulted instruction - @ r19 = return address - @ r20 = fp_state - enable_irq r4 - add r20, r20, #TI_FPSTATE @ r20 = workspace - cff r1, s31 @ get fpu FPSCR - andn r2, r1, #0x08000000 - ctf r2, s31 @ clear 27 bit - mov r2, sp @ nothing stacked - regdump is at TOS - mov lr, r19 @ setup for a return to the user code - - @ Now call the C code to package up the bounce to the support code - @ r0 holds the trigger instruction - @ r1 holds the FPSCR value - @ r2 pointer to register dump - b ucf64_exchandler -209: -#endif - @ - @ Call the processor-specific abort handler: - @ - @ r2 - aborted context pc - @ r3 - aborted context asr - @ - @ The abort handler must return the aborted address in r0, and - @ the fault status register in r1. - @ - movc r1, p0.c3, #0 @ get FSR - movc r0, p0.c4, #0 @ get FAR - - @ - @ INTRs on, then call the main handler - @ - enable_irq r2 - mov r2, sp - adr lr, ret_from_exception - b do_DataAbort -ENDPROC(__dabt_user) - - .align 5 -__intr_user: - user_entry - - get_thread_info tsk - - intr_handler - - mov why, #0 - b ret_to_user -ENDPROC(__intr_user) - - .ltorg - - .align 5 -__extn_user: - user_entry - - mov r0, sp - mov r1, asr - b bad_mode -ENDPROC(__extn_user) - - .align 5 -__pabt_user: - user_entry - - mov r0, r2 @ pass address of aborted instruction. - mov r1, #5 - enable_irq r1 @ Enable interrupts - mov r2, sp @ regs - b.l do_PrefetchAbort @ call abort handler - /* fall through */ -/* - * This is the return code to user mode for abort handlers - */ -ENTRY(ret_from_exception) - get_thread_info tsk - mov why, #0 - b ret_to_user -ENDPROC(__pabt_user) -ENDPROC(ret_from_exception) - -/* - * Register switch for UniCore V2 processors - * r0 = previous task_struct, r1 = previous thread_info, r2 = next thread_info - * previous and next are guaranteed not to be the same. - */ -ENTRY(__switch_to) - add ip, r1, #TI_CPU_SAVE - stm.w (r4 - r15), [ip]+ - stm.w (r16 - r27, sp, lr), [ip]+ - -#ifdef CONFIG_UNICORE_FPU_F64 - add ip, r1, #TI_FPSTATE - sfm.w (f0 - f7 ), [ip]+ - sfm.w (f8 - f15), [ip]+ - sfm.w (f16 - f23), [ip]+ - sfm.w (f24 - f31), [ip]+ - cff r4, s31 - stw r4, [ip] - - add ip, r2, #TI_FPSTATE - lfm.w (f0 - f7 ), [ip]+ - lfm.w (f8 - f15), [ip]+ - lfm.w (f16 - f23), [ip]+ - lfm.w (f24 - f31), [ip]+ - ldw r4, [ip] - ctf r4, s31 -#endif - add ip, r2, #TI_CPU_SAVE - ldm.w (r4 - r15), [ip]+ - ldm (r16 - r27, sp, pc), [ip]+ @ Load all regs saved previously -ENDPROC(__switch_to) - - .align 5 -/* - * This is the fast syscall return path. We do as little as - * possible here, and this includes saving r0 back into the PRIV - * stack. - */ -ret_fast_syscall: - disable_irq r1 @ disable interrupts - ldw r1, [tsk+], #TI_FLAGS - cand.a r1, #_TIF_WORK_MASK - bne fast_work_pending - - @ fast_restore_user_regs - restore_user_regs fast = 1, offset = S_OFF - -/* - * Ok, we need to do extra processing, enter the slow path. - */ -fast_work_pending: - stw.w r0, [sp+], #S_R0+S_OFF @ returned r0 -work_pending: - cand.a r1, #_TIF_NEED_RESCHED - bne work_resched - mov r0, sp @ 'regs' - mov r2, why @ 'syscall' - cand.a r1, #_TIF_SIGPENDING @ delivering a signal? - cmovne why, #0 @ prevent further restarts - b.l do_notify_resume - b ret_slow_syscall @ Check work again - -work_resched: - b.l schedule -/* - * "slow" syscall return path. "why" tells us if this was a real syscall. - */ -ENTRY(ret_to_user) -ret_slow_syscall: - disable_irq r1 @ disable interrupts - get_thread_info tsk @ epip4d, one path error?! - ldw r1, [tsk+], #TI_FLAGS - cand.a r1, #_TIF_WORK_MASK - bne work_pending -no_work_pending: - @ slow_restore_user_regs - restore_user_regs fast = 0, offset = 0 -ENDPROC(ret_to_user) - -/* - * This is how we return from a fork. - */ -ENTRY(ret_from_fork) - b.l schedule_tail - b ret_slow_syscall -ENDPROC(ret_from_fork) - -ENTRY(ret_from_kernel_thread) - b.l schedule_tail - mov r0, r5 - adr lr, ret_slow_syscall - mov pc, r4 -ENDPROC(ret_from_kernel_thread) - -/*============================================================================= - * SWI handler - *----------------------------------------------------------------------------- - */ - .align 5 -ENTRY(vector_swi) - sub sp, sp, #S_FRAME_SIZE - stm (r0 - r15), [sp]+ @ Calling r0 - r15 - add r8, sp, #S_R16 - stm (r16 - r28), [r8]+ @ Calling r16 - r28 - add r8, sp, #S_PC - stur (sp, lr), [r8-] @ Calling sp, lr - mov r8, bsr @ called from non-REAL mode - stw lr, [sp+], #S_PC @ Save calling PC - stw r8, [sp+], #S_PSR @ Save ASR - stw r0, [sp+], #S_OLD_R0 @ Save OLD_R0 - zero_fp - - /* - * Get the system call number. - */ - sub ip, lr, #4 - ldw.u scno, [ip] @ get SWI instruction - -#ifdef CONFIG_ALIGNMENT_TRAP - ldw ip, __cr_alignment - ldw ip, [ip] - movc p0.c1, ip, #0 @ update control register -#endif - enable_irq ip - - get_thread_info tsk - ldw tbl, =sys_call_table @ load syscall table pointer - - andn scno, scno, #0xff000000 @ mask off SWI op-code - andn scno, scno, #0x00ff0000 @ mask off SWI op-code - - stm.w (r4, r5), [sp-] @ push fifth and sixth args - ldw ip, [tsk+], #TI_FLAGS @ check for syscall tracing - cand.a ip, #_TIF_SYSCALL_TRACE @ are we tracing syscalls? - bne __sys_trace - - csub.a scno, #__NR_syscalls @ check upper syscall limit - adr lr, ret_fast_syscall @ return address - bea 1f - ldw pc, [tbl+], scno << #2 @ call sys_* routine -1: - add r1, sp, #S_OFF -2: mov why, #0 @ no longer a real syscall - b sys_ni_syscall @ not private func - - /* - * This is the really slow path. We're going to be doing - * context switches, and waiting for our parent to respond. - */ -__sys_trace: - mov r2, scno - add r1, sp, #S_OFF - mov r0, #0 @ trace entry [IP = 0] - b.l syscall_trace - - adr lr, __sys_trace_return @ return address - mov scno, r0 @ syscall number (possibly new) - add r1, sp, #S_R0 + S_OFF @ pointer to regs - csub.a scno, #__NR_syscalls @ check upper syscall limit - bea 2b - ldm (r0 - r3), [r1]+ @ have to reload r0 - r3 - ldw pc, [tbl+], scno << #2 @ call sys_* routine - -__sys_trace_return: - stw.w r0, [sp+], #S_R0 + S_OFF @ save returned r0 - mov r2, scno - mov r1, sp - mov r0, #1 @ trace exit [IP = 1] - b.l syscall_trace - b ret_slow_syscall - - .align 5 -#ifdef CONFIG_ALIGNMENT_TRAP - .type __cr_alignment, #object -__cr_alignment: - .word cr_alignment -#endif - .ltorg - -ENTRY(sys_rt_sigreturn) - add r0, sp, #S_OFF - mov why, #0 @ prevent syscall restart handling - b __sys_rt_sigreturn -ENDPROC(sys_rt_sigreturn) - - __INIT - -/* - * Vector stubs. - * - * This code is copied to 0xffff0200 so we can use branches in the - * vectors, rather than ldr's. Note that this code must not - * exceed 0x300 bytes. - * - * Common stub entry macro: - * Enter in INTR mode, bsr = PRIV/USER ASR, lr = PRIV/USER PC - * - * SP points to a minimal amount of processor-private memory, the address - * of which is copied into r0 for the mode specific abort handler. - */ - .macro vector_stub, name, mode - .align 5 - -vector_\name: - @ - @ Save r0, lr_ (parent PC) and bsr_ - @ (parent ASR) - @ - stw r0, [sp] - stw lr, [sp+], #4 @ save r0, lr - mov lr, bsr - stw lr, [sp+], #8 @ save bsr - - @ - @ Prepare for PRIV mode. INTRs remain disabled. - @ - mov r0, asr - xor r0, r0, #(\mode ^ PRIV_MODE) - mov.a bsr, r0 - - @ - @ the branch table must immediately follow this code - @ - and lr, lr, #0x03 - add lr, lr, #1 - mov r0, sp - ldw lr, [pc+], lr << #2 - mov.a pc, lr @ branch to handler in PRIV mode -ENDPROC(vector_\name) - .align 2 - @ handler addresses follow this label - .endm - - .globl __stubs_start -__stubs_start: -/* - * Interrupt dispatcher - */ - vector_stub intr, INTR_MODE - - .long __intr_user @ 0 (USER) - .long __invalid @ 1 - .long __invalid @ 2 - .long __intr_priv @ 3 (PRIV) - -/* - * Data abort dispatcher - * Enter in ABT mode, bsr = USER ASR, lr = USER PC - */ - vector_stub dabt, ABRT_MODE - - .long __dabt_user @ 0 (USER) - .long __invalid @ 1 - .long __invalid @ 2 (INTR) - .long __dabt_priv @ 3 (PRIV) - -/* - * Prefetch abort dispatcher - * Enter in ABT mode, bsr = USER ASR, lr = USER PC - */ - vector_stub pabt, ABRT_MODE - - .long __pabt_user @ 0 (USER) - .long __invalid @ 1 - .long __invalid @ 2 (INTR) - .long __pabt_priv @ 3 (PRIV) - -/* - * Undef instr entry dispatcher - * Enter in EXTN mode, bsr = PRIV/USER ASR, lr = PRIV/USER PC - */ - vector_stub extn, EXTN_MODE - - .long __extn_user @ 0 (USER) - .long __invalid @ 1 - .long __invalid @ 2 (INTR) - .long __extn_priv @ 3 (PRIV) - -/* - * We group all the following data together to optimise - * for CPUs with separate I & D caches. - */ - .align 5 - -.LCvswi: - .word vector_swi - - .globl __stubs_end -__stubs_end: - - .equ stubs_offset, __vectors_start + 0x200 - __stubs_start - - .globl __vectors_start -__vectors_start: - jepriv SYS_ERROR0 - b vector_extn + stubs_offset - ldw pc, .LCvswi + stubs_offset - b vector_pabt + stubs_offset - b vector_dabt + stubs_offset - jepriv SYS_ERROR0 - b vector_intr + stubs_offset - jepriv SYS_ERROR0 - - .globl __vectors_end -__vectors_end: - - .data - - .globl cr_alignment - .globl cr_no_alignment -cr_alignment: - .space 4 -cr_no_alignment: - .space 4 diff --git a/arch/unicore32/kernel/fpu-ucf64.c b/arch/unicore32/kernel/fpu-ucf64.c deleted file mode 100644 index 85f0af29d29b..000000000000 --- a/arch/unicore32/kernel/fpu-ucf64.c +++ /dev/null @@ -1,117 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/fpu-ucf64.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include - -#include - -/* - * A special flag to tell the normalisation code not to normalise. - */ -#define F64_NAN_FLAG 0x100 - -/* - * A bit pattern used to indicate the initial (unset) value of the - * exception mask, in case nothing handles an instruction. This - * doesn't include the NAN flag, which get masked out before - * we check for an error. - */ -#define F64_EXCEPTION_ERROR ((u32)-1 & ~F64_NAN_FLAG) - -/* - * Since we aren't building with -mfpu=f64, we need to code - * these instructions using their MRC/MCR equivalents. - */ -#define f64reg(_f64_) #_f64_ - -#define cff(_f64_) ({ \ - u32 __v; \ - asm("cff %0, " f64reg(_f64_) "@ fmrx %0, " #_f64_ \ - : "=r" (__v) : : "cc"); \ - __v; \ - }) - -#define ctf(_f64_, _var_) \ - asm("ctf %0, " f64reg(_f64_) "@ fmxr " #_f64_ ", %0" \ - : : "r" (_var_) : "cc") - -/* - * Raise a SIGFPE for the current process. - * sicode describes the signal being raised. - */ -void ucf64_raise_sigfpe(struct pt_regs *regs) -{ - /* - * This is the same as NWFPE, because it's not clear what - * this is used for - */ - current->thread.error_code = 0; - current->thread.trap_no = 6; - - send_sig_fault(SIGFPE, FPE_FLTUNK, - (void __user *)(instruction_pointer(regs) - 4), - current); -} - -/* - * Handle exceptions of UniCore-F64. - */ -void ucf64_exchandler(u32 inst, u32 fpexc, struct pt_regs *regs) -{ - u32 tmp = fpexc; - u32 exc = F64_EXCEPTION_ERROR & fpexc; - - pr_debug("UniCore-F64: instruction %08x fpscr %08x\n", - inst, fpexc); - - if (exc & FPSCR_CMPINSTR_BIT) { - if (exc & FPSCR_CON) - tmp |= FPSCR_CON; - else - tmp &= ~(FPSCR_CON); - exc &= ~(FPSCR_CMPINSTR_BIT | FPSCR_CON); - } else { - pr_debug("UniCore-F64 Error: unhandled exceptions\n"); - pr_debug("UniCore-F64 FPSCR 0x%08x INST 0x%08x\n", - cff(FPSCR), inst); - - ucf64_raise_sigfpe(regs); - return; - } - - /* - * Update the FPSCR with the additional exception flags. - * Comparison instructions always return at least one of - * these flags set. - */ - tmp &= ~(FPSCR_TRAP | FPSCR_IOS | FPSCR_OFS | FPSCR_UFS | - FPSCR_IXS | FPSCR_HIS | FPSCR_IOC | FPSCR_OFC | - FPSCR_UFC | FPSCR_IXC | FPSCR_HIC); - - tmp |= exc; - ctf(FPSCR, tmp); -} - -/* - * F64 support code initialisation. - */ -static int __init ucf64_init(void) -{ - ctf(FPSCR, 0x0); /* FPSCR_UFE | FPSCR_NDE perhaps better */ - - printk(KERN_INFO "Enable UniCore-F64 support.\n"); - - return 0; -} - -late_initcall(ucf64_init); diff --git a/arch/unicore32/kernel/gpio.c b/arch/unicore32/kernel/gpio.c deleted file mode 100644 index 36d395b54b7c..000000000000 --- a/arch/unicore32/kernel/gpio.c +++ /dev/null @@ -1,121 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/gpio.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ -/* in FPGA, no GPIO support */ - -#include -#include -#include -/* FIXME: needed for gpio_set_value() - convert to use descriptors or hogs */ -#include -#include - -#ifdef CONFIG_LEDS -#include -#include - -static const struct gpio_led puv3_gpio_leds[] = { - { .name = "cpuhealth", .gpio = GPO_CPU_HEALTH, .active_low = 0, - .default_trigger = "heartbeat", }, - { .name = "hdd_led", .gpio = GPO_HDD_LED, .active_low = 1, - .default_trigger = "disk-activity", }, -}; - -static const struct gpio_led_platform_data puv3_gpio_led_data = { - .num_leds = ARRAY_SIZE(puv3_gpio_leds), - .leds = (void *) puv3_gpio_leds, -}; - -static struct platform_device puv3_gpio_gpio_leds = { - .name = "leds-gpio", - .id = -1, - .dev = { - .platform_data = (void *) &puv3_gpio_led_data, - } -}; - -static int __init puv3_gpio_leds_init(void) -{ - platform_device_register(&puv3_gpio_gpio_leds); - return 0; -} - -device_initcall(puv3_gpio_leds_init); -#endif - -static int puv3_gpio_get(struct gpio_chip *chip, unsigned offset) -{ - return !!(readl(GPIO_GPLR) & GPIO_GPIO(offset)); -} - -static void puv3_gpio_set(struct gpio_chip *chip, unsigned offset, int value) -{ - if (value) - writel(GPIO_GPIO(offset), GPIO_GPSR); - else - writel(GPIO_GPIO(offset), GPIO_GPCR); -} - -static int puv3_direction_input(struct gpio_chip *chip, unsigned offset) -{ - unsigned long flags; - - local_irq_save(flags); - writel(readl(GPIO_GPDR) & ~GPIO_GPIO(offset), GPIO_GPDR); - local_irq_restore(flags); - return 0; -} - -static int puv3_direction_output(struct gpio_chip *chip, unsigned offset, - int value) -{ - unsigned long flags; - - local_irq_save(flags); - puv3_gpio_set(chip, offset, value); - writel(readl(GPIO_GPDR) | GPIO_GPIO(offset), GPIO_GPDR); - local_irq_restore(flags); - return 0; -} - -static struct gpio_chip puv3_gpio_chip = { - .label = "gpio", - .direction_input = puv3_direction_input, - .direction_output = puv3_direction_output, - .set = puv3_gpio_set, - .get = puv3_gpio_get, - .base = 0, - .ngpio = GPIO_MAX + 1, -}; - -void __init puv3_init_gpio(void) -{ - writel(GPIO_DIR, GPIO_GPDR); -#if defined(CONFIG_PUV3_NB0916) || defined(CONFIG_PUV3_SMW0919) \ - || defined(CONFIG_PUV3_DB0913) - gpio_set_value(GPO_WIFI_EN, 1); - gpio_set_value(GPO_HDD_LED, 1); - gpio_set_value(GPO_VGA_EN, 1); - gpio_set_value(GPO_LCD_EN, 1); - gpio_set_value(GPO_CAM_PWR_EN, 0); - gpio_set_value(GPO_LCD_VCC_EN, 1); - gpio_set_value(GPO_SOFT_OFF, 1); - gpio_set_value(GPO_BT_EN, 1); - gpio_set_value(GPO_FAN_ON, 0); - gpio_set_value(GPO_SPKR, 0); - gpio_set_value(GPO_CPU_HEALTH, 1); - gpio_set_value(GPO_LAN_SEL, 1); -/* - * DO NOT modify the GPO_SET_V1 and GPO_SET_V2 in kernel - * gpio_set_value(GPO_SET_V1, 1); - * gpio_set_value(GPO_SET_V2, 1); - */ -#endif - gpiochip_add_data(&puv3_gpio_chip, NULL); -} diff --git a/arch/unicore32/kernel/head.S b/arch/unicore32/kernel/head.S deleted file mode 100644 index 9bbb8668f9f7..000000000000 --- a/arch/unicore32/kernel/head.S +++ /dev/null @@ -1,249 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/head.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#if (PHYS_OFFSET & 0x003fffff) -#error "PHYS_OFFSET must be at an even 4MiB boundary!" -#endif - -#define KERNEL_RAM_VADDR (PAGE_OFFSET + KERNEL_IMAGE_START) -#define KERNEL_RAM_PADDR (PHYS_OFFSET + KERNEL_IMAGE_START) - -#define KERNEL_PGD_PADDR (KERNEL_RAM_PADDR - 0x1000) -#define KERNEL_PGD_VADDR (KERNEL_RAM_VADDR - 0x1000) - -#define KERNEL_START KERNEL_RAM_VADDR -#define KERNEL_END _end - -/* - * swapper_pg_dir is the virtual address of the initial page table. - * We place the page tables 4K below KERNEL_RAM_VADDR. Therefore, we must - * make sure that KERNEL_RAM_VADDR is correctly set. Currently, we expect - * the least significant 16 bits to be 0x8000, but we could probably - * relax this restriction to KERNEL_RAM_VADDR >= PAGE_OFFSET + 0x1000. - */ -#if (KERNEL_RAM_VADDR & 0xffff) != 0x8000 -#error KERNEL_RAM_VADDR must start at 0xXXXX8000 -#endif - - .globl swapper_pg_dir - .equ swapper_pg_dir, KERNEL_RAM_VADDR - 0x1000 - -/* - * Kernel startup entry point. - * --------------------------- - * - * This is normally called from the decompressor code. The requirements - * are: MMU = off, D-cache = off, I-cache = dont care - * - * This code is mostly position independent, so if you link the kernel at - * 0xc0008000, you call this at __pa(0xc0008000). - */ - __HEAD -ENTRY(stext) - @ set asr - mov r0, #PRIV_MODE @ ensure priv mode - or r0, #PSR_R_BIT | PSR_I_BIT @ disable irqs - mov.a asr, r0 - - @ process identify - movc r0, p0.c0, #0 @ cpuid - movl r1, 0xff00ffff @ mask - movl r2, 0x4d000863 @ value - and r0, r1, r0 - cxor.a r0, r2 - bne __error_p @ invalid processor id - - /* - * Clear the 4K level 1 swapper page table - */ - movl r0, #KERNEL_PGD_PADDR @ page table address - mov r1, #0 - add r2, r0, #0x1000 -101: stw.w r1, [r0]+, #4 - stw.w r1, [r0]+, #4 - stw.w r1, [r0]+, #4 - stw.w r1, [r0]+, #4 - cxor.a r0, r2 - bne 101b - - movl r4, #KERNEL_PGD_PADDR @ page table address - mov r7, #PMD_TYPE_SECT | PMD_PRESENT @ page size: section - or r7, r7, #PMD_SECT_CACHEABLE @ cacheable - or r7, r7, #PMD_SECT_READ | PMD_SECT_WRITE | PMD_SECT_EXEC - - /* - * Create identity mapping for first 4MB of kernel to - * cater for the MMU enable. This identity mapping - * will be removed by paging_init(). We use our current program - * counter to determine corresponding section base address. - */ - mov r6, pc - mov r6, r6 >> #22 @ start of kernel section - or r1, r7, r6 << #22 @ flags + kernel base - stw r1, [r4+], r6 << #2 @ identity mapping - - /* - * Now setup the pagetables for our kernel direct - * mapped region. - */ - add r0, r4, #(KERNEL_START & 0xff000000) >> 20 - stw.w r1, [r0+], #(KERNEL_START & 0x00c00000) >> 20 - movl r6, #(KERNEL_END - 1) - add r0, r0, #4 - add r6, r4, r6 >> #20 -102: csub.a r0, r6 - add r1, r1, #1 << 22 - bua 103f - stw.w r1, [r0]+, #4 - b 102b -103: - /* - * Then map first 4MB of ram in case it contains our boot params. - */ - add r0, r4, #PAGE_OFFSET >> 20 - or r6, r7, #(PHYS_OFFSET & 0xffc00000) - stw r6, [r0] - - ldw r15, __switch_data @ address to jump to after - - /* - * Initialise TLB, Caches, and MMU state ready to switch the MMU - * on. - */ - mov r0, #0 - movc p0.c5, r0, #28 @ cache invalidate all - nop8 - movc p0.c6, r0, #6 @ TLB invalidate all - nop8 - - /* - * ..V. .... ..TB IDAM - * ..1. .... ..01 1111 - */ - movl r0, #0x201f @ control register setting - - /* - * Setup common bits before finally enabling the MMU. Essentially - * this is just loading the page table pointer and domain access - * registers. - */ - #ifndef CONFIG_ALIGNMENT_TRAP - andn r0, r0, #CR_A - #endif - #ifdef CONFIG_CPU_DCACHE_DISABLE - andn r0, r0, #CR_D - #endif - #ifdef CONFIG_CPU_DCACHE_WRITETHROUGH - andn r0, r0, #CR_B - #endif - #ifdef CONFIG_CPU_ICACHE_DISABLE - andn r0, r0, #CR_I - #endif - - movc p0.c2, r4, #0 @ set pgd - b __turn_mmu_on -ENDPROC(stext) - -/* - * Enable the MMU. This completely changes the structure of the visible - * memory space. You will not be able to trace execution through this. - * - * r0 = cp#0 control register - * r15 = *virtual* address to jump to upon completion - */ - .align 5 -__turn_mmu_on: - mov r0, r0 - movc p0.c1, r0, #0 @ write control reg - nop @ fetch inst by phys addr - mov pc, r15 - nop8 @ fetch inst by phys addr -ENDPROC(__turn_mmu_on) - -/* - * Setup the initial page tables. We only setup the barest - * amount which are required to get the kernel running, which - * generally means mapping in the kernel code. - * - * r9 = cpuid - * r10 = procinfo - * - * Returns: - * r0, r3, r6, r7 corrupted - * r4 = physical page table address - */ - .ltorg - - .align 2 - .type __switch_data, %object -__switch_data: - .long __mmap_switched - .long __bss_start @ r6 - .long _end @ r7 - .long cr_alignment @ r8 - .long init_thread_union + THREAD_START_SP @ sp - -/* - * The following fragment of code is executed with the MMU on in MMU mode, - * and uses absolute addresses; this is not position independent. - * - * r0 = cp#0 control register - */ -__mmap_switched: - adr r3, __switch_data + 4 - - ldm.w (r6, r7, r8), [r3]+ - ldw sp, [r3] - - mov fp, #0 @ Clear BSS (and zero fp) -203: csub.a r6, r7 - bea 204f - stw.w fp, [r6]+,#4 - b 203b -204: - andn r1, r0, #CR_A @ Clear 'A' bit - stm (r0, r1), [r8]+ @ Save control register values - b start_kernel -ENDPROC(__mmap_switched) - -/* - * Exception handling. Something went wrong and we can't proceed. We - * ought to tell the user, but since we don't have any guarantee that - * we're even running on the right architecture, we do virtually nothing. - * - * If CONFIG_DEBUG_LL is set we try to print out something about the error - * and hope for the best (useful if bootloader fails to pass a proper - * machine ID for example). - */ -__error_p: -#ifdef CONFIG_DEBUG_LL - adr r0, str_p1 - b.l printascii - mov r0, r9 - b.l printhex8 - adr r0, str_p2 - b.l printascii -901: nop8 - b 901b -str_p1: .asciz "\nError: unrecognized processor variant (0x" -str_p2: .asciz ").\n" - .align -#endif -ENDPROC(__error_p) - diff --git a/arch/unicore32/kernel/hibernate.c b/arch/unicore32/kernel/hibernate.c deleted file mode 100644 index 4cdf3c846a2d..000000000000 --- a/arch/unicore32/kernel/hibernate.c +++ /dev/null @@ -1,159 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/hibernate.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "mach/pm.h" - -/* Pointer to the temporary resume page tables */ -pgd_t *resume_pg_dir; - -struct swsusp_arch_regs swsusp_arch_regs_cpu0; - -/* - * Create a middle page table on a resume-safe page and put a pointer to it in - * the given global directory entry. This only returns the gd entry - * in non-PAE compilation mode, since the middle layer is folded. - */ -static pmd_t *resume_one_md_table_init(pgd_t *pgd) -{ - pud_t *pud; - p4d_t *p4d; - pmd_t *pmd_table; - - p4d = p4d_offset(pgd, 0); - pud = pud_offset(p4d, 0); - pmd_table = pmd_offset(pud, 0); - - return pmd_table; -} - -/* - * Create a page table on a resume-safe page and place a pointer to it in - * a middle page directory entry. - */ -static pte_t *resume_one_page_table_init(pmd_t *pmd) -{ - if (pmd_none(*pmd)) { - pte_t *page_table = (pte_t *)get_safe_page(GFP_ATOMIC); - if (!page_table) - return NULL; - - set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_KERNEL_TABLE)); - - BUG_ON(page_table != pte_offset_kernel(pmd, 0)); - - return page_table; - } - - return pte_offset_kernel(pmd, 0); -} - -/* - * This maps the physical memory to kernel virtual address space, a total - * of max_low_pfn pages, by creating page tables starting from address - * PAGE_OFFSET. The page tables are allocated out of resume-safe pages. - */ -static int resume_physical_mapping_init(pgd_t *pgd_base) -{ - unsigned long pfn; - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - int pgd_idx, pmd_idx; - - pgd_idx = pgd_index(PAGE_OFFSET); - pgd = pgd_base + pgd_idx; - pfn = 0; - - for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { - pmd = resume_one_md_table_init(pgd); - if (!pmd) - return -ENOMEM; - - if (pfn >= max_low_pfn) - continue; - - for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD; pmd++, pmd_idx++) { - pte_t *max_pte; - - if (pfn >= max_low_pfn) - break; - - /* Map with normal page tables. - * NOTE: We can mark everything as executable here - */ - pte = resume_one_page_table_init(pmd); - if (!pte) - return -ENOMEM; - - max_pte = pte + PTRS_PER_PTE; - for (; pte < max_pte; pte++, pfn++) { - if (pfn >= max_low_pfn) - break; - - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - } - } - } - - return 0; -} - -static inline void resume_init_first_level_page_table(pgd_t *pg_dir) -{ -} - -int swsusp_arch_resume(void) -{ - int error; - - resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!resume_pg_dir) - return -ENOMEM; - - resume_init_first_level_page_table(resume_pg_dir); - error = resume_physical_mapping_init(resume_pg_dir); - if (error) - return error; - - /* We have got enough memory and from now on we cannot recover */ - restore_image(resume_pg_dir, restore_pblist); - return 0; -} - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; - unsigned long end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; - - return (pfn >= begin_pfn) && (pfn < end_pfn); -} - -void save_processor_state(void) -{ -} - -void restore_processor_state(void) -{ - local_flush_tlb_all(); -} diff --git a/arch/unicore32/kernel/hibernate_asm.S b/arch/unicore32/kernel/hibernate_asm.S deleted file mode 100644 index a589bc189e24..000000000000 --- a/arch/unicore32/kernel/hibernate_asm.S +++ /dev/null @@ -1,114 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/hibernate_asm.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include -#include -#include -#include - -@ restore_image(pgd_t *resume_pg_dir, struct pbe *restore_pblist) -@ r0: resume_pg_dir -@ r1: restore_pblist -@ copy restore_pblist pages -@ restore registers from swsusp_arch_regs_cpu0 -@ -ENTRY(restore_image) - sub r0, r0, #PAGE_OFFSET - mov r5, #0 - movc p0.c6, r5, #6 @invalidate ITLB & DTLB - movc p0.c2, r0, #0 - nop - nop - nop - nop - nop - nop - nop - - .p2align 4,,7 -101: - csub.a r1, #0 - beq 109f - - ldw r6, [r1+], #PBE_ADDRESS - ldw r7, [r1+], #PBE_ORIN_ADDRESS - - movl ip, #128 -102: ldm.w (r8 - r15), [r6]+ - stm.w (r8 - r15), [r7]+ - sub.a ip, ip, #1 - bne 102b - - ldw r1, [r1+], #PBE_NEXT - b 101b - - .p2align 4,,7 -109: - /* go back to the original page tables */ - ldw r0, =swapper_pg_dir - sub r0, r0, #PAGE_OFFSET - mov r5, #0 - movc p0.c6, r5, #6 - movc p0.c2, r0, #0 - nop - nop - nop - nop - nop - nop - nop - -#ifdef CONFIG_UNICORE_FPU_F64 - ldw ip, 1f - add ip, ip, #SWSUSP_FPSTATE - lfm.w (f0 - f7 ), [ip]+ - lfm.w (f8 - f15), [ip]+ - lfm.w (f16 - f23), [ip]+ - lfm.w (f24 - f31), [ip]+ - ldw r4, [ip] - ctf r4, s31 -#endif - mov r0, #0x0 - ldw ip, 1f - add ip, ip, #SWSUSP_CPU - ldm.w (r4 - r15), [ip]+ - ldm (r16 - r27, sp, pc), [ip]+ @ Load all regs saved previously - - .align 2 -1: .long swsusp_arch_regs_cpu0 - - -@ swsusp_arch_suspend() -@ - prepare pc for resume, return from function without swsusp_save on resume -@ - save registers in swsusp_arch_regs_cpu0 -@ - call swsusp_save write suspend image - -ENTRY(swsusp_arch_suspend) - ldw ip, 1f - add ip, ip, #SWSUSP_CPU - stm.w (r4 - r15), [ip]+ - stm.w (r16 - r27, sp, lr), [ip]+ - -#ifdef CONFIG_UNICORE_FPU_F64 - ldw ip, 1f - add ip, ip, #SWSUSP_FPSTATE - sfm.w (f0 - f7 ), [ip]+ - sfm.w (f8 - f15), [ip]+ - sfm.w (f16 - f23), [ip]+ - sfm.w (f24 - f31), [ip]+ - cff r4, s31 - stw r4, [ip] -#endif - b swsusp_save @ no return - -1: .long swsusp_arch_regs_cpu0 diff --git a/arch/unicore32/kernel/irq.c b/arch/unicore32/kernel/irq.c deleted file mode 100644 index c014ae3c3e48..000000000000 --- a/arch/unicore32/kernel/irq.c +++ /dev/null @@ -1,371 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/irq.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "setup.h" - -/* - * PKUnity GPIO edge detection for IRQs: - * IRQs are generated on Falling-Edge, Rising-Edge, or both. - * Use this instead of directly setting GRER/GFER. - */ -static int GPIO_IRQ_rising_edge; -static int GPIO_IRQ_falling_edge; -static int GPIO_IRQ_mask = 0; - -#define GPIO_MASK(irq) (1 << (irq - IRQ_GPIO0)) - -static int puv3_gpio_type(struct irq_data *d, unsigned int type) -{ - unsigned int mask; - - if (d->irq < IRQ_GPIOHIGH) - mask = 1 << d->irq; - else - mask = GPIO_MASK(d->irq); - - if (type == IRQ_TYPE_PROBE) { - if ((GPIO_IRQ_rising_edge | GPIO_IRQ_falling_edge) & mask) - return 0; - type = IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING; - } - - if (type & IRQ_TYPE_EDGE_RISING) - GPIO_IRQ_rising_edge |= mask; - else - GPIO_IRQ_rising_edge &= ~mask; - if (type & IRQ_TYPE_EDGE_FALLING) - GPIO_IRQ_falling_edge |= mask; - else - GPIO_IRQ_falling_edge &= ~mask; - - writel(GPIO_IRQ_rising_edge & GPIO_IRQ_mask, GPIO_GRER); - writel(GPIO_IRQ_falling_edge & GPIO_IRQ_mask, GPIO_GFER); - - return 0; -} - -/* - * GPIO IRQs must be acknowledged. This is for IRQs from 0 to 7. - */ -static void puv3_low_gpio_ack(struct irq_data *d) -{ - writel((1 << d->irq), GPIO_GEDR); -} - -static void puv3_low_gpio_mask(struct irq_data *d) -{ - writel(readl(INTC_ICMR) & ~(1 << d->irq), INTC_ICMR); -} - -static void puv3_low_gpio_unmask(struct irq_data *d) -{ - writel(readl(INTC_ICMR) | (1 << d->irq), INTC_ICMR); -} - -static int puv3_low_gpio_wake(struct irq_data *d, unsigned int on) -{ - if (on) - writel(readl(PM_PWER) | (1 << d->irq), PM_PWER); - else - writel(readl(PM_PWER) & ~(1 << d->irq), PM_PWER); - return 0; -} - -static struct irq_chip puv3_low_gpio_chip = { - .name = "GPIO-low", - .irq_ack = puv3_low_gpio_ack, - .irq_mask = puv3_low_gpio_mask, - .irq_unmask = puv3_low_gpio_unmask, - .irq_set_type = puv3_gpio_type, - .irq_set_wake = puv3_low_gpio_wake, -}; - -/* - * IRQ8 (GPIO0 through 27) handler. We enter here with the - * irq_controller_lock held, and IRQs disabled. Decode the IRQ - * and call the handler. - */ -static void puv3_gpio_handler(struct irq_desc *desc) -{ - unsigned int mask, irq; - - mask = readl(GPIO_GEDR); - do { - /* - * clear down all currently active IRQ sources. - * We will be processing them all. - */ - writel(mask, GPIO_GEDR); - - irq = IRQ_GPIO0; - do { - if (mask & 1) - generic_handle_irq(irq); - mask >>= 1; - irq++; - } while (mask); - mask = readl(GPIO_GEDR); - } while (mask); -} - -/* - * GPIO0-27 edge IRQs need to be handled specially. - * In addition, the IRQs are all collected up into one bit in the - * interrupt controller registers. - */ -static void puv3_high_gpio_ack(struct irq_data *d) -{ - unsigned int mask = GPIO_MASK(d->irq); - - writel(mask, GPIO_GEDR); -} - -static void puv3_high_gpio_mask(struct irq_data *d) -{ - unsigned int mask = GPIO_MASK(d->irq); - - GPIO_IRQ_mask &= ~mask; - - writel(readl(GPIO_GRER) & ~mask, GPIO_GRER); - writel(readl(GPIO_GFER) & ~mask, GPIO_GFER); -} - -static void puv3_high_gpio_unmask(struct irq_data *d) -{ - unsigned int mask = GPIO_MASK(d->irq); - - GPIO_IRQ_mask |= mask; - - writel(GPIO_IRQ_rising_edge & GPIO_IRQ_mask, GPIO_GRER); - writel(GPIO_IRQ_falling_edge & GPIO_IRQ_mask, GPIO_GFER); -} - -static int puv3_high_gpio_wake(struct irq_data *d, unsigned int on) -{ - if (on) - writel(readl(PM_PWER) | PM_PWER_GPIOHIGH, PM_PWER); - else - writel(readl(PM_PWER) & ~PM_PWER_GPIOHIGH, PM_PWER); - return 0; -} - -static struct irq_chip puv3_high_gpio_chip = { - .name = "GPIO-high", - .irq_ack = puv3_high_gpio_ack, - .irq_mask = puv3_high_gpio_mask, - .irq_unmask = puv3_high_gpio_unmask, - .irq_set_type = puv3_gpio_type, - .irq_set_wake = puv3_high_gpio_wake, -}; - -/* - * We don't need to ACK IRQs on the PKUnity unless they're GPIOs - * this is for internal IRQs i.e. from 8 to 31. - */ -static void puv3_mask_irq(struct irq_data *d) -{ - writel(readl(INTC_ICMR) & ~(1 << d->irq), INTC_ICMR); -} - -static void puv3_unmask_irq(struct irq_data *d) -{ - writel(readl(INTC_ICMR) | (1 << d->irq), INTC_ICMR); -} - -/* - * Apart form GPIOs, only the RTC alarm can be a wakeup event. - */ -static int puv3_set_wake(struct irq_data *d, unsigned int on) -{ - if (d->irq == IRQ_RTCAlarm) { - if (on) - writel(readl(PM_PWER) | PM_PWER_RTC, PM_PWER); - else - writel(readl(PM_PWER) & ~PM_PWER_RTC, PM_PWER); - return 0; - } - return -EINVAL; -} - -static struct irq_chip puv3_normal_chip = { - .name = "PKUnity-v3", - .irq_ack = puv3_mask_irq, - .irq_mask = puv3_mask_irq, - .irq_unmask = puv3_unmask_irq, - .irq_set_wake = puv3_set_wake, -}; - -static struct resource irq_resource = { - .name = "irqs", - .start = io_v2p(PKUNITY_INTC_BASE), - .end = io_v2p(PKUNITY_INTC_BASE) + 0xFFFFF, -}; - -static struct puv3_irq_state { - unsigned int saved; - unsigned int icmr; - unsigned int iclr; - unsigned int iccr; -} puv3_irq_state; - -static int puv3_irq_suspend(void) -{ - struct puv3_irq_state *st = &puv3_irq_state; - - st->saved = 1; - st->icmr = readl(INTC_ICMR); - st->iclr = readl(INTC_ICLR); - st->iccr = readl(INTC_ICCR); - - /* - * Disable all GPIO-based interrupts. - */ - writel(readl(INTC_ICMR) & ~(0x1ff), INTC_ICMR); - - /* - * Set the appropriate edges for wakeup. - */ - writel(readl(PM_PWER) & GPIO_IRQ_rising_edge, GPIO_GRER); - writel(readl(PM_PWER) & GPIO_IRQ_falling_edge, GPIO_GFER); - - /* - * Clear any pending GPIO interrupts. - */ - writel(readl(GPIO_GEDR), GPIO_GEDR); - - return 0; -} - -static void puv3_irq_resume(void) -{ - struct puv3_irq_state *st = &puv3_irq_state; - - if (st->saved) { - writel(st->iccr, INTC_ICCR); - writel(st->iclr, INTC_ICLR); - - writel(GPIO_IRQ_rising_edge & GPIO_IRQ_mask, GPIO_GRER); - writel(GPIO_IRQ_falling_edge & GPIO_IRQ_mask, GPIO_GFER); - - writel(st->icmr, INTC_ICMR); - } -} - -static struct syscore_ops puv3_irq_syscore_ops = { - .suspend = puv3_irq_suspend, - .resume = puv3_irq_resume, -}; - -static int __init puv3_irq_init_syscore(void) -{ - register_syscore_ops(&puv3_irq_syscore_ops); - return 0; -} - -device_initcall(puv3_irq_init_syscore); - -void __init init_IRQ(void) -{ - unsigned int irq; - - request_resource(&iomem_resource, &irq_resource); - - /* disable all IRQs */ - writel(0, INTC_ICMR); - - /* all IRQs are IRQ, not REAL */ - writel(0, INTC_ICLR); - - /* clear all GPIO edge detects */ - writel(FMASK(8, 0) & ~FIELD(1, 1, GPI_SOFF_REQ), GPIO_GPIR); - writel(0, GPIO_GFER); - writel(0, GPIO_GRER); - writel(0x0FFFFFFF, GPIO_GEDR); - - writel(1, INTC_ICCR); - - for (irq = 0; irq < IRQ_GPIOHIGH; irq++) { - irq_set_chip(irq, &puv3_low_gpio_chip); - irq_set_handler(irq, handle_edge_irq); - irq_modify_status(irq, - IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN, - 0); - } - - for (irq = IRQ_GPIOHIGH + 1; irq < IRQ_GPIO0; irq++) { - irq_set_chip(irq, &puv3_normal_chip); - irq_set_handler(irq, handle_level_irq); - irq_modify_status(irq, - IRQ_NOREQUEST | IRQ_NOAUTOEN, - IRQ_NOPROBE); - } - - for (irq = IRQ_GPIO0; irq <= IRQ_GPIO27; irq++) { - irq_set_chip(irq, &puv3_high_gpio_chip); - irq_set_handler(irq, handle_edge_irq); - irq_modify_status(irq, - IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN, - 0); - } - - /* - * Install handler for GPIO 0-27 edge detect interrupts - */ - irq_set_chip(IRQ_GPIOHIGH, &puv3_normal_chip); - irq_set_chained_handler(IRQ_GPIOHIGH, puv3_gpio_handler); - -#ifdef CONFIG_PUV3_GPIO - puv3_init_gpio(); -#endif -} - -/* - * do_IRQ handles all hardware IRQ's. Decoded IRQs should not - * come via this function. Instead, they should provide their - * own 'handler' - */ -asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - irq_enter(); - - /* - * Some hardware gives randomly wrong interrupts. Rather - * than crashing, do something sensible. - */ - if (unlikely(irq >= nr_irqs)) { - if (printk_ratelimit()) - printk(KERN_WARNING "Bad IRQ%u\n", irq); - ack_bad_irq(irq); - } else { - generic_handle_irq(irq); - } - - irq_exit(); - set_irq_regs(old_regs); -} - diff --git a/arch/unicore32/kernel/ksyms.c b/arch/unicore32/kernel/ksyms.c deleted file mode 100644 index 731445008932..000000000000 --- a/arch/unicore32/kernel/ksyms.c +++ /dev/null @@ -1,57 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/ksyms.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "ksyms.h" - -EXPORT_SYMBOL(find_first_bit); -EXPORT_SYMBOL(find_first_zero_bit); -EXPORT_SYMBOL(find_next_zero_bit); -EXPORT_SYMBOL(find_next_bit); - - /* platform dependent support */ -EXPORT_SYMBOL(__udelay); -EXPORT_SYMBOL(__const_udelay); - - /* string / mem functions */ -EXPORT_SYMBOL(strchr); -EXPORT_SYMBOL(strrchr); -EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memcpy); -EXPORT_SYMBOL(memmove); -EXPORT_SYMBOL(memchr); - - /* user mem (segment) */ -EXPORT_SYMBOL(__strnlen_user); -EXPORT_SYMBOL(__strncpy_from_user); - -EXPORT_SYMBOL(copy_page); - -EXPORT_SYMBOL(raw_copy_from_user); -EXPORT_SYMBOL(raw_copy_to_user); -EXPORT_SYMBOL(__clear_user); - -EXPORT_SYMBOL(__ashldi3); -EXPORT_SYMBOL(__ashrdi3); -EXPORT_SYMBOL(__divsi3); -EXPORT_SYMBOL(__lshrdi3); -EXPORT_SYMBOL(__modsi3); -EXPORT_SYMBOL(__ucmpdi2); -EXPORT_SYMBOL(__udivsi3); -EXPORT_SYMBOL(__umodsi3); - diff --git a/arch/unicore32/kernel/ksyms.h b/arch/unicore32/kernel/ksyms.h deleted file mode 100644 index 5d2d5ba324ac..000000000000 --- a/arch/unicore32/kernel/ksyms.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * libgcc functions - functions that are used internally by the - * compiler... (prototypes are not correct though, but that - * doesn't really matter since they're not versioned). - */ -extern void __ashldi3(void); -extern void __ashrdi3(void); -extern void __divsi3(void); -extern void __lshrdi3(void); -extern void __modsi3(void); -extern void __ucmpdi2(void); -extern void __udivsi3(void); -extern void __umodsi3(void); diff --git a/arch/unicore32/kernel/module.c b/arch/unicore32/kernel/module.c deleted file mode 100644 index 67c89ef2d6ee..000000000000 --- a/arch/unicore32/kernel/module.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/module.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -void *module_alloc(unsigned long size) -{ - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); -} - -int -apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, - unsigned int relindex, struct module *module) -{ - Elf32_Shdr *symsec = sechdrs + symindex; - Elf32_Shdr *relsec = sechdrs + relindex; - Elf32_Shdr *dstsec = sechdrs + relsec->sh_info; - Elf32_Rel *rel = (void *)relsec->sh_addr; - unsigned int i; - - for (i = 0; i < relsec->sh_size / sizeof(Elf32_Rel); i++, rel++) { - unsigned long loc; - Elf32_Sym *sym; - s32 offset; - - offset = ELF32_R_SYM(rel->r_info); - if (offset < 0 || offset > - (symsec->sh_size / sizeof(Elf32_Sym))) { - printk(KERN_ERR "%s: bad relocation, " - "section %d reloc %d\n", - module->name, relindex, i); - return -ENOEXEC; - } - - sym = ((Elf32_Sym *)symsec->sh_addr) + offset; - - if (rel->r_offset < 0 || rel->r_offset > - dstsec->sh_size - sizeof(u32)) { - printk(KERN_ERR "%s: out of bounds relocation, " - "section %d reloc %d offset %d size %d\n", - module->name, relindex, i, rel->r_offset, - dstsec->sh_size); - return -ENOEXEC; - } - - loc = dstsec->sh_addr + rel->r_offset; - - switch (ELF32_R_TYPE(rel->r_info)) { - case R_UNICORE_NONE: - /* ignore */ - break; - - case R_UNICORE_ABS32: - *(u32 *)loc += sym->st_value; - break; - - case R_UNICORE_PC24: - case R_UNICORE_CALL: - case R_UNICORE_JUMP24: - offset = (*(u32 *)loc & 0x00ffffff) << 2; - if (offset & 0x02000000) - offset -= 0x04000000; - - offset += sym->st_value - loc; - if (offset & 3 || - offset <= (s32)0xfe000000 || - offset >= (s32)0x02000000) { - printk(KERN_ERR - "%s: relocation out of range, section " - "%d reloc %d sym '%s'\n", module->name, - relindex, i, strtab + sym->st_name); - return -ENOEXEC; - } - - offset >>= 2; - - *(u32 *)loc &= 0xff000000; - *(u32 *)loc |= offset & 0x00ffffff; - break; - - default: - printk(KERN_ERR "%s: unknown relocation: %u\n", - module->name, ELF32_R_TYPE(rel->r_info)); - return -ENOEXEC; - } - } - return 0; -} diff --git a/arch/unicore32/kernel/pci.c b/arch/unicore32/kernel/pci.c deleted file mode 100644 index 0d098aa05b47..000000000000 --- a/arch/unicore32/kernel/pci.c +++ /dev/null @@ -1,371 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/pci.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * PCI bios-type initialisation for PCI machines - */ -#include -#include -#include -#include -#include -#include -#include - -static int debug_pci; - -#define CONFIG_CMD(bus, devfn, where) \ - (0x80000000 | (bus->number << 16) | (devfn << 8) | (where & ~3)) - -static int -puv3_read_config(struct pci_bus *bus, unsigned int devfn, int where, - int size, u32 *value) -{ - writel(CONFIG_CMD(bus, devfn, where), PCICFG_ADDR); - switch (size) { - case 1: - *value = (readl(PCICFG_DATA) >> ((where & 3) * 8)) & 0xFF; - break; - case 2: - *value = (readl(PCICFG_DATA) >> ((where & 2) * 8)) & 0xFFFF; - break; - case 4: - *value = readl(PCICFG_DATA); - break; - } - return PCIBIOS_SUCCESSFUL; -} - -static int -puv3_write_config(struct pci_bus *bus, unsigned int devfn, int where, - int size, u32 value) -{ - writel(CONFIG_CMD(bus, devfn, where), PCICFG_ADDR); - switch (size) { - case 1: - writel((readl(PCICFG_DATA) & ~FMASK(8, (where&3)*8)) - | FIELD(value, 8, (where&3)*8), PCICFG_DATA); - break; - case 2: - writel((readl(PCICFG_DATA) & ~FMASK(16, (where&2)*8)) - | FIELD(value, 16, (where&2)*8), PCICFG_DATA); - break; - case 4: - writel(value, PCICFG_DATA); - break; - } - return PCIBIOS_SUCCESSFUL; -} - -struct pci_ops pci_puv3_ops = { - .read = puv3_read_config, - .write = puv3_write_config, -}; - -void pci_puv3_preinit(void) -{ - printk(KERN_DEBUG "PCI: PKUnity PCI Controller Initializing ...\n"); - /* config PCI bridge base */ - writel(io_v2p(PKUNITY_PCIBRI_BASE), PCICFG_BRIBASE); - - writel(0, PCIBRI_AHBCTL0); - writel(io_v2p(PKUNITY_PCIBRI_BASE) | PCIBRI_BARx_MEM, PCIBRI_AHBBAR0); - writel(0xFFFF0000, PCIBRI_AHBAMR0); - writel(0, PCIBRI_AHBTAR0); - - writel(PCIBRI_CTLx_AT, PCIBRI_AHBCTL1); - writel(io_v2p(PKUNITY_PCILIO_BASE) | PCIBRI_BARx_IO, PCIBRI_AHBBAR1); - writel(0xFFFF0000, PCIBRI_AHBAMR1); - writel(0x00000000, PCIBRI_AHBTAR1); - - writel(PCIBRI_CTLx_PREF, PCIBRI_AHBCTL2); - writel(io_v2p(PKUNITY_PCIMEM_BASE) | PCIBRI_BARx_MEM, PCIBRI_AHBBAR2); - writel(0xF8000000, PCIBRI_AHBAMR2); - writel(0, PCIBRI_AHBTAR2); - - writel(io_v2p(PKUNITY_PCIAHB_BASE) | PCIBRI_BARx_MEM, PCIBRI_BAR1); - - writel(PCIBRI_CTLx_AT | PCIBRI_CTLx_PREF, PCIBRI_PCICTL0); - writel(io_v2p(PKUNITY_PCIAHB_BASE) | PCIBRI_BARx_MEM, PCIBRI_PCIBAR0); - writel(0xF8000000, PCIBRI_PCIAMR0); - writel(PKUNITY_SDRAM_BASE, PCIBRI_PCITAR0); - - writel(readl(PCIBRI_CMD) | PCIBRI_CMD_IO | PCIBRI_CMD_MEM, PCIBRI_CMD); -} - -static int pci_puv3_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) -{ - if (dev->bus->number == 0) { -#ifdef CONFIG_ARCH_FPGA /* 4 pci slots */ - if (dev->devfn == 0x00) - return IRQ_PCIINTA; - else if (dev->devfn == 0x08) - return IRQ_PCIINTB; - else if (dev->devfn == 0x10) - return IRQ_PCIINTC; - else if (dev->devfn == 0x18) - return IRQ_PCIINTD; -#endif -#ifdef CONFIG_PUV3_DB0913 /* 3 pci slots */ - if (dev->devfn == 0x30) - return IRQ_PCIINTB; - else if (dev->devfn == 0x60) - return IRQ_PCIINTC; - else if (dev->devfn == 0x58) - return IRQ_PCIINTD; -#endif -#if defined(CONFIG_PUV3_NB0916) || defined(CONFIG_PUV3_SMW0919) - /* only support 2 pci devices */ - if (dev->devfn == 0x00) - return IRQ_PCIINTC; /* sata */ -#endif - } - return -1; -} - -/* - * Only first 128MB of memory can be accessed via PCI. - * We use GFP_DMA to allocate safe buffers to do map/unmap. - * This is really ugly and we need a better way of specifying - * DMA-capable regions of memory. - */ -void __init puv3_pci_adjust_zones(unsigned long max_zone_pfn) -{ - unsigned int sz = SZ_128M >> PAGE_SHIFT; - - max_zone_pfn[ZONE_DMA] = sz; -} - -/* - * If the bus contains any of these devices, then we must not turn on - * parity checking of any kind. - */ -static inline int pdev_bad_for_parity(struct pci_dev *dev) -{ - return 0; -} - -/* - * pcibios_fixup_bus - Called after each bus is probed, - * but before its children are examined. - */ -void pcibios_fixup_bus(struct pci_bus *bus) -{ - struct pci_dev *dev; - u16 features = PCI_COMMAND_SERR - | PCI_COMMAND_PARITY - | PCI_COMMAND_FAST_BACK; - - bus->resource[0] = &ioport_resource; - bus->resource[1] = &iomem_resource; - - /* - * Walk the devices on this bus, working out what we can - * and can't support. - */ - list_for_each_entry(dev, &bus->devices, bus_list) { - u16 status; - - pci_read_config_word(dev, PCI_STATUS, &status); - - /* - * If any device on this bus does not support fast back - * to back transfers, then the bus as a whole is not able - * to support them. Having fast back to back transfers - * on saves us one PCI cycle per transaction. - */ - if (!(status & PCI_STATUS_FAST_BACK)) - features &= ~PCI_COMMAND_FAST_BACK; - - if (pdev_bad_for_parity(dev)) - features &= ~(PCI_COMMAND_SERR - | PCI_COMMAND_PARITY); - - switch (dev->class >> 8) { - case PCI_CLASS_BRIDGE_PCI: - pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &status); - status |= PCI_BRIDGE_CTL_PARITY - | PCI_BRIDGE_CTL_MASTER_ABORT; - status &= ~(PCI_BRIDGE_CTL_BUS_RESET - | PCI_BRIDGE_CTL_FAST_BACK); - pci_write_config_word(dev, PCI_BRIDGE_CONTROL, status); - break; - - case PCI_CLASS_BRIDGE_CARDBUS: - pci_read_config_word(dev, PCI_CB_BRIDGE_CONTROL, - &status); - status |= PCI_CB_BRIDGE_CTL_PARITY - | PCI_CB_BRIDGE_CTL_MASTER_ABORT; - pci_write_config_word(dev, PCI_CB_BRIDGE_CONTROL, - status); - break; - } - } - - /* - * Now walk the devices again, this time setting them up. - */ - list_for_each_entry(dev, &bus->devices, bus_list) { - u16 cmd; - - pci_read_config_word(dev, PCI_COMMAND, &cmd); - cmd |= features; - pci_write_config_word(dev, PCI_COMMAND, cmd); - - pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, - L1_CACHE_BYTES >> 2); - } - - /* - * Propagate the flags to the PCI bridge. - */ - if (bus->self && bus->self->hdr_type == PCI_HEADER_TYPE_BRIDGE) { - if (features & PCI_COMMAND_FAST_BACK) - bus->bridge_ctl |= PCI_BRIDGE_CTL_FAST_BACK; - if (features & PCI_COMMAND_PARITY) - bus->bridge_ctl |= PCI_BRIDGE_CTL_PARITY; - } - - /* - * Report what we did for this bus - */ - printk(KERN_INFO "PCI: bus%d: Fast back to back transfers %sabled\n", - bus->number, (features & PCI_COMMAND_FAST_BACK) ? "en" : "dis"); -} -EXPORT_SYMBOL(pcibios_fixup_bus); - -static struct resource busn_resource = { - .name = "PCI busn", - .start = 0, - .end = 255, - .flags = IORESOURCE_BUS, -}; - -static int __init pci_common_init(void) -{ - struct pci_bus *puv3_bus; - struct pci_host_bridge *bridge; - int ret; - - bridge = pci_alloc_host_bridge(0); - if (!bridge) - return -ENOMEM; - - pci_puv3_preinit(); - - pci_add_resource(&bridge->windows, &ioport_resource); - pci_add_resource(&bridge->windows, &iomem_resource); - pci_add_resource(&bridge->windows, &busn_resource); - bridge->sysdata = NULL; - bridge->busnr = 0; - bridge->ops = &pci_puv3_ops; - bridge->swizzle_irq = pci_common_swizzle; - bridge->map_irq = pci_puv3_map_irq; - - /* Scan our single hose. */ - ret = pci_scan_root_bus_bridge(bridge); - if (ret) { - pci_free_host_bridge(bridge); - return; - } - - puv3_bus = bridge->bus; - - if (!puv3_bus) - panic("PCI: unable to scan bus!"); - - pci_bus_size_bridges(puv3_bus); - pci_bus_assign_resources(puv3_bus); - pci_bus_add_devices(puv3_bus); - return 0; -} -subsys_initcall(pci_common_init); - -char * __init pcibios_setup(char *str) -{ - if (!strcmp(str, "debug")) { - debug_pci = 1; - return NULL; - } - return str; -} - -void pcibios_set_master(struct pci_dev *dev) -{ - /* No special bus mastering setup handling */ -} - -/* - * From arch/i386/kernel/pci-i386.c: - * - * We need to avoid collisions with `mirrored' VGA ports - * and other strange ISA hardware, so we always want the - * addresses to be allocated in the 0x000-0x0ff region - * modulo 0x400. - * - * Why? Because some silly external IO cards only decode - * the low 10 bits of the IO address. The 0x00-0xff region - * is reserved for motherboard devices that decode all 16 - * bits, so it's ok to allocate at, say, 0x2800-0x28ff, - * but we want to try to avoid allocating at 0x2900-0x2bff - * which might be mirrored at 0x0100-0x03ff.. - */ -resource_size_t pcibios_align_resource(void *data, const struct resource *res, - resource_size_t size, resource_size_t align) -{ - resource_size_t start = res->start; - - if (res->flags & IORESOURCE_IO && start & 0x300) - start = (start + 0x3ff) & ~0x3ff; - - start = (start + align - 1) & ~(align - 1); - - return start; -} - -/** - * pcibios_enable_device - Enable I/O and memory. - * @dev: PCI device to be enabled - */ -int pcibios_enable_device(struct pci_dev *dev, int mask) -{ - u16 cmd, old_cmd; - int idx; - struct resource *r; - - pci_read_config_word(dev, PCI_COMMAND, &cmd); - old_cmd = cmd; - for (idx = 0; idx < 6; idx++) { - /* Only set up the requested stuff */ - if (!(mask & (1 << idx))) - continue; - - r = dev->resource + idx; - if (!r->start && r->end) { - printk(KERN_ERR "PCI: Device %s not available because" - " of resource collisions\n", pci_name(dev)); - return -EINVAL; - } - if (r->flags & IORESOURCE_IO) - cmd |= PCI_COMMAND_IO; - if (r->flags & IORESOURCE_MEM) - cmd |= PCI_COMMAND_MEMORY; - } - - /* - * Bridges (eg, cardbus bridges) need to be fully enabled - */ - if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE) - cmd |= PCI_COMMAND_IO | PCI_COMMAND_MEMORY; - - if (cmd != old_cmd) { - printk("PCI: enabling device %s (%04x -> %04x)\n", - pci_name(dev), old_cmd, cmd); - pci_write_config_word(dev, PCI_COMMAND, cmd); - } - return 0; -} diff --git a/arch/unicore32/kernel/pm.c b/arch/unicore32/kernel/pm.c deleted file mode 100644 index 94b7f9df6c1a..000000000000 --- a/arch/unicore32/kernel/pm.c +++ /dev/null @@ -1,121 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/pm.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "setup.h" - -struct puv3_cpu_pm_fns *puv3_cpu_pm_fns; -static unsigned long *sleep_save; - -int puv3_pm_enter(suspend_state_t state) -{ - unsigned long sleep_save_checksum = 0, checksum = 0; - int i; - - /* skip registers saving for standby */ - if (state != PM_SUSPEND_STANDBY) { - puv3_cpu_pm_fns->save(sleep_save); - /* before sleeping, calculate and save a checksum */ - for (i = 0; i < puv3_cpu_pm_fns->save_count - 1; i++) - sleep_save_checksum += sleep_save[i]; - } - - /* *** go zzz *** */ - puv3_cpu_pm_fns->enter(state); - cpu_init(); -#ifdef CONFIG_INPUT_KEYBOARD - puv3_ps2_init(); -#endif -#ifdef CONFIG_PCI - pci_puv3_preinit(); -#endif - if (state != PM_SUSPEND_STANDBY) { - /* after sleeping, validate the checksum */ - for (i = 0; i < puv3_cpu_pm_fns->save_count - 1; i++) - checksum += sleep_save[i]; - - /* if invalid, display message and wait for a hardware reset */ - if (checksum != sleep_save_checksum) { - while (1) - puv3_cpu_pm_fns->enter(state); - } - puv3_cpu_pm_fns->restore(sleep_save); - } - - pr_debug("*** made it back from resume\n"); - - return 0; -} -EXPORT_SYMBOL_GPL(puv3_pm_enter); - -unsigned long sleep_phys_sp(void *sp) -{ - return virt_to_phys(sp); -} - -static int puv3_pm_valid(suspend_state_t state) -{ - if (puv3_cpu_pm_fns) - return puv3_cpu_pm_fns->valid(state); - - return -EINVAL; -} - -static int puv3_pm_prepare(void) -{ - int ret = 0; - - if (puv3_cpu_pm_fns && puv3_cpu_pm_fns->prepare) - ret = puv3_cpu_pm_fns->prepare(); - - return ret; -} - -static void puv3_pm_finish(void) -{ - if (puv3_cpu_pm_fns && puv3_cpu_pm_fns->finish) - puv3_cpu_pm_fns->finish(); -} - -static struct platform_suspend_ops puv3_pm_ops = { - .valid = puv3_pm_valid, - .enter = puv3_pm_enter, - .prepare = puv3_pm_prepare, - .finish = puv3_pm_finish, -}; - -static int __init puv3_pm_init(void) -{ - if (!puv3_cpu_pm_fns) { - printk(KERN_ERR "no valid puv3_cpu_pm_fns defined\n"); - return -EINVAL; - } - - sleep_save = kmalloc_array(puv3_cpu_pm_fns->save_count, - sizeof(unsigned long), - GFP_KERNEL); - if (!sleep_save) { - printk(KERN_ERR "failed to alloc memory for pm save\n"); - return -ENOMEM; - } - - suspend_set_ops(&puv3_pm_ops); - return 0; -} - -device_initcall(puv3_pm_init); diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c deleted file mode 100644 index b4fd3a604a18..000000000000 --- a/arch/unicore32/kernel/process.c +++ /dev/null @@ -1,319 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/process.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "setup.h" - -static const char * const processor_modes[] = { - "UK00", "UK01", "UK02", "UK03", "UK04", "UK05", "UK06", "UK07", - "UK08", "UK09", "UK0A", "UK0B", "UK0C", "UK0D", "UK0E", "UK0F", - "USER", "REAL", "INTR", "PRIV", "UK14", "UK15", "UK16", "ABRT", - "UK18", "UK19", "UK1A", "EXTN", "UK1C", "UK1D", "UK1E", "SUSR" -}; - -void arch_cpu_idle(void) -{ - cpu_do_idle(); - local_irq_enable(); -} - -void machine_halt(void) -{ - gpio_set_value(GPO_SOFT_OFF, 0); -} - -/* - * Function pointers to optional machine specific functions - */ -void (*pm_power_off)(void) = NULL; -EXPORT_SYMBOL(pm_power_off); - -void machine_power_off(void) -{ - if (pm_power_off) - pm_power_off(); - machine_halt(); -} - -void machine_restart(char *cmd) -{ - /* Disable interrupts first */ - local_irq_disable(); - - /* - * Tell the mm system that we are going to reboot - - * we may need it to insert some 1:1 mappings so that - * soft boot works. - */ - setup_mm_for_reboot(); - - /* Clean and invalidate caches */ - flush_cache_all(); - - /* Turn off caching */ - cpu_proc_fin(); - - /* Push out any further dirty data, and ensure cache is empty */ - flush_cache_all(); - - /* - * Now handle reboot code. - */ - if (reboot_mode == REBOOT_SOFT) { - /* Jump into ROM at address 0xffff0000 */ - cpu_reset(VECTORS_BASE); - } else { - writel(0x00002001, PM_PLLSYSCFG); /* cpu clk = 250M */ - writel(0x00100800, PM_PLLDDRCFG); /* ddr clk = 44M */ - writel(0x00002001, PM_PLLVGACFG); /* vga clk = 250M */ - - /* Use on-chip reset capability */ - /* following instructions must be in one icache line */ - __asm__ __volatile__( - " .align 5\n\t" - " stw %1, [%0]\n\t" - "201: ldw r0, [%0]\n\t" - " cmpsub.a r0, #0\n\t" - " bne 201b\n\t" - " stw %3, [%2]\n\t" - " nop; nop; nop\n\t" - /* prefetch 3 instructions at most */ - : - : "r" (PM_PMCR), - "r" (PM_PMCR_CFBSYS | PM_PMCR_CFBDDR - | PM_PMCR_CFBVGA), - "r" (RESETC_SWRR), - "r" (RESETC_SWRR_SRB) - : "r0", "memory"); - } - - /* - * Whoops - the architecture was unable to reboot. - * Tell the user! - */ - mdelay(1000); - printk(KERN_EMERG "Reboot failed -- System halted\n"); - do { } while (1); -} - -void __show_regs(struct pt_regs *regs) -{ - unsigned long flags; - char buf[64]; - - show_regs_print_info(KERN_DEFAULT); - printk("PC is at %pS\n", (void *)instruction_pointer(regs)); - printk("LR is at %pS\n", (void *)regs->UCreg_lr); - printk(KERN_DEFAULT "pc : [<%08lx>] lr : [<%08lx>] psr: %08lx\n" - "sp : %08lx ip : %08lx fp : %08lx\n", - regs->UCreg_pc, regs->UCreg_lr, regs->UCreg_asr, - regs->UCreg_sp, regs->UCreg_ip, regs->UCreg_fp); - printk(KERN_DEFAULT "r26: %08lx r25: %08lx r24: %08lx\n", - regs->UCreg_26, regs->UCreg_25, - regs->UCreg_24); - printk(KERN_DEFAULT "r23: %08lx r22: %08lx r21: %08lx r20: %08lx\n", - regs->UCreg_23, regs->UCreg_22, - regs->UCreg_21, regs->UCreg_20); - printk(KERN_DEFAULT "r19: %08lx r18: %08lx r17: %08lx r16: %08lx\n", - regs->UCreg_19, regs->UCreg_18, - regs->UCreg_17, regs->UCreg_16); - printk(KERN_DEFAULT "r15: %08lx r14: %08lx r13: %08lx r12: %08lx\n", - regs->UCreg_15, regs->UCreg_14, - regs->UCreg_13, regs->UCreg_12); - printk(KERN_DEFAULT "r11: %08lx r10: %08lx r9 : %08lx r8 : %08lx\n", - regs->UCreg_11, regs->UCreg_10, - regs->UCreg_09, regs->UCreg_08); - printk(KERN_DEFAULT "r7 : %08lx r6 : %08lx r5 : %08lx r4 : %08lx\n", - regs->UCreg_07, regs->UCreg_06, - regs->UCreg_05, regs->UCreg_04); - printk(KERN_DEFAULT "r3 : %08lx r2 : %08lx r1 : %08lx r0 : %08lx\n", - regs->UCreg_03, regs->UCreg_02, - regs->UCreg_01, regs->UCreg_00); - - flags = regs->UCreg_asr; - buf[0] = flags & PSR_S_BIT ? 'S' : 's'; - buf[1] = flags & PSR_Z_BIT ? 'Z' : 'z'; - buf[2] = flags & PSR_C_BIT ? 'C' : 'c'; - buf[3] = flags & PSR_V_BIT ? 'V' : 'v'; - buf[4] = '\0'; - - printk(KERN_DEFAULT "Flags: %s INTR o%s REAL o%s Mode %s Segment %s\n", - buf, interrupts_enabled(regs) ? "n" : "ff", - fast_interrupts_enabled(regs) ? "n" : "ff", - processor_modes[processor_mode(regs)], - uaccess_kernel() ? "kernel" : "user"); - { - unsigned int ctrl; - - buf[0] = '\0'; - { - unsigned int transbase; - asm("movc %0, p0.c2, #0\n" - : "=r" (transbase)); - snprintf(buf, sizeof(buf), " Table: %08x", transbase); - } - asm("movc %0, p0.c1, #0\n" : "=r" (ctrl)); - - printk(KERN_DEFAULT "Control: %08x%s\n", ctrl, buf); - } -} - -void show_regs(struct pt_regs *regs) -{ - printk(KERN_DEFAULT "\n"); - printk(KERN_DEFAULT "Pid: %d, comm: %20s\n", - task_pid_nr(current), current->comm); - __show_regs(regs); - __backtrace(); -} - -void flush_thread(void) -{ - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = current; - - memset(thread->used_cp, 0, sizeof(thread->used_cp)); - memset(&tsk->thread.debug, 0, sizeof(struct debug_info)); -#ifdef CONFIG_UNICORE_FPU_F64 - memset(&thread->fpstate, 0, sizeof(struct fp_state)); -#endif -} - -void release_thread(struct task_struct *dead_task) -{ -} - -asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); - -int -copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p) -{ - struct thread_info *thread = task_thread_info(p); - struct pt_regs *childregs = task_pt_regs(p); - - memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save)); - thread->cpu_context.sp = (unsigned long)childregs; - if (unlikely(p->flags & PF_KTHREAD)) { - thread->cpu_context.pc = (unsigned long)ret_from_kernel_thread; - thread->cpu_context.r4 = stack_start; - thread->cpu_context.r5 = stk_sz; - memset(childregs, 0, sizeof(struct pt_regs)); - } else { - thread->cpu_context.pc = (unsigned long)ret_from_fork; - *childregs = *current_pt_regs(); - childregs->UCreg_00 = 0; - if (stack_start) - childregs->UCreg_sp = stack_start; - - if (clone_flags & CLONE_SETTLS) - childregs->UCreg_16 = childregs->UCreg_03; - } - return 0; -} - -/* - * Fill in the task's elfregs structure for a core dump. - */ -int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs) -{ - elf_core_copy_regs(elfregs, task_pt_regs(t)); - return 1; -} - -/* - * fill in the fpe structure for a core dump... - */ -int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fp) -{ - struct thread_info *thread = current_thread_info(); - int used_math = thread->used_cp[1] | thread->used_cp[2]; - -#ifdef CONFIG_UNICORE_FPU_F64 - if (used_math) - memcpy(fp, &thread->fpstate, sizeof(*fp)); -#endif - return used_math != 0; -} -EXPORT_SYMBOL(dump_fpu); - -unsigned long get_wchan(struct task_struct *p) -{ - struct stackframe frame; - int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) - return 0; - - frame.fp = thread_saved_fp(p); - frame.sp = thread_saved_sp(p); - frame.lr = 0; /* recovered from the stack */ - frame.pc = thread_saved_pc(p); - do { - int ret = unwind_frame(&frame); - if (ret < 0) - return 0; - if (!in_sched_functions(frame.pc)) - return frame.pc; - } while ((count++) < 16); - return 0; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - return randomize_page(mm->brk, 0x02000000); -} - -/* - * The vectors page is always readable from user space for the - * atomic helpers and the signal restart code. Let's declare a mapping - * for it so it is visible through ptrace and /proc//mem. - */ - -int vectors_user_mapping(void) -{ - struct mm_struct *mm = current->mm; - return install_special_mapping(mm, 0xffff0000, PAGE_SIZE, - VM_READ | VM_EXEC | - VM_MAYREAD | VM_MAYEXEC | - VM_DONTEXPAND | VM_DONTDUMP, - NULL); -} - -const char *arch_vma_name(struct vm_area_struct *vma) -{ - return (vma->vm_start == 0xffff0000) ? "[vectors]" : NULL; -} diff --git a/arch/unicore32/kernel/ptrace.c b/arch/unicore32/kernel/ptrace.c deleted file mode 100644 index 0f216567b90a..000000000000 --- a/arch/unicore32/kernel/ptrace.c +++ /dev/null @@ -1,147 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/ptrace.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * By Ross Biro 1/23/92 - */ -#include -#include -#include -#include -#include - -/* - * this routine will get a word off of the processes privileged stack. - * the offset is how far from the base addr as stored in the THREAD. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline long get_user_reg(struct task_struct *task, int offset) -{ - return task_pt_regs(task)->uregs[offset]; -} - -/* - * this routine will put a word on the processes privileged stack. - * the offset is how far from the base addr as stored in the THREAD. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline int -put_user_reg(struct task_struct *task, int offset, long data) -{ - struct pt_regs newregs, *regs = task_pt_regs(task); - int ret = -EINVAL; - - newregs = *regs; - newregs.uregs[offset] = data; - - if (valid_user_regs(&newregs)) { - regs->uregs[offset] = data; - ret = 0; - } - - return ret; -} - -/* - * Called by kernel/ptrace.c when detaching.. - */ -void ptrace_disable(struct task_struct *child) -{ -} - -/* - * We actually access the pt_regs stored on the kernel stack. - */ -static int ptrace_read_user(struct task_struct *tsk, unsigned long off, - unsigned long __user *ret) -{ - unsigned long tmp; - - tmp = 0; - if (off < sizeof(struct pt_regs)) - tmp = get_user_reg(tsk, off >> 2); - - return put_user(tmp, ret); -} - -/* - * We actually access the pt_regs stored on the kernel stack. - */ -static int ptrace_write_user(struct task_struct *tsk, unsigned long off, - unsigned long val) -{ - if (off >= sizeof(struct pt_regs)) - return 0; - - return put_user_reg(tsk, off >> 2, val); -} - -long arch_ptrace(struct task_struct *child, long request, - unsigned long addr, unsigned long data) -{ - int ret; - unsigned long __user *datap = (unsigned long __user *) data; - - switch (request) { - case PTRACE_PEEKUSR: - ret = ptrace_read_user(child, addr, datap); - break; - - case PTRACE_POKEUSR: - ret = ptrace_write_user(child, addr, data); - break; - - case PTRACE_GET_THREAD_AREA: - ret = put_user(task_pt_regs(child)->UCreg_16, - datap); - break; - - default: - ret = ptrace_request(child, request, addr, data); - break; - } - - return ret; -} - -asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno) -{ - unsigned long ip; - - if (!test_thread_flag(TIF_SYSCALL_TRACE)) - return scno; - if (!(current->ptrace & PT_PTRACED)) - return scno; - - /* - * Save IP. IP is used to denote syscall entry/exit: - * IP = 0 -> entry, = 1 -> exit - */ - ip = regs->UCreg_ip; - regs->UCreg_ip = why; - - current_thread_info()->syscall = scno; - - /* the 0x80 provides a way for the tracing parent to distinguish - between a syscall stop and SIGTRAP delivery */ - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0)); - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } - regs->UCreg_ip = ip; - - return current_thread_info()->syscall; -} diff --git a/arch/unicore32/kernel/puv3-core.c b/arch/unicore32/kernel/puv3-core.c deleted file mode 100644 index 78f12e627365..000000000000 --- a/arch/unicore32/kernel/puv3-core.c +++ /dev/null @@ -1,276 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/puv3-core.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -/* - * This is the PKUnity sched_clock implementation. This has - * a resolution of 271ns, and a maximum value of 32025597s (370 days). - * - * The return value is guaranteed to be monotonic in that range as - * long as there is always less than 582 seconds between successive - * calls to this function. - * - * ( * 1E9 / CLOCK_TICK_RATE ) -> about 2235/32 - */ -unsigned long long sched_clock(void) -{ - unsigned long long v = cnt32_to_63(readl(OST_OSCR)); - - /* original conservative method, but overflow frequently - * v *= NSEC_PER_SEC >> 12; - * do_div(v, CLOCK_TICK_RATE >> 12); - */ - v = ((v & 0x7fffffffffffffffULL) * 2235) >> 5; - - return v; -} - -static struct resource puv3_usb_resources[] = { - /* order is significant! */ - { - .start = io_v2p(PKUNITY_USB_BASE), - .end = io_v2p(PKUNITY_USB_BASE) + 0x3ff, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_USB, - .flags = IORESOURCE_IRQ, - }, { - .start = IRQ_USB, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct musb_hdrc_config puv3_usb_config[] = { - { - .num_eps = 16, - .multipoint = 1, -#ifdef CONFIG_USB_INVENTRA_DMA - .dma = 1, - .dma_channels = 8, -#endif - }, -}; - -static struct musb_hdrc_platform_data puv3_usb_plat = { - .mode = MUSB_HOST, - .min_power = 100, - .clock = 0, - .config = puv3_usb_config, -}; - -static struct resource puv3_mmc_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_SDC_BASE), - .end = io_v2p(PKUNITY_SDC_BASE) + 0xfff, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_SDC, - .end = IRQ_SDC, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct resource puv3_unigfx_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_UNIGFX_BASE), - .end = io_v2p(PKUNITY_UNIGFX_BASE) + 0xfff, - .flags = IORESOURCE_MEM, - }, -}; - -static struct resource puv3_rtc_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_RTC_BASE), - .end = io_v2p(PKUNITY_RTC_BASE) + 0xff, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_RTCAlarm, - .end = IRQ_RTCAlarm, - .flags = IORESOURCE_IRQ, - }, - [2] = { - .start = IRQ_RTC, - .end = IRQ_RTC, - .flags = IORESOURCE_IRQ - } -}; - -static struct resource puv3_pwm_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_OST_BASE) + 0x80, - .end = io_v2p(PKUNITY_OST_BASE) + 0xff, - .flags = IORESOURCE_MEM, - }, -}; - -static struct resource puv3_uart0_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_UART0_BASE), - .end = io_v2p(PKUNITY_UART0_BASE) + 0xff, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_UART0, - .end = IRQ_UART0, - .flags = IORESOURCE_IRQ - } -}; - -static struct resource puv3_uart1_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_UART1_BASE), - .end = io_v2p(PKUNITY_UART1_BASE) + 0xff, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_UART1, - .end = IRQ_UART1, - .flags = IORESOURCE_IRQ - } -}; - -static struct resource puv3_umal_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_UMAL_BASE), - .end = io_v2p(PKUNITY_UMAL_BASE) + 0x1fff, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_UMAL, - .end = IRQ_UMAL, - .flags = IORESOURCE_IRQ - } -}; - -#ifdef CONFIG_PUV3_PM - -#define SAVE(x) sleep_save[SLEEP_SAVE_##x] = x -#define RESTORE(x) x = sleep_save[SLEEP_SAVE_##x] - -/* - * List of global PXA peripheral registers to preserve. - * More ones like CP and general purpose register values are preserved - * with the stack pointer in sleep.S. - */ -enum { - SLEEP_SAVE_PM_PLLDDRCFG, - SLEEP_SAVE_COUNT -}; - - -static void puv3_cpu_pm_save(unsigned long *sleep_save) -{ -/* SAVE(PM_PLLDDRCFG); */ -} - -static void puv3_cpu_pm_restore(unsigned long *sleep_save) -{ -/* RESTORE(PM_PLLDDRCFG); */ -} - -static int puv3_cpu_pm_prepare(void) -{ - /* set resume return address */ - writel(virt_to_phys(puv3_cpu_resume), PM_DIVCFG); - return 0; -} - -static void puv3_cpu_pm_enter(suspend_state_t state) -{ - /* Clear reset status */ - writel(RESETC_RSSR_HWR | RESETC_RSSR_WDR - | RESETC_RSSR_SMR | RESETC_RSSR_SWR, RESETC_RSSR); - - switch (state) { -/* case PM_SUSPEND_ON: - puv3_cpu_idle(); - break; */ - case PM_SUSPEND_MEM: - puv3_cpu_pm_prepare(); - puv3_cpu_suspend(PM_PMCR_SFB); - break; - } -} - -static int puv3_cpu_pm_valid(suspend_state_t state) -{ - return state == PM_SUSPEND_MEM; -} - -static void puv3_cpu_pm_finish(void) -{ - /* ensure not to come back here if it wasn't intended */ - /* PSPR = 0; */ -} - -static struct puv3_cpu_pm_fns puv3_cpu_pm_fnss = { - .save_count = SLEEP_SAVE_COUNT, - .valid = puv3_cpu_pm_valid, - .save = puv3_cpu_pm_save, - .restore = puv3_cpu_pm_restore, - .enter = puv3_cpu_pm_enter, - .prepare = puv3_cpu_pm_prepare, - .finish = puv3_cpu_pm_finish, -}; - -static void __init puv3_init_pm(void) -{ - puv3_cpu_pm_fns = &puv3_cpu_pm_fnss; -} -#else -static inline void puv3_init_pm(void) {} -#endif - -void puv3_ps2_init(void) -{ - struct clk *bclk32; - - bclk32 = clk_get(NULL, "BUS32_CLK"); - writel(clk_get_rate(bclk32) / 200000, PS2_CNT); /* should > 5us */ -} - -void __init puv3_core_init(void) -{ - puv3_init_pm(); - puv3_ps2_init(); - - platform_device_register_simple("PKUnity-v3-RTC", -1, - puv3_rtc_resources, ARRAY_SIZE(puv3_rtc_resources)); - platform_device_register_simple("PKUnity-v3-UMAL", -1, - puv3_umal_resources, ARRAY_SIZE(puv3_umal_resources)); - platform_device_register_simple("PKUnity-v3-MMC", -1, - puv3_mmc_resources, ARRAY_SIZE(puv3_mmc_resources)); - platform_device_register_simple("PKUnity-v3-UNIGFX", -1, - puv3_unigfx_resources, ARRAY_SIZE(puv3_unigfx_resources)); - platform_device_register_simple("PKUnity-v3-PWM", -1, - puv3_pwm_resources, ARRAY_SIZE(puv3_pwm_resources)); - platform_device_register_simple("PKUnity-v3-UART", 0, - puv3_uart0_resources, ARRAY_SIZE(puv3_uart0_resources)); - platform_device_register_simple("PKUnity-v3-UART", 1, - puv3_uart1_resources, ARRAY_SIZE(puv3_uart1_resources)); - platform_device_register_simple("PKUnity-v3-AC97", -1, NULL, 0); - platform_device_register_resndata(NULL, "musb_hdrc", -1, - puv3_usb_resources, ARRAY_SIZE(puv3_usb_resources), - &puv3_usb_plat, sizeof(puv3_usb_plat)); -} - diff --git a/arch/unicore32/kernel/puv3-nb0916.c b/arch/unicore32/kernel/puv3-nb0916.c deleted file mode 100644 index e251f5028396..000000000000 --- a/arch/unicore32/kernel/puv3-nb0916.c +++ /dev/null @@ -1,147 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/puv3-nb0916.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -static struct physmap_flash_data physmap_flash_data = { - .width = 1, -}; - -static struct resource physmap_flash_resource = { - .start = 0xFFF80000, - .end = 0xFFFFFFFF, - .flags = IORESOURCE_MEM, -}; - -static struct resource puv3_i2c_resources[] = { - [0] = { - .start = io_v2p(PKUNITY_I2C_BASE), - .end = io_v2p(PKUNITY_I2C_BASE) + 0xff, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_I2C, - .end = IRQ_I2C, - .flags = IORESOURCE_IRQ, - } -}; - -static struct pwm_lookup nb0916_pwm_lookup[] = { - PWM_LOOKUP("PKUnity-v3-PWM", 0, "pwm-backlight", NULL, 70 * 1024, - PWM_POLARITY_NORMAL), -}; - -static struct platform_pwm_backlight_data nb0916_backlight_data = { - .max_brightness = 100, - .dft_brightness = 100, -}; - -static struct gpio_keys_button nb0916_gpio_keys[] = { - { - .type = EV_KEY, - .code = KEY_POWER, - .gpio = GPI_SOFF_REQ, - .desc = "Power Button", - .wakeup = 1, - .active_low = 1, - }, - { - .type = EV_KEY, - .code = BTN_TOUCH, - .gpio = GPI_BTN_TOUCH, - .desc = "Touchpad Button", - .wakeup = 1, - .active_low = 1, - }, -}; - -static struct gpio_keys_platform_data nb0916_gpio_button_data = { - .buttons = nb0916_gpio_keys, - .nbuttons = ARRAY_SIZE(nb0916_gpio_keys), -}; - -static irqreturn_t nb0916_lcdcaseoff_handler(int irq, void *dev_id) -{ - if (gpio_get_value(GPI_LCD_CASE_OFF)) - gpio_set_value(GPO_LCD_EN, 1); - else - gpio_set_value(GPO_LCD_EN, 0); - - return IRQ_HANDLED; -} - -static irqreturn_t nb0916_overheat_handler(int irq, void *dev_id) -{ - machine_halt(); - /* SYSTEM HALT, NO RETURN */ - return IRQ_HANDLED; -} - -static struct i2c_board_info __initdata puv3_i2c_devices[] = { - { I2C_BOARD_INFO("lm75", I2C_TAR_THERMAL), }, - { I2C_BOARD_INFO("bq27200", I2C_TAR_PWIC), }, - { I2C_BOARD_INFO("24c02", I2C_TAR_EEPROM), }, -}; - -int __init mach_nb0916_init(void) -{ - i2c_register_board_info(0, puv3_i2c_devices, - ARRAY_SIZE(puv3_i2c_devices)); - - platform_device_register_simple("PKUnity-v3-I2C", -1, - puv3_i2c_resources, ARRAY_SIZE(puv3_i2c_resources)); - - pwm_add_table(nb0916_pwm_lookup, ARRAY_SIZE(nb0916_pwm_lookup)); - - platform_device_register_data(NULL, "pwm-backlight", -1, - &nb0916_backlight_data, sizeof(nb0916_backlight_data)); - - platform_device_register_data(NULL, "gpio-keys", -1, - &nb0916_gpio_button_data, sizeof(nb0916_gpio_button_data)); - - platform_device_register_resndata(NULL, "physmap-flash", -1, - &physmap_flash_resource, 1, - &physmap_flash_data, sizeof(physmap_flash_data)); - - if (request_irq(gpio_to_irq(GPI_LCD_CASE_OFF), - &nb0916_lcdcaseoff_handler, - IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, - "NB0916 lcd case off", NULL) < 0) { - - printk(KERN_DEBUG "LCD-Case-OFF IRQ %d not available\n", - gpio_to_irq(GPI_LCD_CASE_OFF)); - } - - if (request_irq(gpio_to_irq(GPI_OTP_INT), &nb0916_overheat_handler, - IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, - "NB0916 overheating protection", NULL) < 0) { - - printk(KERN_DEBUG "Overheating Protection IRQ %d not available\n", - gpio_to_irq(GPI_OTP_INT)); - } - - return 0; -} - -subsys_initcall_sync(mach_nb0916_init); diff --git a/arch/unicore32/kernel/setup.c b/arch/unicore32/kernel/setup.c deleted file mode 100644 index 0c4242a5ee1d..000000000000 --- a/arch/unicore32/kernel/setup.c +++ /dev/null @@ -1,352 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/setup.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "setup.h" - -#ifndef MEM_SIZE -#define MEM_SIZE (16*1024*1024) -#endif - -struct stack { - u32 irq[3]; - u32 abt[3]; - u32 und[3]; -} ____cacheline_aligned; - -static struct stack stacks[NR_CPUS]; - -#ifdef CONFIG_VGA_CONSOLE -struct screen_info screen_info; -#endif - -char elf_platform[ELF_PLATFORM_SIZE]; -EXPORT_SYMBOL(elf_platform); - -static char __initdata cmd_line[COMMAND_LINE_SIZE]; - -static char default_command_line[COMMAND_LINE_SIZE] __initdata = CONFIG_CMDLINE; - -/* - * Standard memory resources - */ -static struct resource mem_res[] = { - { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_SYSTEM_RAM - }, - { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_SYSTEM_RAM - } -}; - -#define kernel_code mem_res[0] -#define kernel_data mem_res[1] - -/* - * These functions re-use the assembly code in head.S, which - * already provide the required functionality. - */ -static void __init setup_processor(void) -{ - printk(KERN_DEFAULT "CPU: UniCore-II [%08x] revision %d, cr=%08lx\n", - uc32_cpuid, (int)(uc32_cpuid >> 16) & 15, cr_alignment); - - sprintf(init_utsname()->machine, "puv3"); - sprintf(elf_platform, "ucv2"); -} - -/* - * cpu_init - initialise one CPU. - * - * cpu_init sets up the per-CPU stacks. - */ -void cpu_init(void) -{ - unsigned int cpu = smp_processor_id(); - struct stack *stk = &stacks[cpu]; - - /* - * setup stacks for re-entrant exception handlers - */ - __asm__ ( - "mov.a asr, %1\n\t" - "add sp, %0, %2\n\t" - "mov.a asr, %3\n\t" - "add sp, %0, %4\n\t" - "mov.a asr, %5\n\t" - "add sp, %0, %6\n\t" - "mov.a asr, %7" - : - : "r" (stk), - "r" (PSR_R_BIT | PSR_I_BIT | INTR_MODE), - "I" (offsetof(struct stack, irq[0])), - "r" (PSR_R_BIT | PSR_I_BIT | ABRT_MODE), - "I" (offsetof(struct stack, abt[0])), - "r" (PSR_R_BIT | PSR_I_BIT | EXTN_MODE), - "I" (offsetof(struct stack, und[0])), - "r" (PSR_R_BIT | PSR_I_BIT | PRIV_MODE) - : "r30", "cc"); -} - -static int __init uc32_add_memory(unsigned long start, unsigned long size) -{ - struct membank *bank = &meminfo.bank[meminfo.nr_banks]; - - if (meminfo.nr_banks >= NR_BANKS) { - printk(KERN_CRIT "NR_BANKS too low, " - "ignoring memory at %#lx\n", start); - return -EINVAL; - } - - /* - * Ensure that start/size are aligned to a page boundary. - * Size is appropriately rounded down, start is rounded up. - */ - size -= start & ~PAGE_MASK; - - bank->start = PAGE_ALIGN(start); - bank->size = size & PAGE_MASK; - - /* - * Check whether this memory region has non-zero size or - * invalid node number. - */ - if (bank->size == 0) - return -EINVAL; - - meminfo.nr_banks++; - return 0; -} - -/* - * Pick out the memory size. We look for mem=size@start, - * where start and size are "size[KkMm]" - */ -static int __init early_mem(char *p) -{ - static int usermem __initdata = 1; - unsigned long size, start; - char *endp; - - /* - * If the user specifies memory size, we - * blow away any automatically generated - * size. - */ - if (usermem) { - usermem = 0; - meminfo.nr_banks = 0; - } - - start = PHYS_OFFSET; - size = memparse(p, &endp); - if (*endp == '@') - start = memparse(endp + 1, NULL); - - uc32_add_memory(start, size); - - return 0; -} -early_param("mem", early_mem); - -static void __init -request_standard_resources(struct meminfo *mi) -{ - struct resource *res; - int i; - - kernel_code.start = virt_to_phys(_stext); - kernel_code.end = virt_to_phys(_etext - 1); - kernel_data.start = virt_to_phys(_sdata); - kernel_data.end = virt_to_phys(_end - 1); - - for (i = 0; i < mi->nr_banks; i++) { - if (mi->bank[i].size == 0) - continue; - - res = memblock_alloc_low(sizeof(*res), SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes align=%x\n", - __func__, sizeof(*res), SMP_CACHE_BYTES); - - res->name = "System RAM"; - res->start = mi->bank[i].start; - res->end = mi->bank[i].start + mi->bank[i].size - 1; - res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - - request_resource(&iomem_resource, res); - - if (kernel_code.start >= res->start && - kernel_code.end <= res->end) - request_resource(res, &kernel_code); - if (kernel_data.start >= res->start && - kernel_data.end <= res->end) - request_resource(res, &kernel_data); - } -} - -static void (*init_machine)(void) __initdata; - -static int __init customize_machine(void) -{ - /* customizes platform devices, or adds new ones */ - if (init_machine) - init_machine(); - return 0; -} -arch_initcall(customize_machine); - -void __init setup_arch(char **cmdline_p) -{ - char *from = default_command_line; - - setup_processor(); - - init_mm.start_code = (unsigned long) _stext; - init_mm.end_code = (unsigned long) _etext; - init_mm.end_data = (unsigned long) _edata; - init_mm.brk = (unsigned long) _end; - - /* parse_early_param needs a boot_command_line */ - strlcpy(boot_command_line, from, COMMAND_LINE_SIZE); - - /* populate cmd_line too for later use, preserving boot_command_line */ - strlcpy(cmd_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = cmd_line; - - parse_early_param(); - - uc32_memblock_init(&meminfo); - - paging_init(); - request_standard_resources(&meminfo); - - cpu_init(); - - /* - * Set up various architecture-specific pointers - */ - init_machine = puv3_core_init; - -#ifdef CONFIG_VT -#if defined(CONFIG_VGA_CONSOLE) - conswitchp = &vga_con; -#endif -#endif - early_trap_init(); -} - -static struct cpu cpuinfo_unicore; - -static int __init topology_init(void) -{ - int i; - - for_each_possible_cpu(i) - register_cpu(&cpuinfo_unicore, i); - - return 0; -} -subsys_initcall(topology_init); - -#ifdef CONFIG_HAVE_PROC_CPU -static int __init proc_cpu_init(void) -{ - struct proc_dir_entry *res; - - res = proc_mkdir("cpu", NULL); - if (!res) - return -ENOMEM; - return 0; -} -fs_initcall(proc_cpu_init); -#endif - -static int c_show(struct seq_file *m, void *v) -{ - seq_printf(m, "Processor\t: UniCore-II rev %d (%s)\n", - (int)(uc32_cpuid >> 16) & 15, elf_platform); - - seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", - loops_per_jiffy / (500000/HZ), - (loops_per_jiffy / (5000/HZ)) % 100); - - /* dump out the processor features */ - seq_puts(m, "Features\t: CMOV UC-F64"); - - seq_printf(m, "\nCPU implementer\t: 0x%02x\n", uc32_cpuid >> 24); - seq_printf(m, "CPU architecture: 2\n"); - seq_printf(m, "CPU revision\t: %d\n", (uc32_cpuid >> 16) & 15); - - seq_printf(m, "Cache type\t: write-back\n" - "Cache clean\t: cp0 c5 ops\n" - "Cache lockdown\t: not support\n" - "Cache format\t: Harvard\n"); - - seq_puts(m, "\n"); - - seq_printf(m, "Hardware\t: PKUnity v3\n"); - - return 0; -} - -static void *c_start(struct seq_file *m, loff_t *pos) -{ - return *pos < 1 ? (void *)1 : NULL; -} - -static void *c_next(struct seq_file *m, void *v, loff_t *pos) -{ - ++*pos; - return NULL; -} - -static void c_stop(struct seq_file *m, void *v) -{ -} - -const struct seq_operations cpuinfo_op = { - .start = c_start, - .next = c_next, - .stop = c_stop, - .show = c_show -}; diff --git a/arch/unicore32/kernel/setup.h b/arch/unicore32/kernel/setup.h deleted file mode 100644 index 967352323185..000000000000 --- a/arch/unicore32/kernel/setup.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/setup.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#ifndef __UNICORE_KERNEL_SETUP_H__ -#define __UNICORE_KERNEL_SETUP_H__ - -#include - -extern void paging_init(void); -extern void puv3_core_init(void); -extern void cpu_init(void); - -extern void puv3_ps2_init(void); -extern void pci_puv3_preinit(void); -extern void __init puv3_init_gpio(void); - -extern void setup_mm_for_reboot(void); - -extern char __stubs_start[], __stubs_end[]; -extern char __vectors_start[], __vectors_end[]; - -extern void kernel_thread_helper(void); - -extern void __init early_signal_init(void); - -extern asmlinkage void __backtrace(void); -extern asmlinkage void c_backtrace(unsigned long fp, const char *loglvl); - -extern void __show_regs(struct pt_regs *); - -#endif diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c deleted file mode 100644 index 3946182a835d..000000000000 --- a/arch/unicore32/kernel/signal.c +++ /dev/null @@ -1,424 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/signal.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -/* - * For UniCore syscalls, we encode the syscall number into the instruction. - */ -#define SWI_SYS_SIGRETURN (0xff000000) /* error number for new abi */ -#define SWI_SYS_RT_SIGRETURN (0xff000000 | (__NR_rt_sigreturn)) -#define SWI_SYS_RESTART (0xff000000 | (__NR_restart_syscall)) - -#define KERN_SIGRETURN_CODE (KUSER_VECPAGE_BASE + 0x00000500) -#define KERN_RESTART_CODE (KERN_SIGRETURN_CODE + sizeof(sigreturn_codes)) - -const unsigned long sigreturn_codes[3] = { - SWI_SYS_SIGRETURN, SWI_SYS_RT_SIGRETURN, -}; - -const unsigned long syscall_restart_code[2] = { - SWI_SYS_RESTART, /* swi __NR_restart_syscall */ - 0x69efc004, /* ldr pc, [sp], #4 */ -}; - -/* - * Do a signal return; undo the signal stack. These are aligned to 64-bit. - */ -struct sigframe { - struct ucontext uc; - unsigned long retcode[2]; -}; - -struct rt_sigframe { - struct siginfo info; - struct sigframe sig; -}; - -static int restore_sigframe(struct pt_regs *regs, struct sigframe __user *sf) -{ - sigset_t set; - int err; - - err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set)); - if (err == 0) - set_current_blocked(&set); - - err |= __get_user(regs->UCreg_00, &sf->uc.uc_mcontext.regs.UCreg_00); - err |= __get_user(regs->UCreg_01, &sf->uc.uc_mcontext.regs.UCreg_01); - err |= __get_user(regs->UCreg_02, &sf->uc.uc_mcontext.regs.UCreg_02); - err |= __get_user(regs->UCreg_03, &sf->uc.uc_mcontext.regs.UCreg_03); - err |= __get_user(regs->UCreg_04, &sf->uc.uc_mcontext.regs.UCreg_04); - err |= __get_user(regs->UCreg_05, &sf->uc.uc_mcontext.regs.UCreg_05); - err |= __get_user(regs->UCreg_06, &sf->uc.uc_mcontext.regs.UCreg_06); - err |= __get_user(regs->UCreg_07, &sf->uc.uc_mcontext.regs.UCreg_07); - err |= __get_user(regs->UCreg_08, &sf->uc.uc_mcontext.regs.UCreg_08); - err |= __get_user(regs->UCreg_09, &sf->uc.uc_mcontext.regs.UCreg_09); - err |= __get_user(regs->UCreg_10, &sf->uc.uc_mcontext.regs.UCreg_10); - err |= __get_user(regs->UCreg_11, &sf->uc.uc_mcontext.regs.UCreg_11); - err |= __get_user(regs->UCreg_12, &sf->uc.uc_mcontext.regs.UCreg_12); - err |= __get_user(regs->UCreg_13, &sf->uc.uc_mcontext.regs.UCreg_13); - err |= __get_user(regs->UCreg_14, &sf->uc.uc_mcontext.regs.UCreg_14); - err |= __get_user(regs->UCreg_15, &sf->uc.uc_mcontext.regs.UCreg_15); - err |= __get_user(regs->UCreg_16, &sf->uc.uc_mcontext.regs.UCreg_16); - err |= __get_user(regs->UCreg_17, &sf->uc.uc_mcontext.regs.UCreg_17); - err |= __get_user(regs->UCreg_18, &sf->uc.uc_mcontext.regs.UCreg_18); - err |= __get_user(regs->UCreg_19, &sf->uc.uc_mcontext.regs.UCreg_19); - err |= __get_user(regs->UCreg_20, &sf->uc.uc_mcontext.regs.UCreg_20); - err |= __get_user(regs->UCreg_21, &sf->uc.uc_mcontext.regs.UCreg_21); - err |= __get_user(regs->UCreg_22, &sf->uc.uc_mcontext.regs.UCreg_22); - err |= __get_user(regs->UCreg_23, &sf->uc.uc_mcontext.regs.UCreg_23); - err |= __get_user(regs->UCreg_24, &sf->uc.uc_mcontext.regs.UCreg_24); - err |= __get_user(regs->UCreg_25, &sf->uc.uc_mcontext.regs.UCreg_25); - err |= __get_user(regs->UCreg_26, &sf->uc.uc_mcontext.regs.UCreg_26); - err |= __get_user(regs->UCreg_fp, &sf->uc.uc_mcontext.regs.UCreg_fp); - err |= __get_user(regs->UCreg_ip, &sf->uc.uc_mcontext.regs.UCreg_ip); - err |= __get_user(regs->UCreg_sp, &sf->uc.uc_mcontext.regs.UCreg_sp); - err |= __get_user(regs->UCreg_lr, &sf->uc.uc_mcontext.regs.UCreg_lr); - err |= __get_user(regs->UCreg_pc, &sf->uc.uc_mcontext.regs.UCreg_pc); - err |= __get_user(regs->UCreg_asr, &sf->uc.uc_mcontext.regs.UCreg_asr); - - err |= !valid_user_regs(regs); - - return err; -} - -asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - - /* Always make any pending restarted system calls return -EINTR */ - current->restart_block.fn = do_no_restart_syscall; - - /* - * Since we stacked the signal on a 64-bit boundary, - * then 'sp' should be word aligned here. If it's - * not, then the user is trying to mess with us. - */ - if (regs->UCreg_sp & 7) - goto badframe; - - frame = (struct rt_sigframe __user *)regs->UCreg_sp; - - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - - if (restore_sigframe(regs, &frame->sig)) - goto badframe; - - if (restore_altstack(&frame->sig.uc.uc_stack)) - goto badframe; - - return regs->UCreg_00; - -badframe: - force_sig(SIGSEGV); - return 0; -} - -static int setup_sigframe(struct sigframe __user *sf, struct pt_regs *regs, - sigset_t *set) -{ - int err = 0; - - err |= __put_user(regs->UCreg_00, &sf->uc.uc_mcontext.regs.UCreg_00); - err |= __put_user(regs->UCreg_01, &sf->uc.uc_mcontext.regs.UCreg_01); - err |= __put_user(regs->UCreg_02, &sf->uc.uc_mcontext.regs.UCreg_02); - err |= __put_user(regs->UCreg_03, &sf->uc.uc_mcontext.regs.UCreg_03); - err |= __put_user(regs->UCreg_04, &sf->uc.uc_mcontext.regs.UCreg_04); - err |= __put_user(regs->UCreg_05, &sf->uc.uc_mcontext.regs.UCreg_05); - err |= __put_user(regs->UCreg_06, &sf->uc.uc_mcontext.regs.UCreg_06); - err |= __put_user(regs->UCreg_07, &sf->uc.uc_mcontext.regs.UCreg_07); - err |= __put_user(regs->UCreg_08, &sf->uc.uc_mcontext.regs.UCreg_08); - err |= __put_user(regs->UCreg_09, &sf->uc.uc_mcontext.regs.UCreg_09); - err |= __put_user(regs->UCreg_10, &sf->uc.uc_mcontext.regs.UCreg_10); - err |= __put_user(regs->UCreg_11, &sf->uc.uc_mcontext.regs.UCreg_11); - err |= __put_user(regs->UCreg_12, &sf->uc.uc_mcontext.regs.UCreg_12); - err |= __put_user(regs->UCreg_13, &sf->uc.uc_mcontext.regs.UCreg_13); - err |= __put_user(regs->UCreg_14, &sf->uc.uc_mcontext.regs.UCreg_14); - err |= __put_user(regs->UCreg_15, &sf->uc.uc_mcontext.regs.UCreg_15); - err |= __put_user(regs->UCreg_16, &sf->uc.uc_mcontext.regs.UCreg_16); - err |= __put_user(regs->UCreg_17, &sf->uc.uc_mcontext.regs.UCreg_17); - err |= __put_user(regs->UCreg_18, &sf->uc.uc_mcontext.regs.UCreg_18); - err |= __put_user(regs->UCreg_19, &sf->uc.uc_mcontext.regs.UCreg_19); - err |= __put_user(regs->UCreg_20, &sf->uc.uc_mcontext.regs.UCreg_20); - err |= __put_user(regs->UCreg_21, &sf->uc.uc_mcontext.regs.UCreg_21); - err |= __put_user(regs->UCreg_22, &sf->uc.uc_mcontext.regs.UCreg_22); - err |= __put_user(regs->UCreg_23, &sf->uc.uc_mcontext.regs.UCreg_23); - err |= __put_user(regs->UCreg_24, &sf->uc.uc_mcontext.regs.UCreg_24); - err |= __put_user(regs->UCreg_25, &sf->uc.uc_mcontext.regs.UCreg_25); - err |= __put_user(regs->UCreg_26, &sf->uc.uc_mcontext.regs.UCreg_26); - err |= __put_user(regs->UCreg_fp, &sf->uc.uc_mcontext.regs.UCreg_fp); - err |= __put_user(regs->UCreg_ip, &sf->uc.uc_mcontext.regs.UCreg_ip); - err |= __put_user(regs->UCreg_sp, &sf->uc.uc_mcontext.regs.UCreg_sp); - err |= __put_user(regs->UCreg_lr, &sf->uc.uc_mcontext.regs.UCreg_lr); - err |= __put_user(regs->UCreg_pc, &sf->uc.uc_mcontext.regs.UCreg_pc); - err |= __put_user(regs->UCreg_asr, &sf->uc.uc_mcontext.regs.UCreg_asr); - - err |= __put_user(current->thread.trap_no, - &sf->uc.uc_mcontext.trap_no); - err |= __put_user(current->thread.error_code, - &sf->uc.uc_mcontext.error_code); - err |= __put_user(current->thread.address, - &sf->uc.uc_mcontext.fault_address); - err |= __put_user(set->sig[0], &sf->uc.uc_mcontext.oldmask); - - err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); - - return err; -} - -static inline void __user *get_sigframe(struct k_sigaction *ka, - struct pt_regs *regs, int framesize) -{ - unsigned long sp = regs->UCreg_sp; - void __user *frame; - - /* - * This is the X/Open sanctioned signal stack switching. - */ - if ((ka->sa.sa_flags & SA_ONSTACK) && !sas_ss_flags(sp)) - sp = current->sas_ss_sp + current->sas_ss_size; - - /* - * ATPCS B01 mandates 8-byte alignment - */ - frame = (void __user *)((sp - framesize) & ~7); - - /* - * Check that we can actually write to the signal frame. - */ - if (!access_ok(frame, framesize)) - frame = NULL; - - return frame; -} - -static int setup_return(struct pt_regs *regs, struct k_sigaction *ka, - unsigned long __user *rc, void __user *frame, int usig) -{ - unsigned long handler = (unsigned long)ka->sa.sa_handler; - unsigned long retcode; - unsigned long asr = regs->UCreg_asr & ~PSR_f; - - unsigned int idx = 0; - - if (ka->sa.sa_flags & SA_SIGINFO) - idx += 1; - - if (__put_user(sigreturn_codes[idx], rc) || - __put_user(sigreturn_codes[idx+1], rc+1)) - return 1; - - retcode = KERN_SIGRETURN_CODE + (idx << 2); - - regs->UCreg_00 = usig; - regs->UCreg_sp = (unsigned long)frame; - regs->UCreg_lr = retcode; - regs->UCreg_pc = handler; - regs->UCreg_asr = asr; - - return 0; -} - -static int setup_frame(struct ksignal *ksig, sigset_t *set, - struct pt_regs *regs) -{ - struct sigframe __user *frame = get_sigframe(&ksig->ka, regs, sizeof(*frame)); - int err = 0; - - if (!frame) - return 1; - - /* - * Set uc.uc_flags to a value which sc.trap_no would never have. - */ - err |= __put_user(0x5ac3c35a, &frame->uc.uc_flags); - - err |= setup_sigframe(frame, regs, set); - if (err == 0) - err |= setup_return(regs, &ksig->ka, frame->retcode, frame, - ksig->sig); - - return err; -} - -static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, - struct pt_regs *regs) -{ - struct rt_sigframe __user *frame = - get_sigframe(&ksig->ka, regs, sizeof(*frame)); - int err = 0; - - if (!frame) - return 1; - - err |= copy_siginfo_to_user(&frame->info, &ksig->info); - - err |= __put_user(0, &frame->sig.uc.uc_flags); - err |= __put_user(NULL, &frame->sig.uc.uc_link); - err |= __save_altstack(&frame->sig.uc.uc_stack, regs->UCreg_sp); - err |= setup_sigframe(&frame->sig, regs, set); - if (err == 0) - err |= setup_return(regs, &ksig->ka, frame->sig.retcode, frame, - ksig->sig); - - if (err == 0) { - /* - * For realtime signals we must also set the second and third - * arguments for the signal handler. - */ - regs->UCreg_01 = (unsigned long)&frame->info; - regs->UCreg_02 = (unsigned long)&frame->sig.uc; - } - - return err; -} - -static inline void setup_syscall_restart(struct pt_regs *regs) -{ - regs->UCreg_00 = regs->UCreg_ORIG_00; - regs->UCreg_pc -= 4; -} - -/* - * OK, we're invoking a handler - */ -static void handle_signal(struct ksignal *ksig, struct pt_regs *regs, - int syscall) -{ - struct thread_info *thread = current_thread_info(); - sigset_t *oldset = sigmask_to_save(); - int usig = ksig->sig; - int ret; - - /* - * If we were from a system call, check for system call restarting... - */ - if (syscall) { - switch (regs->UCreg_00) { - case -ERESTART_RESTARTBLOCK: - case -ERESTARTNOHAND: - regs->UCreg_00 = -EINTR; - break; - case -ERESTARTSYS: - if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { - regs->UCreg_00 = -EINTR; - break; - } - /* fallthrough */ - case -ERESTARTNOINTR: - setup_syscall_restart(regs); - } - } - - /* - * Set up the stack frame - */ - if (ksig->ka.sa.sa_flags & SA_SIGINFO) - ret = setup_rt_frame(ksig, oldset, regs); - else - ret = setup_frame(ksig, oldset, regs); - - /* - * Check that the resulting registers are actually sane. - */ - ret |= !valid_user_regs(regs); - - signal_setup_done(ret, ksig, 0); -} - -/* - * Note that 'init' is a special process: it doesn't get signals it doesn't - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - * - * Note that we go through the signals twice: once to check the signals that - * the kernel can handle, and then we build all the user-level signal handling - * stack-frames in one go after that. - */ -static void do_signal(struct pt_regs *regs, int syscall) -{ - struct ksignal ksig; - - /* - * We want the common case to go fast, which - * is why we may in certain cases get here from - * kernel mode. Just return without doing anything - * if so. - */ - if (!user_mode(regs)) - return; - - if (get_signal(&ksig)) { - handle_signal(&ksig, regs, syscall); - return; - } - - /* - * No signal to deliver to the process - restart the syscall. - */ - if (syscall) { - if (regs->UCreg_00 == -ERESTART_RESTARTBLOCK) { - u32 __user *usp; - - regs->UCreg_sp -= 4; - usp = (u32 __user *)regs->UCreg_sp; - - if (put_user(regs->UCreg_pc, usp) == 0) { - regs->UCreg_pc = KERN_RESTART_CODE; - } else { - regs->UCreg_sp += 4; - force_sigsegv(0); - } - } - if (regs->UCreg_00 == -ERESTARTNOHAND || - regs->UCreg_00 == -ERESTARTSYS || - regs->UCreg_00 == -ERESTARTNOINTR) { - setup_syscall_restart(regs); - } - } - /* If there's no signal to deliver, we just put the saved - * sigmask back. - */ - restore_saved_sigmask(); -} - -asmlinkage void do_notify_resume(struct pt_regs *regs, - unsigned int thread_flags, int syscall) -{ - if (thread_flags & _TIF_SIGPENDING) - do_signal(regs, syscall); - - if (thread_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - } -} - -/* - * Copy signal return handlers into the vector page, and - * set sigreturn to be a pointer to these. - */ -void __init early_signal_init(void) -{ - memcpy((void *)kuser_vecpage_to_vectors(KERN_SIGRETURN_CODE), - sigreturn_codes, sizeof(sigreturn_codes)); - memcpy((void *)kuser_vecpage_to_vectors(KERN_RESTART_CODE), - syscall_restart_code, sizeof(syscall_restart_code)); - /* Need not to flush icache, since early_trap_init will do it last. */ -} diff --git a/arch/unicore32/kernel/sleep.S b/arch/unicore32/kernel/sleep.S deleted file mode 100644 index 23151abe53c6..000000000000 --- a/arch/unicore32/kernel/sleep.S +++ /dev/null @@ -1,199 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/sleep.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ - -#include -#include -#include - - .text - -pkunity_cpu_save_cp: - - @ get coprocessor registers - - movc r3, p0.c7, #0 @ PID - movc r4, p0.c2, #0 @ translation table base addr - movc r5, p0.c1, #0 @ control reg - - - @ store them plus current virtual stack ptr on stack - mov r6, sp - stm.w (r3 - r6), [sp-] - - mov pc, lr - -pkunity_cpu_save_sp: - @ preserve phys address of stack - mov r0, sp - stw.w lr, [sp+], #-4 - b.l sleep_phys_sp - ldw r1, =sleep_save_sp - stw r0, [r1] - ldw.w pc, [sp]+, #4 - -/* - * puv3_cpu_suspend() - * - * Forces CPU into sleep state. - * - * r0 = value for PWRMODE M field for desired sleep state - */ - -ENTRY(puv3_cpu_suspend) - stm.w (r16 - r27, lr), [sp-] @ save registers on stack - stm.w (r4 - r15), [sp-] @ save registers on stack - -#ifdef CONFIG_UNICORE_FPU_F64 - sfm.w (f0 - f7 ), [sp-] - sfm.w (f8 - f15), [sp-] - sfm.w (f16 - f23), [sp-] - sfm.w (f24 - f31), [sp-] - cff r4, s31 - stm.w (r4), [sp-] -#endif - b.l pkunity_cpu_save_cp - - b.l pkunity_cpu_save_sp - - @ clean data cache - mov r1, #0 - movc p0.c5, r1, #14 - nop - nop - nop - nop - - - - @ DDR2 BaseAddr - ldw r0, =(PKUNITY_DDR2CTRL_BASE) - - @ PM BaseAddr - ldw r1, =(PKUNITY_PM_BASE) - - @ set PLL_SYS_CFG reg, 275 - movl r6, #0x00002401 - stw r6, [r1+], #0x18 - @ set PLL_DDR_CFG reg, 66MHz - movl r6, #0x00100c00 - stw r6, [r1+], #0x1c - - @ set wake up source - movl r8, #0x800001ff @ epip4d - stw r8, [r1+], #0xc - - @ set PGSR - movl r5, #0x40000 - stw r5, [r1+], #0x10 - - @ prepare DDR2 refresh settings - ldw r5, [r0+], #0x24 - or r5, r5, #0x00000001 - - @ prepare PMCR for PLL changing - movl r6, #0xc - - @ prepare for closing PLL - movl r7, #0x1 - - @ prepare sleep mode - mov r8, #0x1 - -@ movl r0, 0x11111111 -@ put_word_ocd r0 - b pkunity_cpu_do_suspend - - .ltorg - .align 5 -pkunity_cpu_do_suspend: - b 101f - @ put DDR2 into self-refresh -100: stw r5, [r0+], #0x24 - @ change PLL - stw r6, [r1] - b 1f - - .ltorg - .align 5 -101: b 102f - @ wait for PLL changing complete -1: ldw r6, [r1+], #0x44 - csub.a r6, #0x1 - bne 1b - b 2f - - .ltorg - .align 5 -102: b 100b - @ close PLL -2: stw r7, [r1+], #0x4 - @ enter sleep mode - stw r8, [r1] -3: b 3b - - - - -/* - * puv3_cpu_resume() - * - * entry point from bootloader into kernel during resume - * - * Note: Yes, part of the following code is located into the .data section. - * This is to allow sleep_save_sp to be accessed with a relative load - * while we can't rely on any MMU translation. We could have put - * sleep_save_sp in the .text section as well, but some setups might - * insist on it to be truly read-only. - */ - - .data - .align 5 -ENTRY(puv3_cpu_resume) -@ movl r0, 0x20202020 -@ put_word_ocd r0 - - ldw r0, sleep_save_sp @ stack phys addr - ldw r2, =resume_after_mmu @ its absolute virtual address - ldm (r3 - r6), [r0]+ @ CP regs + virt stack ptr - mov sp, r6 @ CP regs + virt stack ptr - - mov r1, #0 - movc p0.c6, r1, #6 @ invalidate I & D TLBs - movc p0.c5, r1, #28 @ invalidate I & D caches, BTB - - movc p0.c7, r3, #0 @ PID - movc p0.c2, r4, #0 @ translation table base addr - movc p0.c1, r5, #0 @ control reg, turn on mmu - nop - jump r2 - nop - nop - nop - nop - nop - -sleep_save_sp: - .word 0 @ preserve stack phys ptr here - - .text -resume_after_mmu: -@ movl r0, 0x30303030 -@ put_word_ocd r0 - -#ifdef CONFIG_UNICORE_FPU_F64 - lfm.w (f0 - f7 ), [sp]+ - lfm.w (f8 - f15), [sp]+ - lfm.w (f16 - f23), [sp]+ - lfm.w (f24 - f31), [sp]+ - ldm.w (r4), [sp]+ - ctf r4, s31 -#endif - ldm.w (r4 - r15), [sp]+ @ restore registers from stack - ldm.w (r16 - r27, pc), [sp]+ @ return to caller diff --git a/arch/unicore32/kernel/stacktrace.c b/arch/unicore32/kernel/stacktrace.c deleted file mode 100644 index c9d8650e9d78..000000000000 --- a/arch/unicore32/kernel/stacktrace.c +++ /dev/null @@ -1,127 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/stacktrace.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include - -#include - -#if defined(CONFIG_FRAME_POINTER) -/* - * Unwind the current stack frame and store the new register values in the - * structure passed as argument. Unwinding is equivalent to a function return, - * hence the new PC value rather than LR should be used for backtrace. - * - * With framepointer enabled, a simple function prologue looks like this: - * mov ip, sp - * stmdb sp!, {fp, ip, lr, pc} - * sub fp, ip, #4 - * - * A simple function epilogue looks like this: - * ldm sp, {fp, sp, pc} - * - * Note that with framepointer enabled, even the leaf functions have the same - * prologue and epilogue, therefore we can ignore the LR value in this case. - */ -int notrace unwind_frame(struct stackframe *frame) -{ - unsigned long high, low; - unsigned long fp = frame->fp; - - /* only go to a higher address on the stack */ - low = frame->sp; - high = ALIGN(low, THREAD_SIZE); - - /* check current frame pointer is within bounds */ - if (fp < (low + 12) || fp + 4 >= high) - return -EINVAL; - - /* restore the registers from the stack frame */ - frame->fp = *(unsigned long *)(fp - 12); - frame->sp = *(unsigned long *)(fp - 8); - frame->pc = *(unsigned long *)(fp - 4); - - return 0; -} -#endif - -void notrace walk_stackframe(struct stackframe *frame, - int (*fn)(struct stackframe *, void *), void *data) -{ - while (1) { - int ret; - - if (fn(frame, data)) - break; - ret = unwind_frame(frame); - if (ret < 0) - break; - } -} -EXPORT_SYMBOL(walk_stackframe); - -#ifdef CONFIG_STACKTRACE -struct stack_trace_data { - struct stack_trace *trace; - unsigned int no_sched_functions; - unsigned int skip; -}; - -static int save_trace(struct stackframe *frame, void *d) -{ - struct stack_trace_data *data = d; - struct stack_trace *trace = data->trace; - unsigned long addr = frame->pc; - - if (data->no_sched_functions && in_sched_functions(addr)) - return 0; - if (data->skip) { - data->skip--; - return 0; - } - - trace->entries[trace->nr_entries++] = addr; - - return trace->nr_entries >= trace->max_entries; -} - -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) -{ - struct stack_trace_data data; - struct stackframe frame; - - data.trace = trace; - data.skip = trace->skip; - - if (tsk != current) { - data.no_sched_functions = 1; - frame.fp = thread_saved_fp(tsk); - frame.sp = thread_saved_sp(tsk); - frame.lr = 0; /* recovered from the stack */ - frame.pc = thread_saved_pc(tsk); - } else { - register unsigned long current_sp asm("sp"); - - data.no_sched_functions = 0; - frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_sp; - frame.lr = (unsigned long)__builtin_return_address(0); - frame.pc = (unsigned long)save_stack_trace_tsk; - } - - walk_stackframe(&frame, save_trace, &data); -} - -void save_stack_trace(struct stack_trace *trace) -{ - save_stack_trace_tsk(current, trace); -} -EXPORT_SYMBOL_GPL(save_stack_trace); -#endif diff --git a/arch/unicore32/kernel/sys.c b/arch/unicore32/kernel/sys.c deleted file mode 100644 index 256fb4082296..000000000000 --- a/arch/unicore32/kernel/sys.c +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/sys.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -/* Provide the actual syscall number to call mapping. */ -#undef __SYSCALL -#define __SYSCALL(nr, call) [nr] = (call), - -#define sys_mmap2 sys_mmap_pgoff -/* Note that we don't include but */ -void *sys_call_table[__NR_syscalls] = { - [0 ... __NR_syscalls-1] = sys_ni_syscall, -#include -}; diff --git a/arch/unicore32/kernel/time.c b/arch/unicore32/kernel/time.c deleted file mode 100644 index c3a37edf4d40..000000000000 --- a/arch/unicore32/kernel/time.c +++ /dev/null @@ -1,128 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/time.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Maintained by GUAN Xue-tao - * Copyright (C) 2001-2010 Guan Xuetao - */ -#include -#include -#include -#include -#include -#include - -#include - -#define MIN_OSCR_DELTA 2 - -static irqreturn_t puv3_ost0_interrupt(int irq, void *dev_id) -{ - struct clock_event_device *c = dev_id; - - /* Disarm the compare/match, signal the event. */ - writel(readl(OST_OIER) & ~OST_OIER_E0, OST_OIER); - writel(readl(OST_OSSR) & ~OST_OSSR_M0, OST_OSSR); - c->event_handler(c); - - return IRQ_HANDLED; -} - -static int -puv3_osmr0_set_next_event(unsigned long delta, struct clock_event_device *c) -{ - unsigned long next, oscr; - - writel(readl(OST_OIER) | OST_OIER_E0, OST_OIER); - next = readl(OST_OSCR) + delta; - writel(next, OST_OSMR0); - oscr = readl(OST_OSCR); - - return (signed)(next - oscr) <= MIN_OSCR_DELTA ? -ETIME : 0; -} - -static int puv3_osmr0_shutdown(struct clock_event_device *evt) -{ - writel(readl(OST_OIER) & ~OST_OIER_E0, OST_OIER); - writel(readl(OST_OSSR) & ~OST_OSSR_M0, OST_OSSR); - return 0; -} - -static struct clock_event_device ckevt_puv3_osmr0 = { - .name = "osmr0", - .features = CLOCK_EVT_FEAT_ONESHOT, - .rating = 200, - .set_next_event = puv3_osmr0_set_next_event, - .set_state_shutdown = puv3_osmr0_shutdown, - .set_state_oneshot = puv3_osmr0_shutdown, -}; - -static u64 puv3_read_oscr(struct clocksource *cs) -{ - return readl(OST_OSCR); -} - -static struct clocksource cksrc_puv3_oscr = { - .name = "oscr", - .rating = 200, - .read = puv3_read_oscr, - .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -void __init time_init(void) -{ - writel(0, OST_OIER); /* disable any timer interrupts */ - writel(0, OST_OSSR); /* clear status on all timers */ - - clockevents_calc_mult_shift(&ckevt_puv3_osmr0, CLOCK_TICK_RATE, 5); - - ckevt_puv3_osmr0.max_delta_ns = - clockevent_delta2ns(0x7fffffff, &ckevt_puv3_osmr0); - ckevt_puv3_osmr0.max_delta_ticks = 0x7fffffff; - ckevt_puv3_osmr0.min_delta_ns = - clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_puv3_osmr0) + 1; - ckevt_puv3_osmr0.min_delta_ticks = MIN_OSCR_DELTA * 2; - ckevt_puv3_osmr0.cpumask = cpumask_of(0); - - if (request_irq(IRQ_TIMER0, puv3_ost0_interrupt, - IRQF_TIMER | IRQF_IRQPOLL, "ost0", &ckevt_puv3_osmr0)) - pr_err("Failed to register ost0 interrupt\n"); - - clocksource_register_hz(&cksrc_puv3_oscr, CLOCK_TICK_RATE); - clockevents_register_device(&ckevt_puv3_osmr0); -} - -#ifdef CONFIG_PM -unsigned long osmr[4], oier; - -void puv3_timer_suspend(void) -{ - osmr[0] = readl(OST_OSMR0); - osmr[1] = readl(OST_OSMR1); - osmr[2] = readl(OST_OSMR2); - osmr[3] = readl(OST_OSMR3); - oier = readl(OST_OIER); -} - -void puv3_timer_resume(void) -{ - writel(0, OST_OSSR); - writel(osmr[0], OST_OSMR0); - writel(osmr[1], OST_OSMR1); - writel(osmr[2], OST_OSMR2); - writel(osmr[3], OST_OSMR3); - writel(oier, OST_OIER); - - /* - * OSMR0 is the system timer: make sure OSCR is sufficiently behind - */ - writel(readl(OST_OSMR0) - LATCH, OST_OSCR); -} -#else -void puv3_timer_suspend(void) { }; -void puv3_timer_resume(void) { }; -#endif - diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c deleted file mode 100644 index a3ac01df1a2e..000000000000 --- a/arch/unicore32/kernel/traps.c +++ /dev/null @@ -1,322 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/kernel/traps.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * 'traps.c' handles hardware exceptions after we have saved some state. - * Mostly a debugging aid, but will probably kill the offending process. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "setup.h" - -static void dump_mem(const char *, const char *, unsigned long, unsigned long); - -void dump_backtrace_entry(unsigned long where, - unsigned long from, unsigned long frame) -{ -#ifdef CONFIG_KALLSYMS - printk(KERN_DEFAULT "[<%08lx>] (%pS) from [<%08lx>] (%pS)\n", - where, (void *)where, from, (void *)from); -#else - printk(KERN_DEFAULT "Function entered at [<%08lx>] from [<%08lx>]\n", - where, from); -#endif -} - -/* - * Stack pointers should always be within the kernels view of - * physical memory. If it is not there, then we can't dump - * out any information relating to the stack. - */ -static int verify_stack(unsigned long sp) -{ - if (sp < PAGE_OFFSET || - (sp > (unsigned long)high_memory && high_memory != NULL)) - return -EFAULT; - - return 0; -} - -/* - * Dump out the contents of some memory nicely... - */ -static void dump_mem(const char *lvl, const char *str, unsigned long bottom, - unsigned long top) -{ - unsigned long first; - mm_segment_t fs; - int i; - - /* - * We need to switch to kernel mode so that we can use __get_user - * to safely read from kernel space. Note that we now dump the - * code first, just in case the backtrace kills us. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - - printk(KERN_DEFAULT "%s%s(0x%08lx to 0x%08lx)\n", - lvl, str, bottom, top); - - for (first = bottom & ~31; first < top; first += 32) { - unsigned long p; - char str[sizeof(" 12345678") * 8 + 1]; - - memset(str, ' ', sizeof(str)); - str[sizeof(str) - 1] = '\0'; - - for (p = first, i = 0; i < 8 && p < top; i++, p += 4) { - if (p >= bottom && p < top) { - unsigned long val; - if (__get_user(val, (unsigned long *)p) == 0) - sprintf(str + i * 9, " %08lx", val); - else - sprintf(str + i * 9, " ????????"); - } - } - printk(KERN_DEFAULT "%s%04lx:%s\n", lvl, first & 0xffff, str); - } - - set_fs(fs); -} - -static void dump_instr(const char *lvl, struct pt_regs *regs) -{ - unsigned long addr = instruction_pointer(regs); - const int width = 8; - mm_segment_t fs; - char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; - int i; - - /* - * We need to switch to kernel mode so that we can use __get_user - * to safely read from kernel space. Note that we now dump the - * code first, just in case the backtrace kills us. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - - for (i = -4; i < 1; i++) { - unsigned int val, bad; - - bad = __get_user(val, &((u32 *)addr)[i]); - - if (!bad) - p += sprintf(p, i == 0 ? "(%0*x) " : "%0*x ", - width, val); - else { - p += sprintf(p, "bad PC value"); - break; - } - } - printk(KERN_DEFAULT "%sCode: %s\n", lvl, str); - - set_fs(fs); -} - -static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, - const char *loglvl) -{ - unsigned int fp; - int ok = 1; - - printk("%sBacktrace: ", loglvl); - - if (!tsk) - tsk = current; - - if (regs) - fp = regs->UCreg_fp; - else if (tsk != current) - fp = thread_saved_fp(tsk); - else - asm("mov %0, fp" : "=r" (fp) : : "cc"); - - if (!fp) { - printk("%sno frame pointer", loglvl); - ok = 0; - } else if (verify_stack(fp)) { - printk("%sinvalid frame pointer 0x%08x", loglvl, fp); - ok = 0; - } else if (fp < (unsigned long)end_of_stack(tsk)) - printk("%sframe pointer underflow", loglvl); - printk("%s\n", loglvl); - - if (ok) - c_backtrace(fp, loglvl); -} - -void show_stack(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) -{ - dump_backtrace(NULL, tsk, loglvl); - barrier(); -} - -static int __die(const char *str, int err, struct thread_info *thread, - struct pt_regs *regs) -{ - struct task_struct *tsk = thread->task; - static int die_counter; - int ret; - - printk(KERN_EMERG "Internal error: %s: %x [#%d]\n", - str, err, ++die_counter); - - /* trap and error numbers are mostly meaningless on UniCore */ - ret = notify_die(DIE_OOPS, str, regs, err, tsk->thread.trap_no, \ - SIGSEGV); - if (ret == NOTIFY_STOP) - return ret; - - print_modules(); - __show_regs(regs); - printk(KERN_EMERG "Process %.*s (pid: %d, stack limit = 0x%p)\n", - TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1); - - if (!user_mode(regs) || in_interrupt()) { - dump_mem(KERN_EMERG, "Stack: ", regs->UCreg_sp, - THREAD_SIZE + (unsigned long)task_stack_page(tsk)); - dump_backtrace(regs, tsk, KERN_EMERG); - dump_instr(KERN_EMERG, regs); - } - - return ret; -} - -DEFINE_SPINLOCK(die_lock); - -/* - * This function is protected against re-entrancy. - */ -void die(const char *str, struct pt_regs *regs, int err) -{ - struct thread_info *thread = current_thread_info(); - int ret; - - oops_enter(); - - spin_lock_irq(&die_lock); - console_verbose(); - bust_spinlocks(1); - ret = __die(str, err, thread, regs); - - bust_spinlocks(0); - add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); - spin_unlock_irq(&die_lock); - oops_exit(); - - if (in_interrupt()) - panic("Fatal exception in interrupt"); - if (panic_on_oops) - panic("Fatal exception"); - if (ret != NOTIFY_STOP) - do_exit(SIGSEGV); -} - -void uc32_notify_die(const char *str, struct pt_regs *regs, - int sig, int code, void __user *addr, - unsigned long err, unsigned long trap) -{ - if (user_mode(regs)) { - current->thread.error_code = err; - current->thread.trap_no = trap; - - force_sig_fault(sig, code, addr); - } else - die(str, regs, err); -} - -/* - * bad_mode handles the impossible case in the vectors. If you see one of - * these, then it's extremely serious, and could mean you have buggy hardware. - * It never returns, and never tries to sync. We hope that we can at least - * dump out some state information... - */ -asmlinkage void bad_mode(struct pt_regs *regs, unsigned int reason) -{ - console_verbose(); - - printk(KERN_CRIT "Bad mode detected with reason 0x%x\n", reason); - - die("Oops - bad mode", regs, 0); - local_irq_disable(); - panic("bad mode"); -} - -void __pte_error(const char *file, int line, unsigned long val) -{ - printk(KERN_DEFAULT "%s:%d: bad pte %08lx.\n", file, line, val); -} - -void __pmd_error(const char *file, int line, unsigned long val) -{ - printk(KERN_DEFAULT "%s:%d: bad pmd %08lx.\n", file, line, val); -} - -void __pgd_error(const char *file, int line, unsigned long val) -{ - printk(KERN_DEFAULT "%s:%d: bad pgd %08lx.\n", file, line, val); -} - -asmlinkage void __div0(void) -{ - printk(KERN_DEFAULT "Division by zero in kernel.\n"); - dump_stack(); -} -EXPORT_SYMBOL(__div0); - -void abort(void) -{ - BUG(); - - /* if that doesn't kill us, halt */ - panic("Oops failed to kill thread"); -} - -void __init trap_init(void) -{ - return; -} - -void __init early_trap_init(void) -{ - unsigned long vectors = VECTORS_BASE; - - /* - * Copy the vectors, stubs (in entry-unicore.S) - * into the vector page, mapped at 0xffff0000, and ensure these - * are visible to the instruction stream. - */ - memcpy((void *)vectors, - __vectors_start, - __vectors_end - __vectors_start); - memcpy((void *)vectors + 0x200, - __stubs_start, - __stubs_end - __stubs_start); - - early_signal_init(); - - flush_icache_range(vectors, vectors + PAGE_SIZE); -} diff --git a/arch/unicore32/kernel/vmlinux.lds.S b/arch/unicore32/kernel/vmlinux.lds.S deleted file mode 100644 index 6fb320b337ef..000000000000 --- a/arch/unicore32/kernel/vmlinux.lds.S +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/kernel/vmlinux.lds.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#include -#include -#include -#include -#include - -OUTPUT_ARCH(unicore32) -ENTRY(stext) - -jiffies = jiffies_64; - -SECTIONS -{ - . = PAGE_OFFSET + KERNEL_IMAGE_START; - - _text = .; - __init_begin = .; - HEAD_TEXT_SECTION - INIT_TEXT_SECTION(PAGE_SIZE) - INIT_DATA_SECTION(16) - PERCPU_SECTION(L1_CACHE_BYTES) - __init_end = .; - - _stext = .; - .text : { /* Real text segment */ - TEXT_TEXT - SCHED_TEXT - CPUIDLE_TEXT - LOCK_TEXT - - *(.fixup) - *(.gnu.warning) - } - _etext = .; - - _sdata = .; - RO_DATA(PAGE_SIZE) - RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) - _edata = .; - - EXCEPTION_TABLE(L1_CACHE_BYTES) - - BSS_SECTION(0, 0, 0) - _end = .; - - STABS_DEBUG - DWARF_DEBUG - - DISCARDS /* Exit code and data */ -} diff --git a/arch/unicore32/lib/Makefile b/arch/unicore32/lib/Makefile deleted file mode 100644 index 5af06645b8f0..000000000000 --- a/arch/unicore32/lib/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# linux/arch/unicore32/lib/Makefile -# -# Copyright (C) 2001-2010 GUAN Xue-tao -# - -lib-y := backtrace.o delay.o findbit.o -lib-y += strncpy_from_user.o strnlen_user.o -lib-y += clear_user.o copy_page.o -lib-y += copy_from_user.o copy_to_user.o - -GNU_LIBC_A = $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libc.a) -GNU_LIBC_A_OBJS := memchr.o memcpy.o memmove.o memset.o -GNU_LIBC_A_OBJS += strchr.o strrchr.o -GNU_LIBC_A_OBJS += rawmemchr.o # needed by strrchr.o - -GNU_LIBGCC_A = $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libgcc.a) -GNU_LIBGCC_A_OBJS := _ashldi3.o _ashrdi3.o _lshrdi3.o -GNU_LIBGCC_A_OBJS += _divsi3.o _modsi3.o _ucmpdi2.o _umodsi3.o _udivsi3.o - -lib-y += $(GNU_LIBC_A_OBJS) $(GNU_LIBGCC_A_OBJS) - -$(addprefix $(obj)/, $(GNU_LIBC_A_OBJS)): - $(Q)$(AR) p $(GNU_LIBC_A) $(notdir $@) > $@ - -$(addprefix $(obj)/, $(GNU_LIBGCC_A_OBJS)): - $(Q)$(AR) p $(GNU_LIBGCC_A) $(notdir $@) > $@ diff --git a/arch/unicore32/lib/backtrace.S b/arch/unicore32/lib/backtrace.S deleted file mode 100644 index 6221944b81f3..000000000000 --- a/arch/unicore32/lib/backtrace.S +++ /dev/null @@ -1,168 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/backtrace.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - .text - -@ fp is 0 or stack frame - -#define frame v4 -#define sv_fp v5 -#define sv_pc v6 -#define offset v8 -#define loglvl v9 - -ENTRY(__backtrace) - mov r0, fp - -ENTRY(c_backtrace) - -#if !defined(CONFIG_FRAME_POINTER) || !defined(CONFIG_PRINTK) - mov pc, lr -ENDPROC(__backtrace) -ENDPROC(c_backtrace) -#else - stm.w (v4 - v10, lr), [sp-] @ Save an extra register - @ so we have a location... - mov.a frame, r0 @ if frame pointer is zero - beq no_frame @ we have no stack frames - mov loglvl, r1 - -1: stm.w (pc), [sp-] @ calculate offset of PC stored - ldw.w r0, [sp]+, #4 @ by stmfd for this CPU - adr r1, 1b - sub offset, r0, r1 - -/* - * Stack frame layout: - * optionally saved caller registers (r4 - r10) - * saved fp - * saved sp - * saved lr - * frame => saved pc - * optionally saved arguments (r0 - r3) - * saved sp => - * - * Functions start with the following code sequence: - * mov ip, sp - * stm.w (r0 - r3), [sp-] (optional) - * corrected pc => stm.w sp, (..., fp, ip, lr, pc) - */ -for_each_frame: - -1001: ldw sv_pc, [frame+], #0 @ get saved pc -1002: ldw sv_fp, [frame+], #-12 @ get saved fp - - sub sv_pc, sv_pc, offset @ Correct PC for prefetching - -1003: ldw r2, [sv_pc+], #-4 @ if stmfd sp, {args} exists, - ldw r3, .Ldsi+4 @ adjust saved 'pc' back one - cxor.a r3, r2 >> #14 @ instruction - beq 201f - sub r0, sv_pc, #4 @ allow for mov - b 202f -201: - sub r0, sv_pc, #8 @ allow for mov + stmia -202: - ldw r1, [frame+], #-4 @ get saved lr - mov r2, frame - b.l dump_backtrace_entry - - ldw r1, [sv_pc+], #-4 @ if stmfd sp, {args} exists, - ldw r3, .Ldsi+4 - cxor.a r3, r1 >> #14 - bne 1004f - ldw r0, [frame+], #-8 @ get sp - sub r0, r0, #4 @ point at the last arg - b.l .Ldumpstm @ dump saved registers - -1004: ldw r1, [sv_pc+], #0 @ if stmfd {, fp, ip, lr, pc} - ldw r3, .Ldsi @ instruction exists, - cxor.a r3, r1 >> #14 - bne 201f - sub r0, frame, #16 - b.l .Ldumpstm @ dump saved registers -201: - cxor.a sv_fp, #0 @ zero saved fp means - beq no_frame @ no further frames - - csub.a sv_fp, frame @ next frame must be - mov frame, sv_fp @ above the current frame - bua for_each_frame - -1006: adr r0, .Lbad - mov r1, loglvl - mov r2, frame - b.l printk -no_frame: ldm.w (v4 - v10, pc), [sp]+ -ENDPROC(__backtrace) -ENDPROC(c_backtrace) - - .pushsection __ex_table,"a" - .align 3 - .long 1001b, 1006b - .long 1002b, 1006b - .long 1003b, 1006b - .long 1004b, 1006b - .popsection - -#define instr v4 -#define reg v5 -#define stack v6 - -.Ldumpstm: stm.w (instr, reg, stack, v7, lr), [sp-] - mov stack, r0 - mov instr, r1 - mov reg, #14 - mov v7, #0 -1: mov r3, #1 - csub.a reg, #8 - bne 201f - sub reg, reg, #3 -201: - cand.a instr, r3 << reg - beq 2f - add v7, v7, #1 - cxor.a v7, #6 - cmoveq v7, #1 - bne 201f - adr r0, .Lcr - mov r1, loglvl - b.l printk -201: - ldw.w r3, [stack]+, #-4 - mov r2, reg - csub.a r2, #8 - bsl 201f - sub r2, r2, #3 -201: - cand.a instr, #0x40 @ if H is 1, high 16 regs - beq 201f - add r2, r2, #0x10 @ so r2 need add 16 -201: - adr r0, .Lfp - mov r1, loglvl - b.l printk -2: sub.a reg, reg, #1 - bns 1b - cxor.a v7, #0 - beq 201f - adr r0, .Lcr - mov r1, loglvl - b.l printk -201: ldm.w (instr, reg, stack, v7, pc), [sp]+ - -.Lfp: .asciz "%sr%d:%08x " -.Lcr: .asciz "%s\n" -.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" - .align -.Ldsi: .word 0x92eec000 >> 14 @ stm.w sp, (... fp, ip, lr, pc) - .word 0x92e10000 >> 14 @ stm.w sp, () - -#endif diff --git a/arch/unicore32/lib/clear_user.S b/arch/unicore32/lib/clear_user.S deleted file mode 100644 index c6ca431b1090..000000000000 --- a/arch/unicore32/lib/clear_user.S +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/clear_user.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - - .text - -/* Prototype: int __clear_user(void *addr, size_t sz) - * Purpose : clear some user memory - * Params : addr - user memory address to clear - * : sz - number of bytes to clear - * Returns : number of bytes NOT cleared - */ -WEAK(__clear_user) - stm.w (lr), [sp-] - stm.w (r1), [sp-] - mov r2, #0 - csub.a r1, #4 - bsl 2f - and.a ip, r0, #3 - beq 1f - csub.a ip, #2 - strusr r2, r0, 1 - strusr r2, r0, 1, el - strusr r2, r0, 1, sl - rsub ip, ip, #4 - sub r1, r1, ip @ 7 6 5 4 3 2 1 -1: sub.a r1, r1, #8 @ -1 -2 -3 -4 -5 -6 -7 - strusr r2, r0, 4, ns, rept=2 - bns 1b - add.a r1, r1, #4 @ 3 2 1 0 -1 -2 -3 - strusr r2, r0, 4, ns -2: cand.a r1, #2 @ 1x 1x 0x 0x 1x 1x 0x - strusr r2, r0, 1, ne, rept=2 - cand.a r1, #1 @ x1 x0 x1 x0 x1 x0 x1 - beq 3f -USER( stb.u r2, [r0]) -3: mov r0, #0 - ldm.w (r1), [sp]+ - ldm.w (pc), [sp]+ -ENDPROC(__clear_user) - - .pushsection .fixup,"ax" - .align 0 -9001: ldm.w (r0), [sp]+ - ldm.w (pc), [sp]+ - .popsection - diff --git a/arch/unicore32/lib/copy_from_user.S b/arch/unicore32/lib/copy_from_user.S deleted file mode 100644 index affb43920ac0..000000000000 --- a/arch/unicore32/lib/copy_from_user.S +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/copy_from_user.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#include -#include - -/* - * Prototype: - * - * size_t raw_copy_from_user(void *to, const void *from, size_t n) - * - * Purpose: - * - * copy a block to kernel memory from user memory - * - * Params: - * - * to = kernel memory - * from = user memory - * n = number of bytes to copy - * - * Return value: - * - * Number of bytes NOT copied. - */ - - .macro ldr1w ptr reg abort - ldrusr \reg, \ptr, 4, abort=\abort - .endm - - .macro ldr4w ptr reg1 reg2 reg3 reg4 abort -100: ldm.w (\reg1, \reg2, \reg3, \reg4), [\ptr]+ - .pushsection __ex_table, "a" - .align 3 - .long 100b, \abort - .popsection - .endm - - .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort -100: ldm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+ - .pushsection __ex_table, "a" - .align 3 - .long 100b, \abort - .popsection - .endm - - .macro ldr1b ptr reg cond=al abort - ldrusr \reg, \ptr, 1, \cond, abort=\abort - .endm - - .macro str1w ptr reg abort - stw.w \reg, [\ptr]+, #4 - .endm - - .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort - stm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+ - .endm - - .macro str1b ptr reg cond=al abort - .ifnc \cond, al - b\cond 201f - b 202f - .endif -201: stb.w \reg, [\ptr]+, #1 -202: - .endm - - .macro enter - mov r3, #0 - stm.w (r0, r2, r3), [sp-] - .endm - - .macro exit - add sp, sp, #8 - ldm.w (r0), [sp]+ - mov pc, lr - .endm - - .text - -ENTRY(raw_copy_from_user) - -#include "copy_template.S" - -ENDPROC(raw_copy_from_user) - - .pushsection .fixup,"ax" - .align 0 - copy_abort_preamble - ldm.w (r1, r2, r3), [sp]+ - sub r0, r0, r1 - rsub r0, r0, r2 - copy_abort_end - .popsection - diff --git a/arch/unicore32/lib/copy_page.S b/arch/unicore32/lib/copy_page.S deleted file mode 100644 index dc163f2d1af0..000000000000 --- a/arch/unicore32/lib/copy_page.S +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/copy_page.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * ASM optimised string functions - */ -#include -#include -#include -#include - -#define COPY_COUNT (PAGE_SZ/256) - - .text - .align 5 -/* - * UniCore optimised copy_page routine - */ -ENTRY(copy_page) - stm.w (r17 - r19, lr), [sp-] - mov r17, r0 - mov r18, r1 - mov r19, #COPY_COUNT -1: - .rept 4 - ldm.w (r0 - r15), [r18]+ - stm.w (r0 - r15), [r17]+ - .endr - sub.a r19, r19, #1 - bne 1b - ldm.w (r17 - r19, pc), [sp]+ -ENDPROC(copy_page) diff --git a/arch/unicore32/lib/copy_template.S b/arch/unicore32/lib/copy_template.S deleted file mode 100644 index 02a7aef83fbf..000000000000 --- a/arch/unicore32/lib/copy_template.S +++ /dev/null @@ -1,211 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/copy_template.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -/* - * Theory of operation - * ------------------- - * - * This file provides the core code for a forward memory copy used in - * the implementation of memcopy(), copy_to_user() and copy_from_user(). - * - * The including file must define the following accessor macros - * according to the need of the given function: - * - * ldr1w ptr reg abort - * - * This loads one word from 'ptr', stores it in 'reg' and increments - * 'ptr' to the next word. The 'abort' argument is used for fixup tables. - * - * ldr4w ptr reg1 reg2 reg3 reg4 abort - * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort - * - * This loads four or eight words starting from 'ptr', stores them - * in provided registers and increments 'ptr' past those words. - * The'abort' argument is used for fixup tables. - * - * ldr1b ptr reg cond abort - * - * Similar to ldr1w, but it loads a byte and increments 'ptr' one byte. - * It also must apply the condition code if provided, otherwise the - * "al" condition is assumed by default. - * - * str1w ptr reg abort - * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort - * str1b ptr reg cond abort - * - * Same as their ldr* counterparts, but data is stored to 'ptr' location - * rather than being loaded. - * - * enter - * - * Preserve the provided registers on the stack plus any additional - * data as needed by the implementation including this code. Called - * upon code entry. - * - * exit - * - * Restore registers with the values previously saved with the - * 'preserv' macro. Called upon code termination. - */ - - - enter - - sub.a r2, r2, #4 - bsl 8f - and.a ip, r0, #3 - bne 9f - and.a ip, r1, #3 - bne 10f - -1: sub.a r2, r2, #(28) - stm.w (r5 - r8), [sp-] - bsl 5f - -3: -4: ldr8w r1, r3, r4, r5, r6, r7, r8, r10, r11, abort=20f - sub.a r2, r2, #32 - str8w r0, r3, r4, r5, r6, r7, r8, r10, r11, abort=20f - beg 3b - -5: and.a ip, r2, #28 - rsub ip, ip, #32 - beq 7f - add pc, pc, ip @ C is always clear here - nop - - ldr1w r1, r3, abort=20f - ldr1w r1, r4, abort=20f - ldr1w r1, r5, abort=20f - ldr1w r1, r6, abort=20f - ldr1w r1, r7, abort=20f - ldr1w r1, r8, abort=20f - ldr1w r1, r11, abort=20f - - add pc, pc, ip - nop - - str1w r0, r3, abort=20f - str1w r0, r4, abort=20f - str1w r0, r5, abort=20f - str1w r0, r6, abort=20f - str1w r0, r7, abort=20f - str1w r0, r8, abort=20f - str1w r0, r11, abort=20f - -7: ldm.w (r5 - r8), [sp]+ - -8: mov.a r2, r2 << #31 - ldr1b r1, r3, ne, abort=21f - ldr1b r1, r4, ea, abort=21f - ldr1b r1, r10, ea, abort=21f - str1b r0, r3, ne, abort=21f - str1b r0, r4, ea, abort=21f - str1b r0, r10, ea, abort=21f - - exit - -9: rsub ip, ip, #4 - csub.a ip, #2 - ldr1b r1, r3, sg, abort=21f - ldr1b r1, r4, eg, abort=21f - ldr1b r1, r11, abort=21f - str1b r0, r3, sg, abort=21f - str1b r0, r4, eg, abort=21f - sub.a r2, r2, ip - str1b r0, r11, abort=21f - bsl 8b - and.a ip, r1, #3 - beq 1b - -10: andn r1, r1, #3 - csub.a ip, #2 - ldr1w r1, r11, abort=21f - beq 17f - bsg 18f - - - .macro forward_copy_shift a b - - sub.a r2, r2, #28 - bsl 14f - -11: stm.w (r5 - r9), [sp-] - -12: - ldr4w r1, r4, r5, r6, r7, abort=19f - mov r3, r11 pull #\a - sub.a r2, r2, #32 - ldr4w r1, r8, r9, r10, r11, abort=19f - or r3, r3, r4 push #\b - mov r4, r4 pull #\a - or r4, r4, r5 push #\b - mov r5, r5 pull #\a - or r5, r5, r6 push #\b - mov r6, r6 pull #\a - or r6, r6, r7 push #\b - mov r7, r7 pull #\a - or r7, r7, r8 push #\b - mov r8, r8 pull #\a - or r8, r8, r9 push #\b - mov r9, r9 pull #\a - or r9, r9, r10 push #\b - mov r10, r10 pull #\a - or r10, r10, r11 push #\b - str8w r0, r3, r4, r5, r6, r7, r8, r9, r10, , abort=19f - beg 12b - - ldm.w (r5 - r9), [sp]+ - -14: and.a ip, r2, #28 - beq 16f - -15: mov r3, r11 pull #\a - ldr1w r1, r11, abort=21f - sub.a ip, ip, #4 - or r3, r3, r11 push #\b - str1w r0, r3, abort=21f - bsg 15b - -16: sub r1, r1, #(\b / 8) - b 8b - - .endm - - - forward_copy_shift a=8 b=24 - -17: forward_copy_shift a=16 b=16 - -18: forward_copy_shift a=24 b=8 - - -/* - * Abort preamble and completion macros. - * If a fixup handler is required then those macros must surround it. - * It is assumed that the fixup code will handle the private part of - * the exit macro. - */ - - .macro copy_abort_preamble -19: ldm.w (r5 - r9), [sp]+ - b 21f -299: .word 0 @ store lr - @ to avoid function call in fixup -20: ldm.w (r5 - r8), [sp]+ -21: - adr r1, 299b - stw lr, [r1] - .endm - - .macro copy_abort_end - adr lr, 299b - ldw pc, [lr] - .endm - diff --git a/arch/unicore32/lib/copy_to_user.S b/arch/unicore32/lib/copy_to_user.S deleted file mode 100644 index c867f08f89ce..000000000000 --- a/arch/unicore32/lib/copy_to_user.S +++ /dev/null @@ -1,93 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/copy_to_user.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ - -#include -#include - -/* - * Prototype: - * - * size_t raw_copy_to_user(void *to, const void *from, size_t n) - * - * Purpose: - * - * copy a block to user memory from kernel memory - * - * Params: - * - * to = user memory - * from = kernel memory - * n = number of bytes to copy - * - * Return value: - * - * Number of bytes NOT copied. - */ - - .macro ldr1w ptr reg abort - ldw.w \reg, [\ptr]+, #4 - .endm - - .macro ldr4w ptr reg1 reg2 reg3 reg4 abort - ldm.w (\reg1, \reg2, \reg3, \reg4), [\ptr]+ - .endm - - .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort - ldm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+ - .endm - - .macro ldr1b ptr reg cond=al abort - notcond \cond, .+8 - ldb.w \reg, [\ptr]+, #1 - .endm - - .macro str1w ptr reg abort - strusr \reg, \ptr, 4, abort=\abort - .endm - - .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort -100: stm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+ - - .pushsection __ex_table, "a" - .long 100b, \abort - .popsection - .endm - - .macro str1b ptr reg cond=al abort - strusr \reg, \ptr, 1, \cond, abort=\abort - .endm - - .macro enter - mov r3, #0 - stm.w (r0, r2, r3), [sp-] - .endm - - .macro exit - add sp, sp, #8 - ldm.w (r0), [sp]+ - mov pc, lr - .endm - - .text - -WEAK(raw_copy_to_user) - -#include "copy_template.S" - -ENDPROC(raw_copy_to_user) - - .pushsection .fixup,"ax" - .align 0 - copy_abort_preamble - ldm.w (r1, r2, r3), [sp]+ - sub r0, r0, r1 - rsub r0, r0, r2 - copy_abort_end - .popsection - diff --git a/arch/unicore32/lib/delay.S b/arch/unicore32/lib/delay.S deleted file mode 100644 index 6a359dd034e5..000000000000 --- a/arch/unicore32/lib/delay.S +++ /dev/null @@ -1,48 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/delay.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include - .text - -.LC0: .word loops_per_jiffy -.LC1: .word (2199023*HZ)>>11 - -/* - * r0 <= 2000 - * lpj <= 0x01ffffff (max. 3355 bogomips) - * HZ <= 1000 - */ - -ENTRY(__udelay) - ldw r2, .LC1 - mul r0, r2, r0 -ENTRY(__const_udelay) @ 0 <= r0 <= 0x7fffff06 - ldw r2, .LC0 - ldw r2, [r2] @ max = 0x01ffffff - mov r0, r0 >> #14 @ max = 0x0001ffff - mov r2, r2 >> #10 @ max = 0x00007fff - mul r0, r2, r0 @ max = 2^32-1 - mov.a r0, r0 >> #6 - cmoveq pc, lr - -/* - * loops = r0 * HZ * loops_per_jiffy / 1000000 - * - * Oh, if only we had a cycle counter... - */ - -@ Delay routine -ENTRY(__delay) - sub.a r0, r0, #2 - bua __delay - mov pc, lr -ENDPROC(__udelay) -ENDPROC(__const_udelay) -ENDPROC(__delay) diff --git a/arch/unicore32/lib/findbit.S b/arch/unicore32/lib/findbit.S deleted file mode 100644 index 42f1282670d2..000000000000 --- a/arch/unicore32/lib/findbit.S +++ /dev/null @@ -1,97 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/findbit.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - .text - -/* - * Purpose : Find a 'zero' bit - * Prototype: int find_first_zero_bit(void *addr, unsigned int maxbit); - */ -ENTRY(find_first_zero_bit) - cxor.a r1, #0 - beq 3f - mov r2, #0 -1: ldb r3, [r0+], r2 >> #3 - xor.a r3, r3, #0xff @ invert bits - bne .L_found @ any now set - found zero bit - add r2, r2, #8 @ next bit pointer -2: csub.a r2, r1 @ any more? - bub 1b -3: mov r0, r1 @ no free bits - mov pc, lr -ENDPROC(find_first_zero_bit) - -/* - * Purpose : Find next 'zero' bit - * Prototype: int find_next_zero_bit - * (void *addr, unsigned int maxbit, int offset) - */ -ENTRY(find_next_zero_bit) - cxor.a r1, #0 - beq 3b - and.a ip, r2, #7 - beq 1b @ If new byte, goto old routine - ldb r3, [r0+], r2 >> #3 - xor r3, r3, #0xff @ now looking for a 1 bit - mov.a r3, r3 >> ip @ shift off unused bits - bne .L_found - or r2, r2, #7 @ if zero, then no bits here - add r2, r2, #1 @ align bit pointer - b 2b @ loop for next bit -ENDPROC(find_next_zero_bit) - -/* - * Purpose : Find a 'one' bit - * Prototype: int find_first_bit - * (const unsigned long *addr, unsigned int maxbit); - */ -ENTRY(find_first_bit) - cxor.a r1, #0 - beq 3f - mov r2, #0 -1: ldb r3, [r0+], r2 >> #3 - mov.a r3, r3 - bne .L_found @ any now set - found zero bit - add r2, r2, #8 @ next bit pointer -2: csub.a r2, r1 @ any more? - bub 1b -3: mov r0, r1 @ no free bits - mov pc, lr -ENDPROC(find_first_bit) - -/* - * Purpose : Find next 'one' bit - * Prototype: int find_next_zero_bit - * (void *addr, unsigned int maxbit, int offset) - */ -ENTRY(find_next_bit) - cxor.a r1, #0 - beq 3b - and.a ip, r2, #7 - beq 1b @ If new byte, goto old routine - ldb r3, [r0+], r2 >> #3 - mov.a r3, r3 >> ip @ shift off unused bits - bne .L_found - or r2, r2, #7 @ if zero, then no bits here - add r2, r2, #1 @ align bit pointer - b 2b @ loop for next bit -ENDPROC(find_next_bit) - -/* - * One or more bits in the LSB of r3 are assumed to be set. - */ -.L_found: - rsub r1, r3, #0 - and r3, r3, r1 - cntlz r3, r3 - rsub r3, r3, #31 - add r0, r2, r3 - mov pc, lr - diff --git a/arch/unicore32/lib/strncpy_from_user.S b/arch/unicore32/lib/strncpy_from_user.S deleted file mode 100644 index f227b8227a4c..000000000000 --- a/arch/unicore32/lib/strncpy_from_user.S +++ /dev/null @@ -1,42 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/strncpy_from_user.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include - - .text - .align 5 - -/* - * Copy a string from user space to kernel space. - * r0 = dst, r1 = src, r2 = byte length - * returns the number of characters copied (strlen of copied string), - * -EFAULT on exception, or "len" if we fill the whole buffer - */ -ENTRY(__strncpy_from_user) - mov ip, r1 -1: sub.a r2, r2, #1 - ldrusr r3, r1, 1, ns - bfs 2f - stb.w r3, [r0]+, #1 - cxor.a r3, #0 - bne 1b - sub r1, r1, #1 @ take NUL character out of count -2: sub r0, r1, ip - mov pc, lr -ENDPROC(__strncpy_from_user) - - .pushsection .fixup,"ax" - .align 0 -9001: mov r3, #0 - stb r3, [r0+], #0 @ null terminate - mov r0, #-EFAULT - mov pc, lr - .popsection - diff --git a/arch/unicore32/lib/strnlen_user.S b/arch/unicore32/lib/strnlen_user.S deleted file mode 100644 index c836b12776fe..000000000000 --- a/arch/unicore32/lib/strnlen_user.S +++ /dev/null @@ -1,39 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/lib/strnlen_user.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include - - .text - .align 5 - -/* Prototype: unsigned long __strnlen_user(const char *str, long n) - * Purpose : get length of a string in user memory - * Params : str - address of string in user memory - * Returns : length of string *including terminator* - * or zero on exception, or n + 1 if too long - */ -ENTRY(__strnlen_user) - mov r2, r0 -1: - ldrusr r3, r0, 1 - cxor.a r3, #0 - beq 2f - sub.a r1, r1, #1 - bne 1b - add r0, r0, #1 -2: sub r0, r0, r2 - mov pc, lr -ENDPROC(__strnlen_user) - - .pushsection .fixup,"ax" - .align 0 -9001: mov r0, #0 - mov pc, lr - .popsection diff --git a/arch/unicore32/mm/Kconfig b/arch/unicore32/mm/Kconfig deleted file mode 100644 index 82759b6aba67..000000000000 --- a/arch/unicore32/mm/Kconfig +++ /dev/null @@ -1,41 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -comment "Processor Type" - -# Select CPU types depending on the architecture selected. This selects -# which CPUs we support in the kernel image, and the compiler instruction -# optimiser behaviour. - -config CPU_UCV2 - def_bool y - -comment "Processor Features" - -config CPU_ICACHE_DISABLE - bool "Disable I-Cache (I-bit)" - help - Say Y here to disable the processor instruction cache. Unless - you have a reason not to or are unsure, say N. - -config CPU_DCACHE_DISABLE - bool "Disable D-Cache (D-bit)" - help - Say Y here to disable the processor data cache. Unless - you have a reason not to or are unsure, say N. - -config CPU_DCACHE_WRITETHROUGH - bool "Force write through D-cache" - help - Say Y here to use the data cache in writethrough mode. Unless you - specifically require this or are unsure, say N. - -config CPU_DCACHE_LINE_DISABLE - bool "Disable D-cache line ops" - default y - help - Say Y here to disable the data cache line operations. - -config CPU_TLB_SINGLE_ENTRY_DISABLE - bool "Disable TLB single entry ops" - default y - help - Say Y here to disable the TLB single entry operations. diff --git a/arch/unicore32/mm/Makefile b/arch/unicore32/mm/Makefile deleted file mode 100644 index 8106260583ab..000000000000 --- a/arch/unicore32/mm/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for the linux unicore-specific parts of the memory manager. -# - -obj-y := extable.o fault.o init.o pgd.o mmu.o -obj-y += flush.o ioremap.o - -obj-$(CONFIG_MODULES) += proc-syms.o - -obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o - -obj-$(CONFIG_CPU_UCV2) += cache-ucv2.o tlb-ucv2.o proc-ucv2.o - diff --git a/arch/unicore32/mm/alignment.c b/arch/unicore32/mm/alignment.c deleted file mode 100644 index 2ea98f7a4156..000000000000 --- a/arch/unicore32/mm/alignment.c +++ /dev/null @@ -1,524 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/alignment.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -/* - * TODO: - * FPU ldm/stm not handling - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "mm.h" - -#define CODING_BITS(i) (i & 0xe0000120) - -#define LDST_P_BIT(i) (i & (1 << 28)) /* Preindex */ -#define LDST_U_BIT(i) (i & (1 << 27)) /* Add offset */ -#define LDST_W_BIT(i) (i & (1 << 25)) /* Writeback */ -#define LDST_L_BIT(i) (i & (1 << 24)) /* Load */ - -#define LDST_P_EQ_U(i) ((((i) ^ ((i) >> 1)) & (1 << 27)) == 0) - -#define LDSTH_I_BIT(i) (i & (1 << 26)) /* half-word immed */ -#define LDM_S_BIT(i) (i & (1 << 26)) /* write ASR from BSR */ -#define LDM_H_BIT(i) (i & (1 << 6)) /* select r0-r15 or r16-r31 */ - -#define RN_BITS(i) ((i >> 19) & 31) /* Rn */ -#define RD_BITS(i) ((i >> 14) & 31) /* Rd */ -#define RM_BITS(i) (i & 31) /* Rm */ - -#define REGMASK_BITS(i) (((i & 0x7fe00) >> 3) | (i & 0x3f)) -#define OFFSET_BITS(i) (i & 0x03fff) - -#define SHIFT_BITS(i) ((i >> 9) & 0x1f) -#define SHIFT_TYPE(i) (i & 0xc0) -#define SHIFT_LSL 0x00 -#define SHIFT_LSR 0x40 -#define SHIFT_ASR 0x80 -#define SHIFT_RORRRX 0xc0 - -union offset_union { - unsigned long un; - signed long sn; -}; - -#define TYPE_ERROR 0 -#define TYPE_FAULT 1 -#define TYPE_LDST 2 -#define TYPE_DONE 3 -#define TYPE_SWAP 4 -#define TYPE_COLS 5 /* Coprocessor load/store */ - -#define get8_unaligned_check(val, addr, err) \ - __asm__( \ - "1: ldb.u %1, [%2], #1\n" \ - "2:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "3: mov %0, #1\n" \ - " b 2b\n" \ - " .popsection\n" \ - " .pushsection __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .long 1b, 3b\n" \ - " .popsection\n" \ - : "=r" (err), "=&r" (val), "=r" (addr) \ - : "0" (err), "2" (addr)) - -#define get8t_unaligned_check(val, addr, err) \ - __asm__( \ - "1: ldb.u %1, [%2], #1\n" \ - "2:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "3: mov %0, #1\n" \ - " b 2b\n" \ - " .popsection\n" \ - " .pushsection __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .long 1b, 3b\n" \ - " .popsection\n" \ - : "=r" (err), "=&r" (val), "=r" (addr) \ - : "0" (err), "2" (addr)) - -#define get16_unaligned_check(val, addr) \ - do { \ - unsigned int err = 0, v, a = addr; \ - get8_unaligned_check(val, a, err); \ - get8_unaligned_check(v, a, err); \ - val |= v << 8; \ - if (err) \ - goto fault; \ - } while (0) - -#define put16_unaligned_check(val, addr) \ - do { \ - unsigned int err = 0, v = val, a = addr; \ - __asm__( \ - "1: stb.u %1, [%2], #1\n" \ - " mov %1, %1 >> #8\n" \ - "2: stb.u %1, [%2]\n" \ - "3:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "4: mov %0, #1\n" \ - " b 3b\n" \ - " .popsection\n" \ - " .pushsection __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .long 1b, 4b\n" \ - " .long 2b, 4b\n" \ - " .popsection\n" \ - : "=r" (err), "=&r" (v), "=&r" (a) \ - : "0" (err), "1" (v), "2" (a)); \ - if (err) \ - goto fault; \ - } while (0) - -#define __put32_unaligned_check(ins, val, addr) \ - do { \ - unsigned int err = 0, v = val, a = addr; \ - __asm__( \ - "1: "ins" %1, [%2], #1\n" \ - " mov %1, %1 >> #8\n" \ - "2: "ins" %1, [%2], #1\n" \ - " mov %1, %1 >> #8\n" \ - "3: "ins" %1, [%2], #1\n" \ - " mov %1, %1 >> #8\n" \ - "4: "ins" %1, [%2]\n" \ - "5:\n" \ - " .pushsection .fixup,\"ax\"\n" \ - " .align 2\n" \ - "6: mov %0, #1\n" \ - " b 5b\n" \ - " .popsection\n" \ - " .pushsection __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .long 1b, 6b\n" \ - " .long 2b, 6b\n" \ - " .long 3b, 6b\n" \ - " .long 4b, 6b\n" \ - " .popsection\n" \ - : "=r" (err), "=&r" (v), "=&r" (a) \ - : "0" (err), "1" (v), "2" (a)); \ - if (err) \ - goto fault; \ - } while (0) - -#define get32_unaligned_check(val, addr) \ - do { \ - unsigned int err = 0, v, a = addr; \ - get8_unaligned_check(val, a, err); \ - get8_unaligned_check(v, a, err); \ - val |= v << 8; \ - get8_unaligned_check(v, a, err); \ - val |= v << 16; \ - get8_unaligned_check(v, a, err); \ - val |= v << 24; \ - if (err) \ - goto fault; \ - } while (0) - -#define put32_unaligned_check(val, addr) \ - __put32_unaligned_check("stb.u", val, addr) - -#define get32t_unaligned_check(val, addr) \ - do { \ - unsigned int err = 0, v, a = addr; \ - get8t_unaligned_check(val, a, err); \ - get8t_unaligned_check(v, a, err); \ - val |= v << 8; \ - get8t_unaligned_check(v, a, err); \ - val |= v << 16; \ - get8t_unaligned_check(v, a, err); \ - val |= v << 24; \ - if (err) \ - goto fault; \ - } while (0) - -#define put32t_unaligned_check(val, addr) \ - __put32_unaligned_check("stb.u", val, addr) - -static void -do_alignment_finish_ldst(unsigned long addr, unsigned long instr, - struct pt_regs *regs, union offset_union offset) -{ - if (!LDST_U_BIT(instr)) - offset.un = -offset.un; - - if (!LDST_P_BIT(instr)) - addr += offset.un; - - if (!LDST_P_BIT(instr) || LDST_W_BIT(instr)) - regs->uregs[RN_BITS(instr)] = addr; -} - -static int -do_alignment_ldrhstrh(unsigned long addr, unsigned long instr, - struct pt_regs *regs) -{ - unsigned int rd = RD_BITS(instr); - - /* old value 0x40002120, can't judge swap instr correctly */ - if ((instr & 0x4b003fe0) == 0x40000120) - goto swp; - - if (LDST_L_BIT(instr)) { - unsigned long val; - get16_unaligned_check(val, addr); - - /* signed half-word? */ - if (instr & 0x80) - val = (signed long)((signed short)val); - - regs->uregs[rd] = val; - } else - put16_unaligned_check(regs->uregs[rd], addr); - - return TYPE_LDST; - -swp: - /* only handle swap word - * for swap byte should not active this alignment exception */ - get32_unaligned_check(regs->uregs[RD_BITS(instr)], addr); - put32_unaligned_check(regs->uregs[RM_BITS(instr)], addr); - return TYPE_SWAP; - -fault: - return TYPE_FAULT; -} - -static int -do_alignment_ldrstr(unsigned long addr, unsigned long instr, - struct pt_regs *regs) -{ - unsigned int rd = RD_BITS(instr); - - if (!LDST_P_BIT(instr) && LDST_W_BIT(instr)) - goto trans; - - if (LDST_L_BIT(instr)) - get32_unaligned_check(regs->uregs[rd], addr); - else - put32_unaligned_check(regs->uregs[rd], addr); - return TYPE_LDST; - -trans: - if (LDST_L_BIT(instr)) - get32t_unaligned_check(regs->uregs[rd], addr); - else - put32t_unaligned_check(regs->uregs[rd], addr); - return TYPE_LDST; - -fault: - return TYPE_FAULT; -} - -/* - * LDM/STM alignment handler. - * - * There are 4 variants of this instruction: - * - * B = rn pointer before instruction, A = rn pointer after instruction - * ------ increasing address -----> - * | | r0 | r1 | ... | rx | | - * PU = 01 B A - * PU = 11 B A - * PU = 00 A B - * PU = 10 A B - */ -static int -do_alignment_ldmstm(unsigned long addr, unsigned long instr, - struct pt_regs *regs) -{ - unsigned int rd, rn, pc_correction, reg_correction, nr_regs, regbits; - unsigned long eaddr, newaddr; - - if (LDM_S_BIT(instr)) - goto bad; - - pc_correction = 4; /* processor implementation defined */ - - /* count the number of registers in the mask to be transferred */ - nr_regs = hweight16(REGMASK_BITS(instr)) * 4; - - rn = RN_BITS(instr); - newaddr = eaddr = regs->uregs[rn]; - - if (!LDST_U_BIT(instr)) - nr_regs = -nr_regs; - newaddr += nr_regs; - if (!LDST_U_BIT(instr)) - eaddr = newaddr; - - if (LDST_P_EQ_U(instr)) /* U = P */ - eaddr += 4; - - /* - * This is a "hint" - we already have eaddr worked out by the - * processor for us. - */ - if (addr != eaddr) { - printk(KERN_ERR "LDMSTM: PC = %08lx, instr = %08lx, " - "addr = %08lx, eaddr = %08lx\n", - instruction_pointer(regs), instr, addr, eaddr); - show_regs(regs); - } - - if (LDM_H_BIT(instr)) - reg_correction = 0x10; - else - reg_correction = 0x00; - - for (regbits = REGMASK_BITS(instr), rd = 0; regbits; - regbits >>= 1, rd += 1) - if (regbits & 1) { - if (LDST_L_BIT(instr)) - get32_unaligned_check(regs-> - uregs[rd + reg_correction], eaddr); - else - put32_unaligned_check(regs-> - uregs[rd + reg_correction], eaddr); - eaddr += 4; - } - - if (LDST_W_BIT(instr)) - regs->uregs[rn] = newaddr; - return TYPE_DONE; - -fault: - regs->UCreg_pc -= pc_correction; - return TYPE_FAULT; - -bad: - printk(KERN_ERR "Alignment trap: not handling ldm with s-bit set\n"); - return TYPE_ERROR; -} - -static int -do_alignment(unsigned long addr, unsigned int error_code, struct pt_regs *regs) -{ - union offset_union offset; - unsigned long instr, instrptr; - int (*handler) (unsigned long addr, unsigned long instr, - struct pt_regs *regs); - unsigned int type; - - instrptr = instruction_pointer(regs); - if (instrptr >= PAGE_OFFSET) - instr = *(unsigned long *)instrptr; - else { - __asm__ __volatile__( - "ldw.u %0, [%1]\n" - : "=&r"(instr) - : "r"(instrptr)); - } - - regs->UCreg_pc += 4; - - switch (CODING_BITS(instr)) { - case 0x40000120: /* ldrh or strh */ - if (LDSTH_I_BIT(instr)) - offset.un = (instr & 0x3e00) >> 4 | (instr & 31); - else - offset.un = regs->uregs[RM_BITS(instr)]; - handler = do_alignment_ldrhstrh; - break; - - case 0x60000000: /* ldr or str immediate */ - case 0x60000100: /* ldr or str immediate */ - case 0x60000020: /* ldr or str immediate */ - case 0x60000120: /* ldr or str immediate */ - offset.un = OFFSET_BITS(instr); - handler = do_alignment_ldrstr; - break; - - case 0x40000000: /* ldr or str register */ - offset.un = regs->uregs[RM_BITS(instr)]; - { - unsigned int shiftval = SHIFT_BITS(instr); - - switch (SHIFT_TYPE(instr)) { - case SHIFT_LSL: - offset.un <<= shiftval; - break; - - case SHIFT_LSR: - offset.un >>= shiftval; - break; - - case SHIFT_ASR: - offset.sn >>= shiftval; - break; - - case SHIFT_RORRRX: - if (shiftval == 0) { - offset.un >>= 1; - if (regs->UCreg_asr & PSR_C_BIT) - offset.un |= 1 << 31; - } else - offset.un = offset.un >> shiftval | - offset.un << (32 - shiftval); - break; - } - } - handler = do_alignment_ldrstr; - break; - - case 0x80000000: /* ldm or stm */ - case 0x80000020: /* ldm or stm */ - handler = do_alignment_ldmstm; - break; - - default: - goto bad; - } - - type = handler(addr, instr, regs); - - if (type == TYPE_ERROR || type == TYPE_FAULT) - goto bad_or_fault; - - if (type == TYPE_LDST) - do_alignment_finish_ldst(addr, instr, regs, offset); - - return 0; - -bad_or_fault: - if (type == TYPE_ERROR) - goto bad; - regs->UCreg_pc -= 4; - /* - * We got a fault - fix it up, or die. - */ - do_bad_area(addr, error_code, regs); - return 0; - -bad: - /* - * Oops, we didn't handle the instruction. - * However, we must handle fpu instr firstly. - */ -#ifdef CONFIG_UNICORE_FPU_F64 - /* handle co.load/store */ -#define CODING_COLS 0xc0000000 -#define COLS_OFFSET_BITS(i) (i & 0x1FF) -#define COLS_L_BITS(i) (i & (1<<24)) -#define COLS_FN_BITS(i) ((i>>14) & 31) - if ((instr & 0xe0000000) == CODING_COLS) { - unsigned int fn = COLS_FN_BITS(instr); - unsigned long val = 0; - if (COLS_L_BITS(instr)) { - get32t_unaligned_check(val, addr); - switch (fn) { -#define ASM_MTF(n) case n: \ - __asm__ __volatile__("MTF %0, F" __stringify(n) \ - : : "r"(val)); \ - break; - ASM_MTF(0); ASM_MTF(1); ASM_MTF(2); ASM_MTF(3); - ASM_MTF(4); ASM_MTF(5); ASM_MTF(6); ASM_MTF(7); - ASM_MTF(8); ASM_MTF(9); ASM_MTF(10); ASM_MTF(11); - ASM_MTF(12); ASM_MTF(13); ASM_MTF(14); ASM_MTF(15); - ASM_MTF(16); ASM_MTF(17); ASM_MTF(18); ASM_MTF(19); - ASM_MTF(20); ASM_MTF(21); ASM_MTF(22); ASM_MTF(23); - ASM_MTF(24); ASM_MTF(25); ASM_MTF(26); ASM_MTF(27); - ASM_MTF(28); ASM_MTF(29); ASM_MTF(30); ASM_MTF(31); -#undef ASM_MTF - } - } else { - switch (fn) { -#define ASM_MFF(n) case n: \ - __asm__ __volatile__("MFF %0, F" __stringify(n) \ - : : "r"(val)); \ - break; - ASM_MFF(0); ASM_MFF(1); ASM_MFF(2); ASM_MFF(3); - ASM_MFF(4); ASM_MFF(5); ASM_MFF(6); ASM_MFF(7); - ASM_MFF(8); ASM_MFF(9); ASM_MFF(10); ASM_MFF(11); - ASM_MFF(12); ASM_MFF(13); ASM_MFF(14); ASM_MFF(15); - ASM_MFF(16); ASM_MFF(17); ASM_MFF(18); ASM_MFF(19); - ASM_MFF(20); ASM_MFF(21); ASM_MFF(22); ASM_MFF(23); - ASM_MFF(24); ASM_MFF(25); ASM_MFF(26); ASM_MFF(27); - ASM_MFF(28); ASM_MFF(29); ASM_MFF(30); ASM_MFF(31); -#undef ASM_MFF - } - put32t_unaligned_check(val, addr); - } - return TYPE_COLS; - } -fault: - return TYPE_FAULT; -#endif - printk(KERN_ERR "Alignment trap: not handling instruction " - "%08lx at [<%08lx>]\n", instr, instrptr); - return 1; -} - -/* - * This needs to be done after sysctl_init, otherwise sys/ will be - * overwritten. Actually, this shouldn't be in sys/ at all since - * it isn't a sysctl, and it doesn't contain sysctl information. - */ -static int __init alignment_init(void) -{ - hook_fault_code(1, do_alignment, SIGBUS, BUS_ADRALN, - "alignment exception"); - - return 0; -} - -fs_initcall(alignment_init); diff --git a/arch/unicore32/mm/cache-ucv2.S b/arch/unicore32/mm/cache-ucv2.S deleted file mode 100644 index 2108837d6f4f..000000000000 --- a/arch/unicore32/mm/cache-ucv2.S +++ /dev/null @@ -1,209 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/mm/cache-ucv2.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * This is the "shell" of the UniCore-v2 processor support. - */ -#include -#include -#include -#include - -#include "proc-macros.S" - -/* - * __cpuc_flush_icache_all() - * __cpuc_flush_kern_all() - * __cpuc_flush_user_all() - * - * Flush the entire cache. - */ -ENTRY(__cpuc_flush_icache_all) - /*FALLTHROUGH*/ -ENTRY(__cpuc_flush_kern_all) - /*FALLTHROUGH*/ -ENTRY(__cpuc_flush_user_all) - mov r0, #0 - movc p0.c5, r0, #14 @ Dcache flush all - nop8 - - mov r0, #0 - movc p0.c5, r0, #20 @ Icache invalidate all - nop8 - - mov pc, lr - -/* - * __cpuc_flush_user_range(start, end, flags) - * - * Flush a range of TLB entries in the specified address space. - * - * - start - start address (may not be aligned) - * - end - end address (exclusive, may not be aligned) - * - flags - vm_area_struct flags describing address space - */ -ENTRY(__cpuc_flush_user_range) - cxor.a r2, #0 - beq __cpuc_dma_flush_range - -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - andn r0, r0, #CACHE_LINESIZE - 1 @ Safety check - sub r1, r1, r0 - csub.a r1, #MAX_AREA_SIZE - bsg 2f - - andn r1, r1, #CACHE_LINESIZE - 1 - add r1, r1, #CACHE_LINESIZE - -101: dcacheline_flush r0, r11, r12 - - add r0, r0, #CACHE_LINESIZE - sub.a r1, r1, #CACHE_LINESIZE - bns 101b - b 3f -#endif -2: mov ip, #0 - movc p0.c5, ip, #14 @ Dcache flush all - nop8 - -3: mov ip, #0 - movc p0.c5, ip, #20 @ Icache invalidate all - nop8 - - mov pc, lr - -/* - * __cpuc_coherent_kern_range(start,end) - * __cpuc_coherent_user_range(start,end) - * - * Ensure that the I and D caches are coherent within specified - * region. This is typically used when code has been written to - * a memory region, and will be executed. - * - * - start - virtual start address of region - * - end - virtual end address of region - */ -ENTRY(__cpuc_coherent_kern_range) - /* FALLTHROUGH */ -ENTRY(__cpuc_coherent_user_range) -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - andn r0, r0, #CACHE_LINESIZE - 1 @ Safety check - sub r1, r1, r0 - csub.a r1, #MAX_AREA_SIZE - bsg 2f - - andn r1, r1, #CACHE_LINESIZE - 1 - add r1, r1, #CACHE_LINESIZE - - @ r0 va2pa r10 - mov r9, #PAGE_SZ - sub r9, r9, #1 @ PAGE_MASK -101: va2pa r0, r10, r11, r12, r13, 2f @ r10 is PA - b 103f -102: cand.a r0, r9 - beq 101b - -103: movc p0.c5, r10, #11 @ Dcache clean line of R10 - nop8 - - add r0, r0, #CACHE_LINESIZE - add r10, r10, #CACHE_LINESIZE - sub.a r1, r1, #CACHE_LINESIZE - bns 102b - b 3f -#endif -2: mov ip, #0 - movc p0.c5, ip, #10 @ Dcache clean all - nop8 - -3: mov ip, #0 - movc p0.c5, ip, #20 @ Icache invalidate all - nop8 - - mov pc, lr - -/* - * __cpuc_flush_kern_dcache_area(void *addr, size_t size) - * - * - addr - kernel address - * - size - region size - */ -ENTRY(__cpuc_flush_kern_dcache_area) - mov ip, #0 - movc p0.c5, ip, #14 @ Dcache flush all - nop8 - mov pc, lr - -/* - * __cpuc_dma_clean_range(start,end) - * - start - virtual start address of region - * - end - virtual end address of region - */ -ENTRY(__cpuc_dma_clean_range) -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - andn r0, r0, #CACHE_LINESIZE - 1 - sub r1, r1, r0 - andn r1, r1, #CACHE_LINESIZE - 1 - add r1, r1, #CACHE_LINESIZE - - csub.a r1, #MAX_AREA_SIZE - bsg 2f - - @ r0 va2pa r10 - mov r9, #PAGE_SZ - sub r9, r9, #1 @ PAGE_MASK -101: va2pa r0, r10, r11, r12, r13, 2f @ r10 is PA - b 1f -102: cand.a r0, r9 - beq 101b - -1: movc p0.c5, r10, #11 @ Dcache clean line of R10 - nop8 - add r0, r0, #CACHE_LINESIZE - add r10, r10, #CACHE_LINESIZE - sub.a r1, r1, #CACHE_LINESIZE - bns 102b - mov pc, lr -#endif -2: mov ip, #0 - movc p0.c5, ip, #10 @ Dcache clean all - nop8 - - mov pc, lr - -/* - * __cpuc_dma_inv_range(start,end) - * __cpuc_dma_flush_range(start,end) - * - start - virtual start address of region - * - end - virtual end address of region - */ -__cpuc_dma_inv_range: - /* FALLTHROUGH */ -ENTRY(__cpuc_dma_flush_range) -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - andn r0, r0, #CACHE_LINESIZE - 1 - sub r1, r1, r0 - andn r1, r1, #CACHE_LINESIZE - 1 - add r1, r1, #CACHE_LINESIZE - - csub.a r1, #MAX_AREA_SIZE - bsg 2f - - @ r0 va2pa r10 -101: dcacheline_flush r0, r11, r12 - - add r0, r0, #CACHE_LINESIZE - sub.a r1, r1, #CACHE_LINESIZE - bns 101b - mov pc, lr -#endif -2: mov ip, #0 - movc p0.c5, ip, #14 @ Dcache flush all - nop8 - - mov pc, lr - diff --git a/arch/unicore32/mm/extable.c b/arch/unicore32/mm/extable.c deleted file mode 100644 index e53352b41c4a..000000000000 --- a/arch/unicore32/mm/extable.c +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/extable.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - -int fixup_exception(struct pt_regs *regs) -{ - const struct exception_table_entry *fixup; - - fixup = search_exception_tables(instruction_pointer(regs)); - if (fixup) - regs->UCreg_pc = fixup->fixup; - - return fixup != NULL; -} diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c deleted file mode 100644 index 7654bddde133..000000000000 --- a/arch/unicore32/mm/fault.c +++ /dev/null @@ -1,481 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/fault.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Fault status register encodings. We steal bit 31 for our own purposes. - */ -#define FSR_LNX_PF (1 << 31) - -static inline int fsr_fs(unsigned int fsr) -{ - /* xyabcde will be abcde+xy */ - return (fsr & 31) + ((fsr & (3 << 5)) >> 5); -} - -/* - * This is useful to dump out the page tables associated with - * 'addr' in mm 'mm'. - */ -void show_pte(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pgd; - - if (!mm) - mm = &init_mm; - - printk(KERN_ALERT "pgd = %p\n", mm->pgd); - pgd = pgd_offset(mm, addr); - printk(KERN_ALERT "[%08lx] *pgd=%08lx", addr, pgd_val(*pgd)); - - do { - pmd_t *pmd; - pte_t *pte; - - if (pgd_none(*pgd)) - break; - - if (pgd_bad(*pgd)) { - printk("(bad)"); - break; - } - - pmd = pmd_offset((pud_t *) pgd, addr); - if (PTRS_PER_PMD != 1) - printk(", *pmd=%08lx", pmd_val(*pmd)); - - if (pmd_none(*pmd)) - break; - - if (pmd_bad(*pmd)) { - printk("(bad)"); - break; - } - - /* We must not map this if we have highmem enabled */ - if (PageHighMem(pfn_to_page(pmd_val(*pmd) >> PAGE_SHIFT))) - break; - - pte = pte_offset_map(pmd, addr); - printk(", *pte=%08lx", pte_val(*pte)); - pte_unmap(pte); - } while (0); - - printk("\n"); -} - -/* - * Oops. The kernel tried to access some page that wasn't present. - */ -static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, - unsigned int fsr, struct pt_regs *regs) -{ - /* - * Are we prepared to handle this kernel fault? - */ - if (fixup_exception(regs)) - return; - - /* - * No handler, we'll have to terminate things with extreme prejudice. - */ - bust_spinlocks(1); - printk(KERN_ALERT - "Unable to handle kernel %s at virtual address %08lx\n", - (addr < PAGE_SIZE) ? "NULL pointer dereference" : - "paging request", addr); - - show_pte(mm, addr); - die("Oops", regs, fsr); - bust_spinlocks(0); - do_exit(SIGKILL); -} - -/* - * Something tried to access memory that isn't in our memory map.. - * User mode accesses just cause a SIGSEGV - */ -static void __do_user_fault(unsigned long addr, unsigned int fsr, - unsigned int sig, int code, struct pt_regs *regs) -{ - struct task_struct *tsk = current; - - tsk->thread.address = addr; - tsk->thread.error_code = fsr; - tsk->thread.trap_no = 14; - force_sig_fault(sig, code, (void __user *)addr); -} - -void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs) -{ - struct task_struct *tsk = current; - struct mm_struct *mm = tsk->active_mm; - - /* - * If we are in kernel mode at this point, we - * have no context to handle this fault with. - */ - if (user_mode(regs)) - __do_user_fault(addr, fsr, SIGSEGV, SEGV_MAPERR, regs); - else - __do_kernel_fault(mm, addr, fsr, regs); -} - -#define VM_FAULT_BADMAP 0x010000 -#define VM_FAULT_BADACCESS 0x020000 - -/* - * Check that the permissions on the VMA allow for the fault which occurred. - * If we encountered a write fault, we must have write permission, otherwise - * we allow any permission. - */ -static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma) -{ - unsigned int mask = VM_ACCESS_FLAGS; - - if (!(fsr ^ 0x12)) /* write? */ - mask = VM_WRITE; - if (fsr & FSR_LNX_PF) - mask = VM_EXEC; - - return vma->vm_flags & mask ? false : true; -} - -static vm_fault_t __do_pf(struct mm_struct *mm, unsigned long addr, - unsigned int fsr, unsigned int flags, struct task_struct *tsk) -{ - struct vm_area_struct *vma; - vm_fault_t fault; - - vma = find_vma(mm, addr); - fault = VM_FAULT_BADMAP; - if (unlikely(!vma)) - goto out; - if (unlikely(vma->vm_start > addr)) - goto check_stack; - - /* - * Ok, we have a good vm_area for this - * memory access, so we can handle it. - */ -good_area: - if (access_error(fsr, vma)) { - fault = VM_FAULT_BADACCESS; - goto out; - } - - /* - * If for any reason at all we couldn't handle the fault, make - * sure we exit gracefully rather than endlessly redo the fault. - */ - fault = handle_mm_fault(vma, addr & PAGE_MASK, flags); - return fault; - -check_stack: - if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr)) - goto good_area; -out: - return fault; -} - -static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) -{ - struct task_struct *tsk; - struct mm_struct *mm; - int sig, code; - vm_fault_t fault; - unsigned int flags = FAULT_FLAG_DEFAULT; - - tsk = current; - mm = tsk->mm; - - /* - * If we're in an interrupt or have no user - * context, we must not take the fault.. - */ - if (faulthandler_disabled() || !mm) - goto no_context; - - if (user_mode(regs)) - flags |= FAULT_FLAG_USER; - if (!(fsr ^ 0x12)) - flags |= FAULT_FLAG_WRITE; - - /* - * As per x86, we may deadlock here. However, since the kernel only - * validly references user space from well defined areas of the code, - * we can bug out early if this is from code which shouldn't. - */ - if (!mmap_read_trylock(mm)) { - if (!user_mode(regs) - && !search_exception_tables(regs->UCreg_pc)) - goto no_context; -retry: - mmap_read_lock(mm); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case, we'll have missed the might_sleep() from - * down_read() - */ - might_sleep(); -#ifdef CONFIG_DEBUG_VM - if (!user_mode(regs) && - !search_exception_tables(regs->UCreg_pc)) - goto no_context; -#endif - } - - fault = __do_pf(mm, addr, fsr, flags, tsk); - - /* If we need to retry but a fatal signal is pending, handle the - * signal first. We do not need to release the mmap_lock because - * it would already be released in __lock_page_or_retry in - * mm/filemap.c. */ - if (fault_signal_pending(fault, regs)) - return 0; - - if (!(fault & VM_FAULT_ERROR) && (flags & FAULT_FLAG_ALLOW_RETRY)) { - if (fault & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; - goto retry; - } - } - - mmap_read_unlock(mm); - - /* - * Handle the "normal" case first - VM_FAULT_MAJOR - */ - if (likely(!(fault & - (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) - return 0; - - /* - * If we are in kernel mode at this point, we - * have no context to handle this fault with. - */ - if (!user_mode(regs)) - goto no_context; - - if (fault & VM_FAULT_OOM) { - /* - * We ran out of memory, call the OOM killer, and return to - * userspace (which will retry the fault, or kill us if we - * got oom-killed) - */ - pagefault_out_of_memory(); - return 0; - } - - if (fault & VM_FAULT_SIGBUS) { - /* - * We had some memory, but were unable to - * successfully fix up this page fault. - */ - sig = SIGBUS; - code = BUS_ADRERR; - } else { - /* - * Something tried to access memory that - * isn't in our memory map.. - */ - sig = SIGSEGV; - code = fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR; - } - - __do_user_fault(addr, fsr, sig, code, regs); - return 0; - -no_context: - __do_kernel_fault(mm, addr, fsr, regs); - return 0; -} - -/* - * First Level Translation Fault Handler - * - * We enter here because the first level page table doesn't contain - * a valid entry for the address. - * - * If the address is in kernel space (>= TASK_SIZE), then we are - * probably faulting in the vmalloc() area. - * - * If the init_task's first level page tables contains the relevant - * entry, we copy the it to this task. If not, we send the process - * a signal, fixup the exception, or oops the kernel. - * - * NOTE! We MUST NOT take any locks for this case. We may be in an - * interrupt or a critical region, and should only copy the information - * from the master page table, nothing more. - */ -static int do_ifault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) -{ - unsigned int index; - pgd_t *pgd, *pgd_k; - pmd_t *pmd, *pmd_k; - - if (addr < TASK_SIZE) - return do_pf(addr, fsr, regs); - - if (user_mode(regs)) - goto bad_area; - - index = pgd_index(addr); - - pgd = cpu_get_pgd() + index; - pgd_k = init_mm.pgd + index; - - if (pgd_none(*pgd_k)) - goto bad_area; - - pmd_k = pmd_offset((pud_t *) pgd_k, addr); - pmd = pmd_offset((pud_t *) pgd, addr); - - if (pmd_none(*pmd_k)) - goto bad_area; - - set_pmd(pmd, *pmd_k); - flush_pmd_entry(pmd); - return 0; - -bad_area: - do_bad_area(addr, fsr, regs); - return 0; -} - -/* - * This abort handler always returns "fault". - */ -static int do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs) -{ - return 1; -} - -static int do_good(unsigned long addr, unsigned int fsr, struct pt_regs *regs) -{ - unsigned int res1, res2; - - printk("dabt exception but no error!\n"); - - __asm__ __volatile__( - "mff %0,f0\n" - "mff %1,f1\n" - : "=r"(res1), "=r"(res2) - : - : "memory"); - - printk(KERN_EMERG "r0 :%08x r1 :%08x\n", res1, res2); - panic("shut up\n"); - return 0; -} - -static struct fsr_info { - int (*fn) (unsigned long addr, unsigned int fsr, struct pt_regs *regs); - int sig; - int code; - const char *name; -} fsr_info[] = { - /* - * The following are the standard Unicore-I and UniCore-II aborts. - */ - { do_good, SIGBUS, 0, "no error" }, - { do_bad, SIGBUS, BUS_ADRALN, "alignment exception" }, - { do_bad, SIGBUS, BUS_OBJERR, "external exception" }, - { do_bad, SIGBUS, 0, "burst operation" }, - { do_bad, SIGBUS, 0, "unknown 00100" }, - { do_ifault, SIGSEGV, SEGV_MAPERR, "2nd level pt non-exist"}, - { do_bad, SIGBUS, 0, "2nd lvl large pt non-exist" }, - { do_bad, SIGBUS, 0, "invalid pte" }, - { do_pf, SIGSEGV, SEGV_MAPERR, "page miss" }, - { do_bad, SIGBUS, 0, "middle page miss" }, - { do_bad, SIGBUS, 0, "large page miss" }, - { do_pf, SIGSEGV, SEGV_MAPERR, "super page (section) miss" }, - { do_bad, SIGBUS, 0, "unknown 01100" }, - { do_bad, SIGBUS, 0, "unknown 01101" }, - { do_bad, SIGBUS, 0, "unknown 01110" }, - { do_bad, SIGBUS, 0, "unknown 01111" }, - { do_bad, SIGBUS, 0, "addr: up 3G or IO" }, - { do_pf, SIGSEGV, SEGV_ACCERR, "read unreadable addr" }, - { do_pf, SIGSEGV, SEGV_ACCERR, "write unwriteable addr"}, - { do_pf, SIGSEGV, SEGV_ACCERR, "exec unexecutable addr"}, - { do_bad, SIGBUS, 0, "unknown 10100" }, - { do_bad, SIGBUS, 0, "unknown 10101" }, - { do_bad, SIGBUS, 0, "unknown 10110" }, - { do_bad, SIGBUS, 0, "unknown 10111" }, - { do_bad, SIGBUS, 0, "unknown 11000" }, - { do_bad, SIGBUS, 0, "unknown 11001" }, - { do_bad, SIGBUS, 0, "unknown 11010" }, - { do_bad, SIGBUS, 0, "unknown 11011" }, - { do_bad, SIGBUS, 0, "unknown 11100" }, - { do_bad, SIGBUS, 0, "unknown 11101" }, - { do_bad, SIGBUS, 0, "unknown 11110" }, - { do_bad, SIGBUS, 0, "unknown 11111" } -}; - -void __init hook_fault_code(int nr, - int (*fn) (unsigned long, unsigned int, struct pt_regs *), - int sig, int code, const char *name) -{ - if (nr < 0 || nr >= ARRAY_SIZE(fsr_info)) - BUG(); - - fsr_info[nr].fn = fn; - fsr_info[nr].sig = sig; - fsr_info[nr].code = code; - fsr_info[nr].name = name; -} - -/* - * Dispatch a data abort to the relevant handler. - */ -asmlinkage void do_DataAbort(unsigned long addr, unsigned int fsr, - struct pt_regs *regs) -{ - const struct fsr_info *inf = fsr_info + fsr_fs(fsr); - - if (!inf->fn(addr, fsr & ~FSR_LNX_PF, regs)) - return; - - printk(KERN_ALERT "Unhandled fault: %s (0x%03x) at 0x%08lx\n", - inf->name, fsr, addr); - - uc32_notify_die("", regs, inf->sig, inf->code, (void __user *)addr, - fsr, 0); -} - -asmlinkage void do_PrefetchAbort(unsigned long addr, - unsigned int ifsr, struct pt_regs *regs) -{ - const struct fsr_info *inf = fsr_info + fsr_fs(ifsr); - - if (!inf->fn(addr, ifsr | FSR_LNX_PF, regs)) - return; - - printk(KERN_ALERT "Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n", - inf->name, ifsr, addr); - - uc32_notify_die("", regs, inf->sig, inf->code, (void __user *)addr, - ifsr, 0); -} diff --git a/arch/unicore32/mm/flush.c b/arch/unicore32/mm/flush.c deleted file mode 100644 index 65954f8d89a2..000000000000 --- a/arch/unicore32/mm/flush.c +++ /dev/null @@ -1,94 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/flush.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include - -#include -#include - -void flush_cache_mm(struct mm_struct *mm) -{ -} - -void flush_cache_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - if (vma->vm_flags & VM_EXEC) - __flush_icache_all(); -} - -void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, - unsigned long pfn) -{ -} - -static void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, - unsigned long uaddr, void *kaddr, unsigned long len) -{ - /* VIPT non-aliasing D-cache */ - if (vma->vm_flags & VM_EXEC) { - unsigned long addr = (unsigned long)kaddr; - - __cpuc_coherent_kern_range(addr, addr + len); - } -} - -/* - * Copy user data from/to a page which is mapped into a different - * processes address space. Really, we want to allow our "user - * space" model to handle this. - * - * Note that this code needs to run on the current CPU. - */ -void copy_to_user_page(struct vm_area_struct *vma, struct page *page, - unsigned long uaddr, void *dst, const void *src, - unsigned long len) -{ - memcpy(dst, src, len); - flush_ptrace_access(vma, page, uaddr, dst, len); -} - -void __flush_dcache_page(struct address_space *mapping, struct page *page) -{ - /* - * Writeback any data associated with the kernel mapping of this - * page. This ensures that data in the physical page is mutually - * coherent with the kernels mapping. - */ - __cpuc_flush_kern_dcache_area(page_address(page), PAGE_SIZE); -} - -/* - * Ensure cache coherency between kernel mapping and userspace mapping - * of this page. - */ -void flush_dcache_page(struct page *page) -{ - struct address_space *mapping; - - /* - * The zero page is never written to, so never has any dirty - * cache lines, and therefore never needs to be flushed. - */ - if (page == ZERO_PAGE(0)) - return; - - mapping = page_mapping_file(page); - - if (mapping && !mapping_mapped(mapping)) - clear_bit(PG_dcache_clean, &page->flags); - else { - __flush_dcache_page(mapping, page); - if (mapping) - __flush_icache_all(); - set_bit(PG_dcache_clean, &page->flags); - } -} -EXPORT_SYMBOL(flush_dcache_page); diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c deleted file mode 100644 index 52425d383cea..000000000000 --- a/arch/unicore32/mm/init.c +++ /dev/null @@ -1,261 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/init.c - * - * Copyright (C) 2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "mm.h" - -/* - * This keeps memory configuration data used by a couple memory - * initialization functions, as well as show_mem() for the skipping - * of holes in the memory map. It is populated by uc32_add_memory(). - */ -struct meminfo meminfo; - -static void __init find_limits(unsigned long *min, unsigned long *max_low, - unsigned long *max_high) -{ - struct meminfo *mi = &meminfo; - int i; - - *min = -1UL; - *max_low = *max_high = 0; - - for_each_bank(i, mi) { - struct membank *bank = &mi->bank[i]; - unsigned long start, end; - - start = bank_pfn_start(bank); - end = bank_pfn_end(bank); - - if (*min > start) - *min = start; - if (*max_high < end) - *max_high = end; - if (bank->highmem) - continue; - if (*max_low < end) - *max_low = end; - } -} - -static void __init uc32_bootmem_free(unsigned long max_low) -{ - unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - - max_zone_pfn[ZONE_DMA] = max_low; - max_zone_pfn[ZONE_NORMAL] = max_low; - - /* - * Adjust the sizes according to any special requirements for - * this machine type. - * This might lower ZONE_DMA limit. - */ - arch_adjust_zones(max_zone_pfn); - - free_area_init(max_zone_pfn); -} - -int pfn_valid(unsigned long pfn) -{ - return memblock_is_memory(pfn << PAGE_SHIFT); -} -EXPORT_SYMBOL(pfn_valid); - -static void uc32_memory_present(void) -{ -} - -static int __init meminfo_cmp(const void *_a, const void *_b) -{ - const struct membank *a = _a, *b = _b; - long cmp = bank_pfn_start(a) - bank_pfn_start(b); - return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; -} - -void __init uc32_memblock_init(struct meminfo *mi) -{ - int i; - - sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), - meminfo_cmp, NULL); - - for (i = 0; i < mi->nr_banks; i++) - memblock_add(mi->bank[i].start, mi->bank[i].size); - - /* Register the kernel text, kernel data and initrd with memblock. */ - memblock_reserve(__pa(_text), _end - _text); - -#ifdef CONFIG_BLK_DEV_INITRD - if (!phys_initrd_size) { - phys_initrd_start = 0x01000000; - phys_initrd_size = SZ_8M; - } - - if (phys_initrd_size) { - memblock_reserve(phys_initrd_start, phys_initrd_size); - - /* Now convert initrd to virtual addresses */ - initrd_start = __phys_to_virt(phys_initrd_start); - initrd_end = initrd_start + phys_initrd_size; - } -#endif - - uc32_mm_memblock_reserve(); - - memblock_allow_resize(); - memblock_dump_all(); -} - -void __init bootmem_init(void) -{ - unsigned long min, max_low, max_high; - - max_low = max_high = 0; - - find_limits(&min, &max_low, &max_high); - - node_set_online(0); - - /* - * Sparsemem tries to allocate bootmem in memory_present(), - * so must be done after the fixed reservations - */ - uc32_memory_present(); - - /* - * sparse_init() needs the bootmem allocator up and running. - */ - sparse_init(); - - /* - * Now free the memory - free_area_init needs - * the sparse mem_map arrays initialized by sparse_init() - * for memmap_init_zone(), otherwise all PFNs are invalid. - */ - uc32_bootmem_free(max_low); - - high_memory = __va((max_low << PAGE_SHIFT) - 1) + 1; - - /* - * This doesn't seem to be used by the Linux memory manager any - * more, but is used by ll_rw_block. If we can get rid of it, we - * also get rid of some of the stuff above as well. - * - * Note: max_low_pfn and max_pfn reflect the number of _pages_ in - * the system, not the maximum PFN. - */ - max_low_pfn = max_low - PHYS_PFN_OFFSET; - max_pfn = max_high - PHYS_PFN_OFFSET; -} - -static inline void -free_memmap(unsigned long start_pfn, unsigned long end_pfn) -{ - struct page *start_pg, *end_pg; - unsigned long pg, pgend; - - /* - * Convert start_pfn/end_pfn to a struct page pointer. - */ - start_pg = pfn_to_page(start_pfn - 1) + 1; - end_pg = pfn_to_page(end_pfn); - - /* - * Convert to physical addresses, and - * round start upwards and end downwards. - */ - pg = PAGE_ALIGN(__pa(start_pg)); - pgend = __pa(end_pg) & PAGE_MASK; - - /* - * If there are free pages between these, - * free the section of the memmap array. - */ - if (pg < pgend) - memblock_free(pg, pgend - pg); -} - -/* - * The mem_map array can get very big. Free the unused area of the memory map. - */ -static void __init free_unused_memmap(struct meminfo *mi) -{ - unsigned long bank_start, prev_bank_end = 0; - unsigned int i; - - /* - * This relies on each bank being in address order. - * The banks are sorted previously in bootmem_init(). - */ - for_each_bank(i, mi) { - struct membank *bank = &mi->bank[i]; - - bank_start = bank_pfn_start(bank); - - /* - * If we had a previous bank, and there is a space - * between the current bank and the previous, free it. - */ - if (prev_bank_end && prev_bank_end < bank_start) - free_memmap(prev_bank_end, bank_start); - - /* - * Align up here since the VM subsystem insists that the - * memmap entries are valid from the bank end aligned to - * MAX_ORDER_NR_PAGES. - */ - prev_bank_end = ALIGN(bank_pfn_end(bank), MAX_ORDER_NR_PAGES); - } -} - -/* - * mem_init() marks the free areas in the mem_map and tells us how much - * memory is free. This is done after various parts of the system have - * claimed their memory after the kernel image. - */ -void __init mem_init(void) -{ - max_mapnr = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map; - - free_unused_memmap(&meminfo); - - /* this will put all unused low memory onto the freelists */ - memblock_free_all(); - - mem_init_print_info(NULL); - - BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); - BUG_ON(TASK_SIZE > MODULES_VADDR); - - if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) { - /* - * On a machine this small we won't get - * anywhere without overcommit, so turn - * it on by default. - */ - sysctl_overcommit_memory = OVERCOMMIT_ALWAYS; - } -} diff --git a/arch/unicore32/mm/ioremap.c b/arch/unicore32/mm/ioremap.c deleted file mode 100644 index 46a64bd6156a..000000000000 --- a/arch/unicore32/mm/ioremap.c +++ /dev/null @@ -1,242 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/ioremap.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * Re-map IO memory to kernel address space so that we can access it. - * - * This allows a driver to remap an arbitrary region of bus memory into - * virtual space. One should *only* use readl, writel, memcpy_toio and - * so on with such remapped areas. - * - * Because UniCore only has a 32-bit address space we can't address the - * whole of the (physical) PCI space at once. PCI huge-mode addressing - * allows us to circumvent this restriction by splitting PCI space into - * two 2GB chunks and mapping only one at a time into processor memory. - * We use MMU protection domains to trap any attempt to access the bank - * that is not currently mapped. (This isn't fully implemented yet.) - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include "mm.h" - -/* - * Used by ioremap() and iounmap() code to mark (super)section-mapped - * I/O regions in vm_struct->flags field. - */ -#define VM_UNICORE_SECTION_MAPPING 0x80000000 - -int ioremap_page(unsigned long virt, unsigned long phys, - const struct mem_type *mtype) -{ - return ioremap_page_range(virt, virt + PAGE_SIZE, phys, - __pgprot(mtype->prot_pte)); -} -EXPORT_SYMBOL(ioremap_page); - -/* - * Section support is unsafe on SMP - If you iounmap and ioremap a region, - * the other CPUs will not see this change until their next context switch. - * Meanwhile, (eg) if an interrupt comes in on one of those other CPUs - * which requires the new ioremap'd region to be referenced, the CPU will - * reference the _old_ region. - * - * Note that get_vm_area_caller() allocates a guard 4K page, so we need to - * mask the size back to 4MB aligned or we will overflow in the loop below. - */ -static void unmap_area_sections(unsigned long virt, unsigned long size) -{ - unsigned long addr = virt, end = virt + (size & ~(SZ_4M - 1)); - pgd_t *pgd; - - flush_cache_vunmap(addr, end); - pgd = pgd_offset_k(addr); - do { - pmd_t pmd, *pmdp = pmd_offset((pud_t *)pgd, addr); - - pmd = *pmdp; - if (!pmd_none(pmd)) { - /* - * Clear the PMD from the page table, and - * increment the kvm sequence so others - * notice this change. - * - * Note: this is still racy on SMP machines. - */ - pmd_clear(pmdp); - - /* - * Free the page table, if there was one. - */ - if ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_TABLE) - pte_free_kernel(&init_mm, pmd_page_vaddr(pmd)); - } - - addr += PGDIR_SIZE; - pgd++; - } while (addr < end); - - flush_tlb_kernel_range(virt, end); -} - -static int -remap_area_sections(unsigned long virt, unsigned long pfn, - size_t size, const struct mem_type *type) -{ - unsigned long addr = virt, end = virt + size; - pgd_t *pgd; - - /* - * Remove and free any PTE-based mapping, and - * sync the current kernel mapping. - */ - unmap_area_sections(virt, size); - - pgd = pgd_offset_k(addr); - do { - pmd_t *pmd = pmd_offset((pud_t *)pgd, addr); - - set_pmd(pmd, __pmd(__pfn_to_phys(pfn) | type->prot_sect)); - pfn += SZ_4M >> PAGE_SHIFT; - flush_pmd_entry(pmd); - - addr += PGDIR_SIZE; - pgd++; - } while (addr < end); - - return 0; -} - -void __iomem *__uc32_ioremap_pfn_caller(unsigned long pfn, - unsigned long offset, size_t size, unsigned int mtype, void *caller) -{ - const struct mem_type *type; - int err; - unsigned long addr; - struct vm_struct *area; - - /* - * High mappings must be section aligned - */ - if (pfn >= 0x100000 && (__pfn_to_phys(pfn) & ~SECTION_MASK)) - return NULL; - - /* - * Don't allow RAM to be mapped - */ - if (pfn_valid(pfn)) { - WARN(1, "BUG: Your driver calls ioremap() on\n" - "system memory. This leads to architecturally\n" - "unpredictable behaviour, and ioremap() will fail in\n" - "the next kernel release. Please fix your driver.\n"); - return NULL; - } - - type = get_mem_type(mtype); - if (!type) - return NULL; - - /* - * Page align the mapping size, taking account of any offset. - */ - size = PAGE_ALIGN(offset + size); - - area = get_vm_area_caller(size, VM_IOREMAP, caller); - if (!area) - return NULL; - addr = (unsigned long)area->addr; - - if (!((__pfn_to_phys(pfn) | size | addr) & ~PMD_MASK)) { - area->flags |= VM_UNICORE_SECTION_MAPPING; - err = remap_area_sections(addr, pfn, size, type); - } else - err = ioremap_page_range(addr, addr + size, __pfn_to_phys(pfn), - __pgprot(type->prot_pte)); - - if (err) { - vunmap((void *)addr); - return NULL; - } - - flush_cache_vmap(addr, addr + size); - return (void __iomem *) (offset + addr); -} - -void __iomem *__uc32_ioremap_caller(unsigned long phys_addr, size_t size, - unsigned int mtype, void *caller) -{ - unsigned long last_addr; - unsigned long offset = phys_addr & ~PAGE_MASK; - unsigned long pfn = __phys_to_pfn(phys_addr); - - /* - * Don't allow wraparound or zero size - */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - return __uc32_ioremap_pfn_caller(pfn, offset, size, mtype, caller); -} - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * -__uc32_ioremap_pfn(unsigned long pfn, unsigned long offset, size_t size, - unsigned int mtype) -{ - return __uc32_ioremap_pfn_caller(pfn, offset, size, mtype, - __builtin_return_address(0)); -} -EXPORT_SYMBOL(__uc32_ioremap_pfn); - -void __iomem * -__uc32_ioremap(unsigned long phys_addr, size_t size) -{ - return __uc32_ioremap_caller(phys_addr, size, MT_DEVICE, - __builtin_return_address(0)); -} -EXPORT_SYMBOL(__uc32_ioremap); - -void __uc32_iounmap(volatile void __iomem *io_addr) -{ - void *addr = (void *)(PAGE_MASK & (unsigned long)io_addr); - struct vm_struct *vm; - - /* - * If this is a section based mapping we need to handle it - * specially as the VM subsystem does not know how to handle - * such a beast. We need the lock here b/c we need to clear - * all the mappings before the area can be reclaimed - * by someone else. - */ - vm = find_vm_area(addr); - if (vm && (vm->flags & VM_IOREMAP) && - (vm->flags & VM_UNICORE_SECTION_MAPPING)) - unmap_area_sections((unsigned long)vm->addr, vm->size); - - vunmap(addr); -} -EXPORT_SYMBOL(__uc32_iounmap); diff --git a/arch/unicore32/mm/mm.h b/arch/unicore32/mm/mm.h deleted file mode 100644 index f157f5d249ab..000000000000 --- a/arch/unicore32/mm/mm.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/mm/mm.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include - -/* the upper-most page table pointer */ -extern pmd_t *top_pmd; -extern int sysctl_overcommit_memory; - -#define TOP_PTE(x) pte_offset_kernel(top_pmd, x) - -struct mem_type { - unsigned int prot_pte; - unsigned int prot_l1; - unsigned int prot_sect; -}; - -const struct mem_type *get_mem_type(unsigned int type); - -extern void __flush_dcache_page(struct address_space *, struct page *); -extern void hook_fault_code(int nr, int (*fn) - (unsigned long, unsigned int, struct pt_regs *), - int sig, int code, const char *name); - -void __init bootmem_init(void); -void uc32_mm_memblock_reserve(void); diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c deleted file mode 100644 index 183d5b056814..000000000000 --- a/arch/unicore32/mm/mmu.c +++ /dev/null @@ -1,513 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/mmu.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include "mm.h" - -/* - * empty_zero_page is a special page that is used for - * zero-initialized data and COW. - */ -struct page *empty_zero_page; -EXPORT_SYMBOL(empty_zero_page); - -/* - * The pmd table for the upper-most set of pages. - */ -pmd_t *top_pmd; - -pgprot_t pgprot_user; -EXPORT_SYMBOL(pgprot_user); - -pgprot_t pgprot_kernel; -EXPORT_SYMBOL(pgprot_kernel); - -static int __init noalign_setup(char *__unused) -{ - cr_alignment &= ~CR_A; - cr_no_alignment &= ~CR_A; - set_cr(cr_alignment); - return 1; -} -__setup("noalign", noalign_setup); - -void adjust_cr(unsigned long mask, unsigned long set) -{ - unsigned long flags; - - mask &= ~CR_A; - - set &= mask; - - local_irq_save(flags); - - cr_no_alignment = (cr_no_alignment & ~mask) | set; - cr_alignment = (cr_alignment & ~mask) | set; - - set_cr((get_cr() & ~mask) | set); - - local_irq_restore(flags); -} - -struct map_desc { - unsigned long virtual; - unsigned long pfn; - unsigned long length; - unsigned int type; -}; - -#define PROT_PTE_DEVICE (PTE_PRESENT | PTE_YOUNG | \ - PTE_DIRTY | PTE_READ | PTE_WRITE) -#define PROT_SECT_DEVICE (PMD_TYPE_SECT | PMD_PRESENT | \ - PMD_SECT_READ | PMD_SECT_WRITE) - -static struct mem_type mem_types[] = { - [MT_DEVICE] = { /* Strongly ordered */ - .prot_pte = PROT_PTE_DEVICE, - .prot_l1 = PMD_TYPE_TABLE | PMD_PRESENT, - .prot_sect = PROT_SECT_DEVICE, - }, - /* - * MT_KUSER: pte for vecpage -- cacheable, - * and sect for unigfx mmap -- noncacheable - */ - [MT_KUSER] = { - .prot_pte = PTE_PRESENT | PTE_YOUNG | PTE_DIRTY | - PTE_CACHEABLE | PTE_READ | PTE_EXEC, - .prot_l1 = PMD_TYPE_TABLE | PMD_PRESENT, - .prot_sect = PROT_SECT_DEVICE, - }, - [MT_HIGH_VECTORS] = { - .prot_pte = PTE_PRESENT | PTE_YOUNG | PTE_DIRTY | - PTE_CACHEABLE | PTE_READ | PTE_WRITE | - PTE_EXEC, - .prot_l1 = PMD_TYPE_TABLE | PMD_PRESENT, - }, - [MT_MEMORY] = { - .prot_pte = PTE_PRESENT | PTE_YOUNG | PTE_DIRTY | - PTE_WRITE | PTE_EXEC, - .prot_l1 = PMD_TYPE_TABLE | PMD_PRESENT, - .prot_sect = PMD_TYPE_SECT | PMD_PRESENT | PMD_SECT_CACHEABLE | - PMD_SECT_READ | PMD_SECT_WRITE | PMD_SECT_EXEC, - }, - [MT_ROM] = { - .prot_sect = PMD_TYPE_SECT | PMD_PRESENT | PMD_SECT_CACHEABLE | - PMD_SECT_READ, - }, -}; - -const struct mem_type *get_mem_type(unsigned int type) -{ - return type < ARRAY_SIZE(mem_types) ? &mem_types[type] : NULL; -} -EXPORT_SYMBOL(get_mem_type); - -/* - * Adjust the PMD section entries according to the CPU in use. - */ -static void __init build_mem_type_table(void) -{ - pgprot_user = __pgprot(PTE_PRESENT | PTE_YOUNG | PTE_CACHEABLE); - pgprot_kernel = __pgprot(PTE_PRESENT | PTE_YOUNG | - PTE_DIRTY | PTE_READ | PTE_WRITE | - PTE_EXEC | PTE_CACHEABLE); -} - -#define vectors_base() (vectors_high() ? 0xffff0000 : 0) - -static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr, - unsigned long prot) -{ - if (pmd_none(*pmd)) { - size_t size = PTRS_PER_PTE * sizeof(pte_t); - pte_t *pte = memblock_alloc(size, size); - - if (!pte) - panic("%s: Failed to allocate %zu bytes align=%zx\n", - __func__, size, size); - - __pmd_populate(pmd, __pa(pte) | prot); - } - BUG_ON(pmd_bad(*pmd)); - return pte_offset_kernel(pmd, addr); -} - -static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr, - unsigned long end, unsigned long pfn, - const struct mem_type *type) -{ - pte_t *pte = early_pte_alloc(pmd, addr, type->prot_l1); - do { - set_pte(pte, pfn_pte(pfn, __pgprot(type->prot_pte))); - pfn++; - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -static void __init alloc_init_section(pgd_t *pgd, unsigned long addr, - unsigned long end, unsigned long phys, - const struct mem_type *type) -{ - pmd_t *pmd = pmd_offset((pud_t *)pgd, addr); - - /* - * Try a section mapping - end, addr and phys must all be aligned - * to a section boundary. - */ - if (((addr | end | phys) & ~SECTION_MASK) == 0) { - pmd_t *p = pmd; - - do { - set_pmd(pmd, __pmd(phys | type->prot_sect)); - phys += SECTION_SIZE; - } while (pmd++, addr += SECTION_SIZE, addr != end); - - flush_pmd_entry(p); - } else { - /* - * No need to loop; pte's aren't interested in the - * individual L1 entries. - */ - alloc_init_pte(pmd, addr, end, __phys_to_pfn(phys), type); - } -} - -/* - * Create the page directory entries and any necessary - * page tables for the mapping specified by `md'. We - * are able to cope here with varying sizes and address - * offsets, and we take full advantage of sections. - */ -static void __init create_mapping(struct map_desc *md) -{ - unsigned long phys, addr, length, end; - const struct mem_type *type; - pgd_t *pgd; - - if (md->virtual != vectors_base() && md->virtual < TASK_SIZE) { - printk(KERN_WARNING "BUG: not creating mapping for " - "0x%08llx at 0x%08lx in user region\n", - __pfn_to_phys((u64)md->pfn), md->virtual); - return; - } - - if ((md->type == MT_DEVICE || md->type == MT_ROM) && - md->virtual >= PAGE_OFFSET && md->virtual < VMALLOC_END) { - printk(KERN_WARNING "BUG: mapping for 0x%08llx at 0x%08lx " - "overlaps vmalloc space\n", - __pfn_to_phys((u64)md->pfn), md->virtual); - } - - type = &mem_types[md->type]; - - addr = md->virtual & PAGE_MASK; - phys = (unsigned long)__pfn_to_phys(md->pfn); - length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK)); - - if (type->prot_l1 == 0 && ((addr | phys | length) & ~SECTION_MASK)) { - printk(KERN_WARNING "BUG: map for 0x%08lx at 0x%08lx can not " - "be mapped using pages, ignoring.\n", - __pfn_to_phys(md->pfn), addr); - return; - } - - pgd = pgd_offset_k(addr); - end = addr + length; - do { - unsigned long next = pgd_addr_end(addr, end); - - alloc_init_section(pgd, addr, next, phys, type); - - phys += next - addr; - addr = next; - } while (pgd++, addr != end); -} - -static void * __initdata vmalloc_min = (void *)(VMALLOC_END - SZ_128M); - -/* - * vmalloc=size forces the vmalloc area to be exactly 'size' - * bytes. This can be used to increase (or decrease) the vmalloc - * area - the default is 128m. - */ -static int __init early_vmalloc(char *arg) -{ - unsigned long vmalloc_reserve = memparse(arg, NULL); - - if (vmalloc_reserve < SZ_16M) { - vmalloc_reserve = SZ_16M; - printk(KERN_WARNING - "vmalloc area too small, limiting to %luMB\n", - vmalloc_reserve >> 20); - } - - if (vmalloc_reserve > VMALLOC_END - (PAGE_OFFSET + SZ_32M)) { - vmalloc_reserve = VMALLOC_END - (PAGE_OFFSET + SZ_32M); - printk(KERN_WARNING - "vmalloc area is too big, limiting to %luMB\n", - vmalloc_reserve >> 20); - } - - vmalloc_min = (void *)(VMALLOC_END - vmalloc_reserve); - return 0; -} -early_param("vmalloc", early_vmalloc); - -static phys_addr_t lowmem_limit __initdata = SZ_1G; - -static void __init sanity_check_meminfo(void) -{ - int i, j; - - lowmem_limit = __pa(vmalloc_min - 1) + 1; - memblock_set_current_limit(lowmem_limit); - - for (i = 0, j = 0; i < meminfo.nr_banks; i++) { - struct membank *bank = &meminfo.bank[j]; - *bank = meminfo.bank[i]; - j++; - } - meminfo.nr_banks = j; -} - -static inline void prepare_page_table(void) -{ - unsigned long addr; - phys_addr_t end; - - /* - * Clear out all the mappings below the kernel image. - */ - for (addr = 0; addr < MODULES_VADDR; addr += PGDIR_SIZE) - pmd_clear(pmd_off_k(addr)); - - for ( ; addr < PAGE_OFFSET; addr += PGDIR_SIZE) - pmd_clear(pmd_off_k(addr)); - - /* - * Find the end of the first block of lowmem. - */ - end = memblock.memory.regions[0].base + memblock.memory.regions[0].size; - if (end >= lowmem_limit) - end = lowmem_limit; - - /* - * Clear out all the kernel space mappings, except for the first - * memory bank, up to the end of the vmalloc region. - */ - for (addr = __phys_to_virt(end); - addr < VMALLOC_END; addr += PGDIR_SIZE) - pmd_clear(pmd_off_k(addr)); -} - -/* - * Reserve the special regions of memory - */ -void __init uc32_mm_memblock_reserve(void) -{ - /* - * Reserve the page tables. These are already in use, - * and can only be in node 0. - */ - memblock_reserve(__pa(swapper_pg_dir), PTRS_PER_PGD * sizeof(pgd_t)); -} - -/* - * Set up device the mappings. Since we clear out the page tables for all - * mappings above VMALLOC_END, we will remove any debug device mappings. - * This means you have to be careful how you debug this function, or any - * called function. This means you can't use any function or debugging - * method which may touch any device, otherwise the kernel _will_ crash. - */ -static void __init devicemaps_init(void) -{ - struct map_desc map; - unsigned long addr; - void *vectors; - - /* - * Allocate the vector page early. - */ - vectors = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!vectors) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); - - for (addr = VMALLOC_END; addr; addr += PGDIR_SIZE) - pmd_clear(pmd_off_k(addr)); - - /* - * Create a mapping for the machine vectors at the high-vectors - * location (0xffff0000). If we aren't using high-vectors, also - * create a mapping at the low-vectors virtual address. - */ - map.pfn = __phys_to_pfn(virt_to_phys(vectors)); - map.virtual = VECTORS_BASE; - map.length = PAGE_SIZE; - map.type = MT_HIGH_VECTORS; - create_mapping(&map); - - /* - * Create a mapping for the kuser page at the special - * location (0xbfff0000) to the same vectors location. - */ - map.pfn = __phys_to_pfn(virt_to_phys(vectors)); - map.virtual = KUSER_VECPAGE_BASE; - map.length = PAGE_SIZE; - map.type = MT_KUSER; - create_mapping(&map); - - /* - * Finally flush the caches and tlb to ensure that we're in a - * consistent state wrt the writebuffer. This also ensures that - * any write-allocated cache lines in the vector page are written - * back. After this point, we can start to touch devices again. - */ - local_flush_tlb_all(); - flush_cache_all(); -} - -static void __init map_lowmem(void) -{ - struct memblock_region *reg; - - /* Map all the lowmem memory banks. */ - for_each_memblock(memory, reg) { - phys_addr_t start = reg->base; - phys_addr_t end = start + reg->size; - struct map_desc map; - - if (end > lowmem_limit) - end = lowmem_limit; - if (start >= end) - break; - - map.pfn = __phys_to_pfn(start); - map.virtual = __phys_to_virt(start); - map.length = end - start; - map.type = MT_MEMORY; - - create_mapping(&map); - } -} - -/* - * paging_init() sets up the page tables, initialises the zone memory - * maps, and sets up the zero page, bad page and bad page tables. - */ -void __init paging_init(void) -{ - void *zero_page; - - build_mem_type_table(); - sanity_check_meminfo(); - prepare_page_table(); - map_lowmem(); - devicemaps_init(); - - top_pmd = pmd_off_k(0xffff0000); - - /* allocate the zero page. */ - zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); - - bootmem_init(); - - empty_zero_page = virt_to_page(zero_page); - __flush_dcache_page(NULL, empty_zero_page); -} - -/* - * In order to soft-boot, we need to insert a 1:1 mapping in place of - * the user-mode pages. This will then ensure that we have predictable - * results when turning the mmu off - */ -void setup_mm_for_reboot(void) -{ - unsigned long base_pmdval; - pgd_t *pgd; - int i; - - /* - * We need to access to user-mode page tables here. For kernel threads - * we don't have any user-mode mappings so we use the context that we - * "borrowed". - */ - pgd = current->active_mm->pgd; - - base_pmdval = PMD_SECT_WRITE | PMD_SECT_READ | PMD_TYPE_SECT; - - for (i = 0; i < FIRST_USER_PGD_NR + USER_PTRS_PER_PGD; i++, pgd++) { - unsigned long pmdval = (i << PGDIR_SHIFT) | base_pmdval; - pmd_t *pmd; - - pmd = pmd_off(pgd, i << PGDIR_SHIFT); - set_pmd(pmd, __pmd(pmdval)); - flush_pmd_entry(pmd); - } - - local_flush_tlb_all(); -} - -/* - * Take care of architecture specific things when placing a new PTE into - * a page table, or changing an existing PTE. Basically, there are two - * things that we need to take care of: - * - * 1. If PG_dcache_clean is not set for the page, we need to ensure - * that any cache entries for the kernels virtual memory - * range are written back to the page. - * 2. If we have multiple shared mappings of the same space in - * an object, we need to deal with the cache aliasing issues. - * - * Note that the pte lock will be held. - */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep) -{ - unsigned long pfn = pte_pfn(*ptep); - struct address_space *mapping; - struct page *page; - - if (!pfn_valid(pfn)) - return; - - /* - * The zero page is never written to, so never has any dirty - * cache lines, and therefore never needs to be flushed. - */ - page = pfn_to_page(pfn); - if (page == ZERO_PAGE(0)) - return; - - mapping = page_mapping_file(page); - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) - __flush_dcache_page(mapping, page); - if (mapping) - if (vma->vm_flags & VM_EXEC) - __flush_icache_all(); -} diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c deleted file mode 100644 index f01c73e04836..000000000000 --- a/arch/unicore32/mm/pgd.c +++ /dev/null @@ -1,102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/pgd.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include - -#include -#include -#include - -#include "mm.h" - -#define FIRST_KERNEL_PGD_NR (FIRST_USER_PGD_NR + USER_PTRS_PER_PGD) - -/* - * need to get a 4k page for level 1 - */ -pgd_t *get_pgd_slow(struct mm_struct *mm) -{ - pgd_t *new_pgd, *init_pgd; - pmd_t *new_pmd, *init_pmd; - pte_t *new_pte, *init_pte; - - new_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, 0); - if (!new_pgd) - goto no_pgd; - - memset(new_pgd, 0, FIRST_KERNEL_PGD_NR * sizeof(pgd_t)); - - /* - * Copy over the kernel and IO PGD entries - */ - init_pgd = pgd_offset_k(0); - memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR, - (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t)); - - clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - - if (!vectors_high()) { - /* - * On UniCore, first page must always be allocated since it - * contains the machine vectors. - */ - new_pmd = pmd_alloc(mm, (pud_t *)new_pgd, 0); - if (!new_pmd) - goto no_pmd; - - new_pte = pte_alloc_map(mm, new_pmd, 0); - if (!new_pte) - goto no_pte; - - init_pmd = pmd_offset((pud_t *)init_pgd, 0); - init_pte = pte_offset_map(init_pmd, 0); - set_pte(new_pte, *init_pte); - pte_unmap(init_pte); - pte_unmap(new_pte); - } - - return new_pgd; - -no_pte: - pmd_free(mm, new_pmd); - mm_dec_nr_pmds(mm); -no_pmd: - free_pages((unsigned long)new_pgd, 0); -no_pgd: - return NULL; -} - -void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd) -{ - pmd_t *pmd; - pgtable_t pte; - - if (!pgd) - return; - - /* pgd is always present and good */ - pmd = pmd_off(pgd, 0); - if (pmd_none(*pmd)) - goto free; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - goto free; - } - - pte = pmd_pgtable(*pmd); - pmd_clear(pmd); - pte_free(mm, pte); - mm_dec_nr_ptes(mm); - pmd_free(mm, pmd); - mm_dec_nr_pmds(mm); -free: - free_pages((unsigned long) pgd, 0); -} diff --git a/arch/unicore32/mm/proc-macros.S b/arch/unicore32/mm/proc-macros.S deleted file mode 100644 index 3b0ae7d5bd80..000000000000 --- a/arch/unicore32/mm/proc-macros.S +++ /dev/null @@ -1,142 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/mm/proc-macros.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * We need constants.h for: - * VMA_VM_MM - * VMA_VM_FLAGS - * VM_EXEC - */ -#include -#include -#include - -/* - * the cache line sizes of the I and D cache are the same - */ -#define CACHE_LINESIZE 32 - -/* - * This is the maximum size of an area which will be invalidated - * using the single invalidate entry instructions. Anything larger - * than this, and we go for the whole cache. - * - * This value should be chosen such that we choose the cheapest - * alternative. - */ -#ifdef CONFIG_CPU_UCV2 -#define MAX_AREA_SIZE 0x800 /* 64 cache line */ -#endif - -/* - * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm) - */ - .macro vma_vm_mm, rd, rn - ldw \rd, [\rn+], #VMA_VM_MM - .endm - -/* - * vma_vm_flags - get vma->vm_flags - */ - .macro vma_vm_flags, rd, rn - ldw \rd, [\rn+], #VMA_VM_FLAGS - .endm - - .macro tsk_mm, rd, rn - ldw \rd, [\rn+], #TI_TASK - ldw \rd, [\rd+], #TSK_ACTIVE_MM - .endm - -/* - * act_mm - get current->active_mm - */ - .macro act_mm, rd - andn \rd, sp, #8128 - andn \rd, \rd, #63 - ldw \rd, [\rd+], #TI_TASK - ldw \rd, [\rd+], #TSK_ACTIVE_MM - .endm - -/* - * mmid - get context id from mm pointer (mm->context.id) - */ - .macro mmid, rd, rn - ldw \rd, [\rn+], #MM_CONTEXT_ID - .endm - -/* - * mask_asid - mask the ASID from the context ID - */ - .macro asid, rd, rn - and \rd, \rn, #255 - .endm - - .macro crval, clear, mmuset, ucset - .word \clear - .word \mmuset - .endm - -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE -/* - * va2pa va, pa, tbl, msk, off, err - * This macro is used to translate virtual address to its physical address. - * - * va: virtual address - * pa: physical address, result is stored in this register - * tbl, msk, off: temp registers, will be destroyed - * err: jump to error label if the physical address not exist - * NOTE: all regs must be different - */ - .macro va2pa, va, pa, tbl, msk, off, err=990f - movc \pa, p0.c2, #0 - mov \off, \va >> #22 @ off <- index of 1st page table - adr \tbl, 910f @ tbl <- table of 1st page table -900: @ ---- handle 1, 2 page table - add \pa, \pa, #PAGE_OFFSET @ pa <- virt addr of page table - ldw \pa, [\pa+], \off << #2 @ pa <- the content of pt - cand.a \pa, #4 @ test exist bit - beq \err @ if not exist - and \off, \pa, #3 @ off <- the last 2 bits - add \tbl, \tbl, \off << #3 @ cmove table pointer - ldw \msk, [\tbl+], #0 @ get the mask - ldw pc, [\tbl+], #4 -930: @ ---- handle 2nd page table - and \pa, \pa, \msk @ pa <- phys addr of 2nd pt - mov \off, \va << #10 - cntlo \tbl, \msk @ use tbl as temp reg - mov \off, \off >> \tbl - mov \off, \off >> #2 @ off <- index of 2nd pt - adr \tbl, 920f @ tbl <- table of 2nd pt - b 900b -910: @ 1st level page table - .word 0xfffff000, 930b @ second level page table - .word 0xfffffc00, 930b @ second level large page table - .word 0x00000000, \err @ invalid - .word 0xffc00000, 980f @ super page - -920: @ 2nd level page table - .word 0xfffff000, 980f @ page - .word 0xffffc000, 980f @ middle page - .word 0xffff0000, 980f @ large page - .word 0x00000000, \err @ invalid -980: - andn \tbl, \va, \msk - and \pa, \pa, \msk - or \pa, \pa, \tbl -990: - .endm -#endif - - .macro dcacheline_flush, addr, t1, t2 - mov \t1, \addr << #20 - ldw \t2, =_stext @ _stext must ALIGN(4096) - add \t2, \t2, \t1 >> #20 - ldw \t1, [\t2+], #0x0000 - ldw \t1, [\t2+], #0x1000 - ldw \t1, [\t2+], #0x2000 - ldw \t1, [\t2+], #0x3000 - .endm diff --git a/arch/unicore32/mm/proc-syms.c b/arch/unicore32/mm/proc-syms.c deleted file mode 100644 index 6c081616fc3c..000000000000 --- a/arch/unicore32/mm/proc-syms.c +++ /dev/null @@ -1,19 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/unicore32/mm/proc-syms.c - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include - -#include -#include -#include - -EXPORT_SYMBOL(cpu_dcache_clean_area); -EXPORT_SYMBOL(cpu_set_pte); - -EXPORT_SYMBOL(__cpuc_coherent_kern_range); diff --git a/arch/unicore32/mm/proc-ucv2.S b/arch/unicore32/mm/proc-ucv2.S deleted file mode 100644 index 18f8c4fb21a0..000000000000 --- a/arch/unicore32/mm/proc-ucv2.S +++ /dev/null @@ -1,131 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/mm/proc-ucv2.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include - -#include "proc-macros.S" - -ENTRY(cpu_proc_fin) - stm.w (lr), [sp-] - mov ip, #PSR_R_BIT | PSR_I_BIT | PRIV_MODE - mov.a asr, ip - b.l __cpuc_flush_kern_all - ldm.w (pc), [sp]+ - -/* - * cpu_reset(loc) - * - * Perform a soft reset of the system. Put the CPU into the - * same state as it would be if it had been reset, and branch - * to what would be the reset vector. - * - * - loc - location to jump to for soft reset - */ - .align 5 -ENTRY(cpu_reset) - mov ip, #0 - movc p0.c5, ip, #28 @ Cache invalidate all - nop8 - - movc p0.c6, ip, #6 @ TLB invalidate all - nop8 - - movc ip, p0.c1, #0 @ ctrl register - or ip, ip, #0x2000 @ vector base address - andn ip, ip, #0x000f @ ............idam - movc p0.c1, ip, #0 @ disable caches and mmu - nop - mov pc, r0 @ jump to loc - nop8 - -/* - * cpu_do_idle() - * - * Idle the processor (eg, wait for interrupt). - * - * IRQs are already disabled. - */ -ENTRY(cpu_do_idle) - mov r0, #0 @ PCI address - .rept 8 - ldw r1, [r0] - .endr - mov pc, lr - -ENTRY(cpu_dcache_clean_area) -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - csub.a r1, #MAX_AREA_SIZE - bsg 101f - mov r9, #PAGE_SZ - sub r9, r9, #1 @ PAGE_MASK -1: va2pa r0, r10, r11, r12, r13 @ r10 is PA - b 3f -2: cand.a r0, r9 - beq 1b -3: movc p0.c5, r10, #11 @ clean D entry - nop8 - add r0, r0, #CACHE_LINESIZE - add r10, r10, #CACHE_LINESIZE - sub.a r1, r1, #CACHE_LINESIZE - bua 2b - mov pc, lr -#endif -101: mov ip, #0 - movc p0.c5, ip, #10 @ Dcache clean all - nop8 - - mov pc, lr - -/* - * cpu_do_switch_mm(pgd_phys) - * - * Set the translation table base pointer to be pgd_phys - * - * - pgd_phys - physical address of new pgd - * - * It is assumed that: - * - we are not using split page tables - */ - .align 5 -ENTRY(cpu_do_switch_mm) - movc p0.c2, r0, #0 @ update page table ptr - nop8 - - movc p0.c6, ip, #6 @ TLB invalidate all - nop8 - - mov pc, lr - -/* - * cpu_set_pte(ptep, pte) - * - * Set a level 2 translation table entry. - * - * - ptep - pointer to level 2 translation table entry - * - pte - PTE value to store - */ - .align 5 -ENTRY(cpu_set_pte) - stw r1, [r0] -#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE - sub r2, r0, #PAGE_OFFSET - movc p0.c5, r2, #11 @ Dcache clean line - nop8 -#else - mov ip, #0 - movc p0.c5, ip, #10 @ Dcache clean all - nop8 - @dcacheline_flush r0, r2, ip -#endif - mov pc, lr - diff --git a/arch/unicore32/mm/tlb-ucv2.S b/arch/unicore32/mm/tlb-ucv2.S deleted file mode 100644 index 0ce9c6b6f1db..000000000000 --- a/arch/unicore32/mm/tlb-ucv2.S +++ /dev/null @@ -1,86 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/arch/unicore32/mm/tlb-ucv2.S - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - */ -#include -#include -#include -#include -#include -#include "proc-macros.S" - -/* - * __cpu_flush_user_tlb_range(start, end, vma) - * - * Invalidate a range of TLB entries in the specified address space. - * - * - start - start address (may not be aligned) - * - end - end address (exclusive, may not be aligned) - * - vma - vma_struct describing address range - */ -ENTRY(__cpu_flush_user_tlb_range) -#ifndef CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE - mov r0, r0 >> #PAGE_SHIFT @ align address - mov r0, r0 << #PAGE_SHIFT - vma_vm_flags r2, r2 @ get vma->vm_flags -1: - movc p0.c6, r0, #3 - nop8 - - cand.a r2, #VM_EXEC @ Executable area ? - beq 2f - - movc p0.c6, r0, #5 - nop8 -2: - add r0, r0, #PAGE_SZ - csub.a r0, r1 - beb 1b -#else - movc p0.c6, r0, #2 - nop8 - - cand.a r2, #VM_EXEC @ Executable area ? - beq 2f - - movc p0.c6, r0, #4 - nop8 -2: -#endif - mov pc, lr - -/* - * __cpu_flush_kern_tlb_range(start,end) - * - * Invalidate a range of kernel TLB entries - * - * - start - start address (may not be aligned) - * - end - end address (exclusive, may not be aligned) - */ -ENTRY(__cpu_flush_kern_tlb_range) -#ifndef CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE - mov r0, r0 >> #PAGE_SHIFT @ align address - mov r0, r0 << #PAGE_SHIFT -1: - movc p0.c6, r0, #3 - nop8 - - movc p0.c6, r0, #5 - nop8 - - add r0, r0, #PAGE_SZ - csub.a r0, r1 - beb 1b -#else - movc p0.c6, r0, #2 - nop8 - - movc p0.c6, r0, #4 - nop8 -#endif - mov pc, lr - diff --git a/kernel/reboot.c b/kernel/reboot.c index 491f1347bf43..e7b78d5ae1ab 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -26,7 +26,7 @@ int C_A_D = 1; struct pid *cad_pid; EXPORT_SYMBOL(cad_pid); -#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32) +#if defined(CONFIG_ARM) #define DEFAULT_REBOOT_MODE = REBOOT_HARD #else #define DEFAULT_REBOOT_MODE -- cgit v1.2.3 From bba1dc0b55ac462d24ed1228ad49800c238cd6d7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 29 Jun 2020 21:33:39 -0700 Subject: bpf: Remove redundant synchronize_rcu. bpf_free_used_maps() or close(map_fd) will trigger map_free callback. bpf_free_used_maps() is called after bpf prog is no longer executing: bpf_prog_put->call_rcu->bpf_prog_free->bpf_free_used_maps. Hence there is no need to call synchronize_rcu() to protect map elements. Note that hash_of_maps and array_of_maps update/delete inner maps via sys_bpf() that calls maybe_wait_bpf_programs() and synchronize_rcu(). Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Paul E. McKenney Link: https://lore.kernel.org/bpf/20200630043343.53195-2-alexei.starovoitov@gmail.com --- kernel/bpf/arraymap.c | 9 --------- kernel/bpf/hashtab.c | 8 +++----- kernel/bpf/lpm_trie.c | 5 ----- kernel/bpf/queue_stack_maps.c | 7 ------- kernel/bpf/reuseport_array.c | 2 -- kernel/bpf/ringbuf.c | 7 ------- kernel/bpf/stackmap.c | 3 --- 7 files changed, 3 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index ec5cd11032aa..c66e8273fccd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -386,13 +386,6 @@ static void array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding programs to complete - * and free the array - */ - synchronize_rcu(); - if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); @@ -546,8 +539,6 @@ static void fd_array_map_free(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - synchronize_rcu(); - /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) BUG_ON(array->ptrs[i] != NULL); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index acd06081d81d..d4378d7d442b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1290,12 +1290,10 @@ static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete + /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. + * bpf_free_used_maps() is called after bpf prog is no longer executing. + * There is no need to synchronize_rcu() here to protect map elements. */ - synchronize_rcu(); /* some of free_htab_elem() callbacks for elements of this map may * not have executed. Wait for them. diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1abd4e3f906d..44474bf3ab7a 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -589,11 +589,6 @@ static void trie_free(struct bpf_map *map) struct lpm_trie_node __rcu **slot; struct lpm_trie_node *node; - /* Wait for outstanding programs to complete - * update/lookup/delete/get_next_key and free the trie. - */ - synchronize_rcu(); - /* Always start at the root and walk down to a node that has no * children. Then free that node, nullify its reference in the parent * and start over. diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 80c66a6d7c54..44184f82916a 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -101,13 +101,6 @@ static void queue_stack_map_free(struct bpf_map *map) { struct bpf_queue_stack *qs = bpf_queue_stack(map); - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete - */ - synchronize_rcu(); - bpf_map_area_free(qs); } diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index a09922f656e4..3625c4fcc65c 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -96,8 +96,6 @@ static void reuseport_array_free(struct bpf_map *map) struct sock *sk; u32 i; - synchronize_rcu(); - /* * ops->map_*_elem() will not be able to access this * array now. Hence, this function only races with diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index dbf37aff4827..13a8d3967e07 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -215,13 +215,6 @@ static void ringbuf_map_free(struct bpf_map *map) { struct bpf_ringbuf_map *rb_map; - /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete - */ - synchronize_rcu(); - rb_map = container_of(map, struct bpf_ringbuf_map, map); bpf_ringbuf_free(rb_map->rb); kfree(rb_map); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 27dc9b1b08a5..071f98d0f7c6 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -604,9 +604,6 @@ static void stack_map_free(struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - /* wait for bpf programs to complete before freeing stack map */ - synchronize_rcu(); - bpf_map_area_free(smap->elems); pcpu_freelist_destroy(&smap->freelist); bpf_map_area_free(smap); -- cgit v1.2.3 From d141b8bc5773cbbaf5b8530f08f94fc10fff9e8c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Jun 2020 23:28:43 -0700 Subject: perf: Expose get/put_callchain_entry() Sanitize and expose get/put_callchain_entry(). This would be used by bpf stack map. Suggested-by: Peter Zijlstra Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200630062846.664389-2-songliubraving@fb.com --- include/linux/perf_event.h | 2 ++ kernel/events/callchain.c | 13 ++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b4bb32082342..00ab5efa3833 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1244,6 +1244,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); +extern struct perf_callchain_entry *get_callchain_entry(int *rctx); +extern void put_callchain_entry(int rctx); extern int sysctl_perf_event_max_stack; extern int sysctl_perf_event_max_contexts_per_stack; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 334d48b16c36..c6ce894e4ce9 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -149,7 +149,7 @@ void put_callchain_buffers(void) } } -static struct perf_callchain_entry *get_callchain_entry(int *rctx) +struct perf_callchain_entry *get_callchain_entry(int *rctx) { int cpu; struct callchain_cpus_entries *entries; @@ -159,8 +159,10 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) return NULL; entries = rcu_dereference(callchain_cpus_entries); - if (!entries) + if (!entries) { + put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx); return NULL; + } cpu = smp_processor_id(); @@ -168,7 +170,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) (*rctx * perf_callchain_entry__sizeof())); } -static void +void put_callchain_entry(int rctx) { put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); @@ -183,11 +185,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, int rctx; entry = get_callchain_entry(&rctx); - if (rctx == -1) - return NULL; - if (!entry) - goto exit_put; + return NULL; ctx.entry = entry; ctx.max_stack = max_stack; -- cgit v1.2.3 From fa28dcb82a38f8e3993b0fae9106b1a80b59e4f0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Jun 2020 23:28:44 -0700 Subject: bpf: Introduce helper bpf_get_task_stack() Introduce helper bpf_get_task_stack(), which dumps stack trace of given task. This is different to bpf_get_stack(), which gets stack track of current task. One potential use case of bpf_get_task_stack() is to call it from bpf_iter__task and dump all /proc//stack to a seq_file. bpf_get_task_stack() uses stack_trace_save_tsk() instead of get_perf_callchain() for kernel stack. The benefit of this choice is that stack_trace_save_tsk() doesn't require changes in arch/. The downside of using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the stack trace to unsigned long array. For 32-bit systems, we need to translate it to u64 array. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200630062846.664389-3-songliubraving@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 37 +++++++++++++++++++- kernel/bpf/stackmap.c | 77 +++++++++++++++++++++++++++++++++++++++--- kernel/bpf/verifier.c | 4 ++- kernel/trace/bpf_trace.c | 2 ++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 37 +++++++++++++++++++- 7 files changed, 153 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3d2ade703a35..0cd7f6884c5c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1627,6 +1627,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; +extern const struct bpf_func_proto bpf_get_task_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0cb8ec948816..da9bf35a26f8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3285,6 +3285,39 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to struct task_struct. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3427,7 +3460,9 @@ union bpf_attr { FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ FN(skc_to_tcp_request_sock), \ - FN(skc_to_udp6_sock), + FN(skc_to_udp6_sock), \ + FN(get_task_stack), \ + /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 071f98d0f7c6..5ad72ab2276b 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -348,6 +348,40 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } } +static struct perf_callchain_entry * +get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) +{ + struct perf_callchain_entry *entry; + int rctx; + + entry = get_callchain_entry(&rctx); + + if (!entry) + return NULL; + + entry->nr = init_nr + + stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), + sysctl_perf_event_max_stack - init_nr, 0); + + /* stack_trace_save_tsk() works on unsigned long array, while + * perf_callchain_entry uses u64 array. For 32-bit systems, it is + * necessary to fix this mismatch. + */ + if (__BITS_PER_LONG != 64) { + unsigned long *from = (unsigned long *) entry->ip; + u64 *to = entry->ip; + int i; + + /* copy data from the end to avoid using extra buffer */ + for (i = entry->nr - 1; i >= (int)init_nr; i--) + to[i] = (u64)(from[i]); + } + + put_callchain_entry(rctx); + + return entry; +} + BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { @@ -448,8 +482,8 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, - u64, flags) +static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, + void *buf, u32 size, u64 flags) { u32 init_nr, trace_nr, copy_len, elem_size, num_elem; bool user_build_id = flags & BPF_F_USER_BUILD_ID; @@ -471,13 +505,22 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, if (unlikely(size % elem_size)) goto clear; + /* cannot get valid user stack for task without user_mode regs */ + if (task && user && !user_mode(regs)) + goto err_fault; + num_elem = size / elem_size; if (sysctl_perf_event_max_stack < num_elem) init_nr = 0; else init_nr = sysctl_perf_event_max_stack - num_elem; - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); + + if (kernel && task) + trace = get_callchain_entry_for_task(task, init_nr); + else + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, + false, false); if (unlikely(!trace)) goto err_fault; @@ -505,6 +548,12 @@ clear: return err; } +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + return __bpf_get_stack(regs, NULL, buf, size, flags); +} + const struct bpf_func_proto bpf_get_stack_proto = { .func = bpf_get_stack, .gpl_only = true, @@ -515,6 +564,26 @@ const struct bpf_func_proto bpf_get_stack_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, + u32, size, u64, flags) +{ + struct pt_regs *regs = task_pt_regs(task); + + return __bpf_get_stack(regs, task, buf, size, flags); +} + +static int bpf_get_task_stack_btf_ids[5]; +const struct bpf_func_proto bpf_get_task_stack_proto = { + .func = bpf_get_task_stack, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .btf_id = bpf_get_task_stack_btf_ids, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7de98906ddf4..b608185e1ffd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4864,7 +4864,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; - if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { + if ((func_id == BPF_FUNC_get_stack || + func_id == BPF_FUNC_get_task_stack) && + !env->prog->has_callchain_buf) { const char *err_str; #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5d59dda5f661..977ba3b6f6c6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1137,6 +1137,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ringbuf_query_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + case BPF_FUNC_get_task_stack: + return &bpf_get_task_stack_proto; default: return NULL; } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6bab40ff442e..6843376733df 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -426,6 +426,7 @@ class PrinterHelpers(Printer): 'struct tcp_timewait_sock', 'struct tcp_request_sock', 'struct udp6_sock', + 'struct task_struct', 'struct __sk_buff', 'struct sk_msg_md', @@ -468,6 +469,7 @@ class PrinterHelpers(Printer): 'struct tcp_timewait_sock', 'struct tcp_request_sock', 'struct udp6_sock', + 'struct task_struct', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0cb8ec948816..da9bf35a26f8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3285,6 +3285,39 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to struct task_struct. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3427,7 +3460,9 @@ union bpf_attr { FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ FN(skc_to_tcp_request_sock), \ - FN(skc_to_udp6_sock), + FN(skc_to_udp6_sock), \ + FN(get_task_stack), \ + /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 2df6bb5493f83c7e818f66c384ea337c1b3da228 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Jun 2020 23:28:45 -0700 Subject: bpf: Allow %pB in bpf_seq_printf() and bpf_trace_printk() This makes it easy to dump stack trace in text. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200630062846.664389-4-songliubraving@fb.com --- kernel/trace/bpf_trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 977ba3b6f6c6..1d874d8e4384 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -376,7 +376,7 @@ static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, /* * Only limited trace_printk() conversion specifiers allowed: - * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s + * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s */ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) @@ -420,6 +420,11 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, goto fmt_str; } + if (fmt[i + 1] == 'B') { + i++; + goto fmt_next; + } + /* disallow any further format extensions */ if (fmt[i + 1] != 0 && !isspace(fmt[i + 1]) && @@ -636,7 +641,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, if (fmt[i] == 'p') { if (fmt[i + 1] == 0 || fmt[i + 1] == 'K' || - fmt[i + 1] == 'x') { + fmt[i + 1] == 'x' || + fmt[i + 1] == 'B') { /* just kernel pointers */ params[fmt_cnt] = args[fmt_cnt]; fmt_cnt++; -- cgit v1.2.3 From 58fbc3c63275c6255a10ae151cc3882d9190c4e0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 30 Jun 2020 08:59:26 -0400 Subject: ring-buffer: Consolidate add_timestamp to remove some branches Reorganize a little the logic to handle adding the absolute time stamp, extended and forced time stamps, in such a way to remove a branch or two. This is just a micro optimization. Also add before and after time stamps to the rb_event_info structure to display those values in the rb_check_timestamps() code, if something were to go wrong. Suggested-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 139 ++++++++++++++++++++++++--------------------- 1 file changed, 73 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 888bc9177937..ce125cbe98a5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -416,6 +416,8 @@ struct rb_irq_work { struct rb_event_info { u64 ts; u64 delta; + u64 before; + u64 after; unsigned long length; struct buffer_page *tail_page; int add_timestamp; @@ -2619,6 +2621,33 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event); +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; +} +#endif + +static noinline void +rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + u64 write_stamp; + + WARN_ONCE(info->delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)info->before, + (unsigned long long)info->after, + (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n" + "or add trace_clock=global to the kernel command line\n"); +} + /** * rb_update_event - update event type and data * @cpu_buffer: The per cpu buffer of the @event @@ -2646,6 +2675,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, bool abs = info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); + rb_check_timestamp(cpu_buffer, info); event = rb_add_time_stamp(event, abs ? info->delta : delta, abs); length -= RB_LEN_TIME_EXTEND; delta = 0; @@ -2692,13 +2722,6 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline bool sched_clock_stable(void) -{ - return true; -} -#endif - static __always_inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) @@ -3092,24 +3115,6 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); -static noinline void -rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, - struct rb_event_info *info) -{ - u64 write_stamp; - - WARN_ONCE(info->delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)info->delta, - (unsigned long long)info->ts, - (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), - sched_clock_stable() ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n" - "or add trace_clock=global to the kernel command line\n"); -} - static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) @@ -3117,7 +3122,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write, w; - u64 before, after; bool a_ok; bool b_ok; @@ -3126,40 +3130,31 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); - b_ok = rb_time_read(&cpu_buffer->before_stamp, &before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); - if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { + if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) { info->delta = info->ts; - info->add_timestamp = RB_ADD_STAMP_ABSOLUTE; } else { - info->delta = info->ts - after; - } - - if (likely(a_ok && b_ok)) { - if (unlikely(test_time_stamp(info->delta))) { - rb_check_timestamp(cpu_buffer, info); - info->add_timestamp |= RB_ADD_STAMP_EXTEND; + /* + * If interrupting an event time update, we may need an + * absolute timestamp. + * Don't bother if this is the start of a new page (w == 0). + */ + if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) { + info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; + info->length += RB_LEN_TIME_EXTEND; + } else { + info->delta = info->ts - info->after; + if (unlikely(test_time_stamp(info->delta))) { + info->add_timestamp |= RB_ADD_STAMP_EXTEND; + info->length += RB_LEN_TIME_EXTEND; + } } } - /* - * If interrupting an event time update, we may need an absolute timestamp. - * Don't bother if this is the start of a new page (w == 0). - */ - if (unlikely(!a_ok || !b_ok || (before != after && w))) - info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; - - /* - * If the time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - if (unlikely(info->add_timestamp)) - info->length += RB_LEN_TIME_EXTEND; - /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts); /*C*/ write = local_add_return(info->length, &tail_page->write); @@ -3173,10 +3168,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(write > BUF_PAGE_SIZE)) { if (tail != w) { /* before and after may now different, fix it up*/ - b_ok = rb_time_read(&cpu_buffer->before_stamp, &before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); - if (a_ok && b_ok && before != after) - (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, before, after); + b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + if (a_ok && b_ok && info->before != info->after) + (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, + info->before, info->after); } return rb_move_tail(cpu_buffer, tail, info); } @@ -3193,7 +3189,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (likely(!(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ - info->delta = info->ts - after; + info->delta = info->ts - info->after; else /* Just use full timestamp for inerrupting event */ info->delta = info->ts; @@ -3201,31 +3197,33 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->ts != save_before)) { /* SLOW PATH - Interrupted between C and E */ - a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); RB_WARN_ON(cpu_buffer, !a_ok); /* Write stamp must only go forward */ - if (save_before > after) { + if (save_before > info->after) { /* * We do not care about the result, only that * it gets updated atomically. */ - (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, save_before); + (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, + info->after, save_before); } } } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ - a_ok = rb_time_read(&cpu_buffer->write_stamp, &after); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); /* Was interrupted before here, write_stamp must be valid */ RB_WARN_ON(cpu_buffer, !a_ok); ts = rb_time_stamp(cpu_buffer->buffer); barrier(); /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && - after < ts) { + info->after < ts) { /* Nothing came after this event between C and E */ - info->delta = ts - after; - (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, after, info->ts); + info->delta = ts - info->after; + (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, + info->after, info->ts); info->ts = ts; } else { /* @@ -3277,6 +3275,7 @@ rb_reserve_next_event(struct trace_buffer *buffer, struct ring_buffer_event *event; struct rb_event_info info; int nr_loops = 0; + int add_ts_default; rb_start_commit(cpu_buffer); /* The commit page can not change after this */ @@ -3297,8 +3296,16 @@ rb_reserve_next_event(struct trace_buffer *buffer, #endif info.length = rb_calculate_event_length(length); + + if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { + add_ts_default = RB_ADD_STAMP_ABSOLUTE; + info.length += RB_LEN_TIME_EXTEND; + } else { + add_ts_default = RB_ADD_STAMP_NONE; + } + again: - info.add_timestamp = RB_ADD_STAMP_NONE; + info.add_timestamp = add_ts_default; info.delta = 0; /* @@ -3316,7 +3323,7 @@ rb_reserve_next_event(struct trace_buffer *buffer, event = __rb_reserve_next(cpu_buffer, &info); if (unlikely(PTR_ERR(event) == -EAGAIN)) { - if (info.add_timestamp) + if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND)) info.length -= RB_LEN_TIME_EXTEND; goto again; } -- cgit v1.2.3 From 74e879373b377f15d4ecb45bf8316b77e8badc49 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 30 Jun 2020 12:47:56 -0400 Subject: ring-buffer: Move the add_timestamp into its own function Make a helper function rb_add_timestamp() that moves the adding of the extended time stamps into its own function. Also, remove the noinline and inline for the functions it calls, as recent benchmarks appear they do not make a difference (just let gcc decide). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ce125cbe98a5..a30ca7ec2200 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2596,8 +2596,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } -/* Slow path, do not inline */ -static noinline struct ring_buffer_event * +/* Slow path */ +static struct ring_buffer_event * rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) { if (abs) @@ -2628,7 +2628,7 @@ static inline bool sched_clock_stable(void) } #endif -static noinline void +static void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { @@ -2648,6 +2648,21 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, "or add trace_clock=global to the kernel command line\n"); } +static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event **event, + struct rb_event_info *info, + u64 *delta, + unsigned int *length) +{ + bool abs = info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); + + rb_check_timestamp(cpu_buffer, info); + *event = rb_add_time_stamp(*event, info->delta, abs); + *length -= RB_LEN_TIME_EXTEND; + *delta = 0; +} + /** * rb_update_event - update event type and data * @cpu_buffer: The per cpu buffer of the @event @@ -2671,15 +2686,8 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, * If we need to add a timestamp, then we * add it to the start of the reserved space. */ - if (unlikely(info->add_timestamp)) { - bool abs = info->add_timestamp & - (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); - - rb_check_timestamp(cpu_buffer, info); - event = rb_add_time_stamp(event, abs ? info->delta : delta, abs); - length -= RB_LEN_TIME_EXTEND; - delta = 0; - } + if (unlikely(info->add_timestamp)) + rb_add_timestamp(cpu_buffer, &event, info, &delta, &length); event->time_delta = delta; length -= RB_EVNT_HDR_SIZE; -- cgit v1.2.3 From bbeba3e58f040a4297a5ba88ebf6e2b16adc3657 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 30 Jun 2020 13:05:29 -0400 Subject: ring-buffer: Call trace_clock_local() directly for RETPOLINE kernels After doing some benchmarks and examining the code, I found that the ring buffer clock calls were quite expensive, and noticed that it uses retpolines. This is because the ring buffer clock is programmable, and can be set. But in most cases it simply uses the fastest ns unit clock which is the trace_clock_local(). For RETPOLINE builds, checking if the ring buffer clock is set to trace_clock_local() and then calling it directly has brought the time of an event on my i7 box from an average of 93 nanoseconds an event down to 83 nanoseconds an event, and the minimum time from 81 nanoseconds to 68 nanoseconds! Suggested-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a30ca7ec2200..2bb96ee80120 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -970,8 +970,16 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, static inline u64 rb_time_stamp(struct trace_buffer *buffer) { + u64 ts; + + /* Skip retpolines :-( */ + if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local)) + ts = trace_clock_local(); + else + ts = buffer->clock(); + /* shift to debug/test normalization and TIME_EXTENTS */ - return buffer->clock() << DEBUG_SHIFT; + return ts << DEBUG_SHIFT; } u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu) -- cgit v1.2.3 From 29ce24519c0692ca7d998d7444a9e016a4c44fa7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 1 Jul 2020 13:10:19 -0400 Subject: ring-buffer: Do not trigger a WARN if clock going backwards is detected After tweaking the ring buffer to be a bit faster, a warning is triggering on one of my machines, and causing my tests to fail. This warning is caused when the delta (current time stamp minus previous time stamp), is larger than the max time held by the ring buffer (59 bits). If the clock were to go backwards slightly, this would then easily trigger this warning. The machine that it triggered on, the clock did go backwards by around 450 nanoseconds, and this happened after a recalibration of the TSC clock. Now that the ring buffer is faster, it detects this, and the delta that is used larger than the max, the warning is triggered and my test fails. To handle the clock going backwards, look at the saved before and after time stamps. If they are the same, it means that the current event did not interrupt another event, and that those timestamp are of a previous event that was recorded. If the max delta is triggered, look at those time stamps, make sure they are the same, then use them to compare with the current timestamp. If the current timestamp is less than the before/after time stamps, then that means the clock being used went backward. Print out a message that this has happened, but do not warn about it (and only print the message once). Still do the warning if the delta is indeed larger than what can be used. Also remove the unneeded KERN_WARNING from the WARN_ONCE() print. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2bb96ee80120..c3a2e7509527 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2642,8 +2642,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, { u64 write_stamp; - WARN_ONCE(info->delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s", + WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s", (unsigned long long)info->delta, (unsigned long long)info->ts, (unsigned long long)info->before, @@ -2665,7 +2664,26 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, bool abs = info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); - rb_check_timestamp(cpu_buffer, info); + if (unlikely(info->delta > (1ULL << 59))) { + /* did the clock go backwards */ + if (info->before == info->after && info->before > info->ts) { + /* not interrupted */ + static int once; + + /* + * This is possible with a recalibrating of the TSC. + * Do not produce a call stack, but just report it. + */ + if (!once) { + once++; + pr_warn("Ring buffer clock went backwards: %llu -> %llu\n", + info->before, info->ts); + } + } else + rb_check_timestamp(cpu_buffer, info); + if (!abs) + info->delta = 0; + } *event = rb_add_time_stamp(*event, info->delta, abs); *length -= RB_LEN_TIME_EXTEND; *delta = 0; -- cgit v1.2.3 From 10dd8573b09e84b81539d939d55ebdb6a36c5f3a Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 29 Jun 2020 13:54:59 +0530 Subject: cpufreq: Register governors at core_initcall Currently, most CPUFreq governors are registered at the core_initcall time when the given governor is the default one, and the module_init time otherwise. In preparation for letting users specify the default governor on the kernel command line, change all of them to be registered at the core_initcall unconditionally, as it is already the case for the schedutil and performance governors. This will allow us to assume that builtin governors have been registered before the built-in CPUFreq drivers probe. And since all governors have similar init/exit patterns now, introduce two new macros, cpufreq_governor_{init,exit}(), to factorize the code. Acked-by: Viresh Kumar Signed-off-by: Quentin Perret Signed-off-by: Viresh Kumar [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- arch/powerpc/platforms/cell/cpufreq_spudemand.c | 26 ++----------------------- drivers/cpufreq/cpufreq_conservative.c | 22 +++++---------------- drivers/cpufreq/cpufreq_ondemand.c | 24 ++++++----------------- drivers/cpufreq/cpufreq_performance.c | 14 ++----------- drivers/cpufreq/cpufreq_powersave.c | 18 +++-------------- drivers/cpufreq/cpufreq_userspace.c | 18 +++-------------- include/linux/cpufreq.h | 14 +++++++++++++ kernel/sched/cpufreq_schedutil.c | 6 +----- 8 files changed, 36 insertions(+), 106 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c index 55b31eadb3c8..ca7849e113d7 100644 --- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c +++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c @@ -126,30 +126,8 @@ static struct cpufreq_governor spu_governor = { .stop = spu_gov_stop, .owner = THIS_MODULE, }; - -/* - * module init and destoy - */ - -static int __init spu_gov_init(void) -{ - int ret; - - ret = cpufreq_register_governor(&spu_governor); - if (ret) - printk(KERN_ERR "registration of governor failed\n"); - return ret; -} - -static void __exit spu_gov_exit(void) -{ - cpufreq_unregister_governor(&spu_governor); -} - - -module_init(spu_gov_init); -module_exit(spu_gov_exit); +cpufreq_governor_init(spu_governor); +cpufreq_governor_exit(spu_governor); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Christian Krafft "); - diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 737ff3b9c2c0..aa39ff31ec9f 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -322,17 +322,7 @@ static struct dbs_governor cs_governor = { .start = cs_start, }; -#define CPU_FREQ_GOV_CONSERVATIVE (&cs_governor.gov) - -static int __init cpufreq_gov_dbs_init(void) -{ - return cpufreq_register_governor(CPU_FREQ_GOV_CONSERVATIVE); -} - -static void __exit cpufreq_gov_dbs_exit(void) -{ - cpufreq_unregister_governor(CPU_FREQ_GOV_CONSERVATIVE); -} +#define CPU_FREQ_GOV_CONSERVATIVE (cs_governor.gov) MODULE_AUTHOR("Alexander Clouter "); MODULE_DESCRIPTION("'cpufreq_conservative' - A dynamic cpufreq governor for " @@ -343,11 +333,9 @@ MODULE_LICENSE("GPL"); #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE struct cpufreq_governor *cpufreq_default_governor(void) { - return CPU_FREQ_GOV_CONSERVATIVE; + return &CPU_FREQ_GOV_CONSERVATIVE; } - -core_initcall(cpufreq_gov_dbs_init); -#else -module_init(cpufreq_gov_dbs_init); #endif -module_exit(cpufreq_gov_dbs_exit); + +cpufreq_governor_init(CPU_FREQ_GOV_CONSERVATIVE); +cpufreq_governor_exit(CPU_FREQ_GOV_CONSERVATIVE); diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 82a4d37ddecb..ac361a8b1d3b 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -408,7 +408,7 @@ static struct dbs_governor od_dbs_gov = { .start = od_start, }; -#define CPU_FREQ_GOV_ONDEMAND (&od_dbs_gov.gov) +#define CPU_FREQ_GOV_ONDEMAND (od_dbs_gov.gov) static void od_set_powersave_bias(unsigned int powersave_bias) { @@ -429,7 +429,7 @@ static void od_set_powersave_bias(unsigned int powersave_bias) continue; policy = cpufreq_cpu_get_raw(cpu); - if (!policy || policy->governor != CPU_FREQ_GOV_ONDEMAND) + if (!policy || policy->governor != &CPU_FREQ_GOV_ONDEMAND) continue; policy_dbs = policy->governor_data; @@ -461,16 +461,6 @@ void od_unregister_powersave_bias_handler(void) } EXPORT_SYMBOL_GPL(od_unregister_powersave_bias_handler); -static int __init cpufreq_gov_dbs_init(void) -{ - return cpufreq_register_governor(CPU_FREQ_GOV_ONDEMAND); -} - -static void __exit cpufreq_gov_dbs_exit(void) -{ - cpufreq_unregister_governor(CPU_FREQ_GOV_ONDEMAND); -} - MODULE_AUTHOR("Venkatesh Pallipadi "); MODULE_AUTHOR("Alexey Starikovskiy "); MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for " @@ -480,11 +470,9 @@ MODULE_LICENSE("GPL"); #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND struct cpufreq_governor *cpufreq_default_governor(void) { - return CPU_FREQ_GOV_ONDEMAND; + return &CPU_FREQ_GOV_ONDEMAND; } - -core_initcall(cpufreq_gov_dbs_init); -#else -module_init(cpufreq_gov_dbs_init); #endif -module_exit(cpufreq_gov_dbs_exit); + +cpufreq_governor_init(CPU_FREQ_GOV_ONDEMAND); +cpufreq_governor_exit(CPU_FREQ_GOV_ONDEMAND); diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c index def9afe0f5b8..71c1d9aba772 100644 --- a/drivers/cpufreq/cpufreq_performance.c +++ b/drivers/cpufreq/cpufreq_performance.c @@ -23,16 +23,6 @@ static struct cpufreq_governor cpufreq_gov_performance = { .limits = cpufreq_gov_performance_limits, }; -static int __init cpufreq_gov_performance_init(void) -{ - return cpufreq_register_governor(&cpufreq_gov_performance); -} - -static void __exit cpufreq_gov_performance_exit(void) -{ - cpufreq_unregister_governor(&cpufreq_gov_performance); -} - #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE struct cpufreq_governor *cpufreq_default_governor(void) { @@ -50,5 +40,5 @@ MODULE_AUTHOR("Dominik Brodowski "); MODULE_DESCRIPTION("CPUfreq policy governor 'performance'"); MODULE_LICENSE("GPL"); -core_initcall(cpufreq_gov_performance_init); -module_exit(cpufreq_gov_performance_exit); +cpufreq_governor_init(cpufreq_gov_performance); +cpufreq_governor_exit(cpufreq_gov_performance); diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c index 1ae66019eb83..7749522355b5 100644 --- a/drivers/cpufreq/cpufreq_powersave.c +++ b/drivers/cpufreq/cpufreq_powersave.c @@ -23,16 +23,6 @@ static struct cpufreq_governor cpufreq_gov_powersave = { .owner = THIS_MODULE, }; -static int __init cpufreq_gov_powersave_init(void) -{ - return cpufreq_register_governor(&cpufreq_gov_powersave); -} - -static void __exit cpufreq_gov_powersave_exit(void) -{ - cpufreq_unregister_governor(&cpufreq_gov_powersave); -} - MODULE_AUTHOR("Dominik Brodowski "); MODULE_DESCRIPTION("CPUfreq policy governor 'powersave'"); MODULE_LICENSE("GPL"); @@ -42,9 +32,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) { return &cpufreq_gov_powersave; } - -core_initcall(cpufreq_gov_powersave_init); -#else -module_init(cpufreq_gov_powersave_init); #endif -module_exit(cpufreq_gov_powersave_exit); + +cpufreq_governor_init(cpufreq_gov_powersave); +cpufreq_governor_exit(cpufreq_gov_powersave); diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c index b43e7cd502c5..50a4d7846580 100644 --- a/drivers/cpufreq/cpufreq_userspace.c +++ b/drivers/cpufreq/cpufreq_userspace.c @@ -126,16 +126,6 @@ static struct cpufreq_governor cpufreq_gov_userspace = { .owner = THIS_MODULE, }; -static int __init cpufreq_gov_userspace_init(void) -{ - return cpufreq_register_governor(&cpufreq_gov_userspace); -} - -static void __exit cpufreq_gov_userspace_exit(void) -{ - cpufreq_unregister_governor(&cpufreq_gov_userspace); -} - MODULE_AUTHOR("Dominik Brodowski , " "Russell King "); MODULE_DESCRIPTION("CPUfreq policy governor 'userspace'"); @@ -146,9 +136,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) { return &cpufreq_gov_userspace; } - -core_initcall(cpufreq_gov_userspace_init); -#else -module_init(cpufreq_gov_userspace_init); #endif -module_exit(cpufreq_gov_userspace_exit); + +cpufreq_governor_init(cpufreq_gov_userspace); +cpufreq_governor_exit(cpufreq_gov_userspace); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 3494f6763597..e62b022cb07e 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -577,6 +577,20 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy); int cpufreq_register_governor(struct cpufreq_governor *governor); void cpufreq_unregister_governor(struct cpufreq_governor *governor); +#define cpufreq_governor_init(__governor) \ +static int __init __governor##_init(void) \ +{ \ + return cpufreq_register_governor(&__governor); \ +} \ +core_initcall(__governor##_init) + +#define cpufreq_governor_exit(__governor) \ +static void __exit __governor##_exit(void) \ +{ \ + return cpufreq_unregister_governor(&__governor); \ +} \ +module_exit(__governor##_exit) + struct cpufreq_governor *cpufreq_default_governor(void); struct cpufreq_governor *cpufreq_fallback_governor(void); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7fbaee24c824..402a09af9f43 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -909,11 +909,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) } #endif -static int __init sugov_register(void) -{ - return cpufreq_register_governor(&schedutil_gov); -} -core_initcall(sugov_register); +cpufreq_governor_init(schedutil_gov); #ifdef CONFIG_ENERGY_MODEL extern bool sched_energy_update; -- cgit v1.2.3 From 1d50e5d0c5052446cb85a3bf11fe8ba4e8d770ca Mon Sep 17 00:00:00 2001 From: Bhupesh Sharma Date: Thu, 14 May 2020 00:22:36 +0530 Subject: crash_core, vmcoreinfo: Append 'MAX_PHYSMEM_BITS' to vmcoreinfo Right now user-space tools like 'makedumpfile' and 'crash' need to rely on a best-guess method of determining value of 'MAX_PHYSMEM_BITS' supported by underlying kernel. This value is used in user-space code to calculate the bit-space required to store a section for SPARESMEM (similar to the existing calculation method used in the kernel implementation): #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) Now, regressions have been reported in user-space utilities like 'makedumpfile' and 'crash' on arm64, with the recently added kernel support for 52-bit physical address space, as there is no clear method of determining this value in user-space (other than reading kernel CONFIG flags). As per suggestion from makedumpfile maintainer (Kazu), it makes more sense to append 'MAX_PHYSMEM_BITS' to vmcoreinfo in the core code itself rather than in arch-specific code, so that the user-space code for other archs can also benefit from this addition to the vmcoreinfo and use it as a standard way of determining 'SECTIONS_SHIFT' value in user-land. A reference 'makedumpfile' implementation which reads the 'MAX_PHYSMEM_BITS' value from vmcoreinfo in a arch-independent fashion is available here: While at it also update vmcoreinfo documentation for 'MAX_PHYSMEM_BITS' variable being added to vmcoreinfo. 'MAX_PHYSMEM_BITS' defines the maximum supported physical address space memory. Signed-off-by: Bhupesh Sharma Tested-by: John Donnelly Acked-by: Dave Young Cc: Boris Petkov Cc: Ingo Molnar Cc: Thomas Gleixner Cc: James Morse Cc: Mark Rutland Cc: Will Deacon Cc: Michael Ellerman Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Dave Anderson Cc: Kazuhito Hagio Cc: x86@kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: kexec@lists.infradead.org Link: https://lore.kernel.org/r/1589395957-24628-2-git-send-email-bhsharma@redhat.com Signed-off-by: Catalin Marinas --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 5 +++++ kernel/crash_core.c | 1 + 2 files changed, 6 insertions(+) (limited to 'kernel') diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index e4ee8b2db604..2a632020f809 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -93,6 +93,11 @@ It exists in the sparse memory mapping model, and it is also somewhat similar to the mem_map variable, both of them are used to translate an address. +MAX_PHYSMEM_BITS +---------------- + +Defines the maximum supported physical address space memory. + page ---- diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 9f1557b98468..18175687133a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -413,6 +413,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); + VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); #endif VMCOREINFO_STRUCT_SIZE(page); VMCOREINFO_STRUCT_SIZE(pglist_data); -- cgit v1.2.3 From 046cc3dd9a2507b8e111714807ab8bf15ea5bb70 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 2 Jul 2020 19:45:37 -0700 Subject: bpf: Fix build without CONFIG_STACKTRACE Without CONFIG_STACKTRACE stack_trace_save_tsk() is not defined. Let get_callchain_entry_for_task() to always return NULL in such cases. Fixes: fa28dcb82a38 ("bpf: Introduce helper bpf_get_task_stack()") Reported-by: kernel test robot Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200703024537.79971-1-songliubraving@fb.com --- kernel/bpf/stackmap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 5ad72ab2276b..a6c361ed7937 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -351,6 +351,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, static struct perf_callchain_entry * get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) { +#ifdef CONFIG_STACKTRACE struct perf_callchain_entry *entry; int rctx; @@ -380,6 +381,9 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) put_callchain_entry(rctx); return entry; +#else /* CONFIG_STACKTRACE */ + return NULL; +#endif } BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, -- cgit v1.2.3 From 8fa88a88d573093868565a1afba43b5ae5b3a316 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 3 Jul 2020 16:56:45 +0100 Subject: genirq: Remove preflow handler support That was put in place for sparc64, and blackfin also used it for some time; sparc64 no longer uses those, and blackfin is dead. As there are no more users, remove preflow handlers. Signed-off-by: Valentin Schneider Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200703155645.29703-3-valentin.schneider@arm.com --- include/linux/irqdesc.h | 15 --------------- include/linux/irqhandler.h | 1 - kernel/irq/Kconfig | 4 ---- kernel/irq/chip.c | 13 ------------- 4 files changed, 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 8f2820c5e69e..5745491303e0 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -22,7 +22,6 @@ struct pt_regs; * @irq_common_data: per irq and chip data passed down to chip functions * @kstat_irqs: irq stats per cpu * @handle_irq: highlevel irq-events handler - * @preflow_handler: handler called before the flow handler (currently used by sparc) * @action: the irq action chain * @status_use_accessors: status information * @core_internal_state__do_not_mess_with_it: core internal status information @@ -58,9 +57,6 @@ struct irq_desc { struct irq_data irq_data; unsigned int __percpu *kstat_irqs; irq_flow_handler_t handle_irq; -#ifdef CONFIG_IRQ_PREFLOW_FASTEOI - irq_preflow_handler_t preflow_handler; -#endif struct irqaction *action; /* IRQ action list */ unsigned int status_use_accessors; unsigned int core_internal_state__do_not_mess_with_it; @@ -268,15 +264,4 @@ irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, } } -#ifdef CONFIG_IRQ_PREFLOW_FASTEOI -static inline void -__irq_set_preflow_handler(unsigned int irq, irq_preflow_handler_t handler) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - desc->preflow_handler = handler; -} -#endif - #endif diff --git a/include/linux/irqhandler.h b/include/linux/irqhandler.h index 1e6f4e7123d6..c30f454a9518 100644 --- a/include/linux/irqhandler.h +++ b/include/linux/irqhandler.h @@ -10,6 +10,5 @@ struct irq_desc; struct irq_data; typedef void (*irq_flow_handler_t)(struct irq_desc *desc); -typedef void (*irq_preflow_handler_t)(struct irq_data *data); #endif diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 20512252ecc9..10a5aff4eecc 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -51,10 +51,6 @@ config GENERIC_IRQ_INJECTION config HARDIRQS_SW_RESEND bool -# Preflow handler support for fasteoi (sparc64) -config IRQ_PREFLOW_FASTEOI - bool - # Edge style eoi based handler (cell) config IRQ_EDGE_EOI_HANDLER bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 41e7e37a0928..75bbaa8b38f1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -656,16 +656,6 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_level_irq); -#ifdef CONFIG_IRQ_PREFLOW_FASTEOI -static inline void preflow_handler(struct irq_desc *desc) -{ - if (desc->preflow_handler) - desc->preflow_handler(&desc->irq_data); -} -#else -static inline void preflow_handler(struct irq_desc *desc) { } -#endif - static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) { if (!(desc->istate & IRQS_ONESHOT)) { @@ -721,7 +711,6 @@ void handle_fasteoi_irq(struct irq_desc *desc) if (desc->istate & IRQS_ONESHOT) mask_irq(desc); - preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); @@ -1231,7 +1220,6 @@ void handle_fasteoi_ack_irq(struct irq_desc *desc) /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); - preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); @@ -1281,7 +1269,6 @@ void handle_fasteoi_mask_irq(struct irq_desc *desc) if (desc->istate & IRQS_ONESHOT) mask_irq(desc); - preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); -- cgit v1.2.3 From 5fec25f2cb959cb5f189d7f6127bee3efc782530 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 24 Jun 2020 16:34:57 -0500 Subject: umh: Capture the pid in umh_pipe_setup The pid in struct subprocess_info is only used by umh_clean_and_save_pid to write the pid into umh_info. Instead always capture the pid on struct umh_info in umh_pipe_setup, removing code that is specific to user mode drivers from the common user path of user mode helpers. v1: https://lkml.kernel.org/r/87h7uygf9i.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/875zb97iix.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-1-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- include/linux/umh.h | 1 - kernel/umh.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/umh.h b/include/linux/umh.h index 0c08de356d0d..aae16a0ebd0f 100644 --- a/include/linux/umh.h +++ b/include/linux/umh.h @@ -25,7 +25,6 @@ struct subprocess_info { struct file *file; int wait; int retval; - pid_t pid; int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; diff --git a/kernel/umh.c b/kernel/umh.c index 79f139a7ca03..c2a582b3a2bf 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -102,7 +102,6 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - sub_info->pid = task_pid_nr(current); if (sub_info->file) { retval = do_execve_file(sub_info->file, sub_info->argv, sub_info->envp); @@ -468,6 +467,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) umh_info->pipe_to_umh = to_umh[1]; umh_info->pipe_from_umh = from_umh[0]; + umh_info->pid = task_pid_nr(current); return 0; } @@ -476,13 +476,12 @@ static void umh_clean_and_save_pid(struct subprocess_info *info) struct umh_info *umh_info = info->data; /* cleanup if umh_pipe_setup() was successful but exec failed */ - if (info->pid && info->retval) { + if (info->retval) { fput(umh_info->pipe_to_umh); fput(umh_info->pipe_from_umh); } argv_free(info->argv); - umh_info->pid = info->pid; } /** -- cgit v1.2.3 From b044fa2ae50d52d8c9f9d130055c2aea032e7475 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 10:04:25 -0500 Subject: umh: Move setting PF_UMH into umh_pipe_setup I am separating the code specific to user mode drivers from the code for ordinary user space helpers. Move setting of PF_UMH from call_usermodehelper_exec_async which is core user mode helper code into umh_pipe_setup which is user mode driver code. The code is equally as easy to write in one location as the other and the movement minimizes the impact of the user mode driver code on the core of the user mode helper code. Setting PF_UMH unconditionally is harmless as an action will only happen if it is paired with an entry on umh_list. v1: https://lkml.kernel.org/r/87bll6gf8t.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87zh8l63xs.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-2-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/umh.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index c2a582b3a2bf..e6b9d6636850 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -102,12 +102,10 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - if (sub_info->file) { + if (sub_info->file) retval = do_execve_file(sub_info->file, sub_info->argv, sub_info->envp); - if (!retval) - current->flags |= PF_UMH; - } else + else retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); @@ -468,6 +466,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) umh_info->pipe_to_umh = to_umh[1]; umh_info->pipe_from_umh = from_umh[0]; umh_info->pid = task_pid_nr(current); + current->flags |= PF_UMH; return 0; } -- cgit v1.2.3 From 3a171042aeab8702391c311d84633a6c267d566c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 10:10:03 -0500 Subject: umh: Rename the user mode driver helpers for clarity Now that the functionality of umh_setup_pipe and umh_clean_and_save_pid has changed their names are too specific and don't make much sense. Instead name them umd_setup and umd_cleanup for the functional role in setting up user mode drivers. v1: https://lkml.kernel.org/r/875zbegf82.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87tuyt63x3.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-3-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/umh.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index e6b9d6636850..26c3d493f168 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -429,7 +429,7 @@ struct subprocess_info *call_usermodehelper_setup_file(struct file *file, return sub_info; } -static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +static int umd_setup(struct subprocess_info *info, struct cred *new) { struct umh_info *umh_info = info->data; struct file *from_umh[2]; @@ -470,11 +470,11 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) return 0; } -static void umh_clean_and_save_pid(struct subprocess_info *info) +static void umd_cleanup(struct subprocess_info *info) { struct umh_info *umh_info = info->data; - /* cleanup if umh_pipe_setup() was successful but exec failed */ + /* cleanup if umh_setup() was successful but exec failed */ if (info->retval) { fput(umh_info->pipe_to_umh); fput(umh_info->pipe_from_umh); @@ -520,8 +520,8 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) } err = -ENOMEM; - sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup, - umh_clean_and_save_pid, info); + sub_info = call_usermodehelper_setup_file(file, umd_setup, umd_cleanup, + info); if (!sub_info) goto out; -- cgit v1.2.3 From 21d598280675c463ea1b264fab06e9614aacd1e1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 24 Jun 2020 17:01:18 -0500 Subject: umh: Remove call_usermodehelper_setup_file. The only caller of call_usermodehelper_setup_file is fork_usermode_blob. In fork_usermode_blob replace call_usermodehelper_setup_file with call_usermodehelper_setup and delete fork_usermodehelper_setup_file. For this to work the argv_free is moved from umh_clean_and_save_pid to fork_usermode_blob. v1: https://lkml.kernel.org/r/87zh8qf0mp.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87o8p163u1.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-4-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- include/linux/umh.h | 3 --- kernel/umh.c | 42 +++++++++++------------------------------- 2 files changed, 11 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/umh.h b/include/linux/umh.h index aae16a0ebd0f..de08af00c68a 100644 --- a/include/linux/umh.h +++ b/include/linux/umh.h @@ -39,9 +39,6 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); -struct subprocess_info *call_usermodehelper_setup_file(struct file *file, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *), void *data); struct umh_info { const char *cmdline; struct file *pipe_to_umh; diff --git a/kernel/umh.c b/kernel/umh.c index 26c3d493f168..b8fa9b99b366 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -402,33 +402,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, } EXPORT_SYMBOL(call_usermodehelper_setup); -struct subprocess_info *call_usermodehelper_setup_file(struct file *file, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *info), void *data) -{ - struct subprocess_info *sub_info; - struct umh_info *info = data; - const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; - - sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL); - if (!sub_info) - return NULL; - - sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL); - if (!sub_info->argv) { - kfree(sub_info); - return NULL; - } - - INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); - sub_info->path = "none"; - sub_info->file = file; - sub_info->init = init; - sub_info->cleanup = cleanup; - sub_info->data = data; - return sub_info; -} - static int umd_setup(struct subprocess_info *info, struct cred *new) { struct umh_info *umh_info = info->data; @@ -479,8 +452,6 @@ static void umd_cleanup(struct subprocess_info *info) fput(umh_info->pipe_to_umh); fput(umh_info->pipe_from_umh); } - - argv_free(info->argv); } /** @@ -501,7 +472,9 @@ static void umd_cleanup(struct subprocess_info *info) */ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) { + const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; struct subprocess_info *sub_info; + char **argv = NULL; struct file *file; ssize_t written; loff_t pos = 0; @@ -520,11 +493,16 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) } err = -ENOMEM; - sub_info = call_usermodehelper_setup_file(file, umd_setup, umd_cleanup, - info); + argv = argv_split(GFP_KERNEL, cmdline, NULL); + if (!argv) + goto out; + + sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL, + umd_setup, umd_cleanup, info); if (!sub_info) goto out; + sub_info->file = file; err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); if (!err) { mutex_lock(&umh_list_lock); @@ -532,6 +510,8 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) mutex_unlock(&umh_list_lock); } out: + if (argv) + argv_free(argv); fput(file); return err; } -- cgit v1.2.3 From 884c5e683b67dbc52892e24c29eed864f330ec08 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 26 Jun 2020 12:23:00 -0500 Subject: umh: Separate the user mode driver and the user mode helper support This makes it clear which code is part of the core user mode helper support and which code is needed to implement user mode drivers. This makes the kernel smaller for everyone who does not use a usermode driver. v1: https://lkml.kernel.org/r/87tuyyf0ln.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87imf963s6.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-5-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- include/linux/bpfilter.h | 2 +- include/linux/sched.h | 8 --- include/linux/umh.h | 10 --- include/linux/usermode_driver.h | 30 +++++++++ kernel/Makefile | 1 + kernel/exit.c | 1 + kernel/umh.c | 139 -------------------------------------- kernel/usermode_driver.c | 146 ++++++++++++++++++++++++++++++++++++++++ 8 files changed, 179 insertions(+), 158 deletions(-) create mode 100644 include/linux/usermode_driver.h create mode 100644 kernel/usermode_driver.c (limited to 'kernel') diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index d815622cd31e..d6d6206052a6 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -3,7 +3,7 @@ #define _LINUX_BPFILTER_H #include -#include +#include struct sock; int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, diff --git a/include/linux/sched.h b/include/linux/sched.h index b62e6aaf28f0..59d1e92bb88e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2020,14 +2020,6 @@ static inline void rseq_execve(struct task_struct *t) #endif -void __exit_umh(struct task_struct *tsk); - -static inline void exit_umh(struct task_struct *tsk) -{ - if (unlikely(tsk->flags & PF_UMH)) - __exit_umh(tsk); -} - #ifdef CONFIG_DEBUG_RSEQ void rseq_syscall(struct pt_regs *regs); diff --git a/include/linux/umh.h b/include/linux/umh.h index de08af00c68a..73173c4a07e5 100644 --- a/include/linux/umh.h +++ b/include/linux/umh.h @@ -39,16 +39,6 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); -struct umh_info { - const char *cmdline; - struct file *pipe_to_umh; - struct file *pipe_from_umh; - struct list_head list; - void (*cleanup)(struct umh_info *info); - pid_t pid; -}; -int fork_usermode_blob(void *data, size_t len, struct umh_info *info); - extern int call_usermodehelper_exec(struct subprocess_info *info, int wait); diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h new file mode 100644 index 000000000000..c5f6dc950227 --- /dev/null +++ b/include/linux/usermode_driver.h @@ -0,0 +1,30 @@ +#ifndef __LINUX_USERMODE_DRIVER_H__ +#define __LINUX_USERMODE_DRIVER_H__ + +#include + +#ifdef CONFIG_BPFILTER +void __exit_umh(struct task_struct *tsk); + +static inline void exit_umh(struct task_struct *tsk) +{ + if (unlikely(tsk->flags & PF_UMH)) + __exit_umh(tsk); +} +#else +static inline void exit_umh(struct task_struct *tsk) +{ +} +#endif + +struct umh_info { + const char *cmdline; + struct file *pipe_to_umh; + struct file *pipe_from_umh; + struct list_head list; + void (*cleanup)(struct umh_info *info); + pid_t pid; +}; +int fork_usermode_blob(void *data, size_t len, struct umh_info *info); + +#endif /* __LINUX_USERMODE_DRIVER_H__ */ diff --git a/kernel/Makefile b/kernel/Makefile index f3218bc5ec69..43928759893a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,6 +12,7 @@ obj-y = fork.o exec_domain.o panic.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o +obj-$(CONFIG_BPFILTER) += usermode_driver.o obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o diff --git a/kernel/exit.c b/kernel/exit.c index 727150f28103..a081deea52ca 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include diff --git a/kernel/umh.c b/kernel/umh.c index b8fa9b99b366..3e4e453d45c8 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -26,8 +26,6 @@ #include #include #include -#include -#include #include @@ -38,8 +36,6 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); -static LIST_HEAD(umh_list); -static DEFINE_MUTEX(umh_list_lock); static void call_usermodehelper_freeinfo(struct subprocess_info *info) { @@ -402,121 +398,6 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, } EXPORT_SYMBOL(call_usermodehelper_setup); -static int umd_setup(struct subprocess_info *info, struct cred *new) -{ - struct umh_info *umh_info = info->data; - struct file *from_umh[2]; - struct file *to_umh[2]; - int err; - - /* create pipe to send data to umh */ - err = create_pipe_files(to_umh, 0); - if (err) - return err; - err = replace_fd(0, to_umh[0], 0); - fput(to_umh[0]); - if (err < 0) { - fput(to_umh[1]); - return err; - } - - /* create pipe to receive data from umh */ - err = create_pipe_files(from_umh, 0); - if (err) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - return err; - } - err = replace_fd(1, from_umh[1], 0); - fput(from_umh[1]); - if (err < 0) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - fput(from_umh[0]); - return err; - } - - umh_info->pipe_to_umh = to_umh[1]; - umh_info->pipe_from_umh = from_umh[0]; - umh_info->pid = task_pid_nr(current); - current->flags |= PF_UMH; - return 0; -} - -static void umd_cleanup(struct subprocess_info *info) -{ - struct umh_info *umh_info = info->data; - - /* cleanup if umh_setup() was successful but exec failed */ - if (info->retval) { - fput(umh_info->pipe_to_umh); - fput(umh_info->pipe_from_umh); - } -} - -/** - * fork_usermode_blob - fork a blob of bytes as a usermode process - * @data: a blob of bytes that can be do_execv-ed as a file - * @len: length of the blob - * @info: information about usermode process (shouldn't be NULL) - * - * If info->cmdline is set it will be used as command line for the - * user process, else "usermodehelper" is used. - * - * Returns either negative error or zero which indicates success - * in executing a blob of bytes as a usermode process. In such - * case 'struct umh_info *info' is populated with two pipes - * and a pid of the process. The caller is responsible for health - * check of the user process, killing it via pid, and closing the - * pipes when user process is no longer needed. - */ -int fork_usermode_blob(void *data, size_t len, struct umh_info *info) -{ - const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; - struct subprocess_info *sub_info; - char **argv = NULL; - struct file *file; - ssize_t written; - loff_t pos = 0; - int err; - - file = shmem_kernel_file_setup("", len, 0); - if (IS_ERR(file)) - return PTR_ERR(file); - - written = kernel_write(file, data, len, &pos); - if (written != len) { - err = written; - if (err >= 0) - err = -ENOMEM; - goto out; - } - - err = -ENOMEM; - argv = argv_split(GFP_KERNEL, cmdline, NULL); - if (!argv) - goto out; - - sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL, - umd_setup, umd_cleanup, info); - if (!sub_info) - goto out; - - sub_info->file = file; - err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); - if (!err) { - mutex_lock(&umh_list_lock); - list_add(&info->list, &umh_list); - mutex_unlock(&umh_list_lock); - } -out: - if (argv) - argv_free(argv); - fput(file); - return err; -} -EXPORT_SYMBOL_GPL(fork_usermode_blob); - /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocessa @@ -678,26 +559,6 @@ static int proc_cap_handler(struct ctl_table *table, int write, return 0; } -void __exit_umh(struct task_struct *tsk) -{ - struct umh_info *info; - pid_t pid = tsk->pid; - - mutex_lock(&umh_list_lock); - list_for_each_entry(info, &umh_list, list) { - if (info->pid == pid) { - list_del(&info->list); - mutex_unlock(&umh_list_lock); - goto out; - } - } - mutex_unlock(&umh_list_lock); - return; -out: - if (info->cleanup) - info->cleanup(info); -} - struct ctl_table usermodehelper_table[] = { { .procname = "bset", diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c new file mode 100644 index 000000000000..5b05863af855 --- /dev/null +++ b/kernel/usermode_driver.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * umd - User mode driver support + */ +#include +#include +#include + +static LIST_HEAD(umh_list); +static DEFINE_MUTEX(umh_list_lock); + +static int umd_setup(struct subprocess_info *info, struct cred *new) +{ + struct umh_info *umh_info = info->data; + struct file *from_umh[2]; + struct file *to_umh[2]; + int err; + + /* create pipe to send data to umh */ + err = create_pipe_files(to_umh, 0); + if (err) + return err; + err = replace_fd(0, to_umh[0], 0); + fput(to_umh[0]); + if (err < 0) { + fput(to_umh[1]); + return err; + } + + /* create pipe to receive data from umh */ + err = create_pipe_files(from_umh, 0); + if (err) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + return err; + } + err = replace_fd(1, from_umh[1], 0); + fput(from_umh[1]); + if (err < 0) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + fput(from_umh[0]); + return err; + } + + umh_info->pipe_to_umh = to_umh[1]; + umh_info->pipe_from_umh = from_umh[0]; + umh_info->pid = task_pid_nr(current); + current->flags |= PF_UMH; + return 0; +} + +static void umd_cleanup(struct subprocess_info *info) +{ + struct umh_info *umh_info = info->data; + + /* cleanup if umh_setup() was successful but exec failed */ + if (info->retval) { + fput(umh_info->pipe_to_umh); + fput(umh_info->pipe_from_umh); + } +} + +/** + * fork_usermode_blob - fork a blob of bytes as a usermode process + * @data: a blob of bytes that can be do_execv-ed as a file + * @len: length of the blob + * @info: information about usermode process (shouldn't be NULL) + * + * If info->cmdline is set it will be used as command line for the + * user process, else "usermodehelper" is used. + * + * Returns either negative error or zero which indicates success + * in executing a blob of bytes as a usermode process. In such + * case 'struct umh_info *info' is populated with two pipes + * and a pid of the process. The caller is responsible for health + * check of the user process, killing it via pid, and closing the + * pipes when user process is no longer needed. + */ +int fork_usermode_blob(void *data, size_t len, struct umh_info *info) +{ + const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; + struct subprocess_info *sub_info; + char **argv = NULL; + struct file *file; + ssize_t written; + loff_t pos = 0; + int err; + + file = shmem_kernel_file_setup("", len, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + written = kernel_write(file, data, len, &pos); + if (written != len) { + err = written; + if (err >= 0) + err = -ENOMEM; + goto out; + } + + err = -ENOMEM; + argv = argv_split(GFP_KERNEL, cmdline, NULL); + if (!argv) + goto out; + + sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL, + umd_setup, umd_cleanup, info); + if (!sub_info) + goto out; + + sub_info->file = file; + err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + if (!err) { + mutex_lock(&umh_list_lock); + list_add(&info->list, &umh_list); + mutex_unlock(&umh_list_lock); + } +out: + if (argv) + argv_free(argv); + fput(file); + return err; +} +EXPORT_SYMBOL_GPL(fork_usermode_blob); + +void __exit_umh(struct task_struct *tsk) +{ + struct umh_info *info; + pid_t pid = tsk->pid; + + mutex_lock(&umh_list_lock); + list_for_each_entry(info, &umh_list, list) { + if (info->pid == pid) { + list_del(&info->list); + mutex_unlock(&umh_list_lock); + goto out; + } + } + mutex_unlock(&umh_list_lock); + return; +out: + if (info->cleanup) + info->cleanup(info); +} + -- cgit v1.2.3 From 74be2d3b80af1bb264c3b9905b52c15efc03c0fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 26 Jun 2020 11:16:06 -0500 Subject: umd: For clarity rename umh_info umd_info This structure is only used for user mode drivers so change the prefix from umh to umd to make that clear. v1: https://lkml.kernel.org/r/87o8p6f0kw.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/878sg563po.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-6-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- include/linux/bpfilter.h | 2 +- include/linux/usermode_driver.h | 6 +++--- kernel/usermode_driver.c | 20 ++++++++++---------- net/ipv4/bpfilter/sockopt.c | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index d6d6206052a6..ec9972d822e0 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -11,7 +11,7 @@ int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen); struct bpfilter_umh_ops { - struct umh_info info; + struct umd_info info; /* since ip_getsockopt() can run in parallel, serialize access to umh */ struct mutex lock; int (*sockopt)(struct sock *sk, int optname, diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h index c5f6dc950227..7131ea611bab 100644 --- a/include/linux/usermode_driver.h +++ b/include/linux/usermode_driver.h @@ -17,14 +17,14 @@ static inline void exit_umh(struct task_struct *tsk) } #endif -struct umh_info { +struct umd_info { const char *cmdline; struct file *pipe_to_umh; struct file *pipe_from_umh; struct list_head list; - void (*cleanup)(struct umh_info *info); + void (*cleanup)(struct umd_info *info); pid_t pid; }; -int fork_usermode_blob(void *data, size_t len, struct umh_info *info); +int fork_usermode_blob(void *data, size_t len, struct umd_info *info); #endif /* __LINUX_USERMODE_DRIVER_H__ */ diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index 5b05863af855..e73550e946d6 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -11,7 +11,7 @@ static DEFINE_MUTEX(umh_list_lock); static int umd_setup(struct subprocess_info *info, struct cred *new) { - struct umh_info *umh_info = info->data; + struct umd_info *umd_info = info->data; struct file *from_umh[2]; struct file *to_umh[2]; int err; @@ -43,21 +43,21 @@ static int umd_setup(struct subprocess_info *info, struct cred *new) return err; } - umh_info->pipe_to_umh = to_umh[1]; - umh_info->pipe_from_umh = from_umh[0]; - umh_info->pid = task_pid_nr(current); + umd_info->pipe_to_umh = to_umh[1]; + umd_info->pipe_from_umh = from_umh[0]; + umd_info->pid = task_pid_nr(current); current->flags |= PF_UMH; return 0; } static void umd_cleanup(struct subprocess_info *info) { - struct umh_info *umh_info = info->data; + struct umd_info *umd_info = info->data; /* cleanup if umh_setup() was successful but exec failed */ if (info->retval) { - fput(umh_info->pipe_to_umh); - fput(umh_info->pipe_from_umh); + fput(umd_info->pipe_to_umh); + fput(umd_info->pipe_from_umh); } } @@ -72,12 +72,12 @@ static void umd_cleanup(struct subprocess_info *info) * * Returns either negative error or zero which indicates success * in executing a blob of bytes as a usermode process. In such - * case 'struct umh_info *info' is populated with two pipes + * case 'struct umd_info *info' is populated with two pipes * and a pid of the process. The caller is responsible for health * check of the user process, killing it via pid, and closing the * pipes when user process is no longer needed. */ -int fork_usermode_blob(void *data, size_t len, struct umh_info *info) +int fork_usermode_blob(void *data, size_t len, struct umd_info *info) { const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; struct subprocess_info *sub_info; @@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob); void __exit_umh(struct task_struct *tsk) { - struct umh_info *info; + struct umd_info *info; pid_t pid = tsk->pid; mutex_lock(&umh_list_lock); diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 0480918bfc7c..c0dbcc86fcdb 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -12,7 +12,7 @@ struct bpfilter_umh_ops bpfilter_ops; EXPORT_SYMBOL_GPL(bpfilter_ops); -static void bpfilter_umh_cleanup(struct umh_info *info) +static void bpfilter_umh_cleanup(struct umd_info *info) { mutex_lock(&bpfilter_ops.lock); bpfilter_ops.stop = true; -- cgit v1.2.3 From 1199c6c3da5197e9924a906b9de71b8d0ac62a01 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 11:38:08 -0500 Subject: umd: Rename umd_info.cmdline umd_info.driver_name The only thing supplied in the cmdline today is the driver name so rename the field to clarify the code. As this value is always supplied stop trying to handle the case of a NULL cmdline. Additionally since we now have a name we can count on use the driver_name any place where the code is looking for a name of the binary. v1: https://lkml.kernel.org/r/87imfef0k3.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87366d63os.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-7-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- include/linux/usermode_driver.h | 2 +- kernel/usermode_driver.c | 11 ++++------- net/ipv4/bpfilter/sockopt.c | 2 +- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h index 7131ea611bab..48cf25e3145d 100644 --- a/include/linux/usermode_driver.h +++ b/include/linux/usermode_driver.h @@ -18,7 +18,7 @@ static inline void exit_umh(struct task_struct *tsk) #endif struct umd_info { - const char *cmdline; + const char *driver_name; struct file *pipe_to_umh; struct file *pipe_from_umh; struct list_head list; diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index e73550e946d6..46d60d855e93 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -67,9 +67,6 @@ static void umd_cleanup(struct subprocess_info *info) * @len: length of the blob * @info: information about usermode process (shouldn't be NULL) * - * If info->cmdline is set it will be used as command line for the - * user process, else "usermodehelper" is used. - * * Returns either negative error or zero which indicates success * in executing a blob of bytes as a usermode process. In such * case 'struct umd_info *info' is populated with two pipes @@ -79,7 +76,6 @@ static void umd_cleanup(struct subprocess_info *info) */ int fork_usermode_blob(void *data, size_t len, struct umd_info *info) { - const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; struct subprocess_info *sub_info; char **argv = NULL; struct file *file; @@ -87,7 +83,7 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info) loff_t pos = 0; int err; - file = shmem_kernel_file_setup("", len, 0); + file = shmem_kernel_file_setup(info->driver_name, len, 0); if (IS_ERR(file)) return PTR_ERR(file); @@ -100,11 +96,12 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info) } err = -ENOMEM; - argv = argv_split(GFP_KERNEL, cmdline, NULL); + argv = argv_split(GFP_KERNEL, info->driver_name, NULL); if (!argv) goto out; - sub_info = call_usermodehelper_setup("none", argv, NULL, GFP_KERNEL, + sub_info = call_usermodehelper_setup(info->driver_name, argv, NULL, + GFP_KERNEL, umd_setup, umd_cleanup, info); if (!sub_info) goto out; diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index c0dbcc86fcdb..5050de28333d 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -70,7 +70,7 @@ static int __init bpfilter_sockopt_init(void) { mutex_init(&bpfilter_ops.lock); bpfilter_ops.stop = true; - bpfilter_ops.info.cmdline = "bpfilter_umh"; + bpfilter_ops.info.driver_name = "bpfilter_umh"; bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup; return 0; -- cgit v1.2.3 From e2dc9bf3f5275ca372001541e5f26af572976e65 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 13:12:59 -0500 Subject: umd: Transform fork_usermode_blob into fork_usermode_driver Instead of loading a binary blob into a temporary file with shmem_kernel_file_setup load a binary blob into a temporary tmpfs filesystem. This means that the blob can be stored in an init section and discared, and it means the binary blob will have a filename so can be executed normally. The only tricky thing about this code is that in the helper function blob_to_mnt __fput_sync is used. That is because a file can not be executed if it is still open for write, and the ordinary delayed close for kernel threads does not happen soon enough, which causes the following exec to fail. The function umd_load_blob is not called with any locks so this should be safe. Executing the blob normally winds up correcting several problems with the user mode driver code discovered by Tetsuo Handa[1]. By passing an ordinary filename into the exec, it is no longer necessary to figure out how to turn a O_RDWR file descriptor into a properly referende counted O_EXEC file descriptor that forbids all writes. For path based LSMs there are no new special cases. [1] https://lore.kernel.org/linux-fsdevel/2a8775b4-1dd5-9d5c-aa42-9872445e0942@i-love.sakura.ne.jp/ v1: https://lkml.kernel.org/r/87d05mf0j9.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87wo3p4p35.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-8-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- include/linux/usermode_driver.h | 6 +- kernel/usermode_driver.c | 126 +++++++++++++++++++++++++++++++--------- net/bpfilter/bpfilter_kern.c | 14 ++++- 3 files changed, 113 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h index 48cf25e3145d..97c919b7147c 100644 --- a/include/linux/usermode_driver.h +++ b/include/linux/usermode_driver.h @@ -2,6 +2,7 @@ #define __LINUX_USERMODE_DRIVER_H__ #include +#include #ifdef CONFIG_BPFILTER void __exit_umh(struct task_struct *tsk); @@ -23,8 +24,11 @@ struct umd_info { struct file *pipe_from_umh; struct list_head list; void (*cleanup)(struct umd_info *info); + struct path wd; pid_t pid; }; -int fork_usermode_blob(void *data, size_t len, struct umd_info *info); +int umd_load_blob(struct umd_info *info, const void *data, size_t len); +int umd_unload_blob(struct umd_info *info); +int fork_usermode_driver(struct umd_info *info); #endif /* __LINUX_USERMODE_DRIVER_H__ */ diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index 46d60d855e93..a86798759f83 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -4,11 +4,98 @@ */ #include #include +#include +#include +#include #include static LIST_HEAD(umh_list); static DEFINE_MUTEX(umh_list_lock); +static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name) +{ + struct file_system_type *type; + struct vfsmount *mnt; + struct file *file; + ssize_t written; + loff_t pos = 0; + + type = get_fs_type("tmpfs"); + if (!type) + return ERR_PTR(-ENODEV); + + mnt = kern_mount(type); + put_filesystem(type); + if (IS_ERR(mnt)) + return mnt; + + file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700); + if (IS_ERR(file)) { + mntput(mnt); + return ERR_CAST(file); + } + + written = kernel_write(file, data, len, &pos); + if (written != len) { + int err = written; + if (err >= 0) + err = -ENOMEM; + filp_close(file, NULL); + mntput(mnt); + return ERR_PTR(err); + } + + fput(file); + + /* Flush delayed fput so exec can open the file read-only */ + flush_delayed_fput(); + task_work_run(); + return mnt; +} + +/** + * umd_load_blob - Remember a blob of bytes for fork_usermode_driver + * @info: information about usermode driver + * @data: a blob of bytes that can be executed as a file + * @len: The lentgh of the blob + * + */ +int umd_load_blob(struct umd_info *info, const void *data, size_t len) +{ + struct vfsmount *mnt; + + if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt)) + return -EBUSY; + + mnt = blob_to_mnt(data, len, info->driver_name); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + info->wd.mnt = mnt; + info->wd.dentry = mnt->mnt_root; + return 0; +} +EXPORT_SYMBOL_GPL(umd_load_blob); + +/** + * umd_unload_blob - Disassociate @info from a previously loaded blob + * @info: information about usermode driver + * + */ +int umd_unload_blob(struct umd_info *info) +{ + if (WARN_ON_ONCE(!info->wd.mnt || + !info->wd.dentry || + info->wd.mnt->mnt_root != info->wd.dentry)) + return -EINVAL; + + kern_unmount(info->wd.mnt); + info->wd.mnt = NULL; + info->wd.dentry = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(umd_unload_blob); + static int umd_setup(struct subprocess_info *info, struct cred *new) { struct umd_info *umd_info = info->data; @@ -43,6 +130,7 @@ static int umd_setup(struct subprocess_info *info, struct cred *new) return err; } + set_fs_pwd(current->fs, &umd_info->wd); umd_info->pipe_to_umh = to_umh[1]; umd_info->pipe_from_umh = from_umh[0]; umd_info->pid = task_pid_nr(current); @@ -62,39 +150,21 @@ static void umd_cleanup(struct subprocess_info *info) } /** - * fork_usermode_blob - fork a blob of bytes as a usermode process - * @data: a blob of bytes that can be do_execv-ed as a file - * @len: length of the blob - * @info: information about usermode process (shouldn't be NULL) + * fork_usermode_driver - fork a usermode driver + * @info: information about usermode driver (shouldn't be NULL) * - * Returns either negative error or zero which indicates success - * in executing a blob of bytes as a usermode process. In such - * case 'struct umd_info *info' is populated with two pipes - * and a pid of the process. The caller is responsible for health - * check of the user process, killing it via pid, and closing the - * pipes when user process is no longer needed. + * Returns either negative error or zero which indicates success in + * executing a usermode driver. In such case 'struct umd_info *info' + * is populated with two pipes and a pid of the process. The caller is + * responsible for health check of the user process, killing it via + * pid, and closing the pipes when user process is no longer needed. */ -int fork_usermode_blob(void *data, size_t len, struct umd_info *info) +int fork_usermode_driver(struct umd_info *info) { struct subprocess_info *sub_info; char **argv = NULL; - struct file *file; - ssize_t written; - loff_t pos = 0; int err; - file = shmem_kernel_file_setup(info->driver_name, len, 0); - if (IS_ERR(file)) - return PTR_ERR(file); - - written = kernel_write(file, data, len, &pos); - if (written != len) { - err = written; - if (err >= 0) - err = -ENOMEM; - goto out; - } - err = -ENOMEM; argv = argv_split(GFP_KERNEL, info->driver_name, NULL); if (!argv) @@ -106,7 +176,6 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info) if (!sub_info) goto out; - sub_info->file = file; err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); if (!err) { mutex_lock(&umh_list_lock); @@ -116,10 +185,9 @@ int fork_usermode_blob(void *data, size_t len, struct umd_info *info) out: if (argv) argv_free(argv); - fput(file); return err; } -EXPORT_SYMBOL_GPL(fork_usermode_blob); +EXPORT_SYMBOL_GPL(fork_usermode_driver); void __exit_umh(struct task_struct *tsk) { diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index c0f0990f30b6..28883b00609d 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -77,9 +77,7 @@ static int start_umh(void) int err; /* fork usermode process */ - err = fork_usermode_blob(&bpfilter_umh_start, - &bpfilter_umh_end - &bpfilter_umh_start, - &bpfilter_ops.info); + err = fork_usermode_driver(&bpfilter_ops.info); if (err) return err; bpfilter_ops.stop = false; @@ -98,6 +96,12 @@ static int __init load_umh(void) { int err; + err = umd_load_blob(&bpfilter_ops.info, + &bpfilter_umh_start, + &bpfilter_umh_end - &bpfilter_umh_start); + if (err) + return err; + mutex_lock(&bpfilter_ops.lock); if (!bpfilter_ops.stop) { err = -EFAULT; @@ -110,6 +114,8 @@ static int __init load_umh(void) } out: mutex_unlock(&bpfilter_ops.lock); + if (err) + umd_unload_blob(&bpfilter_ops.info); return err; } @@ -122,6 +128,8 @@ static void __exit fini_umh(void) bpfilter_ops.sockopt = NULL; } mutex_unlock(&bpfilter_ops.lock); + + umd_unload_blob(&bpfilter_ops.info); } module_init(load_umh); module_exit(fini_umh); -- cgit v1.2.3 From 55e6074e3fa67e1fb9ec140904db7e6cae6eda4b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 13:52:50 -0500 Subject: umh: Stop calling do_execve_file With the user mode driver code changed to not set subprocess_info.file there are no more users of subproces_info.file. Remove this field from struct subprocess_info and remove the only user in call_usermodehelper_exec_async that would call do_execve_file instead of do_execve if file was set. v1: https://lkml.kernel.org/r/877dvuf0i7.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87r1tx4p2a.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-9-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- include/linux/umh.h | 1 - kernel/umh.c | 10 +++------- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/umh.h b/include/linux/umh.h index 73173c4a07e5..244aff638220 100644 --- a/include/linux/umh.h +++ b/include/linux/umh.h @@ -22,7 +22,6 @@ struct subprocess_info { const char *path; char **argv; char **envp; - struct file *file; int wait; int retval; int (*init)(struct subprocess_info *info, struct cred *new); diff --git a/kernel/umh.c b/kernel/umh.c index 3e4e453d45c8..6ca2096298b9 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -98,13 +98,9 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - if (sub_info->file) - retval = do_execve_file(sub_info->file, - sub_info->argv, sub_info->envp); - else - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); + retval = do_execve(getname_kernel(sub_info->path), + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); out: sub_info->retval = retval; /* -- cgit v1.2.3 From 1c340ead18ee4b4a84357abdef6d4f39ee08328b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 16:48:26 -0500 Subject: umd: Track user space drivers with struct pid Use struct pid instead of user space pid values that are prone to wrap araound. In addition track the entire thread group instead of just the first thread that is started by exec. There are no multi-threaded user mode drivers today but there is nothing preclucing user drivers from being multi-threaded, so it is just a good idea to track the entire process. Take a reference count on the tgid's in question to make it possible to remove exit_umh in a future change. As a struct pid is available directly use kill_pid_info. The prior process signalling code was iffy in using a userspace pid known to be in the initial pid namespace and then looking up it's task in whatever the current pid namespace is. It worked only because kernel threads always run in the initial pid namespace. As the tgid is now refcounted verify the tgid is NULL at the start of fork_usermode_driver to avoid the possibility of silent pid leaks. v1: https://lkml.kernel.org/r/87mu4qdlv2.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/a70l4oy8.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-12-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- include/linux/usermode_driver.h | 2 +- kernel/exit.c | 3 ++- kernel/usermode_driver.c | 15 ++++++++++----- net/bpfilter/bpfilter_kern.c | 13 +++++-------- net/ipv4/bpfilter/sockopt.c | 3 ++- 5 files changed, 20 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h index 97c919b7147c..45adbffb31d9 100644 --- a/include/linux/usermode_driver.h +++ b/include/linux/usermode_driver.h @@ -25,7 +25,7 @@ struct umd_info { struct list_head list; void (*cleanup)(struct umd_info *info); struct path wd; - pid_t pid; + struct pid *tgid; }; int umd_load_blob(struct umd_info *info, const void *data, size_t len); int umd_unload_blob(struct umd_info *info); diff --git a/kernel/exit.c b/kernel/exit.c index a081deea52ca..d3294b611df1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -805,7 +805,8 @@ void __noreturn do_exit(long code) exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); - exit_umh(tsk); + if (group_dead) + exit_umh(tsk); /* * Flush inherited counters to the parent - before the parent diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index a86798759f83..f77f8d7ce9e3 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -133,7 +133,7 @@ static int umd_setup(struct subprocess_info *info, struct cred *new) set_fs_pwd(current->fs, &umd_info->wd); umd_info->pipe_to_umh = to_umh[1]; umd_info->pipe_from_umh = from_umh[0]; - umd_info->pid = task_pid_nr(current); + umd_info->tgid = get_pid(task_tgid(current)); current->flags |= PF_UMH; return 0; } @@ -146,6 +146,8 @@ static void umd_cleanup(struct subprocess_info *info) if (info->retval) { fput(umd_info->pipe_to_umh); fput(umd_info->pipe_from_umh); + put_pid(umd_info->tgid); + umd_info->tgid = NULL; } } @@ -155,9 +157,9 @@ static void umd_cleanup(struct subprocess_info *info) * * Returns either negative error or zero which indicates success in * executing a usermode driver. In such case 'struct umd_info *info' - * is populated with two pipes and a pid of the process. The caller is + * is populated with two pipes and a tgid of the process. The caller is * responsible for health check of the user process, killing it via - * pid, and closing the pipes when user process is no longer needed. + * tgid, and closing the pipes when user process is no longer needed. */ int fork_usermode_driver(struct umd_info *info) { @@ -165,6 +167,9 @@ int fork_usermode_driver(struct umd_info *info) char **argv = NULL; int err; + if (WARN_ON_ONCE(info->tgid)) + return -EBUSY; + err = -ENOMEM; argv = argv_split(GFP_KERNEL, info->driver_name, NULL); if (!argv) @@ -192,11 +197,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_driver); void __exit_umh(struct task_struct *tsk) { struct umd_info *info; - pid_t pid = tsk->pid; + struct pid *tgid = task_tgid(tsk); mutex_lock(&umh_list_lock); list_for_each_entry(info, &umh_list, list) { - if (info->pid == pid) { + if (info->tgid == tgid) { list_del(&info->list); mutex_unlock(&umh_list_lock); goto out; diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 28883b00609d..08ea77c2b137 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -15,16 +15,13 @@ extern char bpfilter_umh_end; static void shutdown_umh(void) { - struct task_struct *tsk; + struct umd_info *info = &bpfilter_ops.info; + struct pid *tgid = info->tgid; if (bpfilter_ops.stop) return; - tsk = get_pid_task(find_vpid(bpfilter_ops.info.pid), PIDTYPE_PID); - if (tsk) { - send_sig(SIGKILL, tsk, 1); - put_task_struct(tsk); - } + kill_pid(tgid, SIGKILL, 1); } static void __stop_umh(void) @@ -48,7 +45,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.cmd = optname; req.addr = (long __force __user)optval; req.len = optlen; - if (!bpfilter_ops.info.pid) + if (!bpfilter_ops.info.tgid) goto out; n = __kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req), &pos); @@ -81,7 +78,7 @@ static int start_umh(void) if (err) return err; bpfilter_ops.stop = false; - pr_info("Loaded bpfilter_umh pid %d\n", bpfilter_ops.info.pid); + pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid)); /* health check that usermode process started correctly */ if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 5050de28333d..56cbc43145f6 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -18,7 +18,8 @@ static void bpfilter_umh_cleanup(struct umd_info *info) bpfilter_ops.stop = true; fput(info->pipe_to_umh); fput(info->pipe_from_umh); - info->pid = 0; + put_pid(info->tgid); + info->tgid = NULL; mutex_unlock(&bpfilter_ops.lock); } -- cgit v1.2.3 From ff2a91127b374c75ae024b31d22f23ad49d16eb4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 24 May 2020 20:57:00 +0200 Subject: fork: remove do_fork() Now that all architectures have been switched to use _do_fork() and the new struct kernel_clone_args calling convention we can remove the legacy do_fork() helper completely. The calling convention used to be brittle and do_fork() didn't buy us anything. The only calling convention accepted should be based on struct kernel_clone_args going forward. It's cleaner and uniform. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Al Viro Cc: "Matthew Wilcox (Oracle)" Cc: "Peter Zijlstra (Intel)" Signed-off-by: Christian Brauner --- include/linux/sched/task.h | 1 - kernel/fork.c | 25 +------------------------ 2 files changed, 1 insertion(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ddce0ea515d1..9f03c44941fb 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -96,7 +96,6 @@ extern void exit_files(struct task_struct *); extern void exit_itimers(struct signal_struct *); extern long _do_fork(struct kernel_clone_args *kargs); -extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/kernel/fork.c b/kernel/fork.c index 9875aeb2ba41..0fd7eb1b38f9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2493,29 +2493,6 @@ long _do_fork(struct kernel_clone_args *args) return nr; } -#ifndef CONFIG_HAVE_COPY_THREAD_TLS -/* For compatibility with architectures that call do_fork directly rather than - * using the syscall entry points below. */ -long do_fork(unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr) -{ - struct kernel_clone_args args = { - .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), - .pidfd = parent_tidptr, - .child_tid = child_tidptr, - .parent_tid = parent_tidptr, - .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), - .stack = stack_start, - .stack_size = stack_size, - }; - - return _do_fork(&args); -} -#endif - /* * Create a kernel thread. */ @@ -2923,7 +2900,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* - * functions used by do_fork() cannot be used here directly + * functions used by _do_fork() cannot be used here directly * because they modify an inactive task_struct that is being * constructed. Here we are modifying the current, active, * task_struct. -- cgit v1.2.3 From 140c8180eb7c7cbda399f64474788b86db72db32 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 24 May 2020 23:34:20 +0200 Subject: arch: remove HAVE_COPY_THREAD_TLS All architectures support copy_thread_tls() now, so remove the legacy copy_thread() function and the HAVE_COPY_THREAD_TLS config option. Everyone uses the same process creation calling convention based on copy_thread_tls() and struct kernel_clone_args. This will make it easier to maintain the core process creation code under kernel/, simplifies the callpaths and makes the identical for all architectures. Cc: linux-arch@vger.kernel.org Acked-by: Thomas Bogendoerfer Acked-by: Greentime Hu Acked-by: Geert Uytterhoeven Reviewed-by: Kees Cook Signed-off-by: Christian Brauner --- arch/Kconfig | 7 ------- arch/alpha/Kconfig | 1 - arch/arc/Kconfig | 1 - arch/arm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/c6x/Kconfig | 1 - arch/csky/Kconfig | 1 - arch/h8300/Kconfig | 1 - arch/hexagon/Kconfig | 1 - arch/ia64/Kconfig | 1 - arch/m68k/Kconfig | 1 - arch/microblaze/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/nds32/Kconfig | 1 - arch/nios2/Kconfig | 1 - arch/openrisc/Kconfig | 1 - arch/parisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/riscv/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/um/Kconfig | 1 - arch/unicore32/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/xtensa/Kconfig | 1 - include/linux/sched/task.h | 15 +-------------- kernel/fork.c | 9 --------- 28 files changed, 1 insertion(+), 55 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 8cc35dc556c7..943aac2f3ebe 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -754,13 +754,6 @@ config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT depends on MMU select ARCH_HAS_ELF_RANDOMIZE -config HAVE_COPY_THREAD_TLS - bool - help - Architecture provides copy_thread_tls to accept tls argument via - normal C parameter passing, rather than extracting the syscall - argument from pt_regs. - config HAVE_STACK_VALIDATION bool help diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index b01515c6b2ed..10862c5a8c76 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -38,7 +38,6 @@ config ALPHA select OLD_SIGSUSPEND select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 select MMU_GATHER_NO_RANGE - select HAVE_COPY_THREAD_TLS help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index fddc70029727..1fa0b98ed9ce 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -29,7 +29,6 @@ config ARC select GENERIC_SMP_IDLE_THREAD select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK - select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_STACKOVERFLOW select HAVE_DEBUG_KMEMLEAK select HAVE_FUTEX_CMPXCHG if FUTEX diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2ac74904a3ce..445b5ed693f0 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -72,7 +72,6 @@ config ARM select HAVE_ARM_SMCCC if CPU_V7 select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32 select HAVE_CONTEXT_TRACKING - select HAVE_COPY_THREAD_TLS select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK if !XIP_KERNEL select HAVE_DMA_CONTIGUOUS if MMU diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a4a094bedcb2..de93e965727d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -148,7 +148,6 @@ config ARM64 select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING - select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 9cde76a5928e..6444ebfd06a6 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -22,7 +22,6 @@ config C6X select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA select MMU_GATHER_NO_RANGE if MMU - select HAVE_COPY_THREAD_TLS config MMU def_bool n diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index bd31ab12f77d..902f1142d550 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -38,7 +38,6 @@ config CSKY select GX6605S_TIMER if CPU_CK610 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_AUDITSYSCALL - select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_BUGVERBOSE select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index de0eb417a0b9..d11666d538fe 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -26,7 +26,6 @@ config H8300 select HAVE_ARCH_HASH select CPU_NO_EFFICIENT_FFS select UACCESS_MEMCPY - select HAVE_COPY_THREAD_TLS config CPU_BIG_ENDIAN def_bool y diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 19bc2f2ee331..667cfc511cf9 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -31,7 +31,6 @@ config HEXAGON select GENERIC_CLOCKEVENTS_BROADCAST select MODULES_USE_ELF_RELA select GENERIC_CPU_DEVICES - select HAVE_COPY_THREAD_TLS help Qualcomm Hexagon is a processor architecture designed for high performance and low power across a wide variety of applications. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 1b6034b89a04..1fa2fe2ef053 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -55,7 +55,6 @@ config IA64 select HAVE_ARCH_AUDITSYSCALL select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH - select HAVE_COPY_THREAD_TLS select NUMA if !FLATMEM default y help diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 6ad6cdac74b3..6663f1741798 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -14,7 +14,6 @@ config M68K select HAVE_AOUT if MMU select HAVE_ASM_MODVERSIONS select HAVE_DEBUG_BUGVERBOSE - select HAVE_COPY_THREAD_TLS select GENERIC_IRQ_SHOW select GENERIC_ATOMIC64 select HAVE_UID16 diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index e3a211a41880..d262ac0c8714 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -46,7 +46,6 @@ config MICROBLAZE select CPU_NO_EFFICIENT_FFS select MMU_GATHER_NO_RANGE if MMU select SPARSE_IRQ - select HAVE_COPY_THREAD_TLS # Endianness selection choice diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 6fee1a133e9d..ca92c3ed2dc5 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -51,7 +51,6 @@ config MIPS select HAVE_CBPF_JIT if !64BIT && !CPU_MICROMIPS select HAVE_CONTEXT_TRACKING select HAVE_TIF_NOHZ - select HAVE_COPY_THREAD_TLS select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index 7b6eaca81cce..e30298e99e1b 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -48,7 +48,6 @@ config NDS32 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE - select HAVE_COPY_THREAD_TLS help Andes(nds32) Linux support. diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index f9a05957a883..c6645141bb2a 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -27,7 +27,6 @@ config NIOS2 select USB_ARCH_HAS_HCD if USB_SUPPORT select CPU_NO_EFFICIENT_FFS select MMU_GATHER_NO_RANGE if MMU - select HAVE_COPY_THREAD_TLS config GENERIC_CSUM def_bool y diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 8588996165ae..7e94fe37cb2f 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -16,7 +16,6 @@ config OPENRISC select HANDLE_DOMAIN_IRQ select GPIOLIB select HAVE_ARCH_TRACEHOOK - select HAVE_COPY_THREAD_TLS select SPARSE_IRQ select GENERIC_IRQ_CHIP select GENERIC_IRQ_PROBE diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 8e4c3708773d..2667eeb6c6f1 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -62,7 +62,6 @@ config PARISC select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE select HAVE_KPROBES_ON_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS - select HAVE_COPY_THREAD_TLS help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 9fa23eb320ff..3b262d87e9c4 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -186,7 +186,6 @@ config PPC select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2) select HAVE_CONTEXT_TRACKING if PPC64 select HAVE_TIF_NOHZ if PPC64 - select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW select HAVE_DYNAMIC_FTRACE diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 128192e14ff2..f6a3a2bea3d8 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -52,7 +52,6 @@ config RISCV select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ASM_MODVERSIONS - select HAVE_COPY_THREAD_TLS select HAVE_DMA_CONTIGUOUS if MMU select HAVE_EBPF_JIT if MMU select HAVE_FUTEX_CMPXCHG if FUTEX diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c7d7ede6300c..959969759453 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -136,7 +136,6 @@ config S390 select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL - select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index e10118d61ce7..9fc2b010e938 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -70,7 +70,6 @@ config SUPERH select ARCH_HIBERNATION_POSSIBLE if MMU select SPARSE_IRQ select HAVE_STACKPROTECTOR - select HAVE_COPY_THREAD_TLS help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 66213c0cb557..5bf2dc163540 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -48,7 +48,6 @@ config SPARC select LOCKDEP_SMALL if LOCKDEP select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH - select HAVE_COPY_THREAD_TLS config SPARC32 def_bool !64BIT diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 9318dc6d1a0c..ef69be17ff70 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -14,7 +14,6 @@ config UML select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_BUGVERBOSE - select HAVE_COPY_THREAD_TLS select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES select GENERIC_CLOCKEVENTS diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 01451cf500d2..11ba1839d198 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -22,7 +22,6 @@ config UNICORE32 select MODULES_USE_ELF_REL select NEED_DMA_MAP_STATE select MMU_GATHER_NO_RANGE if MMU - select HAVE_COPY_THREAD_TLS help UniCore-32 is 32-bit Instruction Set Architecture, including a series of low-power-consumption RISC chip diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a0cc524882d..214b8bf39bbe 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -161,7 +161,6 @@ config X86 select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING if X86_64 - select HAVE_COPY_THREAD_TLS select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 3a9f1e80394a..b71ba910d92f 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -24,7 +24,6 @@ config XTENSA select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL select HAVE_ARCH_TRACEHOOK - select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_EXIT_THREAD diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 9f03c44941fb..77cbe14c3034 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -65,22 +65,9 @@ extern void fork_init(void); extern void release_task(struct task_struct * p); -#ifdef CONFIG_HAVE_COPY_THREAD_TLS extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, struct task_struct *, unsigned long); -#else -extern int copy_thread(unsigned long, unsigned long, unsigned long, - struct task_struct *); - -/* Architectures that haven't opted into copy_thread_tls get the tls argument - * via pt_regs, so ignore the tls argument passed via C. */ -static inline int copy_thread_tls( - unsigned long clone_flags, unsigned long sp, unsigned long arg, - struct task_struct *p, unsigned long tls) -{ - return copy_thread(clone_flags, sp, arg, p); -} -#endif + extern void flush_thread(void); #ifdef CONFIG_HAVE_EXIT_THREAD diff --git a/kernel/fork.c b/kernel/fork.c index 0fd7eb1b38f9..8e52e16a1b5e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2577,15 +2577,6 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 -/* - * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from - * the registers containing the syscall arguments for clone. This doesn't work - * with clone3 since the TLS value is passed in clone_args instead. - */ -#ifndef CONFIG_HAVE_COPY_THREAD_TLS -#error clone3 requires copy_thread_tls support in arch -#endif - noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) -- cgit v1.2.3 From 714acdbd1c94e7e3ab90f6b6938f1ccb27b662f0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 11 Jun 2020 11:04:15 +0200 Subject: arch: rename copy_thread_tls() back to copy_thread() Now that HAVE_COPY_THREAD_TLS has been removed, rename copy_thread_tls() back simply copy_thread(). It's a simpler name, and doesn't imply that only tls is copied here. This finishes an outstanding chunk of internal process creation work since we've added clone3(). Cc: linux-arch@vger.kernel.org Acked-by: Thomas Bogendoerfer A Acked-by: Stafford Horne Acked-by: Greentime Hu Acked-by: Geert Uytterhoeven A Reviewed-by: Kees Cook Signed-off-by: Christian Brauner --- arch/alpha/kernel/process.c | 6 +++--- arch/arc/kernel/process.c | 5 +++-- arch/arm/kernel/process.c | 5 ++--- arch/arm64/kernel/process.c | 2 +- arch/c6x/kernel/process.c | 6 +++--- arch/csky/kernel/process.c | 2 +- arch/h8300/kernel/process.c | 5 ++--- arch/hexagon/kernel/process.c | 4 ++-- arch/ia64/kernel/process.c | 7 +++---- arch/m68k/kernel/process.c | 5 ++--- arch/microblaze/kernel/process.c | 4 ++-- arch/mips/kernel/process.c | 5 +++-- arch/nds32/kernel/process.c | 5 ++--- arch/nios2/kernel/process.c | 4 ++-- arch/openrisc/kernel/process.c | 6 +++--- arch/parisc/kernel/process.c | 2 +- arch/powerpc/kernel/process.c | 2 +- arch/riscv/kernel/process.c | 4 ++-- arch/s390/kernel/process.c | 4 ++-- arch/sh/kernel/process_32.c | 4 ++-- arch/sparc/kernel/process.c | 6 +++--- arch/sparc/kernel/process_32.c | 5 ++--- arch/sparc/kernel/process_64.c | 5 ++--- arch/um/kernel/process.c | 2 +- arch/unicore32/kernel/process.c | 5 ++--- arch/x86/kernel/process.c | 4 ++-- arch/x86/kernel/unwind_frame.c | 2 +- arch/xtensa/kernel/process.c | 2 +- include/linux/sched/task.h | 4 ++-- kernel/fork.c | 3 +-- 30 files changed, 59 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index dfdb6b6ba61c..7462a7911002 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -233,9 +233,9 @@ release_thread(struct task_struct *dead_task) /* * Copy architecture-specific thread state */ -int copy_thread_tls(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p, - unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long usp, + unsigned long kthread_arg, struct task_struct *p, + unsigned long tls) { extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index 8c8e5172fecd..105420c23c8b 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -173,8 +173,9 @@ asmlinkage void ret_from_fork(void); * | user_r25 | * ------------------ <===== END of PAGE */ -int copy_thread_tls(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p, unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long usp, + unsigned long kthread_arg, struct task_struct *p, + unsigned long tls) { struct pt_regs *c_regs; /* child's pt_regs */ unsigned long *childksp; /* to unwind out of __switch_to() */ diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 58eaa1f60e16..3395be186c7d 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -225,9 +225,8 @@ void release_thread(struct task_struct *dead_task) asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -int -copy_thread_tls(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p, unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long stack_start, + unsigned long stk_sz, struct task_struct *p, unsigned long tls) { struct thread_info *thread = task_thread_info(p); struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 6089638c7d43..84ec630b8ab5 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -375,7 +375,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) asmlinkage void ret_from_fork(void) asm("ret_from_fork"); -int copy_thread_tls(unsigned long clone_flags, unsigned long stack_start, +int copy_thread(unsigned long clone_flags, unsigned long stack_start, unsigned long stk_sz, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c index afa3ea9a93aa..9f4fd6a40a10 100644 --- a/arch/c6x/kernel/process.c +++ b/arch/c6x/kernel/process.c @@ -104,9 +104,9 @@ void start_thread(struct pt_regs *regs, unsigned int pc, unsigned long usp) /* * Copy a new thread context in its stack. */ -int copy_thread_tls(unsigned long clone_flags, unsigned long usp, - unsigned long ustk_size, struct task_struct *p, - unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long usp, + unsigned long ustk_size, struct task_struct *p, + unsigned long tls) { struct pt_regs *childregs; diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index 8b3fad062ab2..28cfeaaf902a 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -40,7 +40,7 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return sw->r15; } -int copy_thread_tls(unsigned long clone_flags, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long kthread_arg, struct task_struct *p, diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index ae23de4dcf42..83ce3caf7313 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -105,9 +105,8 @@ void flush_thread(void) { } -int copy_thread_tls(unsigned long clone_flags, unsigned long usp, - unsigned long topstk, struct task_struct *p, - unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long usp, + unsigned long topstk, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs; diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index d756f9556dd7..d294e71d11d8 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -50,8 +50,8 @@ void arch_cpu_idle(void) /* * Copy architecture-specific thread state */ -int copy_thread_tls(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, + struct task_struct *p, unsigned long tls) { struct thread_info *ti = task_thread_info(p); struct hexagon_switch_stack *ss; diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 416dca619da5..9cedb56c3fda 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -311,7 +311,7 @@ ia64_load_extra (struct task_struct *task) * * sys_clone : * _do_fork _do_fork - * copy_thread_tls copy_thread_tls + * copy_thread copy_thread * * This means that the stack layout is as follows: * @@ -333,9 +333,8 @@ ia64_load_extra (struct task_struct *task) * so there is nothing to worry about. */ int -copy_thread_tls(unsigned long clone_flags, unsigned long user_stack_base, - unsigned long user_stack_size, struct task_struct *p, - unsigned long tls) +copy_thread(unsigned long clone_flags, unsigned long user_stack_base, + unsigned long user_stack_size, struct task_struct *p, unsigned long tls) { extern char ia64_ret_from_clone; struct switch_stack *child_stack, *stack; diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 0608439ba452..6492a2c54dbc 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -138,9 +138,8 @@ asmlinkage int m68k_clone3(struct pt_regs *regs) return sys_clone3((struct clone_args __user *)regs->d1, regs->d2); } -int copy_thread_tls(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, - unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, + struct task_struct *p, unsigned long tls) { struct fork_frame { struct switch_stack sw; diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index c2ca9c326510..6cabeab9e2ba 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -54,8 +54,8 @@ void flush_thread(void) { } -int copy_thread_tls(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, + struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); struct thread_info *ti = task_thread_info(p); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index ff5320b79100..f5dc316a826a 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -119,8 +119,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) /* * Copy architecture-specific thread state */ -int copy_thread_tls(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p, unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long usp, + unsigned long kthread_arg, struct task_struct *p, + unsigned long tls) { struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs, *regs = current_pt_regs(); diff --git a/arch/nds32/kernel/process.c b/arch/nds32/kernel/process.c index 7dbb1bf64165..e85bbbadc0e7 100644 --- a/arch/nds32/kernel/process.c +++ b/arch/nds32/kernel/process.c @@ -149,9 +149,8 @@ void flush_thread(void) DEFINE_PER_CPU(struct task_struct *, __entry_task); asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -int copy_thread_tls(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p, - unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long stack_start, + unsigned long stk_sz, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c index 3dde4d6d8fbe..0a42ab8e4c32 100644 --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -100,8 +100,8 @@ void flush_thread(void) { } -int copy_thread_tls(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, + struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); struct pt_regs *regs; diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index d7010e72450c..848f74c2c47a 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -116,7 +116,7 @@ void release_thread(struct task_struct *dead_task) extern asmlinkage void ret_from_fork(void); /* - * copy_thread_tls + * copy_thread * @clone_flags: flags * @usp: user stack pointer or fn for kernel thread * @arg: arg to fn for kernel thread; always NULL for userspace thread @@ -147,8 +147,8 @@ extern asmlinkage void ret_from_fork(void); */ int -copy_thread_tls(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, unsigned long tls) +copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, + struct task_struct *p, unsigned long tls) { struct pt_regs *userregs; struct pt_regs *kregs; diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index b7abb12edd3a..de6299ff1530 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -208,7 +208,7 @@ arch_initcall(parisc_idle_init); * Copy architecture-specific thread state */ int -copy_thread_tls(unsigned long clone_flags, unsigned long usp, +copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long kthread_arg, struct task_struct *p, unsigned long tls) { struct pt_regs *cregs = &(p->thread.regs); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 4650b9bb217f..794b754deec2 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1593,7 +1593,7 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) /* * Copy architecture-specific thread state */ -int copy_thread_tls(unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long kthread_arg, struct task_struct *p, unsigned long tls) { diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 824d117cf202..31f39442df72 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -101,8 +101,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } -int copy_thread_tls(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, + struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index eb6e23ad15a2..b06dec1267d0 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -80,8 +80,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } -int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long new_stackp, + unsigned long arg, struct task_struct *p, unsigned long tls) { struct fake_frame { diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 537a82d80616..b0fefd8f53a6 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -115,8 +115,8 @@ EXPORT_SYMBOL(dump_fpu); asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); -int copy_thread_tls(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, + struct task_struct *p, unsigned long tls) { struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs; diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c index 8bbe62d77b77..5234b5ccc0b9 100644 --- a/arch/sparc/kernel/process.c +++ b/arch/sparc/kernel/process.c @@ -28,7 +28,7 @@ asmlinkage long sparc_fork(struct pt_regs *regs) ret = _do_fork(&args); /* If we get an error and potentially restart the system - * call, we're screwed because copy_thread_tls() clobbered + * call, we're screwed because copy_thread() clobbered * the parent's %o1. So detect that case and restore it * here. */ @@ -53,7 +53,7 @@ asmlinkage long sparc_vfork(struct pt_regs *regs) ret = _do_fork(&args); /* If we get an error and potentially restart the system - * call, we're screwed because copy_thread_tls() clobbered + * call, we're screwed because copy_thread() clobbered * the parent's %o1. So detect that case and restore it * here. */ @@ -99,7 +99,7 @@ asmlinkage long sparc_clone(struct pt_regs *regs) ret = _do_fork(&args); /* If we get an error and potentially restart the system - * call, we're screwed because copy_thread_tls() clobbered + * call, we're screwed because copy_thread() clobbered * the parent's %o1. So detect that case and restore it * here. */ diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 3e1f7b639e9a..bd123f1de2e7 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -273,9 +273,8 @@ clone_stackframe(struct sparc_stackf __user *dst, extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); -int copy_thread_tls(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p, - unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) { struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs, *regs = current_pt_regs(); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 278bf287c4be..04ef19b88632 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -577,9 +577,8 @@ barf: * Parent --> %o0 == childs pid, %o1 == 0 * Child --> %o0 == parents pid, %o1 == 1 */ -int copy_thread_tls(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p, - unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) { struct thread_info *t = task_thread_info(p); struct pt_regs *regs = current_pt_regs(); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index e3a2cf92a373..26b5e243d3fc 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -152,7 +152,7 @@ void fork_handler(void) userspace(¤t->thread.regs.regs, current_thread_info()->aux_fp_regs); } -int copy_thread_tls(unsigned long clone_flags, unsigned long sp, +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct * p, unsigned long tls) { void (*handler)(void); diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index 49a305565a53..d5ae5a971c19 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -219,9 +219,8 @@ void release_thread(struct task_struct *dead_task) asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); -int copy_thread_tls(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p, - unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long stack_start, + unsigned long stk_sz, struct task_struct *p, unsigned long tls) { struct thread_info *thread = task_thread_info(p); struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f362ce0d5ac0..b35cd50ed0dc 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -121,8 +121,8 @@ static int set_new_tls(struct task_struct *p, unsigned long tls) return do_set_thread_area_64(p, ARCH_SET_FS, tls); } -int copy_thread_tls(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) { struct inactive_task_frame *frame; struct fork_frame *fork_frame; diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 722a85f3b2dd..3070fd6561be 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -269,7 +269,7 @@ bool unwind_next_frame(struct unwind_state *state) /* * kthreads (other than the boot CPU's idle thread) have some * partial regs at the end of their stack which were placed - * there by copy_thread_tls(). But the regs don't have any + * there by copy_thread(). But the regs don't have any * useful information, so we can skip them. * * This user_mode() check is slightly broader than a PF_KTHREAD diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index b7fe6f443b42..397a7de56377 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -201,7 +201,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) * involved. Much simpler to just not copy those live frames across. */ -int copy_thread_tls(unsigned long clone_flags, unsigned long usp_thread_fn, +int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, unsigned long thread_fn_arg, struct task_struct *p, unsigned long tls) { diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 77cbe14c3034..b6253f2ea96a 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -65,8 +65,8 @@ extern void fork_init(void); extern void release_task(struct task_struct * p); -extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, - struct task_struct *, unsigned long); +extern int copy_thread(unsigned long, unsigned long, unsigned long, + struct task_struct *, unsigned long); extern void flush_thread(void); diff --git a/kernel/fork.c b/kernel/fork.c index 8e52e16a1b5e..790841eb0a21 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2104,8 +2104,7 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p, - args->tls); + retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls); if (retval) goto bad_fork_cleanup_io; -- cgit v1.2.3 From 38fd525a4c61e7ecdc9ad4dcbf7b767d0a007962 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Jul 2020 07:30:06 -0500 Subject: exit: Factor thread_group_exited out of pidfd_poll Create an independent helper thread_group_exited which returns true when all threads have passed exit_notify in do_exit. AKA all of the threads are at least zombies and might be dead or completely gone. Create this helper by taking the logic out of pidfd_poll where it is already tested, and adding a READ_ONCE on the read of task->exit_state. I will be changing the user mode driver code to use this same logic to know when a user mode driver needs to be restarted. Place the new helper thread_group_exited in kernel/exit.c and EXPORT it so it can be used by modules. Link: https://lkml.kernel.org/r/20200702164140.4468-13-ebiederm@xmission.com Acked-by: Christian Brauner Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 2 ++ kernel/exit.c | 24 ++++++++++++++++++++++++ kernel/fork.c | 6 +----- 3 files changed, 27 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 0ee5e696c5d8..1bad18a1d8ba 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p) #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) +extern bool thread_group_exited(struct pid *pid); + extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, unsigned long *flags); diff --git a/kernel/exit.c b/kernel/exit.c index d3294b611df1..dee246c0866f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1713,6 +1713,30 @@ Efault: } #endif +/** + * thread_group_exited - check that a thread group has exited + * @pid: tgid of thread group to be checked. + * + * Test if the thread group represented by tgid has exited (all + * threads are zombies, dead or completely gone). + * + * Return: true if the thread group has exited. false otherwise. + */ +bool thread_group_exited(struct pid *pid) +{ + struct task_struct *task; + bool exited; + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + exited = !task || + (READ_ONCE(task->exit_state) && thread_group_empty(task)); + rcu_read_unlock(); + + return exited; +} +EXPORT_SYMBOL(thread_group_exited); + __weak void abort(void) { BUG(); diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..bf215af7a904 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1787,22 +1787,18 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { - struct task_struct *task; struct pid *pid = file->private_data; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); - rcu_read_lock(); - task = pid_task(pid, PIDTYPE_PID); /* * Inform pollers only when the whole thread group exits. * If the thread group leader exits before all other threads in the * group, then poll(2) should block, similar to the wait(2) family. */ - if (!task || (task->exit_state && thread_group_empty(task))) + if (thread_group_exited(pid)) poll_flags = EPOLLIN | EPOLLRDNORM; - rcu_read_unlock(); return poll_flags; } -- cgit v1.2.3 From 8c2f52663973e643c617663d826e2b0daa008b38 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Jun 2020 17:40:40 -0500 Subject: umd: Remove exit_umh The bpfilter code no longer uses the umd_info.cleanup callback. This callback is what exit_umh exists to call. So remove exit_umh and all of it's associated booking. v1: https://lkml.kernel.org/r/87bll6dlte.fsf_-_@x220.int.ebiederm.org v2: https://lkml.kernel.org/r/87y2o53abg.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-15-ebiederm@xmission.com Reviewed-by: Greg Kroah-Hartman Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- include/linux/sched.h | 1 - include/linux/usermode_driver.h | 16 ---------------- kernel/exit.c | 3 --- kernel/usermode_driver.c | 28 ---------------------------- 4 files changed, 48 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 59d1e92bb88e..edb2020875ad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1511,7 +1511,6 @@ extern struct pid *cad_pid; #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ -#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h index 45adbffb31d9..073a9e0ec07d 100644 --- a/include/linux/usermode_driver.h +++ b/include/linux/usermode_driver.h @@ -4,26 +4,10 @@ #include #include -#ifdef CONFIG_BPFILTER -void __exit_umh(struct task_struct *tsk); - -static inline void exit_umh(struct task_struct *tsk) -{ - if (unlikely(tsk->flags & PF_UMH)) - __exit_umh(tsk); -} -#else -static inline void exit_umh(struct task_struct *tsk) -{ -} -#endif - struct umd_info { const char *driver_name; struct file *pipe_to_umh; struct file *pipe_from_umh; - struct list_head list; - void (*cleanup)(struct umd_info *info); struct path wd; struct pid *tgid; }; diff --git a/kernel/exit.c b/kernel/exit.c index dee246c0866f..39226a018ed7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -63,7 +63,6 @@ #include #include #include -#include #include #include @@ -805,8 +804,6 @@ void __noreturn do_exit(long code) exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); - if (group_dead) - exit_umh(tsk); /* * Flush inherited counters to the parent - before the parent diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index f77f8d7ce9e3..cd136f86f799 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -9,9 +9,6 @@ #include #include -static LIST_HEAD(umh_list); -static DEFINE_MUTEX(umh_list_lock); - static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name) { struct file_system_type *type; @@ -134,7 +131,6 @@ static int umd_setup(struct subprocess_info *info, struct cred *new) umd_info->pipe_to_umh = to_umh[1]; umd_info->pipe_from_umh = from_umh[0]; umd_info->tgid = get_pid(task_tgid(current)); - current->flags |= PF_UMH; return 0; } @@ -182,11 +178,6 @@ int fork_usermode_driver(struct umd_info *info) goto out; err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); - if (!err) { - mutex_lock(&umh_list_lock); - list_add(&info->list, &umh_list); - mutex_unlock(&umh_list_lock); - } out: if (argv) argv_free(argv); @@ -194,23 +185,4 @@ out: } EXPORT_SYMBOL_GPL(fork_usermode_driver); -void __exit_umh(struct task_struct *tsk) -{ - struct umd_info *info; - struct pid *tgid = task_tgid(tsk); - - mutex_lock(&umh_list_lock); - list_for_each_entry(info, &umh_list, list) { - if (info->tgid == tgid) { - list_del(&info->list); - mutex_unlock(&umh_list_lock); - goto out; - } - } - mutex_unlock(&umh_list_lock); - return; -out: - if (info->cleanup) - info->cleanup(info); -} -- cgit v1.2.3 From 33c326014fe69304244868cc793c2c77be533125 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 29 Jun 2020 08:28:33 -0500 Subject: umd: Stop using split_argv There is exactly one argument so there is nothing to split. All split_argv does now is cause confusion and avoid the need for a cast when passing a "const char *" string to call_usermodehelper_setup. So avoid confusion and the possibility of an odd driver name causing problems by just using a fixed argv array with a cast in the call to call_usermodehelper_setup. v1: https://lkml.kernel.org/r/87sged3a9n.fsf_-_@x220.int.ebiederm.org Link: https://lkml.kernel.org/r/20200702164140.4468-16-ebiederm@xmission.com Acked-by: Alexei Starovoitov Tested-by: Alexei Starovoitov Signed-off-by: "Eric W. Biederman" --- kernel/usermode_driver.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index cd136f86f799..0b35212ffc3d 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -160,27 +160,21 @@ static void umd_cleanup(struct subprocess_info *info) int fork_usermode_driver(struct umd_info *info) { struct subprocess_info *sub_info; - char **argv = NULL; + const char *argv[] = { info->driver_name, NULL }; int err; if (WARN_ON_ONCE(info->tgid)) return -EBUSY; err = -ENOMEM; - argv = argv_split(GFP_KERNEL, info->driver_name, NULL); - if (!argv) - goto out; - - sub_info = call_usermodehelper_setup(info->driver_name, argv, NULL, - GFP_KERNEL, + sub_info = call_usermodehelper_setup(info->driver_name, + (char **)argv, NULL, GFP_KERNEL, umd_setup, umd_cleanup, info); if (!sub_info) goto out; err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); out: - if (argv) - argv_free(argv); return err; } EXPORT_SYMBOL_GPL(fork_usermode_driver); -- cgit v1.2.3 From f5836749c9c04a10decd2742845ad4870965fdef Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 6 Jul 2020 16:01:25 -0700 Subject: bpf: Add BPF_CGROUP_INET_SOCK_RELEASE hook Sometimes it's handy to know when the socket gets freed. In particular, we'd like to try to use a smarter allocation of ports for bpf_bind and explore the possibility of limiting the number of SOCK_DGRAM sockets the process can have. Implement BPF_CGROUP_INET_SOCK_RELEASE hook that triggers on inet socket release. It triggers only for userspace sockets (not in-kernel ones) and therefore has the same semantics as the existing BPF_CGROUP_INET_SOCK_CREATE. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200706230128.4073544-2-sdf@google.com --- include/linux/bpf-cgroup.h | 4 ++++ include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 3 +++ net/core/filter.c | 1 + net/ipv4/af_inet.c | 3 +++ 5 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c66c545e161a..2c6f26670acc 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -210,6 +210,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) +#define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_RELEASE) + #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) @@ -401,6 +404,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index da9bf35a26f8..548a749aebb3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -226,6 +226,7 @@ enum bpf_attach_type { BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, + BPF_CGROUP_INET_SOCK_RELEASE, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8da159936bab..156f51ffada2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1981,6 +1981,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return 0; @@ -2779,6 +2780,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return BPF_PROG_TYPE_CGROUP_SOCK; @@ -2929,6 +2931,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_POST_BIND: diff --git a/net/core/filter.c b/net/core/filter.c index c5e696e6c315..ddcc0d6209e1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6890,6 +6890,7 @@ static bool __sock_filter_check_attach_type(int off, case offsetof(struct bpf_sock, priority): switch (attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: goto full_access; default: return false; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ea6ed6d487ed..ff141d630bdf 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -411,6 +411,9 @@ int inet_release(struct socket *sock) if (sk) { long timeout; + if (!sk->sk_kern_sock) + BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk); + /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); -- cgit v1.2.3 From 42815808f1791eb53c3a05fb78c0a8642ecf8467 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 6 Jul 2020 17:49:09 +0200 Subject: timens: make vdso_join_timens() always succeed As discussed on-list (cf. [1]), in order to make setns() support time namespaces when attaching to multiple namespaces at once properly we need to tweak vdso_join_timens() to always succeed. So switch vdso_join_timens() to using a read lock and replacing mmap_write_lock_killable() to mmap_read_lock() as we discussed. Last cycle setns() was changed to support attaching to multiple namespaces atomically. This requires all namespaces to have a point of no return where they can't fail anymore. Specifically, _install() is allowed to perform permission checks and install the namespace into the new struct nsset that it has been given but it is not allowed to make visible changes to the affected task. Once _install() returns anything that the given namespace type requires to be setup in addition needs to ideally be done in a function that can't fail or if it fails the failure is not fatal. For time namespaces the relevant functions that fall into this category are timens_set_vvar_page() and vdso_join_timens(). Currently the latter can fail but doesn't need to. With this we can go on to implement a timens_commit() helper in a follow up patch to be used by setns(). [1]: https://lore.kernel.org/lkml/20200611110221.pgd3r5qkjrjmfqa2@wittgenstein Signed-off-by: Christian Brauner Reviewed-by: Andrei Vagin Cc: Will Deacon Cc: Vincenzo Frascino Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Mark Rutland Cc: Dmitry Safonov Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20200706154912.3248030-2-christian.brauner@ubuntu.com --- arch/x86/entry/vdso/vma.c | 5 ++--- kernel/time/namespace.c | 10 ++-------- 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index ea7c1f0b79df..9185cb1d13b9 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -144,8 +144,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) struct mm_struct *mm = task->mm; struct vm_area_struct *vma; - if (mmap_write_lock_killable(mm)) - return -EINTR; + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { unsigned long size = vma->vm_end - vma->vm_start; @@ -154,7 +153,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) zap_page_range(vma, vma->vm_start, size); } - mmap_write_unlock(mm); + mmap_read_unlock(mm); return 0; } #else diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 5d9fc22d836a..e5af6fe87af8 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -284,7 +284,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); - int err; if (!current_is_single_threaded()) return -EUSERS; @@ -295,9 +294,7 @@ static int timens_install(struct nsset *nsset, struct ns_common *new) timens_set_vvar_page(current, ns); - err = vdso_join_timens(current, ns); - if (err) - return err; + vdso_join_timens(current, ns); get_time_ns(ns); put_time_ns(nsproxy->time_ns); @@ -313,7 +310,6 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); - int err; /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) @@ -321,9 +317,7 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) timens_set_vvar_page(tsk, ns); - err = vdso_join_timens(tsk, ns); - if (err) - return err; + vdso_join_timens(tsk, ns); get_time_ns(ns); put_time_ns(nsproxy->time_ns); -- cgit v1.2.3 From 5cfea9a106bbb1af919ab94456fdd02209984070 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 6 Jul 2020 17:49:10 +0200 Subject: timens: add timens_commit() helper Wrap the calls to timens_set_vvar_page() and vdso_join_timens() in timens_on_fork() and timens_install() in a new timens_commit() helper. We'll use this helper in a follow-up patch in nsproxy too. Signed-off-by: Christian Brauner Reviewed-by: Andrei Vagin Cc: Will Deacon Cc: Vincenzo Frascino Cc: Thomas Gleixner Cc: Catalin Marinas Cc: Mark Rutland Cc: Dmitry Safonov Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20200706154912.3248030-3-christian.brauner@ubuntu.com --- kernel/time/namespace.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index e5af6fe87af8..aa7b90aac2a7 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -280,6 +280,12 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } +static void timens_commit(struct task_struct *tsk, struct time_namespace *ns) +{ + timens_set_vvar_page(tsk, ns); + vdso_join_timens(tsk, ns); +} + static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; @@ -292,9 +298,8 @@ static int timens_install(struct nsset *nsset, struct ns_common *new) !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; - timens_set_vvar_page(current, ns); - vdso_join_timens(current, ns); + timens_commit(current, ns); get_time_ns(ns); put_time_ns(nsproxy->time_ns); @@ -315,14 +320,12 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) if (nsproxy->time_ns == nsproxy->time_ns_for_children) return 0; - timens_set_vvar_page(tsk, ns); - - vdso_join_timens(tsk, ns); - get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; + timens_commit(tsk, ns); + return 0; } -- cgit v1.2.3 From 76c12881a38aaa83e1eb4ce2fada36c3a732bad4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 6 Jul 2020 17:49:11 +0200 Subject: nsproxy: support CLONE_NEWTIME with setns() So far setns() was missing time namespace support. This was partially due to it simply not being implemented but also because vdso_join_timens() could still fail which made switching to multiple namespaces atomically problematic. This is now fixed so support CLONE_NEWTIME with setns() Signed-off-by: Christian Brauner Reviewed-by: Andrei Vagin Cc: Thomas Gleixner Cc: Michael Kerrisk Cc: Serge Hallyn Cc: Dmitry Safonov Link: https://lore.kernel.org/r/20200706154912.3248030-4-christian.brauner@ubuntu.com --- include/linux/time_namespace.h | 6 ++++++ kernel/nsproxy.c | 21 +++++++++++++++++++-- kernel/time/namespace.c | 5 +---- 3 files changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 824d54e057eb..5b6031385db0 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -33,6 +33,7 @@ extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); +extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { @@ -96,6 +97,11 @@ static inline int vdso_join_timens(struct task_struct *task, return 0; } +static inline void timens_commit(struct task_struct *tsk, + struct time_namespace *ns) +{ +} + static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { return NULL; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index cd356630a311..12dd41b39a7f 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -262,8 +262,8 @@ void exit_task_namespaces(struct task_struct *p) static int check_setns_flags(unsigned long flags) { if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID | - CLONE_NEWCGROUP))) + CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER | + CLONE_NEWPID | CLONE_NEWCGROUP))) return -EINVAL; #ifndef CONFIG_USER_NS @@ -290,6 +290,10 @@ static int check_setns_flags(unsigned long flags) if (flags & CLONE_NEWNET) return -EINVAL; #endif +#ifndef CONFIG_TIME_NS + if (flags & CLONE_NEWTIME) + return -EINVAL; +#endif return 0; } @@ -464,6 +468,14 @@ static int validate_nsset(struct nsset *nsset, struct pid *pid) } #endif +#ifdef CONFIG_TIME_NS + if (flags & CLONE_NEWTIME) { + ret = validate_ns(nsset, &nsp->time_ns->ns); + if (ret) + goto out; + } +#endif + out: if (pid_ns) put_pid_ns(pid_ns); @@ -507,6 +519,11 @@ static void commit_nsset(struct nsset *nsset) exit_sem(me); #endif +#ifdef CONFIG_TIME_NS + if (flags & CLONE_NEWTIME) + timens_commit(me, nsset->nsproxy->time_ns); +#endif + /* transfer ownership */ switch_task_namespaces(me, nsset->nsproxy); nsset->nsproxy = NULL; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index aa7b90aac2a7..afc65e6be33e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -280,7 +280,7 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } -static void timens_commit(struct task_struct *tsk, struct time_namespace *ns) +void timens_commit(struct task_struct *tsk, struct time_namespace *ns) { timens_set_vvar_page(tsk, ns); vdso_join_timens(tsk, ns); @@ -298,9 +298,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new) !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; - - timens_commit(current, ns); - get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; -- cgit v1.2.3 From ff9ff926889dd8026b4ba55266a010c27f68604f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:21 -0700 Subject: perf/core: Factor out functions to allocate/free the task_ctx_data The method to allocate/free the task_ctx_data is going to be changed in the following patch. Currently, the task_ctx_data is allocated/freed in several different places. To avoid repeatedly modifying the same codes in several different places, alloc_task_ctx_data() and free_task_ctx_data() are factored out to allocate/free the task_ctx_data. The modification only needs to be applied once. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-16-git-send-email-kan.liang@linux.intel.com --- kernel/events/core.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9b8f92500833..75090403f942 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1238,12 +1238,22 @@ static void get_ctx(struct perf_event_context *ctx) refcount_inc(&ctx->refcount); } +static void *alloc_task_ctx_data(struct pmu *pmu) +{ + return kzalloc(pmu->task_ctx_size, GFP_KERNEL); +} + +static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) +{ + kfree(task_ctx_data); +} + static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); - kfree(ctx->task_ctx_data); + free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); kfree(ctx); } @@ -4471,7 +4481,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, goto errout; if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); + task_ctx_data = alloc_task_ctx_data(pmu); if (!task_ctx_data) { err = -ENOMEM; goto errout; @@ -4529,11 +4539,11 @@ retry: } } - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ctx; errout: - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ERR_PTR(err); } @@ -12497,8 +12507,7 @@ inherit_event(struct perf_event *parent_event, !child_ctx->task_ctx_data) { struct pmu *pmu = child_event->pmu; - child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size, - GFP_KERNEL); + child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); if (!child_ctx->task_ctx_data) { free_event(child_event); return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 217c2a633ebb36f1cc6d249f4ef2e4a809d46818 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:22 -0700 Subject: perf/core: Use kmem_cache to allocate the PMU specific data Currently, the PMU specific data task_ctx_data is allocated by the function kzalloc() in the perf generic code. When there is no specific alignment requirement for the task_ctx_data, the method works well for now. However, there will be a problem once a specific alignment requirement is introduced in future features, e.g., the Architecture LBR XSAVE feature requires 64-byte alignment. If the specific alignment requirement is not fulfilled, the XSAVE family of instructions will fail to save/restore the xstate to/from the task_ctx_data. The function kzalloc() itself only guarantees a natural alignment. A new method to allocate the task_ctx_data has to be introduced, which has to meet the requirements as below: - must be a generic method can be used by different architectures, because the allocation of the task_ctx_data is implemented in the perf generic code; - must be an alignment-guarantee method (The alignment requirement is not changed after the boot); - must be able to allocate/free a buffer (smaller than a page size) dynamically; - should not cause extra CPU overhead or space overhead. Several options were considered as below: - One option is to allocate a larger buffer for task_ctx_data. E.g., ptr = kmalloc(size + alignment, GFP_KERNEL); ptr &= ~(alignment - 1); This option causes space overhead. - Another option is to allocate the task_ctx_data in the PMU specific code. To do so, several function pointers have to be added. As a result, both the generic structure and the PMU specific structure will become bigger. Besides, extra function calls are added when allocating/freeing the buffer. This option will increase both the space overhead and CPU overhead. - The third option is to use a kmem_cache to allocate a buffer for the task_ctx_data. The kmem_cache can be created with a specific alignment requirement by the PMU at boot time. A new pointer for kmem_cache has to be added in the generic struct pmu, which would be used to dynamically allocate a buffer for the task_ctx_data at run time. Although the new pointer is added to the struct pmu, the existing variable task_ctx_size is not required anymore. The size of the generic structure is kept the same. The third option which meets all the aforementioned requirements is used to replace kzalloc() for the PMU specific data allocation. A later patch will remove the kzalloc() method and the related variables. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-17-git-send-email-kan.liang@linux.intel.com --- include/linux/perf_event.h | 5 +++++ kernel/events/core.c | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 46fe5cfb5163..09915ae06d28 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -424,6 +424,11 @@ struct pmu { */ size_t task_ctx_size; + /* + * Kmem cache of PMU specific data + */ + struct kmem_cache *task_ctx_cache; + /* * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) * can be synchronized using this function. See Intel LBR callstack support diff --git a/kernel/events/core.c b/kernel/events/core.c index 75090403f942..30d9b3182369 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1240,12 +1240,18 @@ static void get_ctx(struct perf_event_context *ctx) static void *alloc_task_ctx_data(struct pmu *pmu) { + if (pmu->task_ctx_cache) + return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); + return kzalloc(pmu->task_ctx_size, GFP_KERNEL); } static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) { - kfree(task_ctx_data); + if (pmu->task_ctx_cache && task_ctx_data) + kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); + else + kfree(task_ctx_data); } static void free_ctx(struct rcu_head *head) -- cgit v1.2.3 From 5a09928d339f3cf0973991ddc3a2798825c84c99 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:24 -0700 Subject: perf/x86: Remove task_ctx_size A new kmem_cache method has replaced the kzalloc() to allocate the PMU specific data. The task_ctx_size is not required anymore. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-19-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/core.c | 1 - arch/x86/events/intel/lbr.c | 1 - include/linux/perf_event.h | 4 ---- kernel/events/core.c | 4 +--- 4 files changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index d740c861724c..6b1228ae007d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2371,7 +2371,6 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, - .task_ctx_size = sizeof(struct x86_perf_task_context), .swap_task_ctx = x86_pmu_swap_task_ctx, .check_period = x86_pmu_check_period, diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index e784c1d485ca..3ad528996d1c 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1672,7 +1672,6 @@ void __init intel_pmu_arch_lbr_init(void) size = sizeof(struct x86_perf_task_context_arch_lbr) + lbr_nr * sizeof(struct lbr_entry); - x86_get_pmu()->task_ctx_size = size; x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 09915ae06d28..3b22db08b6fb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -419,10 +419,6 @@ struct pmu { */ void (*sched_task) (struct perf_event_context *ctx, bool sched_in); - /* - * PMU specific data size - */ - size_t task_ctx_size; /* * Kmem cache of PMU specific data diff --git a/kernel/events/core.c b/kernel/events/core.c index 30d9b3182369..7c436d705fbd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1243,15 +1243,13 @@ static void *alloc_task_ctx_data(struct pmu *pmu) if (pmu->task_ctx_cache) return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); - return kzalloc(pmu->task_ctx_size, GFP_KERNEL); + return NULL; } static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) { if (pmu->task_ctx_cache && task_ctx_data) kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); - else - kfree(task_ctx_data); } static void free_ctx(struct rcu_head *head) -- cgit v1.2.3 From 85c2ce9104eb93517db2037699471c517e81f9b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 Jun 2020 16:49:05 +0200 Subject: sched, vmlinux.lds: Increase STRUCT_ALIGNMENT to 64 bytes for GCC-4.9 For some mysterious reason GCC-4.9 has a 64 byte section alignment for structures, all other GCC versions (and Clang) tested (including 4.8 and 5.0) are fine with the 32 bytes alignment. Getting this right is important for the new SCHED_DATA macro that creates an explicitly ordered array of 'struct sched_class' in the linker script and expect pointer arithmetic to work. Fixes: c3a340f7e7ea ("sched: Have sched_class_highest define by vmlinux.lds.h") Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200630144905.GX4817@hirez.programming.kicks-ass.net --- include/asm-generic/vmlinux.lds.h | 18 +++++++++++------- kernel/sched/sched.h | 3 ++- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 66fb84c3dc7e..3ceb4b7279ec 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -108,6 +108,17 @@ #define SBSS_MAIN .sbss #endif +/* + * GCC 4.5 and later have a 32 bytes section alignment for structures. + * Except GCC 4.9, that feels the need to align on 64 bytes. + */ +#if __GNUC__ == 4 && __GNUC_MINOR__ == 9 +#define STRUCT_ALIGNMENT 64 +#else +#define STRUCT_ALIGNMENT 32 +#endif +#define STRUCT_ALIGN() . = ALIGN(STRUCT_ALIGNMENT) + /* * The order of the sched class addresses are important, as they are * used to determine the order of the priority of each sched class in @@ -123,13 +134,6 @@ *(__stop_sched_class) \ __end_sched_classes = .; -/* - * Align to a 32 byte boundary equal to the - * alignment gcc 4.5 uses for a struct - */ -#define STRUCT_ALIGNMENT 32 -#define STRUCT_ALIGN() . = ALIGN(STRUCT_ALIGNMENT) - /* The actual configuration determine if the init/exit sections * are handled as text/data or they can be discarded (which * often happens at runtime) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5aa6661ecaf1..9bef2dd01247 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -67,6 +67,7 @@ #include #include +#include #ifdef CONFIG_PARAVIRT # include @@ -1810,7 +1811,7 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif -} __aligned(32); /* STRUCT_ALIGN(), vmlinux.lds.h */ +} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { -- cgit v1.2.3 From d81ae8aac85ca2e307d273f6dc7863a721bf054e Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 30 Jun 2020 12:21:22 +0100 Subject: sched/uclamp: Fix initialization of struct uclamp_rq struct uclamp_rq was zeroed out entirely in assumption that in the first call to uclamp_rq_inc() they'd be initialized correctly in accordance to default settings. But when next patch introduces a static key to skip uclamp_rq_{inc,dec}() until userspace opts in to use uclamp, schedutil will fail to perform any frequency changes because the rq->uclamp[UCLAMP_MAX].value is zeroed at init and stays as such. Which means all rqs are capped to 0 by default. Fix it by making sure we do proper initialization at init without relying on uclamp_rq_inc() doing it later. Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcounting") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Lukasz Luba Link: https://lkml.kernel.org/r/20200630112123.12076-2-qais.yousef@arm.com --- kernel/sched/core.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 15c980af63db..9605db70e671 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1239,6 +1239,20 @@ static void uclamp_fork(struct task_struct *p) } } +static void __init init_uclamp_rq(struct rq *rq) +{ + enum uclamp_id clamp_id; + struct uclamp_rq *uc_rq = rq->uclamp; + + for_each_clamp_id(clamp_id) { + uc_rq[clamp_id] = (struct uclamp_rq) { + .value = uclamp_none(clamp_id) + }; + } + + rq->uclamp_flags = 0; +} + static void __init init_uclamp(void) { struct uclamp_se uc_max = {}; @@ -1247,11 +1261,8 @@ static void __init init_uclamp(void) mutex_init(&uclamp_mutex); - for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, - sizeof(struct uclamp_rq)*UCLAMP_CNT); - cpu_rq(cpu)->uclamp_flags = 0; - } + for_each_possible_cpu(cpu) + init_uclamp_rq(cpu_rq(cpu)); for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id], -- cgit v1.2.3 From 46609ce227039fd192e0ecc7d940bed587fd2c78 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 30 Jun 2020 12:21:23 +0100 Subject: sched/uclamp: Protect uclamp fast path code with static key There is a report that when uclamp is enabled, a netperf UDP test regresses compared to a kernel compiled without uclamp. https://lore.kernel.org/lkml/20200529100806.GA3070@suse.de/ While investigating the root cause, there were no sign that the uclamp code is doing anything particularly expensive but could suffer from bad cache behavior under certain circumstances that are yet to be understood. https://lore.kernel.org/lkml/20200616110824.dgkkbyapn3io6wik@e107158-lin/ To reduce the pressure on the fast path anyway, add a static key that is by default will skip executing uclamp logic in the enqueue/dequeue_task() fast path until it's needed. As soon as the user start using util clamp by: 1. Changing uclamp value of a task with sched_setattr() 2. Modifying the default sysctl_sched_util_clamp_{min, max} 3. Modifying the default cpu.uclamp.{min, max} value in cgroup We flip the static key now that the user has opted to use util clamp. Effectively re-introducing uclamp logic in the enqueue/dequeue_task() fast path. It stays on from that point forward until the next reboot. This should help minimize the effect of util clamp on workloads that don't need it but still allow distros to ship their kernels with uclamp compiled in by default. SCHED_WARN_ON() in uclamp_rq_dec_id() was removed since now we can end up with unbalanced call to uclamp_rq_dec_id() if we flip the key while a task is running in the rq. Since we know it is harmless we just quietly return if we attempt a uclamp_rq_dec_id() when rq->uclamp[].bucket[].tasks is 0. In schedutil, we introduce a new uclamp_is_enabled() helper which takes the static key into account to ensure RT boosting behavior is retained. The following results demonstrates how this helps on 2 Sockets Xeon E5 2x10-Cores system. nouclamp uclamp uclamp-static-key Hmean send-64 162.43 ( 0.00%) 157.84 * -2.82%* 163.39 * 0.59%* Hmean send-128 324.71 ( 0.00%) 314.78 * -3.06%* 326.18 * 0.45%* Hmean send-256 641.55 ( 0.00%) 628.67 * -2.01%* 648.12 * 1.02%* Hmean send-1024 2525.28 ( 0.00%) 2448.26 * -3.05%* 2543.73 * 0.73%* Hmean send-2048 4836.14 ( 0.00%) 4712.08 * -2.57%* 4867.69 * 0.65%* Hmean send-3312 7540.83 ( 0.00%) 7425.45 * -1.53%* 7621.06 * 1.06%* Hmean send-4096 9124.53 ( 0.00%) 8948.82 * -1.93%* 9276.25 * 1.66%* Hmean send-8192 15589.67 ( 0.00%) 15486.35 * -0.66%* 15819.98 * 1.48%* Hmean send-16384 26386.47 ( 0.00%) 25752.25 * -2.40%* 26773.74 * 1.47%* The perf diff between nouclamp and uclamp-static-key when uclamp is disabled in the fast path: 8.73% -1.55% [kernel.kallsyms] [k] try_to_wake_up 0.07% +0.04% [kernel.kallsyms] [k] deactivate_task 0.13% -0.02% [kernel.kallsyms] [k] activate_task The diff between nouclamp and uclamp-static-key when uclamp is enabled in the fast path: 8.73% -0.72% [kernel.kallsyms] [k] try_to_wake_up 0.13% +0.39% [kernel.kallsyms] [k] activate_task 0.07% +0.38% [kernel.kallsyms] [k] deactivate_task Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcounting") Reported-by: Mel Gorman Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Tested-by: Lukasz Luba Link: https://lkml.kernel.org/r/20200630112123.12076-3-qais.yousef@arm.com --- kernel/sched/core.c | 74 +++++++++++++++++++++++++++++++++++++++- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/sched.h | 47 +++++++++++++++++++++++-- 3 files changed, 119 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9605db70e671..4cf30e4de653 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -796,6 +796,26 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; +/* + * This static key is used to reduce the uclamp overhead in the fast path. It + * primarily disables the call to uclamp_rq_{inc, dec}() in + * enqueue/dequeue_task(). + * + * This allows users to continue to enable uclamp in their kernel config with + * minimum uclamp overhead in the fast path. + * + * As soon as userspace modifies any of the uclamp knobs, the static key is + * enabled, since we have an actual users that make use of uclamp + * functionality. + * + * The knobs that would enable this static key are: + * + * * A task modifying its uclamp value with sched_setattr(). + * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs. + * * An admin modifying the cgroup cpu.uclamp.{min, max} + */ +DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); + /* Integer rounded range for each bucket */ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) @@ -992,10 +1012,38 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, lockdep_assert_held(&rq->lock); + /* + * If sched_uclamp_used was enabled after task @p was enqueued, + * we could end up with unbalanced call to uclamp_rq_dec_id(). + * + * In this case the uc_se->active flag should be false since no uclamp + * accounting was performed at enqueue time and we can just return + * here. + * + * Need to be careful of the following enqeueue/dequeue ordering + * problem too + * + * enqueue(taskA) + * // sched_uclamp_used gets enabled + * enqueue(taskB) + * dequeue(taskA) + * // Must not decrement bukcet->tasks here + * dequeue(taskB) + * + * where we could end up with stale data in uc_se and + * bucket[uc_se->bucket_id]. + * + * The following check here eliminates the possibility of such race. + */ + if (unlikely(!uc_se->active)) + return; + bucket = &uc_rq->bucket[uc_se->bucket_id]; + SCHED_WARN_ON(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; + uc_se->active = false; /* @@ -1023,6 +1071,15 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1038,6 +1095,15 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1146,8 +1212,10 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, update_root_tg = true; } - if (update_root_tg) + if (update_root_tg) { + static_branch_enable(&sched_uclamp_used); uclamp_update_root_tg(); + } /* * We update all RUNNABLE tasks only when task groups are in use. @@ -1212,6 +1280,8 @@ static void __setscheduler_uclamp(struct task_struct *p, if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) return; + static_branch_enable(&sched_uclamp_used); + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], attr->sched_util_min, true); @@ -7436,6 +7506,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; + static_branch_enable(&sched_uclamp_used); + mutex_lock(&uclamp_mutex); rcu_read_lock(); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7fbaee24c824..dc6835bc6490 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -210,7 +210,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, unsigned long dl_util, util, irq; struct rq *rq = cpu_rq(cpu); - if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && + if (!uclamp_is_used() && type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { return max; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9bef2dd01247..b1432f608061 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -878,6 +878,8 @@ struct uclamp_rq { unsigned int value; struct uclamp_bucket bucket[UCLAMP_BUCKETS]; }; + +DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ /* @@ -2365,12 +2367,35 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +/** + * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. + * @rq: The rq to clamp against. Must not be NULL. + * @util: The util value to clamp. + * @p: The task to clamp against. Can be NULL if you want to clamp + * against @rq only. + * + * Clamps the passed @util to the max(@rq, @p) effective uclamp values. + * + * If sched_uclamp_used static key is disabled, then just return the util + * without any clamping since uclamp aggregation at the rq level in the fast + * path is disabled, rendering this operation a NOP. + * + * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It + * will return the correct effective uclamp value of the task even if the + * static key is disabled. + */ static __always_inline unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, struct task_struct *p) { - unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + unsigned long min_util; + unsigned long max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return util; + + min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); if (p) { min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); @@ -2387,6 +2412,19 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, return clamp(util, min_util, max_util); } + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} #else /* CONFIG_UCLAMP_TASK */ static inline unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, @@ -2394,6 +2432,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, { return util; } + +static inline bool uclamp_is_used(void) +{ + return false; +} #endif /* CONFIG_UCLAMP_TASK */ #ifdef arch_scale_freq_capacity -- cgit v1.2.3 From 9d246053a69196c7c27068870e9b4b66ac536f68 Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Mon, 29 Jun 2020 15:23:03 -0400 Subject: sched: Add a tracepoint to track rq->nr_running Add a bare tracepoint trace_sched_update_nr_running_tp which tracks ->nr_running CPU's rq. This is used to accurately trace this data and provide a visualization of scheduler imbalances in, for example, the form of a heat map. The tracepoint is accessed by loading an external kernel module. An example module (forked from Qais' module and including the pelt related tracepoints) can be found at: https://github.com/auldp/tracepoints-helpers.git A script to turn the trace-cmd report output into a heatmap plot can be found at: https://github.com/jirvoz/plot-nr-running The tracepoints are added to add_nr_running() and sub_nr_running() which are in kernel/sched/sched.h. In order to avoid CREATE_TRACE_POINTS in the header a wrapper call is used and the trace/events/sched.h include is moved before sched.h in kernel/sched/core. Signed-off-by: Phil Auld Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200629192303.GC120228@lorien.usersys.redhat.com --- include/linux/sched.h | 1 + include/trace/events/sched.h | 4 ++++ kernel/sched/core.c | 13 +++++++++---- kernel/sched/fair.c | 8 ++++++-- kernel/sched/pelt.c | 2 -- kernel/sched/sched.h | 10 ++++++++++ 6 files changed, 30 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 683372943093..12b10ce51a08 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2044,6 +2044,7 @@ const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq); const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); int sched_trace_rq_cpu(struct rq *rq); +int sched_trace_rq_nr_running(struct rq *rq); const struct cpumask *sched_trace_rd_span(struct root_domain *rd); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 04f9a4c7b0d9..0d5ff0958d48 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -642,6 +642,10 @@ DECLARE_TRACE(sched_util_est_se_tp, TP_PROTO(struct sched_entity *se), TP_ARGS(se)); +DECLARE_TRACE(sched_update_nr_running_tp, + TP_PROTO(struct rq *rq, int change), + TP_ARGS(rq, change)); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4cf30e4de653..ff0519551188 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6,6 +6,10 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ +#define CREATE_TRACE_POINTS +#include +#undef CREATE_TRACE_POINTS + #include "sched.h" #include @@ -23,9 +27,6 @@ #include "pelt.h" #include "smp.h" -#define CREATE_TRACE_POINTS -#include - /* * Export tracepoints that act as a bare tracehook (ie: have no trace event * associated with them) to allow external modules to probe them. @@ -38,6 +39,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -8195,4 +8197,7 @@ const u32 sched_prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -#undef CREATE_TRACE_POINTS +void call_trace_sched_update_nr_running(struct rq *rq, int count) +{ + trace_sched_update_nr_running_tp(rq, count); +} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6fab1d17c575..3213cb247aff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -22,8 +22,6 @@ */ #include "sched.h" -#include - /* * Targeted preemption latency for CPU-bound tasks: * @@ -11296,3 +11294,9 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd) #endif } EXPORT_SYMBOL_GPL(sched_trace_rd_span); + +int sched_trace_rq_nr_running(struct rq *rq) +{ + return rq ? rq->nr_running : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 11bea3b08115..2c613e1cff3a 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -28,8 +28,6 @@ #include "sched.h" #include "pelt.h" -#include - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b1432f608061..65b72e0487bf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -76,6 +76,8 @@ #include "cpupri.h" #include "cpudeadline.h" +#include + #ifdef CONFIG_SCHED_DEBUG # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else @@ -97,6 +99,7 @@ extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); +extern void call_trace_sched_update_nr_running(struct rq *rq, int count); /* * Helpers for converting nanosecond timing to jiffy resolution */ @@ -1973,6 +1976,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count) unsigned prev_nr = rq->nr_running; rq->nr_running = prev_nr + count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, count); + } #ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) { @@ -1987,6 +1993,10 @@ static inline void add_nr_running(struct rq *rq, unsigned count) static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, count); + } + /* Check if we still need preemption */ sched_update_tick_dependency(rq); } -- cgit v1.2.3 From 05eee619ed61c8cd89633954d38c4e5653086845 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 23 Oct 2019 19:16:22 +0800 Subject: x86/kvm: Add "nopvspin" parameter to disable PV spinlocks There are cases where a guest tries to switch spinlocks to bare metal behavior (e.g. by setting "xen_nopvspin" on XEN platform and "hv_nopvspin" on HYPER_V). That feature is missed on KVM, add a new parameter "nopvspin" to disable PV spinlocks for KVM guest. The new 'nopvspin' parameter will also replace Xen and Hyper-V specific parameters in future patches. Define variable nopvsin as global because it will be used in future patches as above. Signed-off-by: Zhenzhong Duan Reviewed-by: Vitaly Kuznetsov Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Sean Christopherson Cc: Vitaly Kuznetsov Cc: Wanpeng Li Cc: Jim Mattson Cc: Joerg Roedel Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Paolo Bonzini --- Documentation/admin-guide/kernel-parameters.txt | 5 ++++ arch/x86/include/asm/qspinlock.h | 1 + arch/x86/kernel/kvm.c | 39 ++++++++++++++++++++----- kernel/locking/qspinlock.c | 7 +++++ 4 files changed, 45 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb95fad81c79..6a8934ffdaf6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5739,6 +5739,11 @@ as generic guest with no PV drivers. Currently support XEN HVM, KVM, HYPER_V and VMWARE guest. + nopvspin [X86,KVM] + Disables the qspinlock slow path using PV optimizations + which allow the hypervisor to 'idle' the guest on lock + contention. + xirc2ps_cs= [NET,PCMCIA] Format: ,,,,,[,[,[,]]] diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 444d6fd9a6d8..d86ab942219c 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -32,6 +32,7 @@ extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __pv_init_lock_hash(void); extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); +extern bool nopvspin; #define queued_spin_unlock queued_spin_unlock /** diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4ef21a87f1d3..d9995931ea18 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -871,18 +871,36 @@ asm( */ void __init kvm_spinlock_init(void) { - /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + /* + * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an + * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is + * preferred over native qspinlock when vCPU is preempted. + */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { + pr_info("PV spinlocks disabled, no host support\n"); return; + } + /* + * Disable PV spinlocks and use native qspinlock when dedicated pCPUs + * are available. + */ if (kvm_para_has_hint(KVM_HINTS_REALTIME)) { - static_branch_disable(&virt_spin_lock_key); - return; + pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n"); + goto out; } - /* Don't use the pvqspinlock code if there is only 1 vCPU. */ - if (num_possible_cpus() == 1) - return; + if (num_possible_cpus() == 1) { + pr_info("PV spinlocks disabled, single CPU\n"); + goto out; + } + + if (nopvspin) { + pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n"); + goto out; + } + + pr_info("PV spinlocks enabled\n"); __pv_init_lock_hash(); pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; @@ -895,6 +913,13 @@ void __init kvm_spinlock_init(void) pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); } + /* + * When PV spinlock is enabled which is preferred over + * virt_spin_lock(), virt_spin_lock_key's value is meaningless. + * Just disable it anyway. + */ +out: + static_branch_disable(&virt_spin_lock_key); } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index b9515fcc9b29..cbff6ba53d56 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -581,4 +581,11 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #include "qspinlock_paravirt.h" #include "qspinlock.c" +bool nopvspin __initdata; +static __init int parse_nopvspin(char *arg) +{ + nopvspin = true; + return 0; +} +early_param("nopvspin", parse_nopvspin); #endif -- cgit v1.2.3 From d7481b24b816b8c3955a9eaf01b97e2bd7f61a37 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 3 Jul 2020 12:56:19 -0400 Subject: audit: issue CWD record to accompany LSM_AUDIT_DATA_* records The LSM_AUDIT_DATA_* records for PATH, FILE, IOCTL_OP, DENTRY and INODE are incomplete without the task context of the AUDIT Current Working Directory record. Add it. This record addition can't use audit_dummy_context to determine whether or not to store the record information since the LSM_AUDIT_DATA_* records are initiated by various LSMs independent of any audit rules. context->in_syscall is used to determine if it was called in user context like audit_getname. Please see the upstream issue https://github.com/linux-audit/audit-kernel/issues/96 Adapted from Vladis Dronov's v2 patch. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 9 ++++++++- kernel/auditsc.c | 17 +++++++++++++++-- security/lsm_audit.c | 5 +++++ 3 files changed, 28 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index b5478c64bc69..523f77494847 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -292,7 +292,7 @@ extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); extern void __audit_getname(struct filename *name); - +extern void __audit_getcwd(void); extern void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags); extern void __audit_file(const struct file *); @@ -351,6 +351,11 @@ static inline void audit_getname(struct filename *name) if (unlikely(!audit_dummy_context())) __audit_getname(name); } +static inline void audit_getcwd(void) +{ + if (unlikely(audit_context())) + __audit_getcwd(); +} static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { @@ -579,6 +584,8 @@ static inline struct filename *audit_reusename(const __user char *name) } static inline void audit_getname(struct filename *name) { } +static inline void audit_getcwd(void) +{ } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index eae1a599ffe3..6884b50069d1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1891,6 +1891,20 @@ __audit_reusename(const __user char *uptr) return NULL; } +inline void _audit_getcwd(struct audit_context *context) +{ + if (!context->pwd.dentry) + get_fs_pwd(current->fs, &context->pwd); +} + +void __audit_getcwd(void) +{ + struct audit_context *context = audit_context(); + + if (context->in_syscall) + _audit_getcwd(context); +} + /** * __audit_getname - add a name to the list * @name: name to add @@ -1915,8 +1929,7 @@ void __audit_getname(struct filename *name) name->aname = n; name->refcnt++; - if (!context->pwd.dentry) - get_fs_pwd(current->fs, &context->pwd); + _audit_getcwd(context); } static inline int audit_copy_fcaps(struct audit_names *name, diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 2d2bf49016f4..7c555621c2bd 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -241,6 +241,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } + audit_getcwd(); break; } case LSM_AUDIT_DATA_FILE: { @@ -254,6 +255,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } + audit_getcwd(); break; } case LSM_AUDIT_DATA_IOCTL_OP: { @@ -269,6 +271,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, } audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd); + audit_getcwd(); break; } case LSM_AUDIT_DATA_DENTRY: { @@ -283,6 +286,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } + audit_getcwd(); break; } case LSM_AUDIT_DATA_INODE: { @@ -300,6 +304,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_getcwd(); break; } case LSM_AUDIT_DATA_TASK: { -- cgit v1.2.3 From 746cf3459f118592a72ef42e7551777ff17b1684 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 3 Jul 2020 10:06:09 +0800 Subject: tracing: Simplify defining of the next event id The value to be used and compared in trace_search_list() is "last + 1". Let's just define next to be "last + 1" instead of doing the addition each time. Link: https://lkml.kernel.org/r/20200703020612.12930-2-richard.weiyang@linux.alibaba.com Signed-off-by: Wei Yang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 73976de7f8cc..a35232d61601 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -675,11 +675,11 @@ static LIST_HEAD(ftrace_event_list); static int trace_search_list(struct list_head **list) { struct trace_event *e; - int last = __TRACE_LAST_TYPE; + int next = __TRACE_LAST_TYPE + 1; if (list_empty(&ftrace_event_list)) { *list = &ftrace_event_list; - return last + 1; + return next; } /* @@ -687,17 +687,17 @@ static int trace_search_list(struct list_head **list) * lets see if somebody freed one. */ list_for_each_entry(e, &ftrace_event_list, list) { - if (e->type != last + 1) + if (e->type != next) break; - last++; + next++; } /* Did we used up all 65 thousand events??? */ - if ((last + 1) > TRACE_EVENT_TYPE_MAX) + if (next > TRACE_EVENT_TYPE_MAX) return 0; *list = &e->list; - return last + 1; + return next; } void trace_event_read_lock(void) -- cgit v1.2.3 From 36b8aacf2a483ba40fbe91c830d314e0bc133044 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 3 Jul 2020 10:06:10 +0800 Subject: tracing: Save one trace_event->type by using __TRACE_LAST_TYPE Static defined trace_event->type stops at (__TRACE_LAST_TYPE - 1) and dynamic trace_event->type starts from (__TRACE_LAST_TYPE + 1). To save one trace_event->type index, let's use __TRACE_LAST_TYPE. Link: https://lkml.kernel.org/r/20200703020612.12930-3-richard.weiyang@linux.alibaba.com Signed-off-by: Wei Yang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a35232d61601..4d1893564912 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -20,7 +20,7 @@ DECLARE_RWSEM(trace_event_sem); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; -static int next_event_type = __TRACE_LAST_TYPE + 1; +static int next_event_type = __TRACE_LAST_TYPE; enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { @@ -675,7 +675,7 @@ static LIST_HEAD(ftrace_event_list); static int trace_search_list(struct list_head **list) { struct trace_event *e; - int next = __TRACE_LAST_TYPE + 1; + int next = __TRACE_LAST_TYPE; if (list_empty(&ftrace_event_list)) { *list = &ftrace_event_list; -- cgit v1.2.3 From 248591f5d257a19c1cba9ab9da3536bfbc2f0149 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 24 Jun 2020 13:32:46 +0200 Subject: kcsan: Make KCSAN compatible with new IRQ state tracking The new IRQ state tracking code does not honor lockdep_off(), and as such we should again permit tracing by using non-raw functions in core.c. Update the lockdep_off() comment in report.c, to reflect the fact there is still a potential risk of deadlock due to using printk() from scheduler code. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200624113246.GA170324@elver.google.com --- kernel/kcsan/core.c | 5 ++--- kernel/kcsan/report.c | 9 +++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 15f67949d11e..732623c30359 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -397,8 +397,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) } if (!kcsan_interrupt_watcher) - /* Use raw to avoid lockdep recursion via IRQ flags tracing. */ - raw_local_irq_save(irq_flags); + local_irq_save(irq_flags); watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); if (watchpoint == NULL) { @@ -539,7 +538,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS); out_unlock: if (!kcsan_interrupt_watcher) - raw_local_irq_restore(irq_flags); + local_irq_restore(irq_flags); out: user_access_restore(ua_flags); } diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index ac5f8345bae9..6b2fb1a6d8cd 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -606,10 +606,11 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type, goto out; /* - * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if - * we do not turn off lockdep here; this could happen due to recursion - * into lockdep via KCSAN if we detect a race in utilities used by - * lockdep. + * Because we may generate reports when we're in scheduler code, the use + * of printk() could deadlock. Until such time that all printing code + * called in print_report() is scheduler-safe, accept the risk, and just + * get our message out. As such, also disable lockdep to hide the + * warning, and avoid disabling lockdep for the rest of the kernel. */ lockdep_off(); -- cgit v1.2.3 From 859d069ee1ddd87862e1d6a356a82ed417dbeb67 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2020 15:00:57 +0200 Subject: lockdep: Prepare for NMI IRQ state tracking There is no reason not to always, accurately, track IRQ state. This change also makes IRQ state tracking ignore lockdep_off(). Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200623083721.155449112@infradead.org --- kernel/locking/lockdep.c | 46 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 29a8de4c50b9..d595623c4b34 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -395,7 +395,7 @@ void lockdep_init_task(struct task_struct *task) static __always_inline void lockdep_recursion_finish(void) { - if (WARN_ON_ONCE(--current->lockdep_recursion)) + if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK)) current->lockdep_recursion = 0; } @@ -3646,7 +3646,16 @@ static void __trace_hardirqs_on_caller(void) */ void lockdep_hardirqs_on_prepare(unsigned long ip) { - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!debug_locks)) + return; + + /* + * NMIs do not (and cannot) track lock dependencies, nothing to do. + */ + if (unlikely(in_nmi())) + return; + + if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; if (unlikely(current->hardirqs_enabled)) { @@ -3692,7 +3701,27 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks || curr->lockdep_recursion)) + if (unlikely(!debug_locks)) + return; + + /* + * NMIs can happen in the middle of local_irq_{en,dis}able() where the + * tracking state and hardware state are out of sync. + * + * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from, + * and not rely on hardware state like normal interrupts. + */ + if (unlikely(in_nmi())) { + /* + * Skip: + * - recursion check, because NMI can hit lockdep; + * - hardware state check, because above; + * - chain_key check, see lockdep_hardirqs_on_prepare(). + */ + goto skip_checks; + } + + if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; if (curr->hardirqs_enabled) { @@ -3720,6 +3749,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key != current->curr_chain_key); +skip_checks: /* we'll do an OFF -> ON transition: */ curr->hardirqs_enabled = 1; curr->hardirq_enable_ip = ip; @@ -3735,7 +3765,15 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks || curr->lockdep_recursion)) + if (unlikely(!debug_locks)) + return; + + /* + * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep; + * they will restore the software state. This ensures the software + * state is consistent inside NMIs as well. + */ + if (unlikely(!in_nmi() && (current->lockdep_recursion & LOCKDEP_RECURSION_MASK))) return; /* -- cgit v1.2.3 From a21ee6055c30ce68c4e201c6496f0ed2a1936230 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2020 12:22:41 +0200 Subject: lockdep: Change hardirq{s_enabled,_context} to per-cpu variables Currently all IRQ-tracking state is in task_struct, this means that task_struct needs to be defined before we use it. Especially for lockdep_assert_irq*() this can lead to header-hell. Move the hardirq state into per-cpu variables to avoid the task_struct dependency. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200623083721.512673481@infradead.org --- include/linux/irqflags.h | 19 ++++++++++++------- include/linux/lockdep.h | 34 ++++++++++++++++++---------------- include/linux/sched.h | 2 -- kernel/fork.c | 4 +--- kernel/locking/lockdep.c | 30 +++++++++++++++--------------- kernel/softirq.c | 6 ++++++ 6 files changed, 52 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 6384d2813ded..255444fe4609 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -14,6 +14,7 @@ #include #include +#include /* Currently lockdep_softirqs_on/off is used only by lockdep */ #ifdef CONFIG_PROVE_LOCKING @@ -31,18 +32,22 @@ #endif #ifdef CONFIG_TRACE_IRQFLAGS + +DECLARE_PER_CPU(int, hardirqs_enabled); +DECLARE_PER_CPU(int, hardirq_context); + extern void trace_hardirqs_on_prepare(void); extern void trace_hardirqs_off_finish(void); extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); -# define lockdep_hardirq_context(p) ((p)->hardirq_context) +# define lockdep_hardirq_context(p) (this_cpu_read(hardirq_context)) # define lockdep_softirq_context(p) ((p)->softirq_context) -# define lockdep_hardirqs_enabled(p) ((p)->hardirqs_enabled) +# define lockdep_hardirqs_enabled(p) (this_cpu_read(hardirqs_enabled)) # define lockdep_softirqs_enabled(p) ((p)->softirqs_enabled) -# define lockdep_hardirq_enter() \ -do { \ - if (!current->hardirq_context++) \ - current->hardirq_threaded = 0; \ +# define lockdep_hardirq_enter() \ +do { \ + if (this_cpu_inc_return(hardirq_context) == 1) \ + current->hardirq_threaded = 0; \ } while (0) # define lockdep_hardirq_threaded() \ do { \ @@ -50,7 +55,7 @@ do { \ } while (0) # define lockdep_hardirq_exit() \ do { \ - current->hardirq_context--; \ + this_cpu_dec(hardirq_context); \ } while (0) # define lockdep_softirq_enter() \ do { \ diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 3b73cf84f77d..be6cb17a8879 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -11,6 +11,7 @@ #define __LINUX_LOCKDEP_H #include +#include struct task_struct; @@ -529,28 +530,29 @@ do { \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) -#define lockdep_assert_irqs_enabled() do { \ - WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - !current->hardirqs_enabled, \ - "IRQs not enabled as expected\n"); \ - } while (0) +DECLARE_PER_CPU(int, hardirqs_enabled); +DECLARE_PER_CPU(int, hardirq_context); -#define lockdep_assert_irqs_disabled() do { \ - WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - current->hardirqs_enabled, \ - "IRQs not disabled as expected\n"); \ - } while (0) +#define lockdep_assert_irqs_enabled() \ +do { \ + WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirqs_enabled)); \ +} while (0) -#define lockdep_assert_in_irq() do { \ - WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - !current->hardirq_context, \ - "Not in hardirq as expected\n"); \ - } while (0) +#define lockdep_assert_irqs_disabled() \ +do { \ + WARN_ON_ONCE(debug_locks && this_cpu_read(hardirqs_enabled)); \ +} while (0) + +#define lockdep_assert_in_irq() \ +do { \ + WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirq_context)); \ +} while (0) #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) # define might_lock_nested(lock, subclass) do { } while (0) + # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) # define lockdep_assert_in_irq() do { } while (0) @@ -560,7 +562,7 @@ do { \ # define lockdep_assert_RT_in_threaded_ctx() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - current->hardirq_context && \ + lockdep_hardirq_context(current) && \ !(current->hardirq_threaded || current->irq_config), \ "Not in threaded context on PREEMPT_RT as expected\n"); \ } while (0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 692e327d7455..3903a9500926 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -990,8 +990,6 @@ struct task_struct { unsigned long hardirq_disable_ip; unsigned int hardirq_enable_event; unsigned int hardirq_disable_event; - int hardirqs_enabled; - int hardirq_context; u64 hardirq_chain_key; unsigned long softirq_disable_ip; unsigned long softirq_enable_ip; diff --git a/kernel/fork.c b/kernel/fork.c index efc5493203ae..70d9d0a4de2a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1954,8 +1954,8 @@ static __latent_entropy struct task_struct *copy_process( rt_mutex_init_task(p); + lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING - DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; @@ -2036,7 +2036,6 @@ static __latent_entropy struct task_struct *copy_process( #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; - p->hardirqs_enabled = 0; p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; @@ -2046,7 +2045,6 @@ static __latent_entropy struct task_struct *copy_process( p->softirq_enable_event = 0; p->softirq_disable_ip = 0; p->softirq_disable_event = 0; - p->hardirq_context = 0; p->softirq_context = 0; #endif diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d595623c4b34..ab4ffbe0e9e9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("-----------------------------------------------------\n"); pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), - curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - curr->hardirqs_enabled, + lockdep_hardirqs_enabled(curr), curr->softirqs_enabled); print_lock(next); @@ -3658,7 +3658,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (unlikely(current->hardirqs_enabled)) { + if (unlikely(lockdep_hardirqs_enabled(current))) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3686,7 +3686,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) * Can't allow enabling interrupts while in an interrupt handler, * that's general bad form and such. Recursion, limited stack etc.. */ - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context(current))) return; current->hardirq_chain_key = current->curr_chain_key; @@ -3724,7 +3724,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled(curr)) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3751,7 +3751,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) skip_checks: /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; + this_cpu_write(hardirqs_enabled, 1); curr->hardirq_enable_ip = ip; curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_on_events); @@ -3783,11 +3783,11 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->hardirqs_enabled) { + if (lockdep_hardirqs_enabled(curr)) { /* * We have done an ON -> OFF transition: */ - curr->hardirqs_enabled = 0; + this_cpu_write(hardirqs_enabled, 0); curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_off_events); @@ -3832,7 +3832,7 @@ void lockdep_softirqs_on(unsigned long ip) * usage bit for all held locks, if hardirqs are * enabled too: */ - if (curr->hardirqs_enabled) + if (lockdep_hardirqs_enabled(curr)) mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3881,7 +3881,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) */ if (!hlock->trylock) { if (hlock->read) { - if (curr->hardirq_context) + if (lockdep_hardirq_context(curr)) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ_READ)) return 0; @@ -3890,7 +3890,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { - if (curr->hardirq_context) + if (lockdep_hardirq_context(curr)) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) @@ -3928,7 +3928,7 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context + + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context(task) + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } @@ -4021,7 +4021,7 @@ static inline short task_wait_context(struct task_struct *curr) * Set appropriate wait type for the context; for IRQs we have to take * into account force_irqthread as that is implied by PREEMPT_RT. */ - if (curr->hardirq_context) { + if (lockdep_hardirq_context(curr)) { /* * Check if force_irqthreads will run us threaded. */ @@ -4864,11 +4864,11 @@ static void check_flags(unsigned long flags) return; if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled(current))) { printk("possible reason: unannotated irqs-off.\n"); } } else { - if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { + if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled(current))) { printk("possible reason: unannotated irqs-on.\n"); } } diff --git a/kernel/softirq.c b/kernel/softirq.c index c4201b7f42b1..342c53feaa7a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -107,6 +107,12 @@ static bool ksoftirqd_running(unsigned long pending) * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS + +DEFINE_PER_CPU(int, hardirqs_enabled); +DEFINE_PER_CPU(int, hardirq_context); +EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); +EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); + void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; -- cgit v1.2.3 From f9ad4a5f3f20bee022b1bdde94e5ece6dc0b0edc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2020 13:03:26 +0200 Subject: lockdep: Remove lockdep_hardirq{s_enabled,_context}() argument Now that the macros use per-cpu data, we no longer need the argument. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200623083721.571835311@infradead.org --- arch/x86/entry/common.c | 2 +- include/linux/irqflags.h | 8 ++++---- include/linux/lockdep.h | 2 +- kernel/locking/lockdep.c | 30 +++++++++++++++--------------- kernel/softirq.c | 2 +- tools/include/linux/irqflags.h | 4 ++-- 6 files changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 63c607dd6c52..4ea640363f5d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -758,7 +758,7 @@ noinstr void idtentry_exit_user(struct pt_regs *regs) noinstr bool idtentry_enter_nmi(struct pt_regs *regs) { - bool irq_state = lockdep_hardirqs_enabled(current); + bool irq_state = lockdep_hardirqs_enabled(); __nmi_enter(); lockdep_hardirqs_off(CALLER_ADDR0); diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 255444fe4609..5811ee8a5cd8 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -40,9 +40,9 @@ DECLARE_PER_CPU(int, hardirq_context); extern void trace_hardirqs_off_finish(void); extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); -# define lockdep_hardirq_context(p) (this_cpu_read(hardirq_context)) +# define lockdep_hardirq_context() (this_cpu_read(hardirq_context)) # define lockdep_softirq_context(p) ((p)->softirq_context) -# define lockdep_hardirqs_enabled(p) (this_cpu_read(hardirqs_enabled)) +# define lockdep_hardirqs_enabled() (this_cpu_read(hardirqs_enabled)) # define lockdep_softirqs_enabled(p) ((p)->softirqs_enabled) # define lockdep_hardirq_enter() \ do { \ @@ -109,9 +109,9 @@ do { \ # define trace_hardirqs_off_finish() do { } while (0) # define trace_hardirqs_on() do { } while (0) # define trace_hardirqs_off() do { } while (0) -# define lockdep_hardirq_context(p) 0 +# define lockdep_hardirq_context() 0 # define lockdep_softirq_context(p) 0 -# define lockdep_hardirqs_enabled(p) 0 +# define lockdep_hardirqs_enabled() 0 # define lockdep_softirqs_enabled(p) 0 # define lockdep_hardirq_enter() do { } while (0) # define lockdep_hardirq_threaded() do { } while (0) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index be6cb17a8879..fd04b9e96091 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -562,7 +562,7 @@ do { \ # define lockdep_assert_RT_in_threaded_ctx() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ - lockdep_hardirq_context(current) && \ + lockdep_hardirq_context() && \ !(current->hardirq_threaded || current->irq_config), \ "Not in threaded context on PREEMPT_RT as expected\n"); \ } while (0) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ab4ffbe0e9e9..c9ea05edce25 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("-----------------------------------------------------\n"); pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), - lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - lockdep_hardirqs_enabled(curr), + lockdep_hardirqs_enabled(), curr->softirqs_enabled); print_lock(next); @@ -3331,9 +3331,9 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), - lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT, lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - lockdep_hardirqs_enabled(curr), + lockdep_hardirqs_enabled(), lockdep_softirqs_enabled(curr)); print_lock(this); @@ -3658,7 +3658,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (unlikely(lockdep_hardirqs_enabled(current))) { + if (unlikely(lockdep_hardirqs_enabled())) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3686,7 +3686,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) * Can't allow enabling interrupts while in an interrupt handler, * that's general bad form and such. Recursion, limited stack etc.. */ - if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context(current))) + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context())) return; current->hardirq_chain_key = current->curr_chain_key; @@ -3724,7 +3724,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK)) return; - if (lockdep_hardirqs_enabled(curr)) { + if (lockdep_hardirqs_enabled()) { /* * Neither irq nor preemption are disabled here * so this is racy by nature but losing one hit @@ -3783,7 +3783,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (lockdep_hardirqs_enabled(curr)) { + if (lockdep_hardirqs_enabled()) { /* * We have done an ON -> OFF transition: */ @@ -3832,7 +3832,7 @@ void lockdep_softirqs_on(unsigned long ip) * usage bit for all held locks, if hardirqs are * enabled too: */ - if (lockdep_hardirqs_enabled(curr)) + if (lockdep_hardirqs_enabled()) mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3881,7 +3881,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) */ if (!hlock->trylock) { if (hlock->read) { - if (lockdep_hardirq_context(curr)) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ_READ)) return 0; @@ -3890,7 +3890,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { - if (lockdep_hardirq_context(curr)) + if (lockdep_hardirq_context()) if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) @@ -3928,7 +3928,7 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context(task) + + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } @@ -4021,7 +4021,7 @@ static inline short task_wait_context(struct task_struct *curr) * Set appropriate wait type for the context; for IRQs we have to take * into account force_irqthread as that is implied by PREEMPT_RT. */ - if (lockdep_hardirq_context(curr)) { + if (lockdep_hardirq_context()) { /* * Check if force_irqthreads will run us threaded. */ @@ -4864,11 +4864,11 @@ static void check_flags(unsigned long flags) return; if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled(current))) { + if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-off.\n"); } } else { - if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled(current))) { + if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-on.\n"); } } diff --git a/kernel/softirq.c b/kernel/softirq.c index 342c53feaa7a..5e9aaa648a74 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -230,7 +230,7 @@ static inline bool lockdep_softirq_start(void) { bool in_hardirq = false; - if (lockdep_hardirq_context(current)) { + if (lockdep_hardirq_context()) { in_hardirq = true; lockdep_hardirq_exit(); } diff --git a/tools/include/linux/irqflags.h b/tools/include/linux/irqflags.h index 67e01bbadbfe..501262aee8ff 100644 --- a/tools/include/linux/irqflags.h +++ b/tools/include/linux/irqflags.h @@ -2,9 +2,9 @@ #ifndef _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_ #define _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_ -# define lockdep_hardirq_context(p) 0 +# define lockdep_hardirq_context() 0 # define lockdep_softirq_context(p) 0 -# define lockdep_hardirqs_enabled(p) 0 +# define lockdep_hardirqs_enabled() 0 # define lockdep_softirqs_enabled(p) 0 # define lockdep_hardirq_enter() do { } while (0) # define lockdep_hardirq_exit() do { } while (0) -- cgit v1.2.3 From c818c03b661cd769e035e41673d5543ba2ebda64 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 13 May 2020 14:11:26 -0700 Subject: seccomp: Report number of loaded filters in /proc/$pid/status A common question asked when debugging seccomp filters is "how many filters are attached to your process?" Provide a way to easily answer this question through /proc/$pid/status with a "Seccomp_filters" line. Signed-off-by: Kees Cook --- fs/proc/array.c | 2 ++ include/linux/seccomp.h | 2 ++ init/init_task.c | 3 +++ kernel/seccomp.c | 3 +++ 4 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/fs/proc/array.c b/fs/proc/array.c index 55ecbeb3a721..65ec2029fa80 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -341,6 +341,8 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); + seq_put_decimal_ull(m, "\nSeccomp_filters:\t", + atomic_read(&p->seccomp.filter_count)); #endif seq_puts(m, "\nSpeculation_Store_Bypass:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 4192369b8418..2ec2720f83cc 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -13,6 +13,7 @@ #ifdef CONFIG_SECCOMP #include +#include #include struct seccomp_filter; @@ -29,6 +30,7 @@ struct seccomp_filter; */ struct seccomp { int mode; + atomic_t filter_count; struct seccomp_filter *filter; }; diff --git a/init/init_task.c b/init/init_task.c index 15089d15010a..a3eb3847e1f4 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -204,6 +204,9 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif +#ifdef CONFIG_SECCOMP + .seccomp = { .filter_count = ATOMIC_INIT(0) }, +#endif }; EXPORT_SYMBOL(init_task); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d653d8426de9..f387e5004c29 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -398,6 +398,8 @@ static inline void seccomp_sync_threads(unsigned long flags) put_seccomp_filter(thread); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + atomic_set(&thread->seccomp.filter_count, + atomic_read(&thread->seccomp.filter_count)); /* * Don't let an unprivileged task work around @@ -544,6 +546,7 @@ static long seccomp_attach_filter(unsigned int flags, */ filter->prev = current->seccomp.filter; current->seccomp.filter = filter; + atomic_inc(¤t->seccomp.filter_count); /* Now that the new filter is in place, synchronize to all threads. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) -- cgit v1.2.3 From 9f87dcf14b82b05ff0e26970439b372ae135de0c Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Mon, 1 Jun 2020 04:25:32 -0700 Subject: seccomp: Add find_notification helper This adds a helper which can iterate through a seccomp_filter to find a notification matching an ID. It removes several replicated chunks of code. Signed-off-by: Sargun Dhillon Acked-by: Christian Brauner Reviewed-by: Tycho Andersen Cc: Matt Denton Cc: Kees Cook , Cc: Jann Horn , Cc: Robert Sesek , Cc: Chris Palmer Cc: Christian Brauner Cc: Tycho Andersen Link: https://lore.kernel.org/r/20200601112532.150158-1-sargun@sargun.me Signed-off-by: Kees Cook --- kernel/seccomp.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f387e5004c29..e711d697afb6 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -41,6 +41,7 @@ #include #include #include +#include enum notify_state { SECCOMP_NOTIFY_INIT, @@ -1024,6 +1025,23 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) return 0; } +/* must be called with notif_lock held */ +static inline struct seccomp_knotif * +find_notification(struct seccomp_filter *filter, u64 id) +{ + struct seccomp_knotif *cur; + + lockdep_assert_held(&filter->notify_lock); + + list_for_each_entry(cur, &filter->notif->notifications, list) { + if (cur->id == id) + return cur; + } + + return NULL; +} + + static long seccomp_notify_recv(struct seccomp_filter *filter, void __user *buf) { @@ -1081,15 +1099,8 @@ out: * may have died when we released the lock, so we need to make * sure it's still around. */ - knotif = NULL; mutex_lock(&filter->notify_lock); - list_for_each_entry(cur, &filter->notif->notifications, list) { - if (cur->id == unotif.id) { - knotif = cur; - break; - } - } - + knotif = find_notification(filter, unotif.id); if (knotif) { knotif->state = SECCOMP_NOTIFY_INIT; up(&filter->notif->request); @@ -1104,7 +1115,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, void __user *buf) { struct seccomp_notif_resp resp = {}; - struct seccomp_knotif *knotif = NULL, *cur; + struct seccomp_knotif *knotif; long ret; if (copy_from_user(&resp, buf, sizeof(resp))) @@ -1121,13 +1132,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, if (ret < 0) return ret; - list_for_each_entry(cur, &filter->notif->notifications, list) { - if (cur->id == resp.id) { - knotif = cur; - break; - } - } - + knotif = find_notification(filter, resp.id); if (!knotif) { ret = -ENOENT; goto out; @@ -1153,7 +1158,7 @@ out: static long seccomp_notify_id_valid(struct seccomp_filter *filter, void __user *buf) { - struct seccomp_knotif *knotif = NULL; + struct seccomp_knotif *knotif; u64 id; long ret; @@ -1164,16 +1169,12 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter, if (ret < 0) return ret; - ret = -ENOENT; - list_for_each_entry(knotif, &filter->notif->notifications, list) { - if (knotif->id == id) { - if (knotif->state == SECCOMP_NOTIFY_SENT) - ret = 0; - goto out; - } - } + knotif = find_notification(filter, id); + if (knotif && knotif->state == SECCOMP_NOTIFY_SENT) + ret = 0; + else + ret = -ENOENT; -out: mutex_unlock(&filter->notify_lock); return ret; } -- cgit v1.2.3 From b707ddee11d1dc4518ab7f1aa5e7af9ceaa23317 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 31 May 2020 13:50:28 +0200 Subject: seccomp: rename "usage" to "refs" and document Naming the lifetime counter of a seccomp filter "usage" suggests a little too strongly that its about tasks that are using this filter while it also tracks other references such as the user notifier or ptrace. This also updates the documentation to note this fact. We'll be introducing an actual usage counter in a follow-up patch. Cc: Tycho Andersen Cc: Kees Cook Cc: Matt Denton Cc: Sargun Dhillon Cc: Jann Horn Cc: Chris Palmer Cc: Aleksa Sarai Cc: Robert Sesek Cc: Jeffrey Vander Stoep Cc: Linux Containers Signed-off-by: Christian Brauner Link: https://lore.kernel.org/r/20200531115031.391515-1-christian.brauner@ubuntu.com Signed-off-by: Kees Cook --- kernel/seccomp.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e711d697afb6..d4dd3344e312 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -107,10 +107,11 @@ struct notification { /** * struct seccomp_filter - container for seccomp BPF programs * - * @usage: reference count to manage the object lifetime. - * get/put helpers should be used when accessing an instance - * outside of a lifetime-guarded section. In general, this - * is only needed for handling filters shared across tasks. + * @refs: Reference count to manage the object lifetime. + * A filter's reference count is incremented for each directly + * attached task, once for the dependent filter, and if + * requested for the user notifier. When @refs reaches zero, + * the filter can be freed. * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate @@ -125,10 +126,10 @@ struct notification { * how namespaces work. * * seccomp_filter objects should never be modified after being attached - * to a task_struct (other than @usage). + * to a task_struct (other than @refs). */ struct seccomp_filter { - refcount_t usage; + refcount_t refs; bool log; struct seccomp_filter *prev; struct bpf_prog *prog; @@ -464,7 +465,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return ERR_PTR(ret); } - refcount_set(&sfilter->usage, 1); + refcount_set(&sfilter->refs, 1); return sfilter; } @@ -558,7 +559,7 @@ static long seccomp_attach_filter(unsigned int flags, static void __get_seccomp_filter(struct seccomp_filter *filter) { - refcount_inc(&filter->usage); + refcount_inc(&filter->refs); } /* get_seccomp_filter - increments the reference count of the filter on @tsk */ @@ -581,7 +582,7 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) static void __put_seccomp_filter(struct seccomp_filter *orig) { /* Clean up single-reference branches iteratively. */ - while (orig && refcount_dec_and_test(&orig->usage)) { + while (orig && refcount_dec_and_test(&orig->refs)) { struct seccomp_filter *freeme = orig; orig = orig->prev; seccomp_filter_free(freeme); -- cgit v1.2.3 From 3a15fb6ed92cb32b0a83f406aa4a96f28c9adbc3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 31 May 2020 13:50:29 +0200 Subject: seccomp: release filter after task is fully dead The seccomp filter used to be released in free_task() which is called asynchronously via call_rcu() and assorted mechanisms. Since we need to inform tasks waiting on the seccomp notifier when a filter goes empty we will notify them as soon as a task has been marked fully dead in release_task(). To not split seccomp cleanup into two parts, move filter release out of free_task() and into release_task() after we've unhashed struct task from struct pid, exited signals, and unlinked it from the threadgroups' thread list. We'll put the empty filter notification infrastructure into it in a follow up patch. This also renames put_seccomp_filter() to seccomp_filter_release() which is a more descriptive name of what we're doing here especially once we've added the empty filter notification mechanism in there. We're also NULL-ing the task's filter tree entrypoint which seems cleaner than leaving a dangling pointer in there. Note that this shouldn't need any memory barriers since we're calling this when the task is in release_task() which means it's EXIT_DEAD. So it can't modify its seccomp filters anymore. You can also see this from the point where we're calling seccomp_filter_release(). It's after __exit_signal() and at this point, tsk->sighand will already have been NULLed which is required for thread-sync and filter installation alike. Cc: Tycho Andersen Cc: Kees Cook Cc: Matt Denton Cc: Sargun Dhillon Cc: Jann Horn Cc: Chris Palmer Cc: Aleksa Sarai Cc: Robert Sesek Cc: Jeffrey Vander Stoep Cc: Linux Containers Signed-off-by: Christian Brauner Link: https://lore.kernel.org/r/20200531115031.391515-2-christian.brauner@ubuntu.com Signed-off-by: Kees Cook --- include/linux/seccomp.h | 4 ++-- kernel/exit.c | 1 + kernel/fork.c | 1 - kernel/seccomp.c | 62 +++++++++++++++++++++++++++++-------------------- 4 files changed, 40 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 2ec2720f83cc..babcd6c02d09 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -84,10 +84,10 @@ static inline int seccomp_mode(struct seccomp *s) #endif /* CONFIG_SECCOMP */ #ifdef CONFIG_SECCOMP_FILTER -extern void put_seccomp_filter(struct task_struct *tsk); +extern void seccomp_filter_release(struct task_struct *tsk); extern void get_seccomp_filter(struct task_struct *tsk); #else /* CONFIG_SECCOMP_FILTER */ -static inline void put_seccomp_filter(struct task_struct *tsk) +static inline void seccomp_filter_release(struct task_struct *tsk) { return; } diff --git a/kernel/exit.c b/kernel/exit.c index 727150f28103..00d77e5ba700 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -217,6 +217,7 @@ repeat: } write_unlock_irq(&tasklist_lock); + seccomp_filter_release(p); proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..c51a9cd824c5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -473,7 +473,6 @@ void free_task(struct task_struct *tsk) #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); - put_seccomp_filter(tsk); arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d4dd3344e312..0ca6d5243427 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -368,6 +368,42 @@ static inline pid_t seccomp_can_sync_threads(void) return 0; } +static inline void seccomp_filter_free(struct seccomp_filter *filter) +{ + if (filter) { + bpf_prog_destroy(filter->prog); + kfree(filter); + } +} + +static void __put_seccomp_filter(struct seccomp_filter *orig) +{ + /* Clean up single-reference branches iteratively. */ + while (orig && refcount_dec_and_test(&orig->refs)) { + struct seccomp_filter *freeme = orig; + orig = orig->prev; + seccomp_filter_free(freeme); + } +} + +/** + * seccomp_filter_release - Detach the task from its filter tree + * and drop its reference count during + * exit. + * + * This function should only be called when the task is exiting as + * it detaches it from its filter tree. As such, READ_ONCE() and + * barriers are not needed here, as would normally be needed. + */ +void seccomp_filter_release(struct task_struct *tsk) +{ + struct seccomp_filter *orig = tsk->seccomp.filter; + + /* Detach task from its filter tree. */ + tsk->seccomp.filter = NULL; + __put_seccomp_filter(orig); +} + /** * seccomp_sync_threads: sets all threads to use current's filter * @@ -397,7 +433,7 @@ static inline void seccomp_sync_threads(unsigned long flags) * current's path will hold a reference. (This also * allows a put before the assignment.) */ - put_seccomp_filter(thread); + __put_seccomp_filter(thread->seccomp.filter); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); atomic_set(&thread->seccomp.filter_count, @@ -571,30 +607,6 @@ void get_seccomp_filter(struct task_struct *tsk) __get_seccomp_filter(orig); } -static inline void seccomp_filter_free(struct seccomp_filter *filter) -{ - if (filter) { - bpf_prog_destroy(filter->prog); - kfree(filter); - } -} - -static void __put_seccomp_filter(struct seccomp_filter *orig) -{ - /* Clean up single-reference branches iteratively. */ - while (orig && refcount_dec_and_test(&orig->refs)) { - struct seccomp_filter *freeme = orig; - orig = orig->prev; - seccomp_filter_free(freeme); - } -} - -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) -{ - __put_seccomp_filter(tsk->seccomp.filter); -} - static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason) { clear_siginfo(info); -- cgit v1.2.3 From 76194c4e830d570d9e369d637bb907591d2b3111 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 1 Jun 2020 11:50:07 -0700 Subject: seccomp: Lift wait_queue into struct seccomp_filter Lift the wait_queue from struct notification into struct seccomp_filter. This is cleaner overall and lets us avoid having to take the notifier mutex in the future for EPOLLHUP notifications since we need to neither read nor modify the notifier specific aspects of the seccomp filter. In the exit path I'd very much like to avoid having to take the notifier mutex for each filter in the task's filter hierarchy. Cc: Tycho Andersen Cc: Kees Cook Cc: Matt Denton Cc: Sargun Dhillon Cc: Jann Horn Cc: Chris Palmer Cc: Aleksa Sarai Cc: Robert Sesek Cc: Jeffrey Vander Stoep Cc: Linux Containers Signed-off-by: Christian Brauner Signed-off-by: Kees Cook --- kernel/seccomp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0ca6d5243427..39c1c687d8ad 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -95,13 +95,11 @@ struct seccomp_knotif { * filter->notify_lock. * @next_id: The id of the next request. * @notifications: A list of struct seccomp_knotif elements. - * @wqh: A wait queue for poll. */ struct notification { struct semaphore request; u64 next_id; struct list_head notifications; - wait_queue_head_t wqh; }; /** @@ -117,6 +115,7 @@ struct notification { * @prog: the BPF program to evaluate * @notif: the struct that holds all notification related information * @notify_lock: A lock for all notification-related accesses. + * @wqh: A wait queue for poll if a notifier is in use. * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -135,6 +134,7 @@ struct seccomp_filter { struct bpf_prog *prog; struct notification *notif; struct mutex notify_lock; + wait_queue_head_t wqh; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -502,6 +502,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) } refcount_set(&sfilter->refs, 1); + init_waitqueue_head(&sfilter->wqh); return sfilter; } @@ -774,7 +775,7 @@ static int seccomp_do_user_notification(int this_syscall, list_add(&n.list, &match->notif->notifications); up(&match->notif->request); - wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM); + wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); mutex_unlock(&match->notify_lock); /* @@ -1098,7 +1099,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter, unotif.data = *(knotif->data); knotif->state = SECCOMP_NOTIFY_SENT; - wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM); + wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM); ret = 0; out: mutex_unlock(&filter->notify_lock); @@ -1217,7 +1218,7 @@ static __poll_t seccomp_notify_poll(struct file *file, __poll_t ret = 0; struct seccomp_knotif *cur; - poll_wait(file, &filter->notif->wqh, poll_tab); + poll_wait(file, &filter->wqh, poll_tab); if (mutex_lock_interruptible(&filter->notify_lock) < 0) return EPOLLERR; @@ -1261,7 +1262,6 @@ static struct file *init_listener(struct seccomp_filter *filter) sema_init(&filter->notif->request, 0); filter->notif->next_id = get_random_u64(); INIT_LIST_HEAD(&filter->notif->notifications); - init_waitqueue_head(&filter->notif->wqh); ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops, filter, O_RDWR); -- cgit v1.2.3 From 99cdb8b9a57393b5978e7a6310a2cba511dd179b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 31 May 2020 13:50:30 +0200 Subject: seccomp: notify about unused filter We've been making heavy use of the seccomp notifier to intercept and handle certain syscalls for containers. This patch allows a syscall supervisor listening on a given notifier to be notified when a seccomp filter has become unused. A container is often managed by a singleton supervisor process the so-called "monitor". This monitor process has an event loop which has various event handlers registered. If the user specified a seccomp profile that included a notifier for various syscalls then we also register a seccomp notify even handler. For any container using a separate pid namespace the lifecycle of the seccomp notifier is bound to the init process of the pid namespace, i.e. when the init process exits the filter must be unused. If a new process attaches to a container we force it to assume a seccomp profile. This can either be the same seccomp profile as the container was started with or a modified one. If the attaching process makes use of the seccomp notifier we will register a new seccomp notifier handler in the monitor's event loop. However, when the attaching process exits we can't simply delete the handler since other child processes could've been created (daemons spawned etc.) that have inherited the seccomp filter and so we need to keep the seccomp notifier fd alive in the event loop. But this is problematic since we don't get a notification when the seccomp filter has become unused and so we currently never remove the seccomp notifier fd from the event loop and just keep accumulating fds in the event loop. We've had this issue for a while but it has recently become more pressing as more and larger users make use of this. To fix this, we introduce a new "users" reference counter that tracks any tasks and dependent filters making use of a filter. When a notifier is registered waiting tasks will be notified that the filter is now empty by receiving a (E)POLLHUP event. The concept in this patch introduces is the same as for signal_struct, i.e. reference counting for life-cycle management is decoupled from reference counting taks using the object. There's probably some trickery possible but the second counter is just the correct way of doing this IMHO and has precedence. Cc: Tycho Andersen Cc: Kees Cook Cc: Matt Denton Cc: Sargun Dhillon Cc: Jann Horn Cc: Chris Palmer Cc: Aleksa Sarai Cc: Robert Sesek Cc: Jeffrey Vander Stoep Cc: Linux Containers Signed-off-by: Christian Brauner Link: https://lore.kernel.org/r/20200531115031.391515-3-christian.brauner@ubuntu.com Signed-off-by: Kees Cook --- kernel/seccomp.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 39c1c687d8ad..ba30bffa9301 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -110,6 +110,14 @@ struct notification { * attached task, once for the dependent filter, and if * requested for the user notifier. When @refs reaches zero, * the filter can be freed. + * @users: A filter's @users count is incremented for each directly + * attached task (filter installation, fork(), thread_sync), + * and once for the dependent filter (tracked in filter->prev). + * When it reaches zero it indicates that no direct or indirect + * users of that filter exist. No new tasks can get associated with + * this filter after reaching 0. The @users count is always smaller + * or equal to @refs. Hence, reaching 0 for @users does not mean + * the filter can be freed. * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate @@ -129,6 +137,7 @@ struct notification { */ struct seccomp_filter { refcount_t refs; + refcount_t users; bool log; struct seccomp_filter *prev; struct bpf_prog *prog; @@ -376,6 +385,15 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) } } +static void __seccomp_filter_orphan(struct seccomp_filter *orig) +{ + while (orig && refcount_dec_and_test(&orig->users)) { + if (waitqueue_active(&orig->wqh)) + wake_up_poll(&orig->wqh, EPOLLHUP); + orig = orig->prev; + } +} + static void __put_seccomp_filter(struct seccomp_filter *orig) { /* Clean up single-reference branches iteratively. */ @@ -386,10 +404,18 @@ static void __put_seccomp_filter(struct seccomp_filter *orig) } } +static void __seccomp_filter_release(struct seccomp_filter *orig) +{ + /* Notify about any unused filters in the task's former filter tree. */ + __seccomp_filter_orphan(orig); + /* Finally drop all references to the task's former tree. */ + __put_seccomp_filter(orig); +} + /** - * seccomp_filter_release - Detach the task from its filter tree - * and drop its reference count during - * exit. + * seccomp_filter_release - Detach the task from its filter tree, + * drop its reference count, and notify + * about unused filters * * This function should only be called when the task is exiting as * it detaches it from its filter tree. As such, READ_ONCE() and @@ -401,7 +427,7 @@ void seccomp_filter_release(struct task_struct *tsk) /* Detach task from its filter tree. */ tsk->seccomp.filter = NULL; - __put_seccomp_filter(orig); + __seccomp_filter_release(orig); } /** @@ -428,12 +454,15 @@ static inline void seccomp_sync_threads(unsigned long flags) /* Get a task reference for the new leaf node. */ get_seccomp_filter(caller); + /* * Drop the task reference to the shared ancestor since * current's path will hold a reference. (This also * allows a put before the assignment.) */ - __put_seccomp_filter(thread->seccomp.filter); + __seccomp_filter_release(thread->seccomp.filter); + + /* Make our new filter tree visible. */ smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); atomic_set(&thread->seccomp.filter_count, @@ -502,6 +531,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) } refcount_set(&sfilter->refs, 1); + refcount_set(&sfilter->users, 1); init_waitqueue_head(&sfilter->wqh); return sfilter; @@ -606,6 +636,7 @@ void get_seccomp_filter(struct task_struct *tsk) if (!orig) return; __get_seccomp_filter(orig); + refcount_inc(&orig->users); } static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason) @@ -1234,6 +1265,9 @@ static __poll_t seccomp_notify_poll(struct file *file, mutex_unlock(&filter->notify_lock); + if (refcount_read(&filter->users) == 0) + ret |= EPOLLHUP; + return ret; } -- cgit v1.2.3 From e68f9d49dda1744d548426c8b4335a8d693a36d0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 15 Jun 2020 22:02:56 -0700 Subject: seccomp: Use pr_fmt Avoid open-coding "seccomp: " prefixes for pr_*() calls. Signed-off-by: Kees Cook --- kernel/seccomp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ba30bffa9301..5f0e3f3a7a5d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -13,6 +13,7 @@ * Mode 2 allows user-defined system call filters in the form * of Berkeley Packet Filters/Linux Socket Filters. */ +#define pr_fmt(fmt) "seccomp: " fmt #include #include @@ -1873,7 +1874,7 @@ static int __init seccomp_sysctl_init(void) hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); if (!hdr) - pr_warn("seccomp: sysctl registration failed\n"); + pr_warn("sysctl registration failed\n"); else kmemleak_not_leak(hdr); -- cgit v1.2.3 From 47e33c05f9f07cac3de833e531bcac9ae052c7ca Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 15 Jun 2020 15:42:46 -0700 Subject: seccomp: Fix ioctl number for SECCOMP_IOCTL_NOTIF_ID_VALID When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced it had the wrong direction flag set. While this isn't a big deal as nothing currently enforces these bits in the kernel, it should be defined correctly. Fix the define and provide support for the old command until it is no longer needed for backward compatibility. Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 3 ++- kernel/seccomp.c | 9 +++++++++ tools/testing/selftests/seccomp/seccomp_bpf.c | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index c1735455bc53..965290f7dcc2 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -123,5 +123,6 @@ struct seccomp_notif_resp { #define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ struct seccomp_notif_resp) -#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) +#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) + #endif /* _UAPI_LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5f0e3f3a7a5d..0ed57e8c49d0 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -44,6 +44,14 @@ #include #include +/* + * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the + * wrong direction flag in the ioctl number. This is the broken one, + * which the kernel needs to keep supporting until all userspaces stop + * using the wrong command number. + */ +#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64) + enum notify_state { SECCOMP_NOTIFY_INIT, SECCOMP_NOTIFY_SENT, @@ -1236,6 +1244,7 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, return seccomp_notify_recv(filter, buf); case SECCOMP_IOCTL_NOTIF_SEND: return seccomp_notify_send(filter, buf); + case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); default: diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 43884b6625fc..61f9ac200001 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -186,7 +186,7 @@ struct seccomp_metadata { #define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ struct seccomp_notif_resp) -#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) +#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) struct seccomp_notif { __u64 id; -- cgit v1.2.3 From fe4bfff86ec54773df3db79e8112e3b0f820c799 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 19 Jun 2020 12:20:15 -0700 Subject: seccomp: Use -1 marker for end of mode 1 syscall list The terminator for the mode 1 syscalls list was a 0, but that could be a valid syscall number (e.g. x86_64 __NR_read). By luck, __NR_read was listed first and the loop construct would not test it, so there was no bug. However, this is fragile. Replace the terminator with -1 instead, and make the variable name for mode 1 syscall lists more descriptive. Cc: Andy Lutomirski Cc: Will Drewry Signed-off-by: Kees Cook --- arch/mips/include/asm/seccomp.h | 4 ++-- include/asm-generic/seccomp.h | 2 +- kernel/seccomp.c | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/mips/include/asm/seccomp.h b/arch/mips/include/asm/seccomp.h index e383d7e27b93..aa809589a181 100644 --- a/arch/mips/include/asm/seccomp.h +++ b/arch/mips/include/asm/seccomp.h @@ -9,12 +9,12 @@ static inline const int *get_compat_mode1_syscalls(void) static const int syscalls_O32[] = { __NR_O32_Linux + 3, __NR_O32_Linux + 4, __NR_O32_Linux + 1, __NR_O32_Linux + 193, - 0, /* null terminated */ + -1, /* negative terminated */ }; static const int syscalls_N32[] = { __NR_N32_Linux + 0, __NR_N32_Linux + 1, __NR_N32_Linux + 58, __NR_N32_Linux + 211, - 0, /* null terminated */ + -1, /* negative terminated */ }; if (IS_ENABLED(CONFIG_MIPS32_O32) && test_thread_flag(TIF_32BIT_REGS)) diff --git a/include/asm-generic/seccomp.h b/include/asm-generic/seccomp.h index 1321ac7821d7..6b6f42bc58f9 100644 --- a/include/asm-generic/seccomp.h +++ b/include/asm-generic/seccomp.h @@ -33,7 +33,7 @@ static inline const int *get_compat_mode1_syscalls(void) static const int mode1_syscalls_32[] = { __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, - 0, /* null terminated */ + -1, /* negative terminated */ }; return mode1_syscalls_32; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0ed57e8c49d0..866a432cd746 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -742,20 +742,20 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, */ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, - 0, /* null terminated */ + -1, /* negative terminated */ }; static void __secure_computing_strict(int this_syscall) { - const int *syscall_whitelist = mode1_syscalls; + const int *allowed_syscalls = mode1_syscalls; #ifdef CONFIG_COMPAT if (in_compat_syscall()) - syscall_whitelist = get_compat_mode1_syscalls(); + allowed_syscalls = get_compat_mode1_syscalls(); #endif do { - if (*syscall_whitelist == this_syscall) + if (*allowed_syscalls == this_syscall) return; - } while (*++syscall_whitelist); + } while (*++allowed_syscalls != -1); #ifdef SECCOMP_DEBUG dump_stack(); -- cgit v1.2.3 From bc885f1ab6de0d38c6956a71b0126543b64875b0 Mon Sep 17 00:00:00 2001 From: Bruno Meneguele Date: Fri, 10 Jul 2020 14:44:23 -0300 Subject: doc:kmsg: explicitly state the return value in case of SEEK_CUR The commit 625d3449788f ("Revert "kernel/printk: add kmsg SEEK_CUR handling"") reverted a change done to the return value in case a SEEK_CUR operation was performed for kmsg buffer based on the fact that different userspace apps were handling the new return value (-ESPIPE) in different ways, breaking them. At the same time -ESPIPE was the wrong decision because kmsg /does support/ seek() but doesn't follow the "normal" behavior userspace is used to. Because of that and also considering the time -EINVAL has been used, it was decided to keep this way to avoid more userspace breakage. This patch adds an official statement to the kmsg documentation pointing to the current return value for SEEK_CUR, -EINVAL, thus userspace libraries and apps can refer to it for a definitive guide on what to expect. Signed-off-by: Bruno Meneguele Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20200710174423.10480-1-bmeneg@redhat.com --- Documentation/ABI/testing/dev-kmsg | 11 +++++++++++ kernel/printk/printk.c | 8 ++++++++ 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/Documentation/ABI/testing/dev-kmsg b/Documentation/ABI/testing/dev-kmsg index 1e6c28b1942b..a917efc289a2 100644 --- a/Documentation/ABI/testing/dev-kmsg +++ b/Documentation/ABI/testing/dev-kmsg @@ -61,6 +61,17 @@ Description: The /dev/kmsg character device node provides userspace access SEEK_CUR is not supported, returning -ESPIPE (invalid seek) to errno whenever requested. + Other seek operations or offsets are not supported because of + the special behavior this device has. The device allows to read + or write only whole variable length messages (records) that are + stored in a ring buffer. + + Because of the non-standard behavior also the error values are + non-standard. -ESPIPE is returned for non-zero offset. -EINVAL + is returned for other operations, e.g. SEEK_CUR. This behavior + and values are historical and could not be modified without the + risk of breaking userspace. + The output format consists of a prefix carrying the syslog prefix including priority and facility, the 64 bit message sequence number and the monotonic timestamp in microseconds, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8c14835be46c..5f6eca65dd9a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -943,6 +943,14 @@ out: return ret; } +/* + * Be careful when modifying this function!!! + * + * Only few operations are supported because the device works only with the + * entire variable length messages (records). Non-standard values are + * returned in the other cases and has been this way for quite some time. + * User space applications might depend on this behavior. + */ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) { struct devkmsg_user *user = file->private_data; -- cgit v1.2.3 From c9a0f3b85e09dd16665b639cb884490410619434 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 11 Jul 2020 23:53:24 +0200 Subject: bpf: Resolve BTF IDs in vmlinux image Using BTF_ID_LIST macro to define lists for several helpers using BTF arguments. And running resolve_btfids on vmlinux elf object during linking, so the .BTF_ids section gets the IDs resolved. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Tested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200711215329.41165-5-jolsa@kernel.org --- Makefile | 3 ++- kernel/bpf/stackmap.c | 5 ++++- kernel/trace/bpf_trace.c | 9 +++++++-- net/core/filter.c | 9 +++++++-- scripts/link-vmlinux.sh | 6 ++++++ 5 files changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Makefile b/Makefile index 017e775b3288..462f1a75fcb3 100644 --- a/Makefile +++ b/Makefile @@ -448,6 +448,7 @@ OBJSIZE = $(CROSS_COMPILE)size STRIP = $(CROSS_COMPILE)strip endif PAHOLE = pahole +RESOLVE_BTFIDS = $(objtree)/tools/bpf/resolve_btfids/resolve_btfids LEX = flex YACC = bison AWK = awk @@ -510,7 +511,7 @@ GCC_PLUGINS_CFLAGS := CLANG_FLAGS := export ARCH SRCARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC -export CPP AR NM STRIP OBJCOPY OBJDUMP OBJSIZE READELF PAHOLE LEX YACC AWK INSTALLKERNEL +export CPP AR NM STRIP OBJCOPY OBJDUMP OBJSIZE READELF PAHOLE RESOLVE_BTFIDS LEX YACC AWK INSTALLKERNEL export PERL PYTHON PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX export KGZIP KBZIP2 KLZOP LZMA LZ4 XZ export KBUILD_HOSTCXXFLAGS KBUILD_HOSTLDFLAGS KBUILD_HOSTLDLIBS LDFLAGS_MODULE diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a6c361ed7937..48d8e739975f 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -576,7 +577,9 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, return __bpf_get_stack(regs, task, buf, size, flags); } -static int bpf_get_task_stack_btf_ids[5]; +BTF_ID_LIST(bpf_get_task_stack_btf_ids) +BTF_ID(struct, task_struct) + const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e0b7775039ab..e178e8e32b33 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -710,7 +711,9 @@ out: return err; } -static int bpf_seq_printf_btf_ids[5]; +BTF_ID_LIST(bpf_seq_printf_btf_ids) +BTF_ID(struct, seq_file) + static const struct bpf_func_proto bpf_seq_printf_proto = { .func = bpf_seq_printf, .gpl_only = true, @@ -728,7 +731,9 @@ BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) return seq_write(m, data, len) ? -EOVERFLOW : 0; } -static int bpf_seq_write_btf_ids[5]; +BTF_ID_LIST(bpf_seq_write_btf_ids) +BTF_ID(struct, seq_file) + static const struct bpf_func_proto bpf_seq_write_proto = { .func = bpf_seq_write, .gpl_only = true, diff --git a/net/core/filter.c b/net/core/filter.c index ddcc0d6209e1..4e572441e64a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -75,6 +75,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -3779,7 +3780,9 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static int bpf_skb_output_btf_ids[5]; +BTF_ID_LIST(bpf_skb_output_btf_ids) +BTF_ID(struct, sk_buff) + const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, @@ -4173,7 +4176,9 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static int bpf_xdp_output_btf_ids[5]; +BTF_ID_LIST(bpf_xdp_output_btf_ids) +BTF_ID(struct, xdp_buff) + const struct bpf_func_proto bpf_xdp_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 92dd745906f4..e26f02dbedee 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -336,6 +336,12 @@ fi vmlinux_link vmlinux "${kallsymso}" ${btf_vmlinux_bin_o} +# fill in BTF IDs +if [ -n "${CONFIG_DEBUG_INFO_BTF}" ]; then +info BTFIDS vmlinux +${RESOLVE_BTFIDS} vmlinux +fi + if [ -n "${CONFIG_BUILDTIME_TABLE_SORT}" ]; then info SORTTAB vmlinux if ! sorttable vmlinux; then -- cgit v1.2.3 From 138b9a0511c789f2451ff1d80e7fd3f9eef3a9e3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 11 Jul 2020 23:53:25 +0200 Subject: bpf: Remove btf_id helpers resolving Now when we moved the helpers btf_id arrays into .BTF_ids section, we can remove the code that resolve those IDs in runtime. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Tested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200711215329.41165-6-jolsa@kernel.org --- kernel/bpf/btf.c | 89 ++++---------------------------------------------------- 1 file changed, 5 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4c3007f428b1..71140b73ae3c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4079,96 +4079,17 @@ error: return -EINVAL; } -static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, - int arg) -{ - char fnname[KSYM_SYMBOL_LEN + 4] = "btf_"; - const struct btf_param *args; - const struct btf_type *t; - const char *tname, *sym; - u32 btf_id, i; - - if (IS_ERR(btf_vmlinux)) { - bpf_log(log, "btf_vmlinux is malformed\n"); - return -EINVAL; - } - - sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4); - if (!sym) { - bpf_log(log, "kernel doesn't have kallsyms\n"); - return -EFAULT; - } - - for (i = 1; i <= btf_vmlinux->nr_types; i++) { - t = btf_type_by_id(btf_vmlinux, i); - if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) - continue; - tname = __btf_name_by_offset(btf_vmlinux, t->name_off); - if (!strcmp(tname, fnname)) - break; - } - if (i > btf_vmlinux->nr_types) { - bpf_log(log, "helper %s type is not found\n", fnname); - return -ENOENT; - } - - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_ptr(t)) - return -EFAULT; - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_func_proto(t)) - return -EFAULT; - - args = (const struct btf_param *)(t + 1); - if (arg >= btf_type_vlen(t)) { - bpf_log(log, "bpf helper %s doesn't have %d-th argument\n", - fnname, arg); - return -EINVAL; - } - - t = btf_type_by_id(btf_vmlinux, args[arg].type); - if (!btf_type_is_ptr(t) || !t->type) { - /* anything but the pointer to struct is a helper config bug */ - bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n"); - return -EFAULT; - } - btf_id = t->type; - t = btf_type_by_id(btf_vmlinux, t->type); - /* skip modifiers */ - while (btf_type_is_modifier(t)) { - btf_id = t->type; - t = btf_type_by_id(btf_vmlinux, t->type); - } - if (!btf_type_is_struct(t)) { - bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n"); - return -EFAULT; - } - bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4, - arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off)); - return btf_id; -} - int btf_resolve_helper_id(struct bpf_verifier_log *log, const struct bpf_func_proto *fn, int arg) { - int *btf_id = &fn->btf_id[arg]; - int ret; + int id; if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID) return -EINVAL; - - ret = READ_ONCE(*btf_id); - if (ret) - return ret; - /* ok to race the search. The result is the same */ - ret = __btf_resolve_helper_id(log, fn->func, arg); - if (!ret) { - /* Function argument cannot be type 'void' */ - bpf_log(log, "BTF resolution bug\n"); - return -EFAULT; - } - WRITE_ONCE(*btf_id, ret); - return ret; + id = fn->btf_id[arg]; + if (!id || id > btf_vmlinux->nr_types) + return -EINVAL; + return id; } static int __get_type_size(struct btf *btf, u32 btf_id, -- cgit v1.2.3 From 49f4e6720748c778795df62d222d0200b61345be Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 11 Jul 2020 23:53:26 +0200 Subject: bpf: Use BTF_ID to resolve bpf_ctx_convert struct This way the ID is resolved during compile time, and we can remove the runtime name search. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Tested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200711215329.41165-7-jolsa@kernel.org --- kernel/bpf/btf.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 71140b73ae3c..a710e3ee1f18 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -3621,12 +3622,15 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, return kern_ctx_type->type; } +BTF_ID_LIST(bpf_ctx_convert_btf_id) +BTF_ID(struct, bpf_ctx_convert) + struct btf *btf_parse_vmlinux(void) { struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; struct btf *btf = NULL; - int err, btf_id; + int err; env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) @@ -3659,14 +3663,8 @@ struct btf *btf_parse_vmlinux(void) if (err) goto errout; - /* find struct bpf_ctx_convert for type checking later */ - btf_id = btf_find_by_name_kind(btf, "bpf_ctx_convert", BTF_KIND_STRUCT); - if (btf_id < 0) { - err = btf_id; - goto errout; - } /* btf_parse_vmlinux() runs under bpf_verifier_lock */ - bpf_ctx_convert.t = btf_type_by_id(btf, btf_id); + bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); /* find bpf map structs for map_ptr access checking */ err = btf_vmlinux_map_ids_init(btf, log); -- cgit v1.2.3 From 4969f8a073977123504609d7310b42a588297aa4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 9 Jun 2020 16:21:38 -0700 Subject: pidfd: Add missing sock updates for pidfd_getfd() The sock counting (sock_update_netprioidx() and sock_update_classid()) was missing from pidfd's implementation of received fd installation. Add a call to the new __receive_sock() helper. Cc: Christian Brauner Cc: Christoph Hellwig Cc: Sargun Dhillon Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 8649c322f75c ("pid: Implement pidfd_getfd syscall") Signed-off-by: Kees Cook --- kernel/pid.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index f1496b757162..ee58530d1aca 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,7 @@ #include #include #include +#include struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -642,10 +643,12 @@ static int pidfd_getfd(struct pid *pid, int fd) } ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) + if (ret < 0) { fput(file); - else + } else { + __receive_sock(file); fd_install(ret, file); + } return ret; } -- cgit v1.2.3 From 910d2f16ac90463a1f5b03d53246c443e2b354b9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 9 Jun 2020 16:21:38 -0700 Subject: pidfd: Replace open-coded receive_fd() Replace the open-coded version of receive_fd() with a call to the new helper. Thanks to Vamshi K Sthambamkadi for catching a missed fput() in an earlier version of this patch. Cc: Christoph Hellwig Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Sargun Dhillon Acked-by: Christian Brauner Signed-off-by: Kees Cook --- kernel/pid.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index ee58530d1aca..da5aea5f04fa 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -636,19 +636,8 @@ static int pidfd_getfd(struct pid *pid, int fd) if (IS_ERR(file)) return PTR_ERR(file); - ret = security_file_receive(file); - if (ret) { - fput(file); - return ret; - } - - ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) { - fput(file); - } else { - __receive_sock(file); - fd_install(ret, file); - } + ret = receive_fd(file, O_CLOEXEC); + fput(file); return ret; } -- cgit v1.2.3 From ac5a72ea5c8989871e61f6bb0852e0f91de51ebe Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 13 Jul 2020 12:52:33 +0100 Subject: bpf: Use dedicated bpf_trace_printk event instead of trace_printk() The bpf helper bpf_trace_printk() uses trace_printk() under the hood. This leads to an alarming warning message originating from trace buffer allocation which occurs the first time a program using bpf_trace_printk() is loaded. We can instead create a trace event for bpf_trace_printk() and enable it in-kernel when/if we encounter a program using the bpf_trace_printk() helper. With this approach, trace_printk() is not used directly and no warning message appears. This work was started by Steven (see Link) and finished by Alan; added Steven's Signed-off-by with his permission. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20200628194334.6238b933@oasis.local.home Link: https://lore.kernel.org/bpf/1594641154-18897-2-git-send-email-alan.maguire@oracle.com --- kernel/trace/Makefile | 2 ++ kernel/trace/bpf_trace.c | 42 +++++++++++++++++++++++++++++++++++++----- kernel/trace/bpf_trace.h | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 5 deletions(-) create mode 100644 kernel/trace/bpf_trace.h (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 6575bb0a0434..aeba5ee7325a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -31,6 +31,8 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE GCOV_PROFILE := y endif +CFLAGS_bpf_trace.o := -I$(src) + CFLAGS_trace_benchmark.o := -I$(src) CFLAGS_trace_events_filter.o := -I$(src) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e178e8e32b33..3cc0dcb60ca2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,9 @@ #include "trace_probe.h" #include "trace.h" +#define CREATE_TRACE_POINTS +#include "bpf_trace.h" + #define bpf_event_rcu_dereference(p) \ rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) @@ -375,6 +379,30 @@ static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, } } +static DEFINE_RAW_SPINLOCK(trace_printk_lock); + +#define BPF_TRACE_PRINTK_SIZE 1024 + +static inline __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) +{ + static char buf[BPF_TRACE_PRINTK_SIZE]; + unsigned long flags; + va_list ap; + int ret; + + raw_spin_lock_irqsave(&trace_printk_lock, flags); + va_start(ap, fmt); + ret = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + /* vsnprintf() will not append null for zero-length strings */ + if (ret == 0) + buf[0] = '\0'; + trace_bpf_trace_printk(buf); + raw_spin_unlock_irqrestore(&trace_printk_lock, flags); + + return ret; +} + /* * Only limited trace_printk() conversion specifiers allowed: * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s @@ -484,8 +512,7 @@ fmt_next: */ #define __BPF_TP_EMIT() __BPF_ARG3_TP() #define __BPF_TP(...) \ - __trace_printk(0 /* Fake ip */, \ - fmt, ##__VA_ARGS__) + bpf_do_trace_printk(fmt, ##__VA_ARGS__) #define __BPF_ARG1_TP(...) \ ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ @@ -522,10 +549,15 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { const struct bpf_func_proto *bpf_get_trace_printk_proto(void) { /* - * this program might be calling bpf_trace_printk, - * so allocate per-cpu printk buffers + * This program might be calling bpf_trace_printk, + * so enable the associated bpf_trace/bpf_trace_printk event. + * Repeat this each time as it is possible a user has + * disabled bpf_trace_printk events. By loading a program + * calling bpf_trace_printk() however the user has expressed + * the intent to see such events. */ - trace_printk_init_buffers(); + if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) + pr_warn_ratelimited("could not enable bpf_trace_printk events"); return &bpf_trace_printk_proto; } diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h new file mode 100644 index 000000000000..9acbc11ac7bb --- /dev/null +++ b/kernel/trace/bpf_trace.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bpf_trace + +#if !defined(_TRACE_BPF_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) + +#define _TRACE_BPF_TRACE_H + +#include + +TRACE_EVENT(bpf_trace_printk, + + TP_PROTO(const char *bpf_string), + + TP_ARGS(bpf_string), + + TP_STRUCT__entry( + __string(bpf_string, bpf_string) + ), + + TP_fast_assign( + __assign_str(bpf_string, bpf_string); + ), + + TP_printk("%s", __get_str(bpf_string)) +); + +#endif /* _TRACE_BPF_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE bpf_trace + +#include -- cgit v1.2.3 From 6ada7ba2fa7710eead191b69fd948569b05d0f61 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Mon, 22 Jun 2020 17:09:13 +0800 Subject: PM: hibernate: fix white space in a few places In hibernate.c, some places lack of spaces while some places have redundant spaces. So fix them. Signed-off-by: Xiang Chen Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 02ec716a4927..5714f51ba9f8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1062,7 +1062,7 @@ power_attr(disk); static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), + return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); } @@ -1162,7 +1162,7 @@ static ssize_t reserved_size_store(struct kobject *kobj, power_attr(reserved_size); -static struct attribute * g[] = { +static struct attribute *g[] = { &disk_attr.attr, &resume_offset_attr.attr, &resume_attr.attr, @@ -1190,7 +1190,7 @@ static int __init resume_setup(char *str) if (noresume) return 1; - strncpy( resume_file, str, 255 ); + strncpy(resume_file, str, 255); return 1; } -- cgit v1.2.3 From 02d7f4005e942f75c63302d54a659e673edd00df Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 10 Jul 2020 22:12:10 +0300 Subject: PM: sleep: spread "const char *" correctness Fixed string literals can be referred to as "const char *". Signed-off-by: Alexey Dobriyan [ rjw: Minor subject edit ] Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 2 +- kernel/power/snapshot.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index ba2094db6294..32fc89ac96c3 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -32,7 +32,7 @@ static inline int init_header_complete(struct swsusp_info *info) return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE); } -static inline char *check_image_kernel(struct swsusp_info *info) +static inline const char *check_image_kernel(struct swsusp_info *info) { return arch_hibernation_header_restore(info) ? "architecture specific data" : NULL; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 881128b9351e..cef154261fe2 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2023,7 +2023,7 @@ static int init_header_complete(struct swsusp_info *info) return 0; } -static char *check_image_kernel(struct swsusp_info *info) +static const char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; @@ -2176,7 +2176,7 @@ static void mark_unsafe_pages(struct memory_bitmap *bm) static int check_header(struct swsusp_info *info) { - char *reason; + const char *reason; reason = check_image_kernel(info); if (!reason && info->num_physpages != get_num_physpages()) -- cgit v1.2.3 From 7cf97b12545503992020796c74bd84078eb39299 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Tue, 2 Jun 2020 18:10:43 -0700 Subject: seccomp: Introduce addfd ioctl to seccomp user notifier The current SECCOMP_RET_USER_NOTIF API allows for syscall supervision over an fd. It is often used in settings where a supervising task emulates syscalls on behalf of a supervised task in userspace, either to further restrict the supervisee's syscall abilities or to circumvent kernel enforced restrictions the supervisor deems safe to lift (e.g. actually performing a mount(2) for an unprivileged container). While SECCOMP_RET_USER_NOTIF allows for the interception of any syscall, only a certain subset of syscalls could be correctly emulated. Over the last few development cycles, the set of syscalls which can't be emulated has been reduced due to the addition of pidfd_getfd(2). With this we are now able to, for example, intercept syscalls that require the supervisor to operate on file descriptors of the supervisee such as connect(2). However, syscalls that cause new file descriptors to be installed can not currently be correctly emulated since there is no way for the supervisor to inject file descriptors into the supervisee. This patch adds a new addfd ioctl to remove this restriction by allowing the supervisor to install file descriptors into the intercepted task. By implementing this feature via seccomp the supervisor effectively instructs the supervisee to install a set of file descriptors into its own file descriptor table during the intercepted syscall. This way it is possible to intercept syscalls such as open() or accept(), and install (or replace, like dup2(2)) the supervisor's resulting fd into the supervisee. One replacement use-case would be to redirect the stdout and stderr of a supervisee into log file descriptors opened by the supervisor. The ioctl handling is based on the discussions[1] of how Extensible Arguments should interact with ioctls. Instead of building size into the addfd structure, make it a function of the ioctl command (which is how sizes are normally passed to ioctls). To support forward and backward compatibility, just mask out the direction and size, and match everything. The size (and any future direction) checks are done along with copy_struct_from_user() logic. As a note, the seccomp_notif_addfd structure is laid out based on 8-byte alignment without requiring packing as there have been packing issues with uapi highlighted before[2][3]. Although we could overload the newfd field and use -1 to indicate that it is not to be used, doing so requires changing the size of the fd field, and introduces struct packing complexity. [1]: https://lore.kernel.org/lkml/87o8w9bcaf.fsf@mid.deneb.enyo.de/ [2]: https://lore.kernel.org/lkml/a328b91d-fd8f-4f27-b3c2-91a9c45f18c0@rasmusvillemoes.dk/ [3]: https://lore.kernel.org/lkml/20200612104629.GA15814@ircssh-2.c.rugged-nimbus-611.internal Cc: Christoph Hellwig Cc: Christian Brauner Cc: Tycho Andersen Cc: Jann Horn Cc: Robert Sesek Cc: Chris Palmer Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-api@vger.kernel.org Suggested-by: Matt Denton Link: https://lore.kernel.org/r/20200603011044.7972-4-sargun@sargun.me Signed-off-by: Sargun Dhillon Reviewed-by: Will Drewry Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- include/linux/seccomp.h | 4 + include/uapi/linux/seccomp.h | 22 ++++++ kernel/seccomp.c | 175 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 199 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index babcd6c02d09..881c90b6aa25 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -10,6 +10,10 @@ SECCOMP_FILTER_FLAG_NEW_LISTENER | \ SECCOMP_FILTER_FLAG_TSYNC_ESRCH) +/* sizeof() the first published struct seccomp_notif_addfd */ +#define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24 +#define SECCOMP_NOTIFY_ADDFD_SIZE_LATEST SECCOMP_NOTIFY_ADDFD_SIZE_VER0 + #ifdef CONFIG_SECCOMP #include diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 965290f7dcc2..6ba18b82a02e 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -113,6 +113,25 @@ struct seccomp_notif_resp { __u32 flags; }; +/* valid flags for seccomp_notif_addfd */ +#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */ + +/** + * struct seccomp_notif_addfd + * @id: The ID of the seccomp notification + * @flags: SECCOMP_ADDFD_FLAG_* + * @srcfd: The local fd number + * @newfd: Optional remote FD number if SETFD option is set, otherwise 0. + * @newfd_flags: The O_* flags the remote FD should have applied + */ +struct seccomp_notif_addfd { + __u64 id; + __u32 flags; + __u32 srcfd; + __u32 newfd; + __u32 newfd_flags; +}; + #define SECCOMP_IOC_MAGIC '!' #define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr) #define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type) @@ -124,5 +143,8 @@ struct seccomp_notif_resp { #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ struct seccomp_notif_resp) #define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) +/* On success, the return value is the remote process's added fd number */ +#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \ + struct seccomp_notif_addfd) #endif /* _UAPI_LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 866a432cd746..3ee59ce0a323 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -87,10 +87,42 @@ struct seccomp_knotif { long val; u32 flags; - /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ + /* + * Signals when this has changed states, such as the listener + * dying, a new seccomp addfd message, or changing to REPLIED + */ struct completion ready; struct list_head list; + + /* outstanding addfd requests */ + struct list_head addfd; +}; + +/** + * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages + * + * @file: A reference to the file to install in the other task + * @fd: The fd number to install it at. If the fd number is -1, it means the + * installing process should allocate the fd as normal. + * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC + * is allowed. + * @ret: The return value of the installing process. It is set to the fd num + * upon success (>= 0). + * @completion: Indicates that the installing process has completed fd + * installation, or gone away (either due to successful + * reply, or signal) + * + */ +struct seccomp_kaddfd { + struct file *file; + int fd; + unsigned int flags; + + /* To only be set on reply */ + int ret; + struct completion completion; + struct list_head list; }; /** @@ -793,6 +825,17 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) +{ + /* + * Remove the notification, and reset the list pointers, indicating + * that it has been handled. + */ + list_del_init(&addfd->list); + addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); + complete(&addfd->completion); +} + static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, const struct seccomp_data *sd) @@ -801,6 +844,7 @@ static int seccomp_do_user_notification(int this_syscall, u32 flags = 0; long ret = 0; struct seccomp_knotif n = {}; + struct seccomp_kaddfd *addfd, *tmp; mutex_lock(&match->notify_lock); err = -ENOSYS; @@ -813,6 +857,7 @@ static int seccomp_do_user_notification(int this_syscall, n.id = seccomp_next_notify_id(match); init_completion(&n.ready); list_add(&n.list, &match->notif->notifications); + INIT_LIST_HEAD(&n.addfd); up(&match->notif->request); wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); @@ -821,17 +866,34 @@ static int seccomp_do_user_notification(int this_syscall, /* * This is where we wait for a reply from userspace. */ +wait: err = wait_for_completion_interruptible(&n.ready); mutex_lock(&match->notify_lock); if (err == 0) { + /* Check if we were woken up by a addfd message */ + addfd = list_first_entry_or_null(&n.addfd, + struct seccomp_kaddfd, list); + if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) { + seccomp_handle_addfd(addfd); + mutex_unlock(&match->notify_lock); + goto wait; + } ret = n.val; err = n.error; flags = n.flags; } + /* If there were any pending addfd calls, clear them out */ + list_for_each_entry_safe(addfd, tmp, &n.addfd, list) { + /* The process went away before we got a chance to handle it */ + addfd->ret = -ESRCH; + list_del_init(&addfd->list); + complete(&addfd->completion); + } + /* * Note that it's possible the listener died in between the time when - * we were notified of a respons (or a signal) and when we were able to + * we were notified of a response (or a signal) and when we were able to * re-acquire the lock, so only delete from the list if the * notification actually exists. * @@ -1069,6 +1131,11 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) knotif->error = -ENOSYS; knotif->val = 0; + /* + * We do not need to wake up any pending addfd messages, as + * the notifier will do that for us, as this just looks + * like a standard reply. + */ complete(&knotif->ready); } @@ -1233,12 +1300,109 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter, return ret; } +static long seccomp_notify_addfd(struct seccomp_filter *filter, + struct seccomp_notif_addfd __user *uaddfd, + unsigned int size) +{ + struct seccomp_notif_addfd addfd; + struct seccomp_knotif *knotif; + struct seccomp_kaddfd kaddfd; + int ret; + + BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0); + BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST); + + if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE) + return -EINVAL; + + ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size); + if (ret) + return ret; + + if (addfd.newfd_flags & ~O_CLOEXEC) + return -EINVAL; + + if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD) + return -EINVAL; + + if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD)) + return -EINVAL; + + kaddfd.file = fget(addfd.srcfd); + if (!kaddfd.file) + return -EBADF; + + kaddfd.flags = addfd.newfd_flags; + kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ? + addfd.newfd : -1; + init_completion(&kaddfd.completion); + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + goto out; + + knotif = find_notification(filter, addfd.id); + if (!knotif) { + ret = -ENOENT; + goto out_unlock; + } + + /* + * We do not want to allow for FD injection to occur before the + * notification has been picked up by a userspace handler, or after + * the notification has been replied to. + */ + if (knotif->state != SECCOMP_NOTIFY_SENT) { + ret = -EINPROGRESS; + goto out_unlock; + } + + list_add(&kaddfd.list, &knotif->addfd); + complete(&knotif->ready); + mutex_unlock(&filter->notify_lock); + + /* Now we wait for it to be processed or be interrupted */ + ret = wait_for_completion_interruptible(&kaddfd.completion); + if (ret == 0) { + /* + * We had a successful completion. The other side has already + * removed us from the addfd queue, and + * wait_for_completion_interruptible has a memory barrier upon + * success that lets us read this value directly without + * locking. + */ + ret = kaddfd.ret; + goto out; + } + + mutex_lock(&filter->notify_lock); + /* + * Even though we were woken up by a signal and not a successful + * completion, a completion may have happened in the mean time. + * + * We need to check again if the addfd request has been handled, + * and if not, we will remove it from the queue. + */ + if (list_empty(&kaddfd.list)) + ret = kaddfd.ret; + else + list_del(&kaddfd.list); + +out_unlock: + mutex_unlock(&filter->notify_lock); +out: + fput(kaddfd.file); + + return ret; +} + static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seccomp_filter *filter = file->private_data; void __user *buf = (void __user *)arg; + /* Fixed-size ioctls */ switch (cmd) { case SECCOMP_IOCTL_NOTIF_RECV: return seccomp_notify_recv(filter, buf); @@ -1247,6 +1411,13 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); + } + + /* Extensible Argument ioctls */ +#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK)) + switch (EA_IOCTL(cmd)) { + case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD): + return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd)); default: return -EINVAL; } -- cgit v1.2.3 From d3fa60d7bfdc9a0ff1a524bdef96b3db1fd62022 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Jul 2020 09:45:11 +0200 Subject: dma-mapping: move the remaining DMA API calls out of line For a long time the DMA API has been implemented inline in dma-mapping.h, but the function bodies can be quite large. Move them all out of line. This also removes all the dma_direct_* exports as those are just implementation details and should never be used by drivers directly. Signed-off-by: Christoph Hellwig Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy --- include/linux/dma-direct.h | 58 +++++++++++ include/linux/dma-mapping.h | 247 ++++---------------------------------------- kernel/dma/direct.c | 9 -- kernel/dma/mapping.c | 164 +++++++++++++++++++++++++++++ 4 files changed, 244 insertions(+), 234 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index ab2e20cba951..2b045c509146 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -87,4 +87,62 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long attrs); int dma_direct_supported(struct device *dev, u64 mask); bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr); +dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + unsigned long attrs); +int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, + enum dma_data_direction dir, unsigned long attrs); +dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir, unsigned long attrs); + +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ + defined(CONFIG_SWIOTLB) +void dma_direct_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir); +void dma_direct_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir); +#else +static inline void dma_direct_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ +} +static inline void dma_direct_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ +} +#endif + +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ + defined(CONFIG_SWIOTLB) +void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs); +void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs); +void dma_direct_sync_single_for_cpu(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir); +void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir); +#else +static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ +} +static inline void dma_direct_unmap_sg(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir, + unsigned long attrs) +{ +} +static inline void dma_direct_sync_single_for_cpu(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ +} +static inline void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ +} +#endif + +size_t dma_direct_max_mapping_size(struct device *dev); + #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index a33ed3954ed4..bd0a6f5ee445 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -188,73 +188,6 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, } #endif /* CONFIG_DMA_DECLARE_COHERENT */ -static inline bool dma_is_direct(const struct dma_map_ops *ops) -{ - return likely(!ops); -} - -/* - * All the dma_direct_* declarations are here just for the indirect call bypass, - * and must not be used directly drivers! - */ -dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs); -int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, - enum dma_data_direction dir, unsigned long attrs); -dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir, unsigned long attrs); - -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ - defined(CONFIG_SWIOTLB) -void dma_direct_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir); -void dma_direct_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir); -#else -static inline void dma_direct_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ -} -static inline void dma_direct_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir) -{ -} -#endif - -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ - defined(CONFIG_SWIOTLB) -void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs); -void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, unsigned long attrs); -void dma_direct_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir); -void dma_direct_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir); -#else -static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ -} -static inline void dma_direct_unmap_sg(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir, - unsigned long attrs) -{ -} -static inline void dma_direct_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ -} -static inline void dma_direct_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir) -{ -} -#endif - -size_t dma_direct_max_mapping_size(struct device *dev); - #ifdef CONFIG_HAS_DMA #include @@ -271,164 +204,6 @@ static inline void set_dma_ops(struct device *dev, dev->dma_ops = dma_ops; } -static inline dma_addr_t dma_map_page_attrs(struct device *dev, - struct page *page, size_t offset, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - dma_addr_t addr; - - BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) - addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); - else - addr = ops->map_page(dev, page, offset, size, dir, attrs); - debug_dma_map_page(dev, page, offset, size, dir, addr); - - return addr; -} - -static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) - dma_direct_unmap_page(dev, addr, size, dir, attrs); - else if (ops->unmap_page) - ops->unmap_page(dev, addr, size, dir, attrs); - debug_dma_unmap_page(dev, addr, size, dir); -} - -/* - * dma_maps_sg_attrs returns 0 on error and > 0 on success. - * It should never return a value < 0. - */ -static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, - unsigned long attrs) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - int ents; - - BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) - ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); - else - ents = ops->map_sg(dev, sg, nents, dir, attrs); - BUG_ON(ents < 0); - debug_dma_map_sg(dev, sg, nents, ents, dir); - - return ents; -} - -static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, - unsigned long attrs) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - debug_dma_unmap_sg(dev, sg, nents, dir); - if (dma_is_direct(ops)) - dma_direct_unmap_sg(dev, sg, nents, dir, attrs); - else if (ops->unmap_sg) - ops->unmap_sg(dev, sg, nents, dir, attrs); -} - -static inline dma_addr_t dma_map_resource(struct device *dev, - phys_addr_t phys_addr, - size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - dma_addr_t addr = DMA_MAPPING_ERROR; - - BUG_ON(!valid_dma_direction(dir)); - - /* Don't allow RAM to be mapped */ - if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) - return DMA_MAPPING_ERROR; - - if (dma_is_direct(ops)) - addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); - else if (ops->map_resource) - addr = ops->map_resource(dev, phys_addr, size, dir, attrs); - - debug_dma_map_resource(dev, phys_addr, size, dir, addr); - return addr; -} - -static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (!dma_is_direct(ops) && ops->unmap_resource) - ops->unmap_resource(dev, addr, size, dir, attrs); - debug_dma_unmap_resource(dev, addr, size, dir); -} - -static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, - size_t size, - enum dma_data_direction dir) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) - dma_direct_sync_single_for_cpu(dev, addr, size, dir); - else if (ops->sync_single_for_cpu) - ops->sync_single_for_cpu(dev, addr, size, dir); - debug_dma_sync_single_for_cpu(dev, addr, size, dir); -} - -static inline void dma_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, - enum dma_data_direction dir) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) - dma_direct_sync_single_for_device(dev, addr, size, dir); - else if (ops->sync_single_for_device) - ops->sync_single_for_device(dev, addr, size, dir); - debug_dma_sync_single_for_device(dev, addr, size, dir); -} - -static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) - dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir); - else if (ops->sync_sg_for_cpu) - ops->sync_sg_for_cpu(dev, sg, nelems, dir); - debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); -} - -static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) - dma_direct_sync_sg_for_device(dev, sg, nelems, dir); - else if (ops->sync_sg_for_device) - ops->sync_sg_for_device(dev, sg, nelems, dir); - debug_dma_sync_sg_for_device(dev, sg, nelems, dir); - -} static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { @@ -439,6 +214,28 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) return 0; } +dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs); +void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs); +int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, unsigned long attrs); +void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + unsigned long attrs); +dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, + size_t size, enum dma_data_direction dir, unsigned long attrs); +void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs); +void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir); +void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir); +void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir); +void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir); void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs); void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 67f060b86a73..e545d14fccee 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -315,7 +315,6 @@ void dma_direct_sync_single_for_device(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(paddr, size, dir); } -EXPORT_SYMBOL(dma_direct_sync_single_for_device); void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) @@ -335,7 +334,6 @@ void dma_direct_sync_sg_for_device(struct device *dev, dir); } } -EXPORT_SYMBOL(dma_direct_sync_sg_for_device); #endif #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ @@ -354,7 +352,6 @@ void dma_direct_sync_single_for_cpu(struct device *dev, if (unlikely(is_swiotlb_buffer(paddr))) swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); } -EXPORT_SYMBOL(dma_direct_sync_single_for_cpu); void dma_direct_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) @@ -376,7 +373,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu_all(); } -EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu); void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -389,7 +385,6 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, if (unlikely(is_swiotlb_buffer(phys))) swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); } -EXPORT_SYMBOL(dma_direct_unmap_page); void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) @@ -401,7 +396,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir, attrs); } -EXPORT_SYMBOL(dma_direct_unmap_sg); #endif dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, @@ -428,7 +422,6 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, arch_sync_dma_for_device(phys, size, dir); return dma_addr; } -EXPORT_SYMBOL(dma_direct_map_page); int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) @@ -450,7 +443,6 @@ out_unmap: dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return 0; } -EXPORT_SYMBOL(dma_direct_map_sg); dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -467,7 +459,6 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, return dma_addr; } -EXPORT_SYMBOL(dma_direct_map_resource); int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index a8c18c9a796f..b53953024512 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -105,6 +105,170 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, } EXPORT_SYMBOL(dmam_alloc_attrs); +static inline bool dma_is_direct(const struct dma_map_ops *ops) +{ + return likely(!ops); +} + +dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr; + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + else + addr = ops->map_page(dev, page, offset, size, dir, attrs); + debug_dma_map_page(dev, page, offset, size, dir, addr); + + return addr; +} +EXPORT_SYMBOL(dma_map_page_attrs); + +void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + dma_direct_unmap_page(dev, addr, size, dir, attrs); + else if (ops->unmap_page) + ops->unmap_page(dev, addr, size, dir, attrs); + debug_dma_unmap_page(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_unmap_page_attrs); + +/* + * dma_maps_sg_attrs returns 0 on error and > 0 on success. + * It should never return a value < 0. + */ +int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + int ents; + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); + else + ents = ops->map_sg(dev, sg, nents, dir, attrs); + BUG_ON(ents < 0); + debug_dma_map_sg(dev, sg, nents, ents, dir); + + return ents; +} +EXPORT_SYMBOL(dma_map_sg_attrs); + +void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + debug_dma_unmap_sg(dev, sg, nents, dir); + if (dma_is_direct(ops)) + dma_direct_unmap_sg(dev, sg, nents, dir, attrs); + else if (ops->unmap_sg) + ops->unmap_sg(dev, sg, nents, dir, attrs); +} +EXPORT_SYMBOL(dma_unmap_sg_attrs); + +dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr = DMA_MAPPING_ERROR; + + BUG_ON(!valid_dma_direction(dir)); + + /* Don't allow RAM to be mapped */ + if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) + return DMA_MAPPING_ERROR; + + if (dma_is_direct(ops)) + addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); + else if (ops->map_resource) + addr = ops->map_resource(dev, phys_addr, size, dir, attrs); + + debug_dma_map_resource(dev, phys_addr, size, dir, addr); + return addr; +} +EXPORT_SYMBOL(dma_map_resource); + +void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (!dma_is_direct(ops) && ops->unmap_resource) + ops->unmap_resource(dev, addr, size, dir, attrs); + debug_dma_unmap_resource(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_unmap_resource); + +void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + dma_direct_sync_single_for_cpu(dev, addr, size, dir); + else if (ops->sync_single_for_cpu) + ops->sync_single_for_cpu(dev, addr, size, dir); + debug_dma_sync_single_for_cpu(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_sync_single_for_cpu); + +void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + dma_direct_sync_single_for_device(dev, addr, size, dir); + else if (ops->sync_single_for_device) + ops->sync_single_for_device(dev, addr, size, dir); + debug_dma_sync_single_for_device(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_sync_single_for_device); + +void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir); + else if (ops->sync_sg_for_cpu) + ops->sync_sg_for_cpu(dev, sg, nelems, dir); + debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); +} +EXPORT_SYMBOL(dma_sync_sg_for_cpu); + +void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (dma_is_direct(ops)) + dma_direct_sync_sg_for_device(dev, sg, nelems, dir); + else if (ops->sync_sg_for_device) + ops->sync_sg_for_device(dev, sg, nelems, dir); + debug_dma_sync_sg_for_device(dev, sg, nelems, dir); +} +EXPORT_SYMBOL(dma_sync_sg_for_device); + /* * Create scatter-list for the already allocated DMA buffer. */ -- cgit v1.2.3 From b4174173005972f8f6497883d08d87e0aba1b604 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Jul 2020 09:47:08 +0200 Subject: dma-mapping: inline the fast path dma-direct calls Inline the single page map/unmap/sync dma-direct calls into the now out of line generic wrappers. This restores the behavior of a single function call that we had before moving the generic calls out of line. Besides the dma-mapping callers there are just a few callers in IOMMU drivers that have a bypass mode, and more of those are going to be switched to the generic bypass soon. Signed-off-by: Christoph Hellwig Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy --- include/linux/dma-direct.h | 92 ++++++++++++++++++++++++++++++++++------------ kernel/dma/direct.c | 65 -------------------------------- 2 files changed, 69 insertions(+), 88 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 2b045c509146..5a3ce2a24794 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -1,10 +1,16 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Internals of the DMA direct mapping implementation. Only for use by the + * DMA mapping code and IOMMU drivers. + */ #ifndef _LINUX_DMA_DIRECT_H #define _LINUX_DMA_DIRECT_H 1 #include +#include #include /* for min_low_pfn */ #include +#include extern unsigned int zone_dma_bits; @@ -87,25 +93,17 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long attrs); int dma_direct_supported(struct device *dev, u64 mask); bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr); -dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs); int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs); dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir, unsigned long attrs); +size_t dma_direct_max_mapping_size(struct device *dev); #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_SWIOTLB) -void dma_direct_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir); -void dma_direct_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir); +void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir); #else -static inline void dma_direct_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ -} static inline void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { @@ -115,34 +113,82 @@ static inline void dma_direct_sync_sg_for_device(struct device *dev, #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ defined(CONFIG_SWIOTLB) -void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs); void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs); -void dma_direct_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir); void dma_direct_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir); #else -static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ -} static inline void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { } +static inline void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ +} +#endif + +static inline void dma_direct_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + phys_addr_t paddr = dma_to_phys(dev, addr); + + if (unlikely(is_swiotlb_buffer(paddr))) + swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); + + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_for_device(paddr, size, dir); +} + static inline void dma_direct_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { + phys_addr_t paddr = dma_to_phys(dev, addr); + + if (!dev_is_dma_coherent(dev)) { + arch_sync_dma_for_cpu(paddr, size, dir); + arch_sync_dma_for_cpu_all(); + } + + if (unlikely(is_swiotlb_buffer(paddr))) + swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); } -static inline void dma_direct_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir) + +static inline dma_addr_t dma_direct_map_page(struct device *dev, + struct page *page, unsigned long offset, size_t size, + enum dma_data_direction dir, unsigned long attrs) { + phys_addr_t phys = page_to_phys(page) + offset; + dma_addr_t dma_addr = phys_to_dma(dev, phys); + + if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + return swiotlb_map(dev, phys, size, dir, attrs); + + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (swiotlb_force != SWIOTLB_NO_FORCE) + return swiotlb_map(dev, phys, size, dir, attrs); + + dev_WARN_ONCE(dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; + } + + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(phys, size, dir); + return dma_addr; } -#endif -size_t dma_direct_max_mapping_size(struct device *dev); +static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + phys_addr_t phys = dma_to_phys(dev, addr); + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_direct_sync_single_for_cpu(dev, addr, size, dir); + + if (unlikely(is_swiotlb_buffer(phys))) + swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); +} #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index e545d14fccee..bb0041e99659 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -10,11 +10,9 @@ #include #include #include -#include #include #include #include -#include /* * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it @@ -304,18 +302,6 @@ void dma_direct_free(struct device *dev, size_t size, #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_SWIOTLB) -void dma_direct_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - phys_addr_t paddr = dma_to_phys(dev, addr); - - if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); - - if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_device(paddr, size, dir); -} - void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { @@ -339,20 +325,6 @@ void dma_direct_sync_sg_for_device(struct device *dev, #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ defined(CONFIG_SWIOTLB) -void dma_direct_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - phys_addr_t paddr = dma_to_phys(dev, addr); - - if (!dev_is_dma_coherent(dev)) { - arch_sync_dma_for_cpu(paddr, size, dir); - arch_sync_dma_for_cpu_all(); - } - - if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); -} - void dma_direct_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { @@ -374,18 +346,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, arch_sync_dma_for_cpu_all(); } -void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - phys_addr_t phys = dma_to_phys(dev, addr); - - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_direct_sync_single_for_cpu(dev, addr, size, dir); - - if (unlikely(is_swiotlb_buffer(phys))) - swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); -} - void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { @@ -398,31 +358,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, } #endif -dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - phys_addr_t phys = page_to_phys(page) + offset; - dma_addr_t dma_addr = phys_to_dma(dev, phys); - - if (unlikely(swiotlb_force == SWIOTLB_FORCE)) - return swiotlb_map(dev, phys, size, dir, attrs); - - if (unlikely(!dma_capable(dev, dma_addr, size, true))) { - if (swiotlb_force != SWIOTLB_NO_FORCE) - return swiotlb_map(dev, phys, size, dir, attrs); - - dev_WARN_ONCE(dev, 1, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - return DMA_MAPPING_ERROR; - } - - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_device(phys, size, dir); - return dma_addr; -} - int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { -- cgit v1.2.3 From 9b74ebb2b0f259474da65fa0178c657e5fa5c640 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 14 Jul 2020 15:56:34 +0200 Subject: cpumap: Use non-locked version __ptr_ring_consume_batched Commit 77361825bb01 ("bpf: cpumap use ptr_ring_consume_batched") changed away from using single frame ptr_ring dequeue (__ptr_ring_consume) to consume a batched, but it uses a locked version, which as the comment explain isn't needed. Change to use the non-locked version __ptr_ring_consume_batched. Fixes: 77361825bb01 ("bpf: cpumap use ptr_ring_consume_batched") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/a9c7d06f9a009e282209f0c8c7b2c5d9b9ad60b9.1594734381.git.lorenzo@kernel.org --- kernel/bpf/cpumap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index bd8658055c16..323c91c4fab0 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -259,7 +259,7 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); for (i = 0; i < n; i++) { void *f = frames[i]; -- cgit v1.2.3 From 644bfe51fa49c22244d24e896cd3fe3ee2f2cfd1 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:37 +0200 Subject: cpumap: Formalize map value as a named struct As it has been already done for devmap, introduce 'struct bpf_cpumap_val' to formalize the expected values that can be passed in for a CPUMAP. Update cpumap code to use the struct. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/754f950674665dae6139c061d28c1d982aaf4170.1594734381.git.lorenzo@kernel.org --- include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/cpumap.c | 28 +++++++++++++++------------- tools/include/uapi/linux/bpf.h | 9 +++++++++ 3 files changed, 33 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5e386389913a..109623527358 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3849,6 +3849,15 @@ struct bpf_devmap_val { } bpf_prog; }; +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ +}; + enum sk_action { SK_DROP = 0, SK_PASS, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 323c91c4fab0..ff48dc00e8d0 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -52,7 +52,6 @@ struct xdp_bulk_queue { struct bpf_cpu_map_entry { u32 cpu; /* kthread CPU and map index */ int map_id; /* Back reference to map */ - u32 qsize; /* Queue size placeholder for map lookup */ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; @@ -62,10 +61,13 @@ struct bpf_cpu_map_entry { /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; - struct work_struct kthread_stop_wq; + + struct bpf_cpumap_val value; atomic_t refcnt; /* Control when this struct can be free'ed */ struct rcu_head rcu; + + struct work_struct kthread_stop_wq; }; struct bpf_cpu_map { @@ -307,8 +309,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, - int map_id) +static struct bpf_cpu_map_entry * +__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; @@ -338,13 +340,13 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->queue) goto free_bulkq; - err = ptr_ring_init(rcpu->queue, qsize, gfp); + err = ptr_ring_init(rcpu->queue, value->qsize, gfp); if (err) goto free_queue; rcpu->cpu = cpu; rcpu->map_id = map_id; - rcpu->qsize = qsize; + rcpu->value.qsize = value->qsize; /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, @@ -437,12 +439,12 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpumap_val cpumap_value = {}; struct bpf_cpu_map_entry *rcpu; - /* Array index key correspond to CPU number */ u32 key_cpu = *(u32 *)key; - /* Value is the queue size */ - u32 qsize = *(u32 *)value; + + memcpy(&cpumap_value, value, map->value_size); if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -450,18 +452,18 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - if (unlikely(qsize > 16384)) /* sanity limit on qsize */ + if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */ return -EOVERFLOW; /* Make sure CPU is a valid possible cpu */ if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) return -ENODEV; - if (qsize == 0) { + if (cpumap_value.qsize == 0) { rcpu = NULL; /* Same as deleting */ } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ - rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); + rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id); if (!rcpu) return -ENOMEM; rcpu->cmap = cmap; @@ -523,7 +525,7 @@ static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_cpu_map_entry *rcpu = __cpu_map_lookup_elem(map, *(u32 *)key); - return rcpu ? &rcpu->qsize : NULL; + return rcpu ? &rcpu->value : NULL; } static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5e386389913a..109623527358 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3849,6 +3849,15 @@ struct bpf_devmap_val { } bpf_prog; }; +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ +}; + enum sk_action { SK_DROP = 0, SK_PASS, -- cgit v1.2.3 From 9216477449f33cdbc9c9a99d49f500b7fbb81702 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:38 +0200 Subject: bpf: cpumap: Add the possibility to attach an eBPF program to cpumap Introduce the capability to attach an eBPF program to cpumap entries. The idea behind this feature is to add the possibility to define on which CPU run the eBPF program if the underlying hw does not support RSS. Current supported verdicts are XDP_DROP and XDP_PASS. This patch has been tested on Marvell ESPRESSObin using xdp_redirect_cpu sample available in the kernel tree to identify possible performance regressions. Results show there are no observable differences in packet-per-second: $./xdp_redirect_cpu --progname xdp_cpu_map0 --dev eth0 --cpu 1 rx: 354.8 Kpps rx: 356.0 Kpps rx: 356.8 Kpps rx: 356.3 Kpps rx: 356.6 Kpps rx: 356.6 Kpps rx: 356.7 Kpps rx: 355.8 Kpps rx: 356.8 Kpps rx: 356.8 Kpps Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/5c9febdf903d810b3415732e5cd98491d7d9067a.1594734381.git.lorenzo@kernel.org --- include/linux/bpf.h | 6 ++ include/net/xdp.h | 5 ++ include/trace/events/xdp.h | 14 +++-- include/uapi/linux/bpf.h | 5 ++ kernel/bpf/cpumap.c | 121 ++++++++++++++++++++++++++++++++++++----- net/core/dev.c | 9 +++ tools/include/uapi/linux/bpf.h | 5 ++ 7 files changed, 148 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c67c88ad35f8..54ad426dbea1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1272,6 +1272,7 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); +bool cpu_map_prog_allowed(struct bpf_map *map); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) @@ -1432,6 +1433,11 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, return 0; } +static inline bool cpu_map_prog_allowed(struct bpf_map *map) +{ + return false; +} + static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) { diff --git a/include/net/xdp.h b/include/net/xdp.h index 5b383c450858..83b9e0142b52 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -98,6 +98,11 @@ struct xdp_frame { struct net_device *dev_rx; /* used by cpumap */ }; +struct xdp_cpumap_stats { + unsigned int pass; + unsigned int drop; +}; + /* Clear kernel pointers in xdp_frame */ static inline void xdp_scrub_frame(struct xdp_frame *frame) { diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index b73d3e141323..e2c99f5bee39 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -177,9 +177,9 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, - int sched), + int sched, struct xdp_cpumap_stats *xdp_stats), - TP_ARGS(map_id, processed, drops, sched), + TP_ARGS(map_id, processed, drops, sched, xdp_stats), TP_STRUCT__entry( __field(int, map_id) @@ -188,6 +188,8 @@ TRACE_EVENT(xdp_cpumap_kthread, __field(unsigned int, drops) __field(unsigned int, processed) __field(int, sched) + __field(unsigned int, xdp_pass) + __field(unsigned int, xdp_drop) ), TP_fast_assign( @@ -197,16 +199,20 @@ TRACE_EVENT(xdp_cpumap_kthread, __entry->drops = drops; __entry->processed = processed; __entry->sched = sched; + __entry->xdp_pass = xdp_stats->pass; + __entry->xdp_drop = xdp_stats->drop; ), TP_printk("kthread" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" - " sched=%d", + " sched=%d" + " xdp_pass=%u xdp_drop=%u", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, - __entry->sched) + __entry->sched, + __entry->xdp_pass, __entry->xdp_drop) ); TRACE_EVENT(xdp_cpumap_enqueue, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 109623527358..c010b57fce3f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -227,6 +227,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, __MAX_BPF_ATTACH_TYPE }; @@ -3856,6 +3857,10 @@ struct bpf_devmap_val { */ struct bpf_cpumap_val { __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; }; enum sk_action { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ff48dc00e8d0..b3a8aea81ee5 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -63,6 +63,7 @@ struct bpf_cpu_map_entry { struct task_struct *kthread; struct bpf_cpumap_val value; + struct bpf_prog *prog; atomic_t refcnt; /* Control when this struct can be free'ed */ struct rcu_head rcu; @@ -82,6 +83,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { + u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; int err = -ENOMEM; u64 cost; @@ -92,7 +94,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + (value_size != offsetofend(struct bpf_cpumap_val, qsize) && + value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || + attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); cmap = kzalloc(sizeof(*cmap), GFP_USER); @@ -214,6 +218,8 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { if (atomic_dec_and_test(&rcpu->refcnt)) { + if (rcpu->prog) + bpf_prog_put(rcpu->prog); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); @@ -222,6 +228,62 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, + void **frames, int n, + struct xdp_cpumap_stats *stats) +{ + struct xdp_rxq_info rxq; + struct xdp_buff xdp; + int i, nframes = 0; + + if (!rcpu->prog) + return n; + + rcu_read_lock(); + + xdp_set_return_frame_no_direct(); + xdp.rxq = &rxq; + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + u32 act; + int err; + + rxq.dev = xdpf->dev_rx; + rxq.mem = xdpf->mem; + /* TODO: report queue_index to xdp_rxq_info */ + + xdp_convert_frame_to_buff(xdpf, &xdp); + + act = bpf_prog_run_xdp(rcpu->prog, &xdp); + switch (act) { + case XDP_PASS: + err = xdp_update_frame_from_buff(&xdp, xdpf); + if (err < 0) { + xdp_return_frame(xdpf); + stats->drop++; + } else { + frames[nframes++] = xdpf; + stats->pass++; + } + break; + default: + bpf_warn_invalid_xdp_action(act); + /* fallthrough */ + case XDP_DROP: + xdp_return_frame(xdpf); + stats->drop++; + break; + } + } + + xdp_clear_return_frame_no_direct(); + + rcu_read_unlock(); + + return nframes; +} + #define CPUMAP_BATCH 8 static int cpu_map_kthread_run(void *data) @@ -236,11 +298,12 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + struct xdp_cpumap_stats stats = {}; /* zero stats */ + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; unsigned int drops = 0, sched = 0; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - int i, n, m; + int i, n, m, nframes; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -261,8 +324,8 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - + n = __ptr_ring_consume_batched(rcpu->queue, frames, + CPUMAP_BATCH); for (i = 0; i < n; i++) { void *f = frames[i]; struct page *page = virt_to_page(f); @@ -274,15 +337,19 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } - m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); - if (unlikely(m == 0)) { - for (i = 0; i < n; i++) - skbs[i] = NULL; /* effect: xdp_return_frame */ - drops = n; + /* Support running another XDP prog on this CPU */ + nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats); + if (nframes) { + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < nframes; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops += nframes; + } } local_bh_disable(); - for (i = 0; i < n; i++) { + for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; int ret; @@ -299,7 +366,7 @@ static int cpu_map_kthread_run(void *data) drops++; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats); local_bh_enable(); /* resched point, may call do_softirq() */ } @@ -309,13 +376,38 @@ static int cpu_map_kthread_run(void *data) return 0; } +bool cpu_map_prog_allowed(struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_CPUMAP && + map->value_size != offsetofend(struct bpf_cpumap_val, qsize); +} + +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) +{ + struct bpf_prog *prog; + + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->expected_attach_type != BPF_XDP_CPUMAP) { + bpf_prog_put(prog); + return -EINVAL; + } + + rcpu->value.bpf_prog.id = prog->aux->id; + rcpu->prog = prog; + + return 0; +} + static struct bpf_cpu_map_entry * __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) { + int numa, err, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; struct xdp_bulk_queue *bq; - int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -357,6 +449,9 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + goto free_ptr_ring; + /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); diff --git a/net/core/dev.c b/net/core/dev.c index b61075828358..b820527f0a8d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5448,6 +5448,8 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) for (i = 0; i < new->aux->used_map_cnt; i++) { if (dev_map_can_have_prog(new->aux->used_maps[i])) return -EINVAL; + if (cpu_map_prog_allowed(new->aux->used_maps[i])) + return -EINVAL; } } @@ -8875,6 +8877,13 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_CPUMAP) { + NL_SET_ERR_MSG(extack, + "BPF_XDP_CPUMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 109623527358..c010b57fce3f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -227,6 +227,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, __MAX_BPF_ATTACH_TYPE }; @@ -3856,6 +3857,10 @@ struct bpf_devmap_val { */ struct bpf_cpumap_val { __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; }; enum sk_action { -- cgit v1.2.3 From 28b1520ebf81ced970141640d90279ac7b9f1f9a Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:39 +0200 Subject: bpf: cpumap: Implement XDP_REDIRECT for eBPF programs attached to map entries Introduce XDP_REDIRECT support for eBPF programs attached to cpumap entries. This patch has been tested on Marvell ESPRESSObin using a modified version of xdp_redirect_cpu sample in order to attach a XDP program to CPUMAP entries to perform a redirect on the mvneta interface. In particular the following scenario has been tested: rq (cpu0) --> mvneta - XDP_REDIRECT (cpu0) --> CPUMAP - XDP_REDIRECT (cpu1) --> mvneta $./xdp_redirect_cpu -p xdp_cpu_map0 -d eth0 -c 1 -e xdp_redirect \ -f xdp_redirect_kern.o -m tx_port -r eth0 tx: 285.2 Kpps rx: 285.2 Kpps Attaching a simple XDP program on eth0 to perform XDP_TX gives comparable results: tx: 288.4 Kpps rx: 288.4 Kpps Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/2cf8373a731867af302b00c4ff16c122630c4980.1594734381.git.lorenzo@kernel.org --- include/net/xdp.h | 1 + include/trace/events/xdp.h | 6 ++++-- kernel/bpf/cpumap.c | 17 +++++++++++++++-- 3 files changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/net/xdp.h b/include/net/xdp.h index 83b9e0142b52..5be0d4d65b94 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -99,6 +99,7 @@ struct xdp_frame { }; struct xdp_cpumap_stats { + unsigned int redirect; unsigned int pass; unsigned int drop; }; diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index e2c99f5bee39..cd24e8a59529 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -190,6 +190,7 @@ TRACE_EVENT(xdp_cpumap_kthread, __field(int, sched) __field(unsigned int, xdp_pass) __field(unsigned int, xdp_drop) + __field(unsigned int, xdp_redirect) ), TP_fast_assign( @@ -201,18 +202,19 @@ TRACE_EVENT(xdp_cpumap_kthread, __entry->sched = sched; __entry->xdp_pass = xdp_stats->pass; __entry->xdp_drop = xdp_stats->drop; + __entry->xdp_redirect = xdp_stats->redirect; ), TP_printk("kthread" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " sched=%d" - " xdp_pass=%u xdp_drop=%u", + " xdp_pass=%u xdp_drop=%u xdp_redirect=%u", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->sched, - __entry->xdp_pass, __entry->xdp_drop) + __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect) ); TRACE_EVENT(xdp_cpumap_enqueue, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b3a8aea81ee5..4c95d0615ca2 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -239,7 +239,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, if (!rcpu->prog) return n; - rcu_read_lock(); + rcu_read_lock_bh(); xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; @@ -267,6 +267,16 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, stats->pass++; } break; + case XDP_REDIRECT: + err = xdp_do_redirect(xdpf->dev_rx, &xdp, + rcpu->prog); + if (unlikely(err)) { + xdp_return_frame(xdpf); + stats->drop++; + } else { + stats->redirect++; + } + break; default: bpf_warn_invalid_xdp_action(act); /* fallthrough */ @@ -277,9 +287,12 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } } + if (stats->redirect) + xdp_do_flush_map(); + xdp_clear_return_frame_no_direct(); - rcu_read_unlock(); + rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ return nframes; } -- cgit v1.2.3 From 3f649ab728cda8038259d8f14492fe400fbab911 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Jun 2020 13:09:38 -0700 Subject: treewide: Remove uninitialized_var() usage Using uninitialized_var() is dangerous as it papers over real bugs[1] (or can in the future), and suppresses unrelated compiler warnings (e.g. "unused variable"). If the compiler thinks it is uninitialized, either simply initialize the variable or make compiler changes. In preparation for removing[2] the[3] macro[4], remove all remaining needless uses with the following script: git grep '\buninitialized_var\b' | cut -d: -f1 | sort -u | \ xargs perl -pi -e \ 's/\buninitialized_var\(([^\)]+)\)/\1/g; s:\s*/\* (GCC be quiet|to make compiler happy) \*/$::g;' drivers/video/fbdev/riva/riva_hw.c was manually tweaked to avoid pathological white-space. No outstanding warnings were found building allmodconfig with GCC 9.3.0 for x86_64, i386, arm64, arm, powerpc, powerpc64le, s390x, mips, sparc64, alpha, and m68k. [1] https://lore.kernel.org/lkml/20200603174714.192027-1-glider@google.com/ [2] https://lore.kernel.org/lkml/CA+55aFw+Vbj0i=1TGqCR5vQkCzWJ0QxK6CernOU6eedsudAixw@mail.gmail.com/ [3] https://lore.kernel.org/lkml/CA+55aFwgbgqhbp1fkxvRKEpzyR5J8n1vKT1VZdz9knmPuXhOeg@mail.gmail.com/ [4] https://lore.kernel.org/lkml/CA+55aFz2500WfbKXAx8s67wrm9=yVJu65TpLgN_ybYNv0VEOKA@mail.gmail.com/ Reviewed-by: Leon Romanovsky # drivers/infiniband and mlx4/mlx5 Acked-by: Jason Gunthorpe # IB Acked-by: Kalle Valo # wireless drivers Reviewed-by: Chao Yu # erofs Signed-off-by: Kees Cook --- arch/arm/mach-sa1100/assabet.c | 2 +- arch/arm/mm/alignment.c | 2 +- arch/ia64/kernel/process.c | 2 +- arch/ia64/mm/discontig.c | 2 +- arch/ia64/mm/tlb.c | 2 +- arch/mips/lib/dump_tlb.c | 2 +- arch/mips/mm/init.c | 2 +- arch/mips/mm/tlb-r4k.c | 6 +++--- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/powerpc/kvm/powerpc.c | 2 +- arch/powerpc/platforms/52xx/mpc52xx_pic.c | 2 +- arch/s390/kernel/smp.c | 2 +- arch/x86/kernel/quirks.c | 10 +++++----- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- arch/x86/kvm/x86.c | 2 +- block/blk-merge.c | 2 +- drivers/acpi/acpi_pad.c | 2 +- drivers/ata/libata-scsi.c | 2 +- drivers/atm/zatm.c | 2 +- drivers/block/drbd/drbd_nl.c | 6 +++--- drivers/block/rbd.c | 2 +- drivers/clk/clk-gate.c | 2 +- drivers/firewire/ohci.c | 14 +++++++------- drivers/gpu/drm/bridge/sil-sii8620.c | 2 +- drivers/gpu/drm/drm_edid.c | 2 +- drivers/gpu/drm/exynos/exynos_drm_dsi.c | 6 +++--- drivers/gpu/drm/i915/display/intel_fbc.c | 2 +- drivers/gpu/drm/i915/gt/intel_lrc.c | 2 +- drivers/gpu/drm/i915/intel_uncore.c | 2 +- drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c | 4 ++-- drivers/i2c/busses/i2c-rk3x.c | 2 +- drivers/ide/ide-acpi.c | 2 +- drivers/ide/ide-atapi.c | 2 +- drivers/ide/ide-io-std.c | 4 ++-- drivers/ide/ide-io.c | 8 ++++---- drivers/ide/ide-sysfs.c | 2 +- drivers/ide/umc8672.c | 2 +- drivers/idle/intel_idle.c | 2 +- drivers/infiniband/core/uverbs_cmd.c | 4 ++-- drivers/infiniband/hw/cxgb4/cm.c | 2 +- drivers/infiniband/hw/cxgb4/cq.c | 2 +- drivers/infiniband/hw/mlx4/qp.c | 6 +++--- drivers/infiniband/hw/mlx5/cq.c | 6 +++--- drivers/infiniband/hw/mlx5/devx.c | 2 +- drivers/infiniband/hw/mlx5/wr.c | 2 +- drivers/infiniband/hw/mthca/mthca_qp.c | 10 +++++----- drivers/infiniband/sw/siw/siw_qp_rx.c | 2 +- drivers/input/serio/serio_raw.c | 2 +- drivers/iommu/intel/iommu.c | 2 +- drivers/md/dm-io.c | 2 +- drivers/md/dm-ioctl.c | 2 +- drivers/md/dm-snap-persistent.c | 2 +- drivers/md/dm-table.c | 2 +- drivers/md/dm-writecache.c | 2 +- drivers/md/raid5.c | 2 +- drivers/media/dvb-frontends/rtl2832.c | 2 +- drivers/media/tuners/qt1010.c | 4 ++-- drivers/media/usb/gspca/vicam.c | 2 +- drivers/media/usb/uvc/uvc_video.c | 8 ++++---- drivers/memstick/host/jmb38x_ms.c | 2 +- drivers/memstick/host/tifm_ms.c | 2 +- drivers/mmc/host/sdhci.c | 2 +- drivers/mtd/nand/raw/nand_ecc.c | 2 +- drivers/mtd/nand/raw/s3c2410.c | 2 +- drivers/mtd/parsers/afs.c | 4 ++-- drivers/mtd/ubi/eba.c | 2 +- drivers/net/can/janz-ican3.c | 2 +- drivers/net/ethernet/broadcom/bnx2.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c | 4 ++-- drivers/net/ethernet/neterion/s2io.c | 2 +- drivers/net/ethernet/qlogic/qla3xxx.c | 2 +- drivers/net/ethernet/sun/cassini.c | 2 +- drivers/net/ethernet/sun/niu.c | 6 +++--- drivers/net/wan/z85230.c | 2 +- drivers/net/wireless/ath/ath10k/core.c | 2 +- drivers/net/wireless/ath/ath6kl/init.c | 2 +- drivers/net/wireless/ath/ath9k/init.c | 2 +- drivers/net/wireless/broadcom/b43/debugfs.c | 2 +- drivers/net/wireless/broadcom/b43/dma.c | 2 +- drivers/net/wireless/broadcom/b43/lo.c | 2 +- drivers/net/wireless/broadcom/b43/phy_n.c | 2 +- drivers/net/wireless/broadcom/b43/xmit.c | 12 ++++++------ drivers/net/wireless/broadcom/b43legacy/debugfs.c | 2 +- drivers/net/wireless/broadcom/b43legacy/main.c | 2 +- drivers/net/wireless/intel/iwlegacy/3945.c | 2 +- drivers/net/wireless/intel/iwlegacy/4965-mac.c | 2 +- drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c | 4 ++-- drivers/pci/pcie/aer.c | 2 +- drivers/platform/x86/hdaps.c | 4 ++-- drivers/scsi/dc395x.c | 2 +- drivers/scsi/pm8001/pm8001_hwi.c | 2 +- drivers/scsi/pm8001/pm80xx_hwi.c | 2 +- drivers/ssb/driver_chipcommon.c | 4 ++-- drivers/tty/cyclades.c | 2 +- drivers/tty/isicom.c | 2 +- drivers/usb/musb/cppi_dma.c | 2 +- drivers/usb/storage/sddr55.c | 4 ++-- drivers/vhost/net.c | 6 +++--- drivers/video/fbdev/matrox/matroxfb_maven.c | 6 +++--- drivers/video/fbdev/pm3fb.c | 6 +++--- drivers/video/fbdev/riva/riva_hw.c | 3 +-- drivers/virtio/virtio_ring.c | 6 +++--- fs/afs/dir.c | 2 +- fs/afs/security.c | 2 +- fs/dlm/netlink.c | 2 +- fs/erofs/data.c | 4 ++-- fs/erofs/zdata.c | 2 +- fs/fat/dir.c | 2 +- fs/fuse/control.c | 4 ++-- fs/fuse/cuse.c | 2 +- fs/fuse/file.c | 2 +- fs/gfs2/aops.c | 2 +- fs/gfs2/bmap.c | 2 +- fs/gfs2/lops.c | 2 +- fs/hfsplus/unicode.c | 2 +- fs/isofs/namei.c | 4 ++-- fs/jffs2/erase.c | 2 +- fs/nfsd/nfsctl.c | 2 +- fs/ocfs2/alloc.c | 4 ++-- fs/ocfs2/dir.c | 14 +++++++------- fs/ocfs2/extent_map.c | 4 ++-- fs/ocfs2/namei.c | 2 +- fs/ocfs2/refcounttree.c | 2 +- fs/ocfs2/xattr.c | 2 +- fs/omfs/file.c | 2 +- fs/overlayfs/copy_up.c | 2 +- fs/ubifs/commit.c | 6 +++--- fs/ubifs/dir.c | 2 +- fs/ubifs/file.c | 4 ++-- fs/ubifs/journal.c | 4 ++-- fs/ubifs/lpt.c | 2 +- fs/ubifs/tnc.c | 6 +++--- fs/ubifs/tnc_misc.c | 4 ++-- fs/udf/balloc.c | 2 +- fs/xfs/xfs_bmap_util.c | 2 +- include/net/flow_offload.h | 2 +- kernel/async.c | 4 ++-- kernel/audit.c | 2 +- kernel/debug/kdb/kdb_io.c | 2 +- kernel/dma/debug.c | 2 +- kernel/events/core.c | 2 +- kernel/events/uprobes.c | 2 +- kernel/exit.c | 2 +- kernel/futex.c | 14 +++++++------- kernel/locking/lockdep.c | 16 ++++++++-------- kernel/trace/ring_buffer.c | 2 +- lib/radix-tree.c | 2 +- lib/test_lockup.c | 2 +- mm/frontswap.c | 2 +- mm/ksm.c | 2 +- mm/memcontrol.c | 2 +- mm/memory.c | 2 +- mm/mempolicy.c | 4 ++-- mm/page_alloc.c | 2 +- mm/percpu.c | 2 +- mm/slub.c | 4 ++-- mm/swap.c | 4 ++-- net/dccp/options.c | 2 +- net/ipv4/netfilter/nf_socket_ipv4.c | 6 +++--- net/ipv6/ip6_flowlabel.c | 2 +- net/ipv6/netfilter/nf_socket_ipv6.c | 2 +- net/netfilter/nf_conntrack_ftp.c | 2 +- net/netfilter/nfnetlink_log.c | 2 +- net/netfilter/nfnetlink_queue.c | 4 ++-- net/sched/cls_flow.c | 2 +- net/sched/sch_cake.c | 2 +- net/sched/sch_cbq.c | 2 +- net/sched/sch_fq_codel.c | 2 +- net/sched/sch_fq_pie.c | 2 +- net/sched/sch_hfsc.c | 2 +- net/sched/sch_htb.c | 2 +- net/sched/sch_sfq.c | 2 +- net/sunrpc/svcsock.c | 4 ++-- net/sunrpc/xprtsock.c | 10 +++++----- net/tls/tls_sw.c | 2 +- sound/core/control_compat.c | 2 +- sound/isa/sb/sb16_csp.c | 2 +- sound/usb/endpoint.c | 2 +- 179 files changed, 278 insertions(+), 279 deletions(-) (limited to 'kernel') diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c index aa265ede5730..2012fa8c28cf 100644 --- a/arch/arm/mach-sa1100/assabet.c +++ b/arch/arm/mach-sa1100/assabet.c @@ -652,7 +652,7 @@ static void __init map_sa1100_gpio_regs( void ) */ static void __init get_assabet_scr(void) { - unsigned long uninitialized_var(scr), i; + unsigned long scr, i; GPDR |= 0x3fc; /* Configure GPIO 9:2 as outputs */ GPSR = 0x3fc; /* Write 0xFF to GPIO 9:2 */ diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index 81a627e6e1c5..f4bfc1cac91a 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@ -799,7 +799,7 @@ static int alignment_get_thumb(struct pt_regs *regs, u16 *ip, u16 *inst) static int do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { - union offset_union uninitialized_var(offset); + union offset_union offset; unsigned long instrptr; int (*handler)(unsigned long addr, u32 instr, struct pt_regs *regs); unsigned int type; diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 96dfb9e4b16f..da55b41ae33e 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -445,7 +445,7 @@ static void do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg) { unsigned long mask, sp, nat_bits = 0, ar_rnat, urbs_end, cfm; - unsigned long uninitialized_var(ip); /* GCC be quiet */ + unsigned long ip; elf_greg_t *dst = arg; struct pt_regs *pt; char nat; diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index dd8284bcbf16..da810ca234da 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -180,7 +180,7 @@ static void *per_cpu_node_setup(void *cpu_data, int node) void __init setup_per_cpu_areas(void) { struct pcpu_alloc_info *ai; - struct pcpu_group_info *uninitialized_var(gi); + struct pcpu_group_info *gi; unsigned int *cpu_map; void *base; unsigned long base_offset; diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index 72cc568bc841..71c19918e387 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c @@ -369,7 +369,7 @@ EXPORT_SYMBOL(flush_tlb_range); void ia64_tlb_init(void) { - ia64_ptce_info_t uninitialized_var(ptce_info); /* GCC be quiet */ + ia64_ptce_info_t ptce_info; u64 tr_pgbits; long status; pal_vm_info_1_u_t vm_info_1; diff --git a/arch/mips/lib/dump_tlb.c b/arch/mips/lib/dump_tlb.c index 5a418ba5e75f..4256423632c4 100644 --- a/arch/mips/lib/dump_tlb.c +++ b/arch/mips/lib/dump_tlb.c @@ -79,7 +79,7 @@ static void dump_tlb(int first, int last) unsigned int pagemask, guestctl1 = 0, c0, c1, i; unsigned long asidmask = cpu_asid_mask(¤t_cpu_data); int asidwidth = DIV_ROUND_UP(ilog2(asidmask) + 1, 4); - unsigned long uninitialized_var(s_mmid); + unsigned long s_mmid; #ifdef CONFIG_32BIT bool xpa = cpu_has_xpa && (read_c0_pagegrain() & PG_ELPA); int pwidth = xpa ? 11 : 8; diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 336b58173dc7..6c7bbfe35ba3 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -83,7 +83,7 @@ void setup_zero_pages(void) static void *__kmap_pgprot(struct page *page, unsigned long addr, pgprot_t prot) { enum fixed_addresses idx; - unsigned int uninitialized_var(old_mmid); + unsigned int old_mmid; unsigned long vaddr, flags, entrylo; unsigned long old_ctx; pte_t pte; diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c index 6677dcb72580..38e2894d5fa3 100644 --- a/arch/mips/mm/tlb-r4k.c +++ b/arch/mips/mm/tlb-r4k.c @@ -119,7 +119,7 @@ void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start, if (size <= (current_cpu_data.tlbsizeftlbsets ? current_cpu_data.tlbsize / 8 : current_cpu_data.tlbsize / 2)) { - unsigned long old_entryhi, uninitialized_var(old_mmid); + unsigned long old_entryhi, old_mmid; int newpid = cpu_asid(cpu, mm); old_entryhi = read_c0_entryhi(); @@ -213,7 +213,7 @@ void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page) int cpu = smp_processor_id(); if (cpu_context(cpu, vma->vm_mm) != 0) { - unsigned long uninitialized_var(old_mmid); + unsigned long old_mmid; unsigned long flags, old_entryhi; int idx; @@ -382,7 +382,7 @@ void add_wired_entry(unsigned long entrylo0, unsigned long entrylo1, #ifdef CONFIG_XPA panic("Broken for XPA kernels"); #else - unsigned int uninitialized_var(old_mmid); + unsigned int old_mmid; unsigned long flags; unsigned long wired; unsigned long old_pagemask; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 6a73714759ba..777aa5625d5f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -33,7 +33,7 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, gva_t eaddr, void *to, void *from, unsigned long n) { - int uninitialized_var(old_pid), old_lpid; + int old_pid, old_lpid; unsigned long quadrant, ret = n; bool is_load = !!to; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index dd7d141e33e8..aaa7b62f2f82 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1110,7 +1110,7 @@ static inline u32 dp_to_sp(u64 fprd) static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - u64 uninitialized_var(gpr); + u64 gpr; if (run->mmio.len > sizeof(gpr)) { printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len); diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pic.c b/arch/powerpc/platforms/52xx/mpc52xx_pic.c index fc98912f42cf..76a8102bdb98 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_pic.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_pic.c @@ -340,7 +340,7 @@ static int mpc52xx_irqhost_map(struct irq_domain *h, unsigned int virq, { int l1irq; int l2irq; - struct irq_chip *uninitialized_var(irqchip); + struct irq_chip *irqchip; void *hndlr; int type; u32 reg; diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index e6be63ff162a..c2181471ac63 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -146,7 +146,7 @@ static int pcpu_sigp_retry(struct pcpu *pcpu, u8 order, u32 parm) static inline int pcpu_stopped(struct pcpu *pcpu) { - u32 uninitialized_var(status); + u32 status; if (__pcpu_sigp(pcpu->address, SIGP_SENSE, 0, &status) != SIGP_CC_STATUS_STORED) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 896d74cb5081..1b10717c9321 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -95,7 +95,7 @@ static void ich_force_hpet_resume(void) static void ich_force_enable_hpet(struct pci_dev *dev) { u32 val; - u32 uninitialized_var(rcba); + u32 rcba; int err = 0; if (hpet_address || force_hpet_address) @@ -185,7 +185,7 @@ static void hpet_print_force_info(void) static void old_ich_force_hpet_resume(void) { u32 val; - u32 uninitialized_var(gen_cntl); + u32 gen_cntl; if (!force_hpet_address || !cached_dev) return; @@ -207,7 +207,7 @@ static void old_ich_force_hpet_resume(void) static void old_ich_force_enable_hpet(struct pci_dev *dev) { u32 val; - u32 uninitialized_var(gen_cntl); + u32 gen_cntl; if (hpet_address || force_hpet_address) return; @@ -298,7 +298,7 @@ static void vt8237_force_hpet_resume(void) static void vt8237_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 val; if (hpet_address || force_hpet_address) return; @@ -429,7 +429,7 @@ static void nvidia_force_hpet_resume(void) static void nvidia_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 val; if (hpet_address || force_hpet_address) return; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 76817d13c86e..deafcced65d2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1986,7 +1986,7 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, unsigned long data) { u64 *sptep; - struct rmap_iterator uninitialized_var(iter); + struct rmap_iterator iter; int young = 0; for_each_rmap_spte(rmap_head, &iter, sptep) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index bd70ece1ef8b..275564a0ebdb 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -314,7 +314,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, { int ret; pt_element_t pte; - pt_element_t __user *uninitialized_var(ptep_user); + pt_element_t __user *ptep_user; gfn_t table_gfn; u64 pt_access, pte_access; unsigned index, accessed_dirty, pte_pkey; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b92db412335..598d5be960c9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9927,7 +9927,7 @@ void kvm_arch_sync_events(struct kvm *kvm) int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; - unsigned long hva, uninitialized_var(old_npages); + unsigned long hva, old_npages; struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot; diff --git a/block/blk-merge.c b/block/blk-merge.c index f0b0bae075a0..006402edef6b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -473,7 +473,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, struct scatterlist *sglist, struct scatterlist **sg) { - struct bio_vec uninitialized_var(bvec), bvprv = { NULL }; + struct bio_vec bvec, bvprv = { NULL }; struct bvec_iter iter; int nsegs = 0; bool new_bio = false; diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index e7dc0133f817..6cc4c92d9ff9 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -88,7 +88,7 @@ static void round_robin_cpu(unsigned int tsk_index) cpumask_var_t tmp; int cpu; unsigned long min_weight = -1; - unsigned long uninitialized_var(preferred_cpu); + unsigned long preferred_cpu; if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) return; diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 46336084b1a9..ec233208585b 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -93,7 +93,7 @@ static ssize_t ata_scsi_park_show(struct device *device, struct ata_link *link; struct ata_device *dev; unsigned long now; - unsigned int uninitialized_var(msecs); + unsigned int msecs; int rc = 0; ap = ata_shost_to_port(sdev->host); diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c index 57f97b95a453..165eebe06e39 100644 --- a/drivers/atm/zatm.c +++ b/drivers/atm/zatm.c @@ -940,7 +940,7 @@ static int open_tx_first(struct atm_vcc *vcc) vcc->qos.txtp.max_pcr >= ATM_OC3_PCR); if (unlimited && zatm_dev->ubr != -1) zatm_vcc->shaper = zatm_dev->ubr; else { - int uninitialized_var(pcr); + int pcr; if (unlimited) vcc->qos.txtp.max_sdu = ATM_MAX_AAL5_PDU; if ((zatm_vcc->shaper = alloc_shaper(vcc->dev,&pcr, diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index da4a3ebe04ef..c0017cc51ecc 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -3423,7 +3423,7 @@ int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *resource_filter; struct drbd_resource *resource; - struct drbd_device *uninitialized_var(device); + struct drbd_device *device; int minor, err, retcode; struct drbd_genlmsghdr *dh; struct device_info device_info; @@ -3512,7 +3512,7 @@ int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *resource_filter; struct drbd_resource *resource = NULL, *next_resource; - struct drbd_connection *uninitialized_var(connection); + struct drbd_connection *connection; int err = 0, retcode; struct drbd_genlmsghdr *dh; struct connection_info connection_info; @@ -3674,7 +3674,7 @@ int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *resource_filter; struct drbd_resource *resource; - struct drbd_device *uninitialized_var(device); + struct drbd_device *device; struct drbd_peer_device *peer_device = NULL; int minor, err, retcode; struct drbd_genlmsghdr *dh; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4f61e9209461..d9c0e7d154f9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1993,7 +1993,7 @@ static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req, struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; struct ceph_osd_data *osd_data; u64 objno; - u8 state, new_state, uninitialized_var(current_state); + u8 state, new_state, current_state; bool has_current_state; void *p; diff --git a/drivers/clk/clk-gate.c b/drivers/clk/clk-gate.c index 2ca1f2ac38a6..070dc47e95a1 100644 --- a/drivers/clk/clk-gate.c +++ b/drivers/clk/clk-gate.c @@ -56,7 +56,7 @@ static void clk_gate_endisable(struct clk_hw *hw, int enable) { struct clk_gate *gate = to_clk_gate(hw); int set = gate->flags & CLK_GATE_SET_TO_DISABLE ? 1 : 0; - unsigned long uninitialized_var(flags); + unsigned long flags; u32 reg; set ^= enable; diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 54fdc39cd0bc..7dde21b18b04 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -1099,7 +1099,7 @@ static void context_tasklet(unsigned long data) static int context_add_buffer(struct context *ctx) { struct descriptor_buffer *desc; - dma_addr_t uninitialized_var(bus_addr); + dma_addr_t bus_addr; int offset; /* @@ -1289,7 +1289,7 @@ static int at_context_queue_packet(struct context *ctx, struct fw_packet *packet) { struct fw_ohci *ohci = ctx->ohci; - dma_addr_t d_bus, uninitialized_var(payload_bus); + dma_addr_t d_bus, payload_bus; struct driver_data *driver_data; struct descriptor *d, *last; __le32 *header; @@ -2445,7 +2445,7 @@ static int ohci_set_config_rom(struct fw_card *card, { struct fw_ohci *ohci; __be32 *next_config_rom; - dma_addr_t uninitialized_var(next_config_rom_bus); + dma_addr_t next_config_rom_bus; ohci = fw_ohci(card); @@ -2933,10 +2933,10 @@ static struct fw_iso_context *ohci_allocate_iso_context(struct fw_card *card, int type, int channel, size_t header_size) { struct fw_ohci *ohci = fw_ohci(card); - struct iso_context *uninitialized_var(ctx); - descriptor_callback_t uninitialized_var(callback); - u64 *uninitialized_var(channels); - u32 *uninitialized_var(mask), uninitialized_var(regs); + struct iso_context *ctx; + descriptor_callback_t callback; + u64 *channels; + u32 *mask, regs; int index, ret = -EBUSY; spin_lock_irq(&ohci->lock); diff --git a/drivers/gpu/drm/bridge/sil-sii8620.c b/drivers/gpu/drm/bridge/sil-sii8620.c index 92acd336aa89..6cd8e012de5d 100644 --- a/drivers/gpu/drm/bridge/sil-sii8620.c +++ b/drivers/gpu/drm/bridge/sil-sii8620.c @@ -986,7 +986,7 @@ static void sii8620_set_auto_zone(struct sii8620 *ctx) static void sii8620_stop_video(struct sii8620 *ctx) { - u8 uninitialized_var(val); + u8 val; sii8620_write_seq_static(ctx, REG_TPI_INTR_EN, 0, diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index fed653f13c26..b98fa573e706 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -3051,7 +3051,7 @@ static int drm_cvt_modes(struct drm_connector *connector, const u8 empty[3] = { 0, 0, 0 }; for (i = 0; i < 4; i++) { - int uninitialized_var(width), height; + int width, height; cvt = &(timing->data.other_data.data.cvt[i]); if (!memcmp(cvt->code, empty, 3)) diff --git a/drivers/gpu/drm/exynos/exynos_drm_dsi.c b/drivers/gpu/drm/exynos/exynos_drm_dsi.c index ee96a95fb6be..7a6f6df5e954 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_dsi.c +++ b/drivers/gpu/drm/exynos/exynos_drm_dsi.c @@ -547,9 +547,9 @@ static unsigned long exynos_dsi_pll_find_pms(struct exynos_dsi *dsi, unsigned long best_freq = 0; u32 min_delta = 0xffffffff; u8 p_min, p_max; - u8 _p, uninitialized_var(best_p); - u16 _m, uninitialized_var(best_m); - u8 _s, uninitialized_var(best_s); + u8 _p, best_p; + u16 _m, best_m; + u8 _s, best_s; p_min = DIV_ROUND_UP(fin, (12 * MHZ)); p_max = fin / (6 * MHZ); diff --git a/drivers/gpu/drm/i915/display/intel_fbc.c b/drivers/gpu/drm/i915/display/intel_fbc.c index 1c26673acb2d..c3a83c06c1b5 100644 --- a/drivers/gpu/drm/i915/display/intel_fbc.c +++ b/drivers/gpu/drm/i915/display/intel_fbc.c @@ -474,7 +474,7 @@ static int intel_fbc_alloc_cfb(struct drm_i915_private *dev_priv, unsigned int size, unsigned int fb_cpp) { struct intel_fbc *fbc = &dev_priv->fbc; - struct drm_mm_node *uninitialized_var(compressed_llb); + struct drm_mm_node *compressed_llb; int ret; drm_WARN_ON(&dev_priv->drm, diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 7c3d8ef4a47c..234bf45c290b 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1103,7 +1103,7 @@ static struct i915_request * __unwind_incomplete_requests(struct intel_engine_cs *engine) { struct i915_request *rq, *rn, *active = NULL; - struct list_head *uninitialized_var(pl); + struct list_head *pl; int prio = I915_PRIORITY_INVALID; lockdep_assert_held(&engine->active.lock); diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index a61cb8ca4d50..c8fd2bcb17ee 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -1991,7 +1991,7 @@ int __intel_wait_for_register_fw(struct intel_uncore *uncore, unsigned int slow_timeout_ms, u32 *out_value) { - u32 uninitialized_var(reg_value); + u32 reg_value; #define done (((reg_value = intel_uncore_read_fw(uncore, reg)) & mask) == value) int ret; diff --git a/drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c b/drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c index 3feff0c45b3f..542dcf7eddd6 100644 --- a/drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c +++ b/drivers/gpu/drm/rockchip/dw-mipi-dsi-rockchip.c @@ -517,8 +517,8 @@ dw_mipi_dsi_get_lane_mbps(void *priv_data, const struct drm_display_mode *mode, unsigned long best_freq = 0; unsigned long fvco_min, fvco_max, fin, fout; unsigned int min_prediv, max_prediv; - unsigned int _prediv, uninitialized_var(best_prediv); - unsigned long _fbdiv, uninitialized_var(best_fbdiv); + unsigned int _prediv, best_prediv; + unsigned long _fbdiv, best_fbdiv; unsigned long min_delta = ULONG_MAX; dsi->format = format; diff --git a/drivers/i2c/busses/i2c-rk3x.c b/drivers/i2c/busses/i2c-rk3x.c index bc698240c4aa..15324bfbc6cb 100644 --- a/drivers/i2c/busses/i2c-rk3x.c +++ b/drivers/i2c/busses/i2c-rk3x.c @@ -415,7 +415,7 @@ static void rk3x_i2c_handle_read(struct rk3x_i2c *i2c, unsigned int ipd) { unsigned int i; unsigned int len = i2c->msg->len - i2c->processed; - u32 uninitialized_var(val); + u32 val; u8 byte; /* we only care for MBRF here. */ diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c index 7d4e5c08f133..05e18d658141 100644 --- a/drivers/ide/ide-acpi.c +++ b/drivers/ide/ide-acpi.c @@ -180,7 +180,7 @@ err: static acpi_handle ide_acpi_hwif_get_handle(ide_hwif_t *hwif) { struct device *dev = hwif->gendev.parent; - acpi_handle uninitialized_var(dev_handle); + acpi_handle dev_handle; u64 pcidevfn; acpi_handle chan_handle; int err; diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 80bc3bf82f4d..2162bc80f09e 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -609,7 +609,7 @@ static int ide_delayed_transfer_pc(ide_drive_t *drive) static ide_startstop_t ide_transfer_pc(ide_drive_t *drive) { - struct ide_atapi_pc *uninitialized_var(pc); + struct ide_atapi_pc *pc; ide_hwif_t *hwif = drive->hwif; struct request *rq = hwif->rq; ide_expiry_t *expiry; diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c index 18c20a7aa0ce..94bdcf1ea186 100644 --- a/drivers/ide/ide-io-std.c +++ b/drivers/ide/ide-io-std.c @@ -173,7 +173,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0; if (io_32bit) { - unsigned long uninitialized_var(flags); + unsigned long flags; if ((io_32bit & 2) && !mmio) { local_irq_save(flags); @@ -217,7 +217,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0; if (io_32bit) { - unsigned long uninitialized_var(flags); + unsigned long flags; if ((io_32bit & 2) && !mmio) { local_irq_save(flags); diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index c31f1d2b3b07..1a53c7a75224 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -622,12 +622,12 @@ static int drive_is_ready(ide_drive_t *drive) void ide_timer_expiry (struct timer_list *t) { ide_hwif_t *hwif = from_timer(hwif, t, timer); - ide_drive_t *uninitialized_var(drive); + ide_drive_t *drive; ide_handler_t *handler; unsigned long flags; int wait = -1; int plug_device = 0; - struct request *uninitialized_var(rq_in_flight); + struct request *rq_in_flight; spin_lock_irqsave(&hwif->lock, flags); @@ -780,13 +780,13 @@ irqreturn_t ide_intr (int irq, void *dev_id) { ide_hwif_t *hwif = (ide_hwif_t *)dev_id; struct ide_host *host = hwif->host; - ide_drive_t *uninitialized_var(drive); + ide_drive_t *drive; ide_handler_t *handler; unsigned long flags; ide_startstop_t startstop; irqreturn_t irq_ret = IRQ_NONE; int plug_device = 0; - struct request *uninitialized_var(rq_in_flight); + struct request *rq_in_flight; if (host->host_flags & IDE_HFLAG_SERIALIZE) { if (hwif != host->cur_port) diff --git a/drivers/ide/ide-sysfs.c b/drivers/ide/ide-sysfs.c index b9dfeb2e8bd6..c08a8a0916e2 100644 --- a/drivers/ide/ide-sysfs.c +++ b/drivers/ide/ide-sysfs.c @@ -131,7 +131,7 @@ static struct device_attribute *ide_port_attrs[] = { int ide_sysfs_register_port(ide_hwif_t *hwif) { - int i, uninitialized_var(rc); + int i, rc; for (i = 0; ide_port_attrs[i]; i++) { rc = device_create_file(hwif->portdev, ide_port_attrs[i]); diff --git a/drivers/ide/umc8672.c b/drivers/ide/umc8672.c index 870e235e30af..cf996f788292 100644 --- a/drivers/ide/umc8672.c +++ b/drivers/ide/umc8672.c @@ -108,7 +108,7 @@ static void umc_set_speeds(u8 speeds[]) static void umc_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive) { ide_hwif_t *mate = hwif->mate; - unsigned long uninitialized_var(flags); + unsigned long flags; const u8 pio = drive->pio_mode - XFER_PIO_0; printk("%s: setting umc8672 to PIO mode%d (speed %d)\n", diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index f4495841bf68..aae53e650638 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -132,7 +132,7 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state = &drv->states[index]; unsigned long eax = flg2MWAIT(state->flags); unsigned long ecx = 1; /* break on interrupt flag */ - bool uninitialized_var(tick); + bool tick; int cpu = smp_processor_id(); /* diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index b48b3f6e632d..76e7ec0f0775 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1584,7 +1584,7 @@ static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs) struct ib_uverbs_create_qp_resp resp; struct ib_uqp_object *obj; struct ib_xrcd *xrcd; - struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_uobject *xrcd_uobj; struct ib_qp *qp; struct ib_qp_open_attr attr = {}; int ret; @@ -3406,7 +3406,7 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, struct ib_usrq_object *obj; struct ib_pd *pd; struct ib_srq *srq; - struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_uobject *xrcd_uobj; struct ib_srq_init_attr attr; int ret; struct ib_device *ib_dev; diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 30e08bcc9afb..77bc02a9228e 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -3282,7 +3282,7 @@ static int get_lladdr(struct net_device *dev, struct in6_addr *addr, static int pick_local_ip6addrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id) { - struct in6_addr uninitialized_var(addr); + struct in6_addr addr; struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->m_local_addr; struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->m_remote_addr; diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index b1bb61c65f4f..352b8af1998a 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -754,7 +754,7 @@ skip_cqe: static int __c4iw_poll_cq_one(struct c4iw_cq *chp, struct c4iw_qp *qhp, struct ib_wc *wc, struct c4iw_srq *srq) { - struct t4_cqe uninitialized_var(cqe); + struct t4_cqe cqe; struct t4_wq *wq = qhp ? &qhp->wq : NULL; u32 credit = 0; u8 cqe_flushed; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index cf51e3cbd969..f9ca6e000a81 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -3541,11 +3541,11 @@ static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, int nreq; int err = 0; unsigned ind; - int uninitialized_var(size); - unsigned uninitialized_var(seglen); + int size; + unsigned seglen; __be32 dummy; __be32 *lso_wqe; - __be32 uninitialized_var(lso_hdr_sz); + __be32 lso_hdr_sz; __be32 blh; int i; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 0c18cb6a2f14..0133ebb8d740 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -925,8 +925,8 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_cq *cq = to_mcq(ibcq); u32 out[MLX5_ST_SZ_DW(create_cq_out)]; - int uninitialized_var(index); - int uninitialized_var(inlen); + int index; + int inlen; u32 *cqb = NULL; void *cqc; int cqe_size; @@ -1246,7 +1246,7 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) __be64 *pas; int page_shift; int inlen; - int uninitialized_var(cqe_size); + int cqe_size; unsigned long flags; if (!MLX5_CAP_GEN(dev->mdev, cq_resize)) { diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 9454a66c12cc..655ea9c984e1 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -2536,7 +2536,7 @@ static ssize_t devx_async_event_read(struct file *filp, char __user *buf, { struct devx_async_event_file *ev_file = filp->private_data; struct devx_event_subscription *event_sub; - struct devx_async_event_data *uninitialized_var(event); + struct devx_async_event_data *event; int ret = 0; size_t eventsz; bool omit_data; diff --git a/drivers/infiniband/hw/mlx5/wr.c b/drivers/infiniband/hw/mlx5/wr.c index 2c6df1c43b55..bc35dbe4855b 100644 --- a/drivers/infiniband/hw/mlx5/wr.c +++ b/drivers/infiniband/hw/mlx5/wr.c @@ -1249,7 +1249,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, struct mlx5_wqe_xrc_seg *xrc; struct mlx5_bf *bf; void *cur_edge; - int uninitialized_var(size); + int size; unsigned long flags; unsigned int idx; int err = 0; diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index d04c245359eb..c6e95d0d760a 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -1639,8 +1639,8 @@ int mthca_tavor_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, * without initializing f0 and size0, and they are in fact * never used uninitialized. */ - int uninitialized_var(size0); - u32 uninitialized_var(f0); + int size0; + u32 f0; int ind; u8 op0 = 0; @@ -1835,7 +1835,7 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, * without initializing size0, and it is in fact never used * uninitialized. */ - int uninitialized_var(size0); + int size0; int ind; void *wqe; void *prev_wqe; @@ -1943,8 +1943,8 @@ int mthca_arbel_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, * without initializing f0 and size0, and they are in fact * never used uninitialized. */ - int uninitialized_var(size0); - u32 uninitialized_var(f0); + int size0; + u32 f0; int ind; u8 op0 = 0; diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c index 7271d705f4b0..857be5a7d0bd 100644 --- a/drivers/infiniband/sw/siw/siw_qp_rx.c +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -333,7 +333,7 @@ static struct siw_wqe *siw_rqe_get(struct siw_qp *qp) struct siw_srq *srq; struct siw_wqe *wqe = NULL; bool srq_event = false; - unsigned long uninitialized_var(flags); + unsigned long flags; srq = qp->srq; if (srq) { diff --git a/drivers/input/serio/serio_raw.c b/drivers/input/serio/serio_raw.c index e9647ebff187..1e4770094415 100644 --- a/drivers/input/serio/serio_raw.c +++ b/drivers/input/serio/serio_raw.c @@ -159,7 +159,7 @@ static ssize_t serio_raw_read(struct file *file, char __user *buffer, { struct serio_raw_client *client = file->private_data; struct serio_raw *serio_raw = client->serio_raw; - char uninitialized_var(c); + char c; ssize_t read = 0; int error; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index d759e7234e98..c29fd0991857 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2236,7 +2236,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, unsigned long nr_pages, int prot) { struct dma_pte *first_pte = NULL, *pte = NULL; - phys_addr_t uninitialized_var(pteval); + phys_addr_t pteval; unsigned long sg_res = 0; unsigned int largepage_lvl = 0; unsigned long lvl_pages = 0; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 81ffc59d05c9..4312007d2d34 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -306,7 +306,7 @@ static void do_region(int op, int op_flags, unsigned region, struct request_queue *q = bdev_get_queue(where->bdev); unsigned short logical_block_size = queue_logical_block_size(q); sector_t num_sectors; - unsigned int uninitialized_var(special_cmd_max_sectors); + unsigned int special_cmd_max_sectors; /* * Reject unsupported discard and write same requests. diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 489935d5f22d..056d891a32a9 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1844,7 +1844,7 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us int ioctl_flags; int param_flags; unsigned int cmd; - struct dm_ioctl *uninitialized_var(param); + struct dm_ioctl *param; ioctl_fn fn = NULL; size_t input_param_size; struct dm_ioctl param_kernel; diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 963d3774c93e..247089c2be25 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -613,7 +613,7 @@ static int persistent_read_metadata(struct dm_exception_store *store, chunk_t old, chunk_t new), void *callback_context) { - int r, uninitialized_var(new_snapshot); + int r, new_snapshot; struct pstore *ps = get_info(store); /* diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 8277b959e00b..89d7cda07640 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -642,7 +642,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table, */ unsigned short remaining = 0; - struct dm_target *uninitialized_var(ti); + struct dm_target *ti; struct queue_limits ti_limits; unsigned i; diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 30505d70f423..ec83edff25bf 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1752,7 +1752,7 @@ static void writecache_writeback(struct work_struct *work) { struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); struct blk_plug plug; - struct wc_entry *f, *uninitialized_var(g), *e = NULL; + struct wc_entry *f, *g, *e = NULL; struct rb_node *node, *next_node; struct list_head skipped; struct writeback_list wbl; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ab8067f9ce8c..401b366af076 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2601,7 +2601,7 @@ static void raid5_end_write_request(struct bio *bi) struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; - struct md_rdev *uninitialized_var(rdev); + struct md_rdev *rdev; sector_t first_bad; int bad_sectors; int replacement = 0; diff --git a/drivers/media/dvb-frontends/rtl2832.c b/drivers/media/dvb-frontends/rtl2832.c index 6ec277421390..e5bffaaeed38 100644 --- a/drivers/media/dvb-frontends/rtl2832.c +++ b/drivers/media/dvb-frontends/rtl2832.c @@ -640,7 +640,7 @@ static int rtl2832_read_status(struct dvb_frontend *fe, enum fe_status *status) struct i2c_client *client = dev->client; struct dtv_frontend_properties *c = &fe->dtv_property_cache; int ret; - u32 uninitialized_var(tmp); + u32 tmp; u8 u8tmp, buf[2]; u16 u16tmp; diff --git a/drivers/media/tuners/qt1010.c b/drivers/media/tuners/qt1010.c index 85bbdd4ecdbb..e48faf942830 100644 --- a/drivers/media/tuners/qt1010.c +++ b/drivers/media/tuners/qt1010.c @@ -215,7 +215,7 @@ static int qt1010_set_params(struct dvb_frontend *fe) static int qt1010_init_meas1(struct qt1010_priv *priv, u8 oper, u8 reg, u8 reg_init_val, u8 *retval) { - u8 i, val1, uninitialized_var(val2); + u8 i, val1, val2; int err; qt1010_i2c_oper_t i2c_data[] = { @@ -250,7 +250,7 @@ static int qt1010_init_meas1(struct qt1010_priv *priv, static int qt1010_init_meas2(struct qt1010_priv *priv, u8 reg_init_val, u8 *retval) { - u8 i, uninitialized_var(val); + u8 i, val; int err; qt1010_i2c_oper_t i2c_data[] = { { QT1010_WR, 0x07, reg_init_val }, diff --git a/drivers/media/usb/gspca/vicam.c b/drivers/media/usb/gspca/vicam.c index 179b2ec3df57..d98343fd33fe 100644 --- a/drivers/media/usb/gspca/vicam.c +++ b/drivers/media/usb/gspca/vicam.c @@ -225,7 +225,7 @@ static int sd_init(struct gspca_dev *gspca_dev) { int ret; const struct ihex_binrec *rec; - const struct firmware *uninitialized_var(fw); + const struct firmware *fw; u8 *firmware_buf; ret = request_ihex_firmware(&fw, VICAM_FIRMWARE, diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c index 8fa77a81dd7f..a65d5353a441 100644 --- a/drivers/media/usb/uvc/uvc_video.c +++ b/drivers/media/usb/uvc/uvc_video.c @@ -765,9 +765,9 @@ static void uvc_video_stats_decode(struct uvc_streaming *stream, unsigned int header_size; bool has_pts = false; bool has_scr = false; - u16 uninitialized_var(scr_sof); - u32 uninitialized_var(scr_stc); - u32 uninitialized_var(pts); + u16 scr_sof; + u32 scr_stc; + u32 pts; if (stream->stats.stream.nb_frames == 0 && stream->stats.frame.nb_packets == 0) @@ -1828,7 +1828,7 @@ static int uvc_video_start_transfer(struct uvc_streaming *stream, struct usb_host_endpoint *best_ep = NULL; unsigned int best_psize = UINT_MAX; unsigned int bandwidth; - unsigned int uninitialized_var(altsetting); + unsigned int altsetting; int intfnum = stream->intfnum; /* Isochronous endpoint, select the alternate setting. */ diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c index 0a9c5ddf2f59..383091517ed7 100644 --- a/drivers/memstick/host/jmb38x_ms.c +++ b/drivers/memstick/host/jmb38x_ms.c @@ -314,7 +314,7 @@ static int jmb38x_ms_transfer_data(struct jmb38x_ms_host *host) } while (length) { - unsigned int uninitialized_var(p_off); + unsigned int p_off; if (host->req->long_data) { pg = nth_page(sg_page(&host->req->sg), diff --git a/drivers/memstick/host/tifm_ms.c b/drivers/memstick/host/tifm_ms.c index 5b966b54d6e9..fc35c7404429 100644 --- a/drivers/memstick/host/tifm_ms.c +++ b/drivers/memstick/host/tifm_ms.c @@ -198,7 +198,7 @@ static unsigned int tifm_ms_transfer_data(struct tifm_ms *host) host->block_pos); while (length) { - unsigned int uninitialized_var(p_off); + unsigned int p_off; if (host->req->long_data) { pg = nth_page(sg_page(&host->req->sg), diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 37b1158c1c0c..1ee866a38794 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -493,7 +493,7 @@ static void sdhci_read_block_pio(struct sdhci_host *host) { unsigned long flags; size_t blksize, len, chunk; - u32 uninitialized_var(scratch); + u32 scratch; u8 *buf; DBG("PIO reading\n"); diff --git a/drivers/mtd/nand/raw/nand_ecc.c b/drivers/mtd/nand/raw/nand_ecc.c index 09fdced659f5..b6a46b1b7781 100644 --- a/drivers/mtd/nand/raw/nand_ecc.c +++ b/drivers/mtd/nand/raw/nand_ecc.c @@ -131,7 +131,7 @@ void __nand_calculate_ecc(const unsigned char *buf, unsigned int eccsize, /* rp0..rp15..rp17 are the various accumulated parities (per byte) */ uint32_t rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7; uint32_t rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15, rp16; - uint32_t uninitialized_var(rp17); /* to make compiler happy */ + uint32_t rp17; uint32_t par; /* the cumulative parity for all data */ uint32_t tmppar; /* the cumulative parity for this iteration; for rp12, rp14 and rp16 at the end of the diff --git a/drivers/mtd/nand/raw/s3c2410.c b/drivers/mtd/nand/raw/s3c2410.c index f86dff311464..d0dd0c446e4d 100644 --- a/drivers/mtd/nand/raw/s3c2410.c +++ b/drivers/mtd/nand/raw/s3c2410.c @@ -291,7 +291,7 @@ static int s3c2410_nand_setrate(struct s3c2410_nand_info *info) int tacls_max = (info->cpu_type == TYPE_S3C2412) ? 8 : 4; int tacls, twrph0, twrph1; unsigned long clkrate = clk_get_rate(info->clk); - unsigned long uninitialized_var(set), cfg, uninitialized_var(mask); + unsigned long set, cfg, mask; unsigned long flags; /* calculate the timing information for the controller */ diff --git a/drivers/mtd/parsers/afs.c b/drivers/mtd/parsers/afs.c index 752b6cf005f7..980e332bdac4 100644 --- a/drivers/mtd/parsers/afs.c +++ b/drivers/mtd/parsers/afs.c @@ -126,8 +126,8 @@ static int afs_parse_v1_partition(struct mtd_info *mtd, * Static checks cannot see that we bail out if we have an error * reading the footer. */ - u_int uninitialized_var(iis_ptr); - u_int uninitialized_var(img_ptr); + u_int iis_ptr; + u_int img_ptr; u_int ptr; size_t sz; int ret; diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c index 5133e1be5331..0edecfdbd01f 100644 --- a/drivers/mtd/ubi/eba.c +++ b/drivers/mtd/ubi/eba.c @@ -599,7 +599,7 @@ int ubi_eba_read_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum, int err, pnum, scrub = 0, vol_id = vol->vol_id; struct ubi_vid_io_buf *vidb; struct ubi_vid_hdr *vid_hdr; - uint32_t uninitialized_var(crc); + uint32_t crc; err = leb_read_lock(ubi, vol_id, lnum); if (err) diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c index a761092e6ac9..f929db893957 100644 --- a/drivers/net/can/janz-ican3.c +++ b/drivers/net/can/janz-ican3.c @@ -1451,7 +1451,7 @@ static int ican3_napi(struct napi_struct *napi, int budget) /* process all communication messages */ while (true) { - struct ican3_msg uninitialized_var(msg); + struct ican3_msg msg; ret = ican3_recv_msg(mod, &msg); if (ret) break; diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index e1c236cab2a7..c8cc14eadbb4 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -1455,7 +1455,7 @@ bnx2_test_and_disable_2g5(struct bnx2 *bp) static void bnx2_enable_forced_2g5(struct bnx2 *bp) { - u32 uninitialized_var(bmcr); + u32 bmcr; int err; if (!(bp->phy_flags & BNX2_PHY_FLAG_2_5G_CAPABLE)) @@ -1499,7 +1499,7 @@ bnx2_enable_forced_2g5(struct bnx2 *bp) static void bnx2_disable_forced_2g5(struct bnx2 *bp) { - u32 uninitialized_var(bmcr); + u32 bmcr; int err; if (!(bp->phy_flags & BNX2_PHY_FLAG_2_5G_CAPABLE)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 5ddd18639a1e..c410a0ce35c8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -529,8 +529,8 @@ static int req_pages_handler(struct notifier_block *nb, int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot) { - u16 uninitialized_var(func_id); - s32 uninitialized_var(npages); + u16 func_id; + s32 npages; int err; err = mlx5_cmd_query_pages(dev, &func_id, &npages, boot); diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c index 67e62603fe3b..15b8b1bf8163 100644 --- a/drivers/net/ethernet/neterion/s2io.c +++ b/drivers/net/ethernet/neterion/s2io.c @@ -7276,7 +7276,7 @@ static int rx_osm_handler(struct ring_info *ring_data, struct RxD_t * rxdp) int ring_no = ring_data->ring_no; u16 l3_csum, l4_csum; unsigned long long err = rxdp->Control_1 & RXD_T_CODE; - struct lro *uninitialized_var(lro); + struct lro *lro; u8 err_mask; struct swStat *swstats = &sp->mac_control.stats_info->sw_stat; diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c index 0fade19e00d4..0d0e38debbc2 100644 --- a/drivers/net/ethernet/qlogic/qla3xxx.c +++ b/drivers/net/ethernet/qlogic/qla3xxx.c @@ -3769,7 +3769,7 @@ static int ql3xxx_probe(struct pci_dev *pdev, struct net_device *ndev = NULL; struct ql3_adapter *qdev = NULL; static int cards_found; - int uninitialized_var(pci_using_dac), err; + int pci_using_dac, err; err = pci_enable_device(pdev); if (err) { diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c index debd3c3fa6fb..015fdb851cdb 100644 --- a/drivers/net/ethernet/sun/cassini.c +++ b/drivers/net/ethernet/sun/cassini.c @@ -2271,7 +2271,7 @@ static int cas_rx_ringN(struct cas *cp, int ring, int budget) drops = 0; while (1) { struct cas_rx_comp *rxc = rxcs + entry; - struct sk_buff *uninitialized_var(skb); + struct sk_buff *skb; int type, len; u64 words[4]; int i, dring; diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index 9a5004f674c7..1b697e4cd7dc 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -429,7 +429,7 @@ static int serdes_init_niu_1g_serdes(struct niu *np) struct niu_link_config *lp = &np->link_config; u16 pll_cfg, pll_sts; int max_retry = 100; - u64 uninitialized_var(sig), mask, val; + u64 sig, mask, val; u32 tx_cfg, rx_cfg; unsigned long i; int err; @@ -526,7 +526,7 @@ static int serdes_init_niu_10g_serdes(struct niu *np) struct niu_link_config *lp = &np->link_config; u32 tx_cfg, rx_cfg, pll_cfg, pll_sts; int max_retry = 100; - u64 uninitialized_var(sig), mask, val; + u64 sig, mask, val; unsigned long i; int err; @@ -714,7 +714,7 @@ static int esr_write_glue0(struct niu *np, unsigned long chan, u32 val) static int esr_reset(struct niu *np) { - u32 uninitialized_var(reset); + u32 reset; int err; err = mdio_write(np, np->port, NIU_ESR_DEV_ADDR, diff --git a/drivers/net/wan/z85230.c b/drivers/net/wan/z85230.c index 7ad3d24195ba..138930c66ad2 100644 --- a/drivers/net/wan/z85230.c +++ b/drivers/net/wan/z85230.c @@ -702,7 +702,7 @@ EXPORT_SYMBOL(z8530_nop); irqreturn_t z8530_interrupt(int irq, void *dev_id) { struct z8530_dev *dev=dev_id; - u8 uninitialized_var(intr); + u8 intr; static volatile int locker=0; int work=0; struct z8530_irqhandler *irqs; diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index 22b6937ac225..340ce327ac14 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -2240,7 +2240,7 @@ static int ath10k_init_uart(struct ath10k *ar) static int ath10k_init_hw_params(struct ath10k *ar) { - const struct ath10k_hw_params *uninitialized_var(hw_params); + const struct ath10k_hw_params *hw_params; int i; for (i = 0; i < ARRAY_SIZE(ath10k_hw_params_list); i++) { diff --git a/drivers/net/wireless/ath/ath6kl/init.c b/drivers/net/wireless/ath/ath6kl/init.c index aa1c71a76ef7..811fad6d60c0 100644 --- a/drivers/net/wireless/ath/ath6kl/init.c +++ b/drivers/net/wireless/ath/ath6kl/init.c @@ -1575,7 +1575,7 @@ static int ath6kl_init_upload(struct ath6kl *ar) int ath6kl_init_hw_params(struct ath6kl *ar) { - const struct ath6kl_hw *uninitialized_var(hw); + const struct ath6kl_hw *hw; int i; for (i = 0; i < ARRAY_SIZE(hw_list); i++) { diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c index 289a2444d534..4d72cd7daaa2 100644 --- a/drivers/net/wireless/ath/ath9k/init.c +++ b/drivers/net/wireless/ath/ath9k/init.c @@ -230,7 +230,7 @@ static unsigned int ath9k_reg_rmw(void *hw_priv, u32 reg_offset, u32 set, u32 cl struct ath_hw *ah = hw_priv; struct ath_common *common = ath9k_hw_common(ah); struct ath_softc *sc = (struct ath_softc *) common->priv; - unsigned long uninitialized_var(flags); + unsigned long flags; u32 val; if (NR_CPUS > 1 && ah->config.serialize_regmode == SER_REG_MODE_ON) { diff --git a/drivers/net/wireless/broadcom/b43/debugfs.c b/drivers/net/wireless/broadcom/b43/debugfs.c index dc1819ca52ac..89a25aefb327 100644 --- a/drivers/net/wireless/broadcom/b43/debugfs.c +++ b/drivers/net/wireless/broadcom/b43/debugfs.c @@ -493,7 +493,7 @@ static ssize_t b43_debugfs_read(struct file *file, char __user *userbuf, struct b43_wldev *dev; struct b43_debugfs_fops *dfops; struct b43_dfs_file *dfile; - ssize_t uninitialized_var(ret); + ssize_t ret; char *buf; const size_t bufsize = 1024 * 16; /* 16 kiB buffer */ const size_t buforder = get_order(bufsize); diff --git a/drivers/net/wireless/broadcom/b43/dma.c b/drivers/net/wireless/broadcom/b43/dma.c index 9733c64bf978..ca671fc13116 100644 --- a/drivers/net/wireless/broadcom/b43/dma.c +++ b/drivers/net/wireless/broadcom/b43/dma.c @@ -37,7 +37,7 @@ static u32 b43_dma_address(struct b43_dma *dma, dma_addr_t dmaaddr, enum b43_addrtype addrtype) { - u32 uninitialized_var(addr); + u32 addr; switch (addrtype) { case B43_DMA_ADDR_LOW: diff --git a/drivers/net/wireless/broadcom/b43/lo.c b/drivers/net/wireless/broadcom/b43/lo.c index 5d97cf06eceb..338b6545a1e7 100644 --- a/drivers/net/wireless/broadcom/b43/lo.c +++ b/drivers/net/wireless/broadcom/b43/lo.c @@ -729,7 +729,7 @@ struct b43_lo_calib *b43_calibrate_lo_setting(struct b43_wldev *dev, }; int max_rx_gain; struct b43_lo_calib *cal; - struct lo_g_saved_values uninitialized_var(saved_regs); + struct lo_g_saved_values saved_regs; /* Values from the "TXCTL Register and Value Table" */ u16 txctl_reg; u16 txctl_value; diff --git a/drivers/net/wireless/broadcom/b43/phy_n.c b/drivers/net/wireless/broadcom/b43/phy_n.c index 46db91846007..39de18d3ce91 100644 --- a/drivers/net/wireless/broadcom/b43/phy_n.c +++ b/drivers/net/wireless/broadcom/b43/phy_n.c @@ -5643,7 +5643,7 @@ static int b43_nphy_rev2_cal_rx_iq(struct b43_wldev *dev, u8 rfctl[2]; u8 afectl_core; u16 tmp[6]; - u16 uninitialized_var(cur_hpf1), uninitialized_var(cur_hpf2), cur_lna; + u16 cur_hpf1, cur_hpf2, cur_lna; u32 real, imag; enum nl80211_band band; diff --git a/drivers/net/wireless/broadcom/b43/xmit.c b/drivers/net/wireless/broadcom/b43/xmit.c index 55babc6d1091..7651b1bdb592 100644 --- a/drivers/net/wireless/broadcom/b43/xmit.c +++ b/drivers/net/wireless/broadcom/b43/xmit.c @@ -422,10 +422,10 @@ int b43_generate_txhdr(struct b43_wldev *dev, if ((rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) || (rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT)) { unsigned int len; - struct ieee80211_hdr *uninitialized_var(hdr); + struct ieee80211_hdr *hdr; int rts_rate, rts_rate_fb; int rts_rate_ofdm, rts_rate_fb_ofdm; - struct b43_plcp_hdr6 *uninitialized_var(plcp); + struct b43_plcp_hdr6 *plcp; struct ieee80211_rate *rts_cts_rate; rts_cts_rate = ieee80211_get_rts_cts_rate(dev->wl->hw, info); @@ -436,7 +436,7 @@ int b43_generate_txhdr(struct b43_wldev *dev, rts_rate_fb_ofdm = b43_is_ofdm_rate(rts_rate_fb); if (rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT) { - struct ieee80211_cts *uninitialized_var(cts); + struct ieee80211_cts *cts; switch (dev->fw.hdr_format) { case B43_FW_HDR_598: @@ -458,7 +458,7 @@ int b43_generate_txhdr(struct b43_wldev *dev, mac_ctl |= B43_TXH_MAC_SENDCTS; len = sizeof(struct ieee80211_cts); } else { - struct ieee80211_rts *uninitialized_var(rts); + struct ieee80211_rts *rts; switch (dev->fw.hdr_format) { case B43_FW_HDR_598: @@ -637,8 +637,8 @@ void b43_rx(struct b43_wldev *dev, struct sk_buff *skb, const void *_rxhdr) const struct b43_rxhdr_fw4 *rxhdr = _rxhdr; __le16 fctl; u16 phystat0, phystat3; - u16 uninitialized_var(chanstat), uninitialized_var(mactime); - u32 uninitialized_var(macstat); + u16 chanstat, mactime; + u32 macstat; u16 chanid; int padding, rate_idx; diff --git a/drivers/net/wireless/broadcom/b43legacy/debugfs.c b/drivers/net/wireless/broadcom/b43legacy/debugfs.c index fa133dfb2ecb..e7e4293c01f2 100644 --- a/drivers/net/wireless/broadcom/b43legacy/debugfs.c +++ b/drivers/net/wireless/broadcom/b43legacy/debugfs.c @@ -190,7 +190,7 @@ static ssize_t b43legacy_debugfs_read(struct file *file, char __user *userbuf, struct b43legacy_wldev *dev; struct b43legacy_debugfs_fops *dfops; struct b43legacy_dfs_file *dfile; - ssize_t uninitialized_var(ret); + ssize_t ret; char *buf; const size_t bufsize = 1024 * 16; /* 16 KiB buffer */ const size_t buforder = get_order(bufsize); diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c index 5208a39fd6f7..220c11d34c23 100644 --- a/drivers/net/wireless/broadcom/b43legacy/main.c +++ b/drivers/net/wireless/broadcom/b43legacy/main.c @@ -2580,7 +2580,7 @@ static void b43legacy_put_phy_into_reset(struct b43legacy_wldev *dev) static int b43legacy_switch_phymode(struct b43legacy_wl *wl, unsigned int new_mode) { - struct b43legacy_wldev *uninitialized_var(up_dev); + struct b43legacy_wldev *up_dev; struct b43legacy_wldev *down_dev; int err; bool gmode = false; diff --git a/drivers/net/wireless/intel/iwlegacy/3945.c b/drivers/net/wireless/intel/iwlegacy/3945.c index 2ac494f5ae22..fd63eba47ba2 100644 --- a/drivers/net/wireless/intel/iwlegacy/3945.c +++ b/drivers/net/wireless/intel/iwlegacy/3945.c @@ -2100,7 +2100,7 @@ il3945_txpower_set_from_eeprom(struct il_priv *il) /* set tx power value for all OFDM rates */ for (rate_idx = 0; rate_idx < IL_OFDM_RATES; rate_idx++) { - s32 uninitialized_var(power_idx); + s32 power_idx; int rc; /* use channel group's clip-power table, diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c index da6d4202611c..a159d1d18c2c 100644 --- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c +++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c @@ -2769,7 +2769,7 @@ il4965_hdl_tx(struct il_priv *il, struct il_rx_buf *rxb) struct ieee80211_tx_info *info; struct il4965_tx_resp *tx_resp = (void *)&pkt->u.raw[0]; u32 status = le32_to_cpu(tx_resp->u.status); - int uninitialized_var(tid); + int tid; int sta_id; int freed; u8 *qc = NULL; diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c index 5b071b70bc08..0ae9cfc65272 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c @@ -618,8 +618,8 @@ static void _rtl92cu_init_chipn_two_out_ep_priority(struct ieee80211_hw *hw, u8 queue_sel) { u16 beq, bkq, viq, voq, mgtq, hiq; - u16 uninitialized_var(valuehi); - u16 uninitialized_var(valuelow); + u16 valuehi; + u16 valuelow; switch (queue_sel) { case (TX_SELE_HQ | TX_SELE_LQ): diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 3acf56683915..14af4c97c626 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1138,7 +1138,7 @@ static irqreturn_t aer_isr(int irq, void *context) { struct pcie_device *dev = (struct pcie_device *)context; struct aer_rpc *rpc = get_service_data(dev); - struct aer_err_source uninitialized_var(e_src); + struct aer_err_source e_src; if (kfifo_is_empty(&rpc->aer_fifo)) return IRQ_NONE; diff --git a/drivers/platform/x86/hdaps.c b/drivers/platform/x86/hdaps.c index 04c4da6692d7..a72270932ec3 100644 --- a/drivers/platform/x86/hdaps.c +++ b/drivers/platform/x86/hdaps.c @@ -365,7 +365,7 @@ static ssize_t hdaps_variance_show(struct device *dev, static ssize_t hdaps_temp1_show(struct device *dev, struct device_attribute *attr, char *buf) { - u8 uninitialized_var(temp); + u8 temp; int ret; ret = hdaps_readb_one(HDAPS_PORT_TEMP1, &temp); @@ -378,7 +378,7 @@ static ssize_t hdaps_temp1_show(struct device *dev, static ssize_t hdaps_temp2_show(struct device *dev, struct device_attribute *attr, char *buf) { - u8 uninitialized_var(temp); + u8 temp; int ret; ret = hdaps_readb_one(HDAPS_PORT_TEMP2, &temp); diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c index e95f5b3bef4d..37c6cc374079 100644 --- a/drivers/scsi/dc395x.c +++ b/drivers/scsi/dc395x.c @@ -4126,7 +4126,7 @@ static int adapter_sg_tables_alloc(struct AdapterCtlBlk *acb) const unsigned srbs_per_page = PAGE_SIZE/SEGMENTX_LEN; int srb_idx = 0; unsigned i = 0; - struct SGentry *uninitialized_var(ptr); + struct SGentry *ptr; for (i = 0; i < DC395x_MAX_SRB_CNT; i++) acb->srb_array[i].segment_x = NULL; diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c index fb9848e1d481..0b4499210b95 100644 --- a/drivers/scsi/pm8001/pm8001_hwi.c +++ b/drivers/scsi/pm8001/pm8001_hwi.c @@ -4202,7 +4202,7 @@ static int process_oq(struct pm8001_hba_info *pm8001_ha, u8 vec) { struct outbound_queue_table *circularQ; void *pMsg1 = NULL; - u8 uninitialized_var(bc); + u8 bc; u32 ret = MPI_IO_STATUS_FAIL; unsigned long flags; diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index 4d205ebaee87..05c944a3bdca 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -4182,7 +4182,7 @@ static int process_oq(struct pm8001_hba_info *pm8001_ha, u8 vec) { struct outbound_queue_table *circularQ; void *pMsg1 = NULL; - u8 uninitialized_var(bc); + u8 bc; u32 ret = MPI_IO_STATUS_FAIL; unsigned long flags; u32 regval; diff --git a/drivers/ssb/driver_chipcommon.c b/drivers/ssb/driver_chipcommon.c index 3861cb659cb9..6c647ba4ba0b 100644 --- a/drivers/ssb/driver_chipcommon.c +++ b/drivers/ssb/driver_chipcommon.c @@ -119,7 +119,7 @@ void ssb_chipco_set_clockmode(struct ssb_chipcommon *cc, static enum ssb_clksrc chipco_pctl_get_slowclksrc(struct ssb_chipcommon *cc) { struct ssb_bus *bus = cc->dev->bus; - u32 uninitialized_var(tmp); + u32 tmp; if (cc->dev->id.revision < 6) { if (bus->bustype == SSB_BUSTYPE_SSB || @@ -149,7 +149,7 @@ static enum ssb_clksrc chipco_pctl_get_slowclksrc(struct ssb_chipcommon *cc) /* Get maximum or minimum (depending on get_max flag) slowclock frequency. */ static int chipco_pctl_clockfreqlimit(struct ssb_chipcommon *cc, int get_max) { - int uninitialized_var(limit); + int limit; enum ssb_clksrc clocksrc; int divisor = 1; u32 tmp; diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c index a6aabfd6e2da..097266342e5e 100644 --- a/drivers/tty/cyclades.c +++ b/drivers/tty/cyclades.c @@ -3643,7 +3643,7 @@ static int cy_pci_probe(struct pci_dev *pdev, struct cyclades_card *card; void __iomem *addr0 = NULL, *addr2 = NULL; char *card_name = NULL; - u32 uninitialized_var(mailbox); + u32 mailbox; unsigned int device_id, nchan = 0, card_no, i, j; unsigned char plx_ver; int retval, irq; diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c index fc38f96475bf..3b2f9fb01aa0 100644 --- a/drivers/tty/isicom.c +++ b/drivers/tty/isicom.c @@ -1514,7 +1514,7 @@ static unsigned int card_count; static int isicom_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { - unsigned int uninitialized_var(signature), index; + unsigned int signature, index; int retval = -EPERM; struct isi_board *board = NULL; diff --git a/drivers/usb/musb/cppi_dma.c b/drivers/usb/musb/cppi_dma.c index b4d6d9bb3239..c545b27ea568 100644 --- a/drivers/usb/musb/cppi_dma.c +++ b/drivers/usb/musb/cppi_dma.c @@ -1146,7 +1146,7 @@ irqreturn_t cppi_interrupt(int irq, void *dev_id) struct musb_hw_ep *hw_ep = NULL; u32 rx, tx; int i, index; - unsigned long uninitialized_var(flags); + unsigned long flags; cppi = container_of(musb->dma_controller, struct cppi, controller); if (cppi->irq) diff --git a/drivers/usb/storage/sddr55.c b/drivers/usb/storage/sddr55.c index ba955d65eb0e..c8a988d2cfdd 100644 --- a/drivers/usb/storage/sddr55.c +++ b/drivers/usb/storage/sddr55.c @@ -554,8 +554,8 @@ static int sddr55_reset(struct us_data *us) static unsigned long sddr55_get_capacity(struct us_data *us) { - unsigned char uninitialized_var(manufacturerID); - unsigned char uninitialized_var(deviceID); + unsigned char manufacturerID; + unsigned char deviceID; int result; struct sddr55_card_info *info = (struct sddr55_card_info *)us->extra; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index e992decfec53..eea902b83afe 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -862,7 +862,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) struct tun_msg_ctl ctl; size_t len, total_len = 0; int err; - struct vhost_net_ubuf_ref *uninitialized_var(ubufs); + struct vhost_net_ubuf_ref *ubufs; bool zcopy_used; int sent_pkts = 0; @@ -1042,7 +1042,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq, /* len is always initialized before use since we are always called with * datalen > 0. */ - u32 uninitialized_var(len); + u32 len; while (datalen > 0 && headcount < quota) { if (unlikely(seg >= UIO_MAXIOV)) { @@ -1099,7 +1099,7 @@ static void handle_rx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *vq = &nvq->vq; - unsigned uninitialized_var(in), log; + unsigned in, log; struct vhost_log *vq_log; struct msghdr msg = { .msg_name = NULL, diff --git a/drivers/video/fbdev/matrox/matroxfb_maven.c b/drivers/video/fbdev/matrox/matroxfb_maven.c index eda893b7a2e9..9a98c4a6ba33 100644 --- a/drivers/video/fbdev/matrox/matroxfb_maven.c +++ b/drivers/video/fbdev/matrox/matroxfb_maven.c @@ -300,7 +300,7 @@ static int matroxfb_mavenclock(const struct matrox_pll_ctl *ctl, unsigned int* in, unsigned int* feed, unsigned int* post, unsigned int* htotal2) { unsigned int fvco; - unsigned int uninitialized_var(p); + unsigned int p; fvco = matroxfb_PLL_mavenclock(&maven1000_pll, ctl, htotal, vtotal, in, feed, &p, htotal2); if (!fvco) @@ -732,8 +732,8 @@ static int maven_find_exact_clocks(unsigned int ht, unsigned int vt, for (x = 0; x < 8; x++) { unsigned int c; - unsigned int uninitialized_var(a), uninitialized_var(b), - uninitialized_var(h2); + unsigned int a, b, + h2; unsigned int h = ht + 2 + x; if (!matroxfb_mavenclock((m->mode == MATROXFB_OUTPUT_MODE_PAL) ? &maven_PAL : &maven_NTSC, h, vt, &a, &b, &c, &h2)) { diff --git a/drivers/video/fbdev/pm3fb.c b/drivers/video/fbdev/pm3fb.c index 7497bd36334c..a8faf46adeb1 100644 --- a/drivers/video/fbdev/pm3fb.c +++ b/drivers/video/fbdev/pm3fb.c @@ -821,9 +821,9 @@ static void pm3fb_write_mode(struct fb_info *info) wmb(); { - unsigned char uninitialized_var(m); /* ClkPreScale */ - unsigned char uninitialized_var(n); /* ClkFeedBackScale */ - unsigned char uninitialized_var(p); /* ClkPostScale */ + unsigned char m; /* ClkPreScale */ + unsigned char n; /* ClkFeedBackScale */ + unsigned char p; /* ClkPostScale */ unsigned long pixclock = PICOS2KHZ(info->var.pixclock); (void)pm3fb_calculate_clock(pixclock, &m, &n, &p); diff --git a/drivers/video/fbdev/riva/riva_hw.c b/drivers/video/fbdev/riva/riva_hw.c index 08c9ee46978e..4168ac464565 100644 --- a/drivers/video/fbdev/riva/riva_hw.c +++ b/drivers/video/fbdev/riva/riva_hw.c @@ -1245,8 +1245,7 @@ int CalcStateExt ) { int pixelDepth; - int uninitialized_var(VClk),uninitialized_var(m), - uninitialized_var(n), uninitialized_var(p); + int VClk, m, n, p; /* * Save mode parameters. diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 58b96baa8d48..a2de775801af 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -424,7 +424,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, struct vring_virtqueue *vq = to_vvq(_vq); struct scatterlist *sg; struct vring_desc *desc; - unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx; + unsigned int i, n, avail, descs_used, prev, err_idx; int head; bool indirect; @@ -1101,8 +1101,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, struct vring_packed_desc *desc; struct scatterlist *sg; unsigned int i, n, c, descs_used, err_idx; - __le16 uninitialized_var(head_flags), flags; - u16 head, id, uninitialized_var(prev), curr, avail_used_flags; + __le16 head_flags, flags; + u16 head, id, prev, curr, avail_used_flags; START_USE(vq); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 96757f3abd74..1d2e61e0ab04 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1053,7 +1053,7 @@ static int afs_d_revalidate_rcu(struct dentry *dentry) static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) { struct afs_vnode *vnode, *dir; - struct afs_fid uninitialized_var(fid); + struct afs_fid fid; struct dentry *parent; struct inode *inode; struct key *key; diff --git a/fs/afs/security.c b/fs/afs/security.c index 90d852704328..9cf3102f370c 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -399,7 +399,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, int afs_permission(struct inode *inode, int mask) { struct afs_vnode *vnode = AFS_FS_I(inode); - afs_access_t uninitialized_var(access); + afs_access_t access; struct key *key; int ret = 0; diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index e7f550327d5d..e338c407cb75 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -113,7 +113,7 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb) void dlm_timeout_warn(struct dlm_lkb *lkb) { - struct sk_buff *uninitialized_var(send_skb); + struct sk_buff *send_skb; struct dlm_lock_data *data; size_t size; int rv; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 64b56c7df023..d0542151e8c4 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -265,7 +265,7 @@ submit_bio_out: */ static int erofs_raw_access_readpage(struct file *file, struct page *page) { - erofs_off_t uninitialized_var(last_block); + erofs_off_t last_block; struct bio *bio; trace_erofs_readpage(page, true); @@ -282,7 +282,7 @@ static int erofs_raw_access_readpage(struct file *file, struct page *page) static void erofs_raw_access_readahead(struct readahead_control *rac) { - erofs_off_t uninitialized_var(last_block); + erofs_off_t last_block; struct bio *bio = NULL; struct page *page; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index be50a4d9d273..24a26aaf847f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1161,7 +1161,7 @@ static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; /* since bio will be NULL, no need to initialize last_index */ - pgoff_t uninitialized_var(last_index); + pgoff_t last_index; unsigned int nr_bios = 0; struct bio *bio = NULL; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index b4ddf48fa444..c4a274285858 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1284,7 +1284,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, struct super_block *sb = dir->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */ - struct msdos_dir_entry *uninitialized_var(de); + struct msdos_dir_entry *de; int err, free_slots, i, nr_bhs; loff_t pos, i_pos; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index c23f6f243ad4..a1303ad303ba 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -120,7 +120,7 @@ static ssize_t fuse_conn_max_background_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - unsigned uninitialized_var(val); + unsigned val; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, @@ -162,7 +162,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - unsigned uninitialized_var(val); + unsigned val; struct fuse_conn *fc; ssize_t ret; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 030f094910c3..2cc17816d7b1 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -270,7 +270,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo) { char *end = p + len; - char *uninitialized_var(key), *uninitialized_var(val); + char *key, *val; int rc; while (true) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e573b0cd2737..32301fe626ba 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2963,7 +2963,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc, { spin_lock(&fc->lock); if (RB_EMPTY_NODE(&ff->polled_node)) { - struct rb_node **link, *uninitialized_var(parent); + struct rb_node **link, *parent; link = fuse_find_polled_node(fc, ff->kh, &parent); BUG_ON(*link); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 72c9560f4467..baa17c781870 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -335,7 +335,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, int done = 0; struct pagevec pvec; int nr_pages; - pgoff_t uninitialized_var(writeback_index); + pgoff_t writeback_index; pgoff_t index; pgoff_t end; pgoff_t done_index; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 6306eaae378b..8dfe09f52cbc 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1761,7 +1761,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift; __u16 start_list[GFS2_MAX_META_HEIGHT]; __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL; - unsigned int start_aligned, uninitialized_var(end_aligned); + unsigned int start_aligned, end_aligned; unsigned int strip_h = ip->i_height - 1; u32 btotal = 0; int ret, state; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index cb2a11b458c6..ed1da4323967 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -419,7 +419,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, struct page *page) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - struct gfs2_log_header_host uninitialized_var(lh); + struct gfs2_log_header_host lh; void *kaddr = kmap_atomic(page); unsigned int offset; bool ret = false; diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index c8d1b2be7854..73342c925a4b 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -398,7 +398,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str) astr = str->name; len = str->len; while (len > 0) { - int uninitialized_var(dsize); + int dsize; size = asc2unichar(sb, astr, len, &c); astr += size; len -= size; diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index cac468f04820..402769881c32 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -152,8 +152,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry, struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { int found; - unsigned long uninitialized_var(block); - unsigned long uninitialized_var(offset); + unsigned long block; + unsigned long offset; struct inode *inode; struct page *page; diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 83b8f06b4a64..7e9abdb89712 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -401,7 +401,7 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb { size_t retlen; int ret; - uint32_t uninitialized_var(bad_offset); + uint32_t bad_offset; switch (jffs2_block_check_erase(c, jeb, &bad_offset)) { case -EAGAIN: goto refile; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index b68e96681522..bb1804bab1e1 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -351,7 +351,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) static ssize_t write_filehandle(struct file *file, char *buf, size_t size) { char *dname, *path; - int uninitialized_var(maxsize); + int maxsize; char *mesg = buf; int len; struct auth_domain *dom; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 2f834add165b..4c1b90442d6f 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4707,7 +4707,7 @@ int ocfs2_insert_extent(handle_t *handle, struct ocfs2_alloc_context *meta_ac) { int status; - int uninitialized_var(free_records); + int free_records; struct buffer_head *last_eb_bh = NULL; struct ocfs2_insert_type insert = {0, }; struct ocfs2_extent_rec rec; @@ -7051,7 +7051,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, int need_free = 0; u32 bit_off, num; handle_t *handle; - u64 uninitialized_var(block); + u64 block; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 5761060d2ba8..bdfba9db558a 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -848,9 +848,9 @@ static int ocfs2_dx_dir_lookup(struct inode *inode, u64 *ret_phys_blkno) { int ret = 0; - unsigned int cend, uninitialized_var(clen); - u32 uninitialized_var(cpos); - u64 uninitialized_var(blkno); + unsigned int cend, clen; + u32 cpos; + u64 blkno; u32 name_hash = hinfo->major_hash; ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno, @@ -894,7 +894,7 @@ static int ocfs2_dx_dir_search(const char *name, int namelen, struct ocfs2_dir_lookup_result *res) { int ret, i, found; - u64 uninitialized_var(phys); + u64 phys; struct buffer_head *dx_leaf_bh = NULL; struct ocfs2_dx_leaf *dx_leaf; struct ocfs2_dx_entry *dx_entry = NULL; @@ -4393,9 +4393,9 @@ out: int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) { int ret; - unsigned int uninitialized_var(clen); - u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos); - u64 uninitialized_var(blkno); + unsigned int clen; + u32 major_hash = UINT_MAX, p_cpos, cpos; + u64 blkno; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct buffer_head *dx_root_bh = NULL; struct ocfs2_dx_root_block *dx_root; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index a94852af5510..7b93e9c766f6 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -403,7 +403,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, { int i, ret, tree_height, len; struct ocfs2_dinode *di; - struct ocfs2_extent_block *uninitialized_var(eb); + struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; struct buffer_head *eb_bh = NULL; @@ -599,7 +599,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, unsigned int *extent_flags) { int ret; - unsigned int uninitialized_var(hole_len), flags = 0; + unsigned int hole_len, flags = 0; struct buffer_head *di_bh = NULL; struct ocfs2_extent_rec rec; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 5381020aaa9a..c46bf7f581a1 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2498,7 +2498,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, struct buffer_head *new_di_bh = NULL; struct ocfs2_alloc_context *inode_ac = NULL; struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; - u64 uninitialized_var(di_blkno), suballoc_loc; + u64 di_blkno, suballoc_loc; u16 suballoc_bit; status = ocfs2_inode_lock(dir, &parent_di_bh, 1); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index cfb77f70c888..3b397fa9c9e8 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1063,7 +1063,7 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, struct buffer_head **ret_bh) { int ret = 0, i, found; - u32 low_cpos, uninitialized_var(cpos_end); + u32 low_cpos, cpos_end; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec = NULL; struct ocfs2_extent_block *eb = NULL; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 90c830e3758e..9ccd19d8f7b1 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1211,7 +1211,7 @@ static int ocfs2_xattr_block_get(struct inode *inode, struct ocfs2_xattr_value_root *xv; size_t size; int ret = -ENODATA, name_offset, name_len, i; - int uninitialized_var(block_off); + int block_off; xs->bucket = ocfs2_xattr_bucket_new(inode); if (!xs->bucket) { diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d7b5f09d298c..2c7b70ee1388 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -220,7 +220,7 @@ static int omfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh; sector_t next, offset; int ret; - u64 uninitialized_var(new_block); + u64 new_block; u32 max_extents; int extent_count; struct omfs_extent *oe; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 79dd052c7dbf..8e1c308524a2 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -787,7 +787,7 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) struct path upperpath, datapath; int err; char *capability = NULL; - ssize_t uninitialized_var(cap_size); + ssize_t cap_size; ovl_path_upper(c->dentry, &upperpath); if (WARN_ON(upperpath.dentry == NULL)) diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index ad292c5a43a9..b5cdac9b0368 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -552,11 +552,11 @@ out: */ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) { - int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt; + int lnum, offs, len, err = 0, last_level, child_cnt; int first = 1, iip; struct ubifs_debug_info *d = c->dbg; - union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key; - unsigned long long uninitialized_var(last_sqnum); + union ubifs_key lower_key, upper_key, l_key, u_key; + unsigned long long last_sqnum; struct ubifs_idx_node *idx; struct list_head list; struct idx_node *i; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ef85ec167a84..9d042942d8b2 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1260,7 +1260,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct timespec64 time; - unsigned int uninitialized_var(saved_nlink); + unsigned int saved_nlink; struct fscrypt_name old_nm, new_nm; /* diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 49fe062ce45e..b77d1637bbbc 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -222,7 +222,7 @@ static int write_begin_slow(struct address_space *mapping, struct ubifs_info *c = inode->i_sb->s_fs_info; pgoff_t index = pos >> PAGE_SHIFT; struct ubifs_budget_req req = { .new_page = 1 }; - int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + int err, appending = !!(pos + len > inode->i_size); struct page *page; dbg_gen("ino %lu, pos %llu, len %u, i_size %lld", @@ -426,7 +426,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); pgoff_t index = pos >> PAGE_SHIFT; - int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + int err, appending = !!(pos + len > inode->i_size); int skipped_read = 0; struct page *page; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index e5ec1afe1c66..2e6264318bd9 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1222,7 +1222,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); - struct ubifs_inode *uninitialized_var(new_ui); + struct ubifs_inode *new_ui; u8 hash_old_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_inode[UBIFS_HASH_ARR_SZ]; @@ -1507,7 +1507,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, union ubifs_key key, to_key; struct ubifs_ino_node *ino; struct ubifs_trun_node *trun; - struct ubifs_data_node *uninitialized_var(dn); + struct ubifs_data_node *dn; int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode); struct ubifs_inode *ui = ubifs_inode(inode); ino_t inum = inode->i_ino; diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index e21abf250951..6e0a153b7194 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -275,7 +275,7 @@ uint32_t ubifs_unpack_bits(const struct ubifs_info *c, uint8_t **addr, int *pos, const int k = 32 - nrbits; uint8_t *p = *addr; int b = *pos; - uint32_t uninitialized_var(val); + uint32_t val; const int bytes = (nrbits + b + 7) >> 3; ubifs_assert(c, nrbits > 0); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index e8e7b0e9532e..f609f6cdde70 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -892,7 +892,7 @@ static int fallible_resolve_collision(struct ubifs_info *c, int adding) { struct ubifs_znode *o_znode = NULL, *znode = *zn; - int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n; + int o_n, err, cmp, unsure = 0, nn = *n; cmp = fallible_matches_name(c, &znode->zbranch[nn], nm); if (unlikely(cmp < 0)) @@ -1514,8 +1514,8 @@ out: */ int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu) { - int n, err = 0, lnum = -1, uninitialized_var(offs); - int uninitialized_var(len); + int n, err = 0, lnum = -1, offs; + int len; unsigned int block = key_block(c, &bu->key); struct ubifs_znode *znode; diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index 49cb34c3f324..ccaf94ea5be3 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -126,8 +126,8 @@ int ubifs_search_zbranch(const struct ubifs_info *c, const struct ubifs_znode *znode, const union ubifs_key *key, int *n) { - int beg = 0, end = znode->child_cnt, uninitialized_var(mid); - int uninitialized_var(cmp); + int beg = 0, end = znode->child_cnt, mid; + int cmp; const struct ubifs_zbranch *zbr = &znode->zbranch[0]; ubifs_assert(c, end > beg); diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 02f03fadb75b..8e597db4d971 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -564,7 +564,7 @@ static udf_pblk_t udf_table_new_block(struct super_block *sb, udf_pblk_t newblock = 0; uint32_t adsize; uint32_t elen, goal_elen = 0; - struct kernel_lb_addr eloc, uninitialized_var(goal_eloc); + struct kernel_lb_addr eloc, goal_eloc; struct extent_position epos, goal_epos; int8_t etype; struct udf_inode_info *iinfo = UDF_I(table); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index f37f5cc4b19f..30525861c596 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -126,7 +126,7 @@ xfs_bmap_rtalloc( * pick an extent that will space things out in the rt area. */ if (ap->eof && ap->offset == 0) { - xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ + xfs_rtblock_t rtx; /* realtime extent no */ error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); if (error) diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 6315324b9dc2..56e4afb2e729 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -305,7 +305,7 @@ flow_action_mixed_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack) { const struct flow_action_entry *action_entry; - u8 uninitialized_var(last_hw_stats); + u8 last_hw_stats; int i; if (flow_offload_has_one_action(action)) diff --git a/kernel/async.c b/kernel/async.c index 4f9c1d614016..33258e6e20f8 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -111,7 +111,7 @@ static void async_run_entry_fn(struct work_struct *work) struct async_entry *entry = container_of(work, struct async_entry, work); unsigned long flags; - ktime_t uninitialized_var(calltime), delta, rettime; + ktime_t calltime, delta, rettime; /* 1) run (and print duration) */ if (initcall_debug && system_state < SYSTEM_RUNNING) { @@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); */ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) { - ktime_t uninitialized_var(starttime), delta, endtime; + ktime_t starttime, delta, endtime; if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("async_waiting @ %i\n", task_pid_nr(current)); diff --git a/kernel/audit.c b/kernel/audit.c index 8c201f414226..ec38479f9228 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1800,7 +1800,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, { struct audit_buffer *ab; struct timespec64 t; - unsigned int uninitialized_var(serial); + unsigned int serial; if (audit_initialized != AUDIT_INITIALIZED) return NULL; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 683a799618ad..9d847ab851db 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -591,7 +591,7 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) int this_cpu, old_cpu; char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; char *moreprompt = "more> "; - unsigned long uninitialized_var(flags); + unsigned long flags; /* Serialize kdb_printf if multiple cpus try to write at once. * But if any cpu goes recursive in kdb, just print the output, diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 36c962a86bf2..d628ab09d97b 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -882,7 +882,7 @@ static int device_dma_allocations(struct device *dev, struct dma_debug_entry **o static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; - struct dma_debug_entry *uninitialized_var(entry); + struct dma_debug_entry *entry; int count; if (dma_debug_disabled()) diff --git a/kernel/events/core.c b/kernel/events/core.c index 856d98c36f56..851fc5e0e24b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11483,7 +11483,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx, *uninitialized_var(gctx); + struct perf_event_context *ctx, *gctx; struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bb0862873dba..e84eb52b646b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2189,7 +2189,7 @@ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; - int uninitialized_var(is_swbp); + int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == get_trampoline_vaddr()) diff --git a/kernel/exit.c b/kernel/exit.c index 727150f28103..7bcd571618dd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -93,7 +93,7 @@ static void __exit_signal(struct task_struct *tsk) struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; - struct tty_struct *uninitialized_var(tty); + struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, diff --git a/kernel/futex.c b/kernel/futex.c index e646661f6282..05e88562de68 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1326,7 +1326,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { int err; - u32 uninitialized_var(curval); + u32 curval; if (unlikely(should_fail_futex(true))) return -EFAULT; @@ -1496,7 +1496,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) */ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { - u32 uninitialized_var(curval), newval; + u32 curval, newval; struct task_struct *new_owner; bool postunlock = false; DEFINE_WAKE_Q(wake_q); @@ -2370,7 +2370,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, struct task_struct *argowner) { struct futex_pi_state *pi_state = q->pi_state; - u32 uval, uninitialized_var(curval), newval; + u32 uval, curval, newval; struct task_struct *oldowner, *newowner; u32 newtid; int ret, err = 0; @@ -2996,7 +2996,7 @@ uaddr_faulted: */ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { - u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); + u32 curval, uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; struct futex_hash_bucket *hb; struct futex_q *top_waiter; @@ -3479,7 +3479,7 @@ err_unlock: static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, bool pi, bool pending_op) { - u32 uval, uninitialized_var(nval), mval; + u32 uval, nval, mval; int err; /* Futex address must be 32bit aligned */ @@ -3609,7 +3609,7 @@ static void exit_robust_list(struct task_struct *curr) struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int uninitialized_var(next_pi); + unsigned int next_pi; unsigned long futex_offset; int rc; @@ -3909,7 +3909,7 @@ static void compat_exit_robust_list(struct task_struct *curr) struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int uninitialized_var(next_pi); + unsigned int next_pi; compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; int rc; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 29a8de4c50b9..84ed1d1d5013 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1723,7 +1723,7 @@ static int noop_count(struct lock_list *entry, void *data) static unsigned long __lockdep_count_forward_deps(struct lock_list *this) { unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; __bfs_forwards(this, (void *)&count, noop_count, &target_entry); @@ -1749,7 +1749,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) static unsigned long __lockdep_count_backward_deps(struct lock_list *this) { unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; __bfs_backwards(this, (void *)&count, noop_count, &target_entry); @@ -1804,7 +1804,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target, struct lock_trace **const trace) { int ret; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; struct lock_list src_entry = { .class = hlock_class(src), .parent = NULL, @@ -1842,7 +1842,7 @@ static noinline int check_redundant(struct held_lock *src, struct held_lock *target) { int ret; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; struct lock_list src_entry = { .class = hlock_class(src), .parent = NULL, @@ -2244,8 +2244,8 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, { unsigned long usage_mask = 0, forward_mask, backward_mask; enum lock_usage_bit forward_bit = 0, backward_bit = 0; - struct lock_list *uninitialized_var(target_entry1); - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry1; + struct lock_list *target_entry; struct lock_list this, that; int ret; @@ -3438,7 +3438,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, { int ret; struct lock_list root; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; root.parent = NULL; root.class = hlock_class(this); @@ -3465,7 +3465,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, { int ret; struct lock_list root; - struct lock_list *uninitialized_var(target_entry); + struct lock_list *target_entry; root.parent = NULL; root.class = hlock_class(this); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 00867ff82412..f15471ce969e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -577,7 +577,7 @@ static void rb_wake_up_waiters(struct irq_work *work) */ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) { - struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); + struct ring_buffer_per_cpu *cpu_buffer; DEFINE_WAIT(wait); struct rb_irq_work *work; int ret = 0; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 34e406fe561f..8e4a3a4397f2 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1029,7 +1029,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, { struct radix_tree_node *node, *parent; unsigned long maxindex; - int uninitialized_var(offset); + int offset; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) diff --git a/lib/test_lockup.c b/lib/test_lockup.c index bd7c7ff39f6b..ff26f36d729f 100644 --- a/lib/test_lockup.c +++ b/lib/test_lockup.c @@ -168,7 +168,7 @@ static int master_cpu; static void test_lock(bool master, bool verbose) { - u64 uninitialized_var(wait_start); + u64 wait_start; if (measure_lock_wait) wait_start = local_clock(); diff --git a/mm/frontswap.c b/mm/frontswap.c index bfa3a339253e..9d977b1fc016 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -446,7 +446,7 @@ static int __frontswap_shrink(unsigned long target_pages, void frontswap_shrink(unsigned long target_pages) { unsigned long pages_to_unuse = 0; - int uninitialized_var(type), ret; + int type, ret; /* * we don't want to hold swap_lock while doing a very diff --git a/mm/ksm.c b/mm/ksm.c index 4102034cd55a..5fb176d497ea 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2387,7 +2387,7 @@ next_mm: static void ksm_do_scan(unsigned int scan_npages) { struct rmap_item *rmap_item; - struct page *uninitialized_var(page); + struct page *page; while (scan_npages-- && likely(!freezing(current))) { cond_resched(); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 19622328e4b5..c888d9b5c745 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1004,7 +1004,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { - struct mem_cgroup_reclaim_iter *uninitialized_var(iter); + struct mem_cgroup_reclaim_iter *iter; struct cgroup_subsys_state *css = NULL; struct mem_cgroup *memcg = NULL; struct mem_cgroup *pos = NULL; diff --git a/mm/memory.c b/mm/memory.c index 87ec87cdc1ff..5b5887b0e7f4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2205,7 +2205,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; int err = 0; - spinlock_t *uninitialized_var(ptl); + spinlock_t *ptl; if (create) { pte = (mm == &init_mm) ? diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 381320671677..b9e85d467352 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1234,7 +1234,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, static struct page *new_page(struct page *page, unsigned long start) { struct vm_area_struct *vma; - unsigned long uninitialized_var(address); + unsigned long address; vma = find_vma(current->mm, start); while (vma) { @@ -1629,7 +1629,7 @@ static int kernel_get_mempolicy(int __user *policy, unsigned long flags) { int err; - int uninitialized_var(pval); + int pval; nodemask_t nodes; addr = untagged_addr(addr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 48eb0f1410d4..b52a3a2a5edd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -961,7 +961,7 @@ static inline void __free_one_page(struct page *page, int migratetype, bool report) { struct capture_control *capc = task_capc(zone); - unsigned long uninitialized_var(buddy_pfn); + unsigned long buddy_pfn; unsigned long combined_pfn; unsigned int max_order; struct page *buddy; diff --git a/mm/percpu.c b/mm/percpu.c index 696367b18222..b626766160ce 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2513,7 +2513,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( const size_t static_size = __per_cpu_end - __per_cpu_start; int nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; - int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int upa, max_upa, best_upa; /* units_per_alloc */ int last_allocs, group, unit; unsigned int cpu, tcpu; struct pcpu_alloc_info *ai; diff --git a/mm/slub.c b/mm/slub.c index ef303070d175..f226d66408ee 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1218,7 +1218,7 @@ static noinline int free_debug_processing( struct kmem_cache_node *n = get_node(s, page_to_nid(page)); void *object = head; int cnt = 0; - unsigned long uninitialized_var(flags); + unsigned long flags; int ret = 0; spin_lock_irqsave(&n->list_lock, flags); @@ -2901,7 +2901,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, struct page new; unsigned long counters; struct kmem_cache_node *n = NULL; - unsigned long uninitialized_var(flags); + unsigned long flags; stat(s, FREE_SLOWPATH); diff --git a/mm/swap.c b/mm/swap.c index a82efc33411f..de257c0a89b1 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -830,8 +830,8 @@ void release_pages(struct page **pages, int nr) LIST_HEAD(pages_to_free); struct pglist_data *locked_pgdat = NULL; struct lruvec *lruvec; - unsigned long uninitialized_var(flags); - unsigned int uninitialized_var(lock_batch); + unsigned long flags; + unsigned int lock_batch; for (i = 0; i < nr; i++) { struct page *page = pages[i]; diff --git a/net/dccp/options.c b/net/dccp/options.c index 3b42f5c6a63d..9fed0ae21e63 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -56,7 +56,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, (dh->dccph_doff * 4); struct dccp_options_received *opt_recv = &dp->dccps_options_received; unsigned char opt, len; - unsigned char *uninitialized_var(value); + unsigned char *value; u32 elapsed_time; __be32 opt_val; int rc; diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index c94445b44d8c..2d42e4c35a20 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -84,11 +84,11 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, const struct net_device *indev) { - __be32 uninitialized_var(daddr), uninitialized_var(saddr); - __be16 uninitialized_var(dport), uninitialized_var(sport); + __be32 daddr, saddr; + __be16 dport, sport; const struct iphdr *iph = ip_hdr(skb); struct sk_buff *data_skb = NULL; - u8 uninitialized_var(protocol); + u8 protocol; #if IS_ENABLED(CONFIG_NF_CONNTRACK) enum ip_conntrack_info ctinfo; struct nf_conn const *ct; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index ce4fbba4acce..73bb047e6037 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -535,7 +535,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) { - int uninitialized_var(err); + int err; struct net *net = sock_net(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_flowlabel_req freq; diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index b9df879c48d3..6fd54744cbc3 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -97,7 +97,7 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, const struct net_device *indev) { - __be16 uninitialized_var(dport), uninitialized_var(sport); + __be16 dport, sport; const struct in6_addr *daddr = NULL, *saddr = NULL; struct ipv6hdr *iph = ipv6_hdr(skb); struct sk_buff *data_skb = NULL; diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 9eca90414bb7..b22801f97bce 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -382,7 +382,7 @@ static int help(struct sk_buff *skb, int ret; u32 seq; int dir = CTINFO2DIR(ctinfo); - unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff); + unsigned int matchlen, matchoff; struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct); struct nf_conntrack_expect *exp; union nf_inet_addr *daddr; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 0ba020ca38e6..f02992419850 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -689,7 +689,7 @@ nfulnl_log_packet(struct net *net, struct nfnl_log_net *log = nfnl_log_pernet(net); const struct nfnl_ct_hook *nfnl_ct = NULL; struct nf_conn *ct = NULL; - enum ip_conntrack_info uninitialized_var(ctinfo); + enum ip_conntrack_info ctinfo; if (li_user && li_user->type == NF_LOG_TYPE_ULOG) li = li_user; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 3243a31f6e82..dadfc06245a3 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -388,7 +388,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct net_device *indev; struct net_device *outdev; struct nf_conn *ct = NULL; - enum ip_conntrack_info uninitialized_var(ctinfo); + enum ip_conntrack_info ctinfo; struct nfnl_ct_hook *nfnl_ct; bool csum_verify; char *secdata = NULL; @@ -1168,7 +1168,7 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, struct nfqnl_instance *queue; unsigned int verdict; struct nf_queue_entry *entry; - enum ip_conntrack_info uninitialized_var(ctinfo); + enum ip_conntrack_info ctinfo; struct nfnl_ct_hook *nfnl_ct; struct nf_conn *ct = NULL; struct nfnl_queue_net *q = nfnl_queue_pernet(net); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 80ae7b9fa90a..354c3cc90741 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -225,7 +225,7 @@ static u32 flow_get_skgid(const struct sk_buff *skb) static u32 flow_get_vlan_tag(const struct sk_buff *skb) { - u16 uninitialized_var(tag); + u16 tag; if (vlan_get_tag(skb, &tag) < 0) return 0; diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index ca813697728e..4f823a8223d8 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1692,7 +1692,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct cake_sched_data *q = qdisc_priv(sch); int len = qdisc_pkt_len(skb); - int uninitialized_var(ret); + int ret; struct sk_buff *ack = NULL; ktime_t now = ktime_get(); struct cake_tin_data *b; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 39b427dc7512..ce4519358106 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -360,7 +360,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct cbq_sched_data *q = qdisc_priv(sch); - int uninitialized_var(ret); + int ret; struct cbq_class *cl = cbq_classify(skb, sch, &ret); #ifdef CONFIG_NET_CLS_ACT diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 459a784056c0..985d5208f563 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -187,7 +187,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct fq_codel_sched_data *q = qdisc_priv(sch); unsigned int idx, prev_backlog, prev_qlen; struct fq_codel_flow *flow; - int uninitialized_var(ret); + int ret; unsigned int pkt_len; bool memory_limited; diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index fb760cee824e..4d307f17e084 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -130,7 +130,7 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct fq_pie_sched_data *q = qdisc_priv(sch); struct fq_pie_flow *sel_flow; - int uninitialized_var(ret); + int ret; u8 memory_limited = false; u8 enqueue = false; u32 pkt_len; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 433f2190960f..92ad4115e473 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1533,7 +1533,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); struct hfsc_class *cl; - int uninitialized_var(err); + int err; bool first; cl = hfsc_classify(skb, sch, &err); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 8184c87da8be..6feab225b4ba 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -579,7 +579,7 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - int uninitialized_var(ret); + int ret; unsigned int len = qdisc_pkt_len(skb); struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = htb_classify(skb, sch, &ret); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 5a6def5e4e6d..15f400dcb400 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -349,7 +349,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) unsigned int hash, dropped; sfq_index x, qlen; struct sfq_slot *slot; - int uninitialized_var(ret); + int ret; struct sk_buff *head; int delta; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5c4ec9386f81..b97947e9ca45 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -563,7 +563,7 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) .msg_control = cmh, .msg_controllen = sizeof(buffer), }; - unsigned int uninitialized_var(sent); + unsigned int sent; int err; svc_udp_release_rqst(rqstp); @@ -1080,7 +1080,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) struct msghdr msg = { .msg_flags = 0, }; - unsigned int uninitialized_var(sent); + unsigned int sent; int err; svc_tcp_release_rqst(rqstp); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 914508ea9b84..c57aef829403 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -496,8 +496,8 @@ xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg, int flags, struct rpc_rqst *req) { struct xdr_buf *buf = &req->rq_private_buf; - size_t want, uninitialized_var(read); - ssize_t uninitialized_var(ret); + size_t want, read; + ssize_t ret; xs_read_header(transport, buf); @@ -844,7 +844,7 @@ static int xs_local_send_request(struct rpc_rqst *req) struct msghdr msg = { .msg_flags = XS_SENDMSG_FLAGS, }; - unsigned int uninitialized_var(sent); + unsigned int sent; int status; /* Close the stream if the previous transmission was incomplete */ @@ -915,7 +915,7 @@ static int xs_udp_send_request(struct rpc_rqst *req) .msg_namelen = xprt->addrlen, .msg_flags = XS_SENDMSG_FLAGS, }; - unsigned int uninitialized_var(sent); + unsigned int sent; int status; xs_pktdump("packet data:", @@ -999,7 +999,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req) .msg_flags = XS_SENDMSG_FLAGS, }; bool vm_wait = false; - unsigned int uninitialized_var(sent); + unsigned int sent; int status; /* Close the stream if the previous transmission was incomplete */ diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 24f64bc0de18..710bd44eaa49 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -670,7 +670,7 @@ static int tls_push_record(struct sock *sk, int flags, struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec, *tmp = NULL; - u32 i, split_point, uninitialized_var(orig_end); + u32 i, split_point, orig_end; struct sk_msg *msg_pl, *msg_en; struct aead_request *req; bool split; diff --git a/sound/core/control_compat.c b/sound/core/control_compat.c index d55be1db1a8a..02df1d7db9a1 100644 --- a/sound/core/control_compat.c +++ b/sound/core/control_compat.c @@ -223,7 +223,7 @@ static int copy_ctl_value_from_user(struct snd_card *card, { struct snd_ctl_elem_value32 __user *data32 = userdata; int i, type, size; - int uninitialized_var(count); + int count; unsigned int indirect; if (copy_from_user(&data->id, &data32->id, sizeof(data->id))) diff --git a/sound/isa/sb/sb16_csp.c b/sound/isa/sb/sb16_csp.c index 4ad0ff0c4508..270af863e198 100644 --- a/sound/isa/sb/sb16_csp.c +++ b/sound/isa/sb/sb16_csp.c @@ -102,7 +102,7 @@ static void info_read(struct snd_info_entry *entry, struct snd_info_buffer *buff int snd_sb_csp_new(struct snd_sb *chip, int device, struct snd_hwdep ** rhwdep) { struct snd_sb_csp *p; - int uninitialized_var(version); + int version; int err; struct snd_hwdep *hw; diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c index 9bea7d3f99f8..9468a2de8c9b 100644 --- a/sound/usb/endpoint.c +++ b/sound/usb/endpoint.c @@ -335,7 +335,7 @@ static void queue_pending_output_urbs(struct snd_usb_endpoint *ep) while (test_bit(EP_FLAG_RUNNING, &ep->flags)) { unsigned long flags; - struct snd_usb_packet_info *uninitialized_var(packet); + struct snd_usb_packet_info *packet; struct snd_urb_ctx *ctx = NULL; int err, i; -- cgit v1.2.3 From 3d2e83a2a6a0657c1cf145fa6ba23620715d6c36 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:41 +0200 Subject: timers: Preserve higher bits of expiration on index calculation The higher bits of the timer expiration are cropped while calling calc_index() due to the implicit cast from unsigned long to unsigned int. This loss shouldn't have consequences on the current code since all the computation to calculate the index is done on the lower 32 bits. However to prepare for returning the actual bucket expiration from calc_index() in order to properly fix base->next_expiry updates, the higher bits need to be preserved. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200717140551.29076-3-frederic@kernel.org --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index df1ff803acc4..bcdc3045138d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -487,7 +487,7 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) * Helper function to calculate the array index for a given expiry * time. */ -static inline unsigned calc_index(unsigned expires, unsigned lvl) +static inline unsigned calc_index(unsigned long expires, unsigned lvl) { expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); -- cgit v1.2.3 From 1f32cab0db4bdf6491eb4a60838f278e01c31698 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Fri, 17 Jul 2020 16:05:42 +0200 Subject: timers: Use only bucket expiry for base->next_expiry value The bucket expiry time is the effective expriy time of timers and is greater than or equal to the requested timer expiry time. This is due to the guarantee that timers never expire early and the reduced expiry granularity in the secondary wheel levels. When a timer is enqueued, trigger_dyntick_cpu() checks whether the timer is the new first timer. This check compares next_expiry with the requested timer expiry value and not with the effective expiry value of the bucket into which the timer was queued. Storing the requested timer expiry value in base->next_expiry can lead to base->clk going backwards if the requested timer expiry value is smaller than base->clk. Commit 30c66fc30ee7 ("timer: Prevent base->clk from moving backward") worked around this by preventing the store when timer->expiry is before base->clk, but did not fix the underlying problem. Use the expiry value of the bucket into which the timer is queued to do the new first timer check. This fixes the base->clk going backward problem. The workaround of commit 30c66fc30ee7 ("timer: Prevent base->clk from moving backward") in trigger_dyntick_cpu() is not longer necessary as the timers bucket expiry is guaranteed to be greater than or equal base->clk. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200717140551.29076-4-frederic@kernel.org --- kernel/time/timer.c | 64 ++++++++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bcdc3045138d..a7a3cf737411 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -487,35 +487,39 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) * Helper function to calculate the array index for a given expiry * time. */ -static inline unsigned calc_index(unsigned long expires, unsigned lvl) +static inline unsigned calc_index(unsigned long expires, unsigned lvl, + unsigned long *bucket_expiry) { expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); + *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); } -static int calc_wheel_index(unsigned long expires, unsigned long clk) +static int calc_wheel_index(unsigned long expires, unsigned long clk, + unsigned long *bucket_expiry) { unsigned long delta = expires - clk; unsigned int idx; if (delta < LVL_START(1)) { - idx = calc_index(expires, 0); + idx = calc_index(expires, 0, bucket_expiry); } else if (delta < LVL_START(2)) { - idx = calc_index(expires, 1); + idx = calc_index(expires, 1, bucket_expiry); } else if (delta < LVL_START(3)) { - idx = calc_index(expires, 2); + idx = calc_index(expires, 2, bucket_expiry); } else if (delta < LVL_START(4)) { - idx = calc_index(expires, 3); + idx = calc_index(expires, 3, bucket_expiry); } else if (delta < LVL_START(5)) { - idx = calc_index(expires, 4); + idx = calc_index(expires, 4, bucket_expiry); } else if (delta < LVL_START(6)) { - idx = calc_index(expires, 5); + idx = calc_index(expires, 5, bucket_expiry); } else if (delta < LVL_START(7)) { - idx = calc_index(expires, 6); + idx = calc_index(expires, 6, bucket_expiry); } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { - idx = calc_index(expires, 7); + idx = calc_index(expires, 7, bucket_expiry); } else if ((long) delta < 0) { idx = clk & LVL_MASK; + *bucket_expiry = clk; } else { /* * Force expire obscene large timeouts to expire at the @@ -524,7 +528,7 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk) if (delta >= WHEEL_TIMEOUT_CUTOFF) expires = clk + WHEEL_TIMEOUT_MAX; - idx = calc_index(expires, LVL_DEPTH - 1); + idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry); } return idx; } @@ -544,16 +548,18 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, } static void -__internal_add_timer(struct timer_base *base, struct timer_list *timer) +__internal_add_timer(struct timer_base *base, struct timer_list *timer, + unsigned long *bucket_expiry) { unsigned int idx; - idx = calc_wheel_index(timer->expires, base->clk); + idx = calc_wheel_index(timer->expires, base->clk, bucket_expiry); enqueue_timer(base, timer, idx); } static void -trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) +trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, + unsigned long bucket_expiry) { if (!is_timers_nohz_active()) return; @@ -576,31 +582,29 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) if (!base->is_idle) return; - /* Check whether this is the new first expiring timer: */ - if (time_after_eq(timer->expires, base->next_expiry)) + /* + * Check whether this is the new first expiring timer. The + * effective expiry time of the timer is required here + * (bucket_expiry) instead of timer->expires. + */ + if (time_after_eq(bucket_expiry, base->next_expiry)) return; /* * Set the next expiry time and kick the CPU so it can reevaluate the * wheel: */ - if (time_before(timer->expires, base->clk)) { - /* - * Prevent from forward_timer_base() moving the base->clk - * backward - */ - base->next_expiry = base->clk; - } else { - base->next_expiry = timer->expires; - } + base->next_expiry = bucket_expiry; wake_up_nohz_cpu(base->cpu); } static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { - __internal_add_timer(base, timer); - trigger_dyntick_cpu(base, timer); + unsigned long bucket_expiry; + + __internal_add_timer(base, timer, &bucket_expiry); + trigger_dyntick_cpu(base, timer, bucket_expiry); } #ifdef CONFIG_DEBUG_OBJECTS_TIMERS @@ -959,9 +963,9 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) { + unsigned long clk = 0, flags, bucket_expiry; struct timer_base *base, *new_base; unsigned int idx = UINT_MAX; - unsigned long clk = 0, flags; int ret = 0; BUG_ON(!timer->function); @@ -1000,7 +1004,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } clk = base->clk; - idx = calc_wheel_index(expires, clk); + idx = calc_wheel_index(expires, clk, &bucket_expiry); /* * Retrieve and compare the array index of the pending @@ -1059,7 +1063,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option */ if (idx != UINT_MAX && clk == base->clk) { enqueue_timer(base, timer, idx); - trigger_dyntick_cpu(base, timer); + trigger_dyntick_cpu(base, timer, bucket_expiry); } else { internal_add_timer(base, timer); } -- cgit v1.2.3 From 9a2b764b06c880678416d803d027f575ae40ec99 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:43 +0200 Subject: timers: Move trigger_dyntick_cpu() to enqueue_timer() Consolidate the code by calling trigger_dyntick_cpu() from enqueue_timer() instead of calling it from all its callers. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-5-frederic@kernel.org --- kernel/time/timer.c | 61 ++++++++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a7a3cf737411..2af08a169564 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -533,30 +533,6 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk, return idx; } -/* - * Enqueue the timer into the hash bucket, mark it pending in - * the bitmap and store the index in the timer flags. - */ -static void enqueue_timer(struct timer_base *base, struct timer_list *timer, - unsigned int idx) -{ - hlist_add_head(&timer->entry, base->vectors + idx); - __set_bit(idx, base->pending_map); - timer_set_idx(timer, idx); - - trace_timer_start(timer, timer->expires, timer->flags); -} - -static void -__internal_add_timer(struct timer_base *base, struct timer_list *timer, - unsigned long *bucket_expiry) -{ - unsigned int idx; - - idx = calc_wheel_index(timer->expires, base->clk, bucket_expiry); - enqueue_timer(base, timer, idx); -} - static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, unsigned long bucket_expiry) @@ -598,15 +574,31 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, wake_up_nohz_cpu(base->cpu); } -static void -internal_add_timer(struct timer_base *base, struct timer_list *timer) +/* + * Enqueue the timer into the hash bucket, mark it pending in + * the bitmap, store the index in the timer flags then wake up + * the target CPU if needed. + */ +static void enqueue_timer(struct timer_base *base, struct timer_list *timer, + unsigned int idx, unsigned long bucket_expiry) { - unsigned long bucket_expiry; + hlist_add_head(&timer->entry, base->vectors + idx); + __set_bit(idx, base->pending_map); + timer_set_idx(timer, idx); - __internal_add_timer(base, timer, &bucket_expiry); + trace_timer_start(timer, timer->expires, timer->flags); trigger_dyntick_cpu(base, timer, bucket_expiry); } +static void internal_add_timer(struct timer_base *base, struct timer_list *timer) +{ + unsigned long bucket_expiry; + unsigned int idx; + + idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry); + enqueue_timer(base, timer, idx, bucket_expiry); +} + #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static struct debug_obj_descr timer_debug_descr; @@ -1057,16 +1049,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option /* * If 'idx' was calculated above and the base time did not advance * between calculating 'idx' and possibly switching the base, only - * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise - * we need to (re)calculate the wheel index via - * internal_add_timer(). + * enqueue_timer() is required. Otherwise we need to (re)calculate + * the wheel index via internal_add_timer(). */ - if (idx != UINT_MAX && clk == base->clk) { - enqueue_timer(base, timer, idx); - trigger_dyntick_cpu(base, timer, bucket_expiry); - } else { + if (idx != UINT_MAX && clk == base->clk) + enqueue_timer(base, timer, idx, bucket_expiry); + else internal_add_timer(base, timer); - } out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); -- cgit v1.2.3 From 4468897211628865ee2392acb5ad281f74176f63 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:44 +0200 Subject: timers: Add comments about calc_index() ceiling work calc_index() adds 1 unit of the level granularity to the expiry passed in parameter to ensure that the timer doesn't expire too early. Add a comment to explain that and the resulting layout in the wheel. Suggested-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-6-frederic@kernel.org --- kernel/time/timer.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2af08a169564..af1c08b0b168 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -156,7 +156,8 @@ EXPORT_SYMBOL(jiffies_64); /* * The time start value for each level to select the bucket at enqueue - * time. + * time. We start from the last possible delta of the previous level + * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()). */ #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) @@ -490,6 +491,15 @@ static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) static inline unsigned calc_index(unsigned long expires, unsigned lvl, unsigned long *bucket_expiry) { + + /* + * The timer wheel has to guarantee that a timer does not fire + * early. Early expiry can happen due to: + * - Timer is armed at the edge of a tick + * - Truncation of the expiry time in the outer wheel levels + * + * Round up with level granularity to prevent this. + */ expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); -- cgit v1.2.3 From 001ec1b3925da0d51847c23fc0aa4129282db526 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:45 +0200 Subject: timers: Optimize _next_timer_interrupt() level iteration If a level has a timer that expires before reaching the next level, there is no need to iterate further. The next level is reached when the 3 lower bits of the current level are cleared. If the next event happens before/during that, the next levels won't provide an earlier expiration. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-7-frederic@kernel.org --- kernel/time/timer.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index af1c08b0b168..9abc41715fd2 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1526,6 +1526,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); + unsigned long lvl_clk = clk & LVL_CLK_MASK; if (pos >= 0) { unsigned long tmp = clk + (unsigned long) pos; @@ -1533,6 +1534,13 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) tmp <<= LVL_SHIFT(lvl); if (time_before(tmp, next)) next = tmp; + + /* + * If the next expiration happens before we reach + * the next level, no need to check further. + */ + if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) + break; } /* * Clock for the next level. If the current level clock lower @@ -1570,7 +1578,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) * So the simple check whether the lower bits of the current * level are 0 or not is sufficient for all cases. */ - adj = clk & LVL_CLK_MASK ? 1 : 0; + adj = lvl_clk ? 1 : 0; clk >>= LVL_CLK_SHIFT; clk += adj; } -- cgit v1.2.3 From dc2a0f1fb2a06df09f5094f29aea56b763aa7cca Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:46 +0200 Subject: timers: Always keep track of next expiry So far next expiry was only tracked while the CPU was in nohz_idle mode in order to cope with missing ticks that can't increment the base->clk periodically anymore. This logic is going to be expanded beyond nohz in order to spare timer softirqs so do it unconditionally. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-8-frederic@kernel.org --- kernel/time/timer.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 9abc41715fd2..76fd9644638b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -544,8 +544,7 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk, } static void -trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, - unsigned long bucket_expiry) +trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { if (!is_timers_nohz_active()) return; @@ -565,23 +564,8 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, * timer is not deferrable. If the other CPU is on the way to idle * then it can't set base->is_idle as we hold the base lock: */ - if (!base->is_idle) - return; - - /* - * Check whether this is the new first expiring timer. The - * effective expiry time of the timer is required here - * (bucket_expiry) instead of timer->expires. - */ - if (time_after_eq(bucket_expiry, base->next_expiry)) - return; - - /* - * Set the next expiry time and kick the CPU so it can reevaluate the - * wheel: - */ - base->next_expiry = bucket_expiry; - wake_up_nohz_cpu(base->cpu); + if (base->is_idle) + wake_up_nohz_cpu(base->cpu); } /* @@ -592,12 +576,26 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer, static void enqueue_timer(struct timer_base *base, struct timer_list *timer, unsigned int idx, unsigned long bucket_expiry) { + hlist_add_head(&timer->entry, base->vectors + idx); __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); trace_timer_start(timer, timer->expires, timer->flags); - trigger_dyntick_cpu(base, timer, bucket_expiry); + + /* + * Check whether this is the new first expiring timer. The + * effective expiry time of the timer is required here + * (bucket_expiry) instead of timer->expires. + */ + if (time_before(bucket_expiry, base->next_expiry)) { + /* + * Set the next expiry time and kick the CPU so it + * can reevaluate the wheel: + */ + base->next_expiry = bucket_expiry; + trigger_dyntick_cpu(base, timer); + } } static void internal_add_timer(struct timer_base *base, struct timer_list *timer) @@ -1493,7 +1491,6 @@ static int __collect_expired_timers(struct timer_base *base, return levels; } -#ifdef CONFIG_NO_HZ_COMMON /* * Find the next pending bucket of a level. Search from level start (@offset) * + @clk upwards and if nothing there, search from start of the level @@ -1585,6 +1582,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) return next; } +#ifdef CONFIG_NO_HZ_COMMON /* * Check, if the next hrtimer event is before the next timer wheel * event: @@ -1790,6 +1788,7 @@ static inline void __run_timers(struct timer_base *base) levels = collect_expired_timers(base, heads); base->clk++; + base->next_expiry = __next_timer_interrupt(base); while (levels--) expire_timers(base, heads + levels); @@ -2042,6 +2041,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; timer_base_init_expiry_lock(base); } } -- cgit v1.2.3 From 90d52f65f303091be17b5f4ffab7090b2064b4a1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:47 +0200 Subject: timers: Reuse next expiry cache after nohz exit Now that the next expiry it tracked unconditionally when a timer is added, this information can be reused on a tick firing after exiting nohz. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-9-frederic@kernel.org --- kernel/time/timer.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 76fd9644638b..13f48ee708aa 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1706,13 +1706,11 @@ static int collect_expired_timers(struct timer_base *base, * the next expiring timer. */ if ((long)(now - base->clk) > 2) { - unsigned long next = __next_timer_interrupt(base); - /* * If the next timer is ahead of time forward to current * jiffies, otherwise forward to the next expiry time: */ - if (time_after(next, now)) { + if (time_after(base->next_expiry, now)) { /* * The call site will increment base->clk and then * terminate the expiry loop immediately. @@ -1720,7 +1718,7 @@ static int collect_expired_timers(struct timer_base *base, base->clk = now; return 0; } - base->clk = next; + base->clk = base->next_expiry; } return __collect_expired_timers(base, heads); } -- cgit v1.2.3 From 1f8a4212dc83f8353843fabf6465fd918372fbbf Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:48 +0200 Subject: timers: Expand clk forward logic beyond nohz As for next_expiry, the base->clk catch up logic will be expanded beyond NOHZ in order to avoid triggering useless softirqs. If softirqs should only fire to expire pending timers, periodic base->clk increments must be skippable for random amounts of time. Therefore prepare to catch-up with missing updates whenever an up-to-date base clock is needed. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-10-frederic@kernel.org --- kernel/time/timer.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 13f48ee708aa..1be92b53b75f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -888,19 +888,12 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { -#ifdef CONFIG_NO_HZ_COMMON unsigned long jnow; - /* - * We only forward the base when we are idle or have just come out of - * idle (must_forward_clk logic), and have a delta between base clock - * and jiffies. In the common case, run_timers will take care of it. - */ - if (likely(!base->must_forward_clk)) + if (!base->must_forward_clk) return; jnow = READ_ONCE(jiffies); - base->must_forward_clk = base->is_idle; if ((long)(jnow - base->clk) < 2) return; @@ -915,7 +908,6 @@ static inline void forward_timer_base(struct timer_base *base) return; base->clk = base->next_expiry; } -#endif } @@ -1667,10 +1659,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) * logic is only maintained for the BASE_STD base, deferrable * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) { - base->must_forward_clk = true; + if ((expires - basem) > TICK_NSEC) base->is_idle = true; - } } raw_spin_unlock(&base->lock); @@ -1769,16 +1759,7 @@ static inline void __run_timers(struct timer_base *base) /* * timer_base::must_forward_clk must be cleared before running * timers so that any timer functions that call mod_timer() will - * not try to forward the base. Idle tracking / clock forwarding - * logic is only used with BASE_STD timers. - * - * The must_forward_clk flag is cleared unconditionally also for - * the deferrable base. The deferrable base is not affected by idle - * tracking and never forwarded, so clearing the flag is a NOOP. - * - * The fact that the deferrable base is never forwarded can cause - * large variations in granularity for deferrable timers, but they - * can be deferred for long periods due to idle anyway. + * not try to forward the base. */ base->must_forward_clk = false; @@ -1791,6 +1772,7 @@ static inline void __run_timers(struct timer_base *base) while (levels--) expire_timers(base, heads + levels); } + base->must_forward_clk = true; raw_spin_unlock_irq(&base->lock); timer_base_unlock_expiry(base); } -- cgit v1.2.3 From d4f7dae87096dfe722bf32aa82076ece1063746c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:49 +0200 Subject: timers: Spare timer softirq until next expiry Now that the core timer infrastructure doesn't depend anymore on periodic base->clk increments, even when the CPU is not in NO_HZ mode, timer softirqs can be skipped until there are timers to expire. Some spurious softirqs can still remain since base->next_expiry doesn't keep track of canceled timers but this still reduces the number of softirqs significantly: ~15 times less for HZ=1000 and ~5 times less for HZ=100. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-11-frederic@kernel.org --- kernel/time/timer.c | 49 ++++++++----------------------------------------- 1 file changed, 8 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 1be92b53b75f..4f78a7bff9e1 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1458,10 +1458,10 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) } } -static int __collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) { - unsigned long clk = base->clk; + unsigned long clk = base->clk = base->next_expiry; struct hlist_head *vec; int i, levels = 0; unsigned int idx; @@ -1684,40 +1684,6 @@ void timer_clear_idle(void) */ base->is_idle = false; } - -static int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) -{ - unsigned long now = READ_ONCE(jiffies); - - /* - * NOHZ optimization. After a long idle sleep we need to forward the - * base to current jiffies. Avoid a loop by searching the bitfield for - * the next expiring timer. - */ - if ((long)(now - base->clk) > 2) { - /* - * If the next timer is ahead of time forward to current - * jiffies, otherwise forward to the next expiry time: - */ - if (time_after(base->next_expiry, now)) { - /* - * The call site will increment base->clk and then - * terminate the expiry loop immediately. - */ - base->clk = now; - return 0; - } - base->clk = base->next_expiry; - } - return __collect_expired_timers(base, heads); -} -#else -static inline int collect_expired_timers(struct timer_base *base, - struct hlist_head *heads) -{ - return __collect_expired_timers(base, heads); -} #endif /* @@ -1750,7 +1716,7 @@ static inline void __run_timers(struct timer_base *base) struct hlist_head heads[LVL_DEPTH]; int levels; - if (!time_after_eq(jiffies, base->clk)) + if (time_before(jiffies, base->next_expiry)) return; timer_base_lock_expiry(base); @@ -1763,7 +1729,8 @@ static inline void __run_timers(struct timer_base *base) */ base->must_forward_clk = false; - while (time_after_eq(jiffies, base->clk)) { + while (time_after_eq(jiffies, base->clk) && + time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); base->clk++; @@ -1798,12 +1765,12 @@ void run_local_timers(void) hrtimer_run_queues(); /* Raise the softirq only if required. */ - if (time_before(jiffies, base->clk)) { + if (time_before(jiffies, base->next_expiry)) { if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) return; /* CPU is awake, so check the deferrable base. */ base++; - if (time_before(jiffies, base->clk)) + if (time_before(jiffies, base->next_expiry)) return; } raise_softirq(TIMER_SOFTIRQ); -- cgit v1.2.3 From 0975fb565b8b8f9e0c96d0de39fcb954833ea5e0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:50 +0200 Subject: timers: Remove must_forward_clk There is no reason to keep this guard around. The code makes sure that base->clk remains sane and won't be forwarded beyond jiffies nor set backward. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-12-frederic@kernel.org --- kernel/time/timer.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 4f78a7bff9e1..8b3fb52d8c47 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -205,7 +205,6 @@ struct timer_base { unsigned long next_expiry; unsigned int cpu; bool is_idle; - bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -888,12 +887,13 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { - unsigned long jnow; + unsigned long jnow = READ_ONCE(jiffies); - if (!base->must_forward_clk) - return; - - jnow = READ_ONCE(jiffies); + /* + * No need to forward if we are close enough below jiffies. + * Also while executing timers, base->clk is 1 offset ahead + * of jiffies to avoid endless requeuing to current jffies. + */ if ((long)(jnow - base->clk) < 2) return; @@ -1722,16 +1722,8 @@ static inline void __run_timers(struct timer_base *base) timer_base_lock_expiry(base); raw_spin_lock_irq(&base->lock); - /* - * timer_base::must_forward_clk must be cleared before running - * timers so that any timer functions that call mod_timer() will - * not try to forward the base. - */ - base->must_forward_clk = false; - while (time_after_eq(jiffies, base->clk) && time_after_eq(jiffies, base->next_expiry)) { - levels = collect_expired_timers(base, heads); base->clk++; base->next_expiry = __next_timer_interrupt(base); @@ -1739,7 +1731,6 @@ static inline void __run_timers(struct timer_base *base) while (levels--) expire_timers(base, heads + levels); } - base->must_forward_clk = true; raw_spin_unlock_irq(&base->lock); timer_base_unlock_expiry(base); } @@ -1935,7 +1926,6 @@ int timers_prepare_cpu(unsigned int cpu) base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; base->is_idle = false; - base->must_forward_clk = true; } return 0; } -- cgit v1.2.3 From 36cd28a4cdd05d47ccb62a2d86e8f37839cc879a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2020 16:05:51 +0200 Subject: timers: Lower base clock forwarding threshold There is nothing that prevents from forwarding the base clock if it's one jiffy off. The reason for this arbitrary limit of two jiffies is historical and does not longer exist. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20200717140551.29076-13-frederic@kernel.org --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8b3fb52d8c47..77e21e98ec32 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -894,7 +894,7 @@ static inline void forward_timer_base(struct timer_base *base) * Also while executing timers, base->clk is 1 offset ahead * of jiffies to avoid endless requeuing to current jffies. */ - if ((long)(jnow - base->clk) < 2) + if ((long)(jnow - base->clk) < 1) return; /* -- cgit v1.2.3 From 9180bd467f9abdb44afde650d07e3b9dd66d837c Mon Sep 17 00:00:00 2001 From: André Almeida Date: Thu, 2 Jul 2020 17:28:40 -0300 Subject: futex: Remove put_futex_key() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 4b39f99c ("futex: Remove {get,drop}_futex_key_refs()"), put_futex_key() is empty. Remove all references for this function and the then redundant labels. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200702202843.520764-2-andrealmeid@collabora.com --- kernel/futex.c | 61 ++++++++++++---------------------------------------------- 1 file changed, 12 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e646661f6282..bd9adfca5d51 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -677,10 +677,6 @@ out: return err; } -static inline void put_futex_key(union futex_key *key) -{ -} - /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address @@ -1617,7 +1613,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) /* Make sure we really have tasks to wakeup */ if (!hb_waiters_pending(hb)) - goto out_put_key; + goto out; spin_lock(&hb->lock); @@ -1640,8 +1636,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) spin_unlock(&hb->lock); wake_up_q(&wake_q); -out_put_key: - put_futex_key(&key); out: return ret; } @@ -1712,7 +1706,7 @@ retry: goto out; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) - goto out_put_key1; + goto out; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1730,13 +1724,13 @@ retry_private: * an MMU, but we might get them from range checking */ ret = op_ret; - goto out_put_keys; + goto out; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) - goto out_put_keys; + goto out; } if (!(flags & FLAGS_SHARED)) { @@ -1744,8 +1738,6 @@ retry_private: goto retry_private; } - put_futex_key(&key2); - put_futex_key(&key1); cond_resched(); goto retry; } @@ -1781,10 +1773,6 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); out: return ret; } @@ -1996,7 +1984,7 @@ retry: ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) - goto out_put_key1; + goto out; /* * The check above which compares uaddrs is not sufficient for @@ -2004,7 +1992,7 @@ retry: */ if (requeue_pi && match_futex(&key1, &key2)) { ret = -EINVAL; - goto out_put_keys; + goto out; } hb1 = hash_futex(&key1); @@ -2025,13 +2013,11 @@ retry_private: ret = get_user(curval, uaddr1); if (ret) - goto out_put_keys; + goto out; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&key2); - put_futex_key(&key1); goto retry; } if (curval != *cmpval) { @@ -2090,8 +2076,6 @@ retry_private: case -EFAULT: double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; @@ -2106,8 +2090,6 @@ retry_private: */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2217,10 +2199,6 @@ out_unlock: wake_up_q(&wake_q); hb_waiters_dec(hb2); -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); out: return ret ? ret : task_count; } @@ -2697,7 +2675,6 @@ retry_private: if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q->key); goto retry; } @@ -2707,8 +2684,6 @@ retry_private: } out: - if (ret) - put_futex_key(&q->key); return ret; } @@ -2853,7 +2828,6 @@ retry_private: * - EAGAIN: The user space value changed. */ queue_unlock(hb); - put_futex_key(&q.key); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -2961,13 +2935,11 @@ no_block: put_pi_state(pi_state); } - goto out_put_key; + goto out; out_unlock_put_key: queue_unlock(hb); -out_put_key: - put_futex_key(&q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -2980,12 +2952,11 @@ uaddr_faulted: ret = fault_in_user_writeable(uaddr); if (ret) - goto out_put_key; + goto out; if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(&q.key); goto retry; } @@ -3114,16 +3085,13 @@ retry: out_unlock: spin_unlock(&hb->lock); out_putkey: - put_futex_key(&key); return ret; pi_retry: - put_futex_key(&key); cond_resched(); goto retry; pi_faulted: - put_futex_key(&key); ret = fault_in_user_writeable(uaddr); if (!ret) @@ -3265,7 +3233,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) - goto out_key2; + goto out; /* * The check above which compares uaddrs is not sufficient for @@ -3274,7 +3242,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (match_futex(&q.key, &key2)) { queue_unlock(hb); ret = -EINVAL; - goto out_put_keys; + goto out; } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ @@ -3284,7 +3252,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); spin_unlock(&hb->lock); if (ret) - goto out_put_keys; + goto out; /* * In order for us to be here, we know our q.key == key2, and since @@ -3374,11 +3342,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ret = -EWOULDBLOCK; } -out_put_keys: - put_futex_key(&q.key); -out_key2: - put_futex_key(&key2); - out: if (to) { hrtimer_cancel(&to->timer); -- cgit v1.2.3 From d7c5ed73b19c4640426d9c106f70ec2cb532034d Mon Sep 17 00:00:00 2001 From: André Almeida Date: Thu, 2 Jul 2020 17:28:41 -0300 Subject: futex: Remove needless goto's MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As stated in the coding style documentation, "if there is no cleanup needed then just return directly", instead of jumping to a label and then returning. Remove such goto's and replace with a return statement. When there's a ternary operator on the return value, replace it with the result of the operation when it is logically possible to determine it by the control flow. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200702202843.520764-3-andrealmeid@collabora.com --- kernel/futex.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index bd9adfca5d51..362fbca6d614 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1607,13 +1607,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; hb = hash_futex(&key); /* Make sure we really have tasks to wakeup */ if (!hb_waiters_pending(hb)) - goto out; + return ret; spin_lock(&hb->lock); @@ -1636,7 +1636,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) spin_unlock(&hb->lock); wake_up_q(&wake_q); -out: return ret; } @@ -1703,10 +1702,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) - goto out; + return ret; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1724,13 +1723,13 @@ retry_private: * an MMU, but we might get them from range checking */ ret = op_ret; - goto out; + return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) - goto out; + return ret; } if (!(flags & FLAGS_SHARED)) { @@ -1773,7 +1772,6 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); -out: return ret; } @@ -1980,20 +1978,18 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) - goto out; + return ret; /* * The check above which compares uaddrs is not sufficient for * shared futexes. We need to compare the keys: */ - if (requeue_pi && match_futex(&key1, &key2)) { - ret = -EINVAL; - goto out; - } + if (requeue_pi && match_futex(&key1, &key2)) + return -EINVAL; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -2013,7 +2009,7 @@ retry_private: ret = get_user(curval, uaddr1); if (ret) - goto out; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; @@ -2079,7 +2075,7 @@ retry_private: ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; - goto out; + return ret; case -EBUSY: case -EAGAIN: /* @@ -2198,8 +2194,6 @@ out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); hb_waiters_dec(hb2); - -out: return ret ? ret : task_count; } @@ -2545,7 +2539,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); - goto out; + return ret ? ret : locked; } /* @@ -2558,7 +2552,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) */ if (q->pi_state->owner == current) { ret = fixup_pi_state_owner(uaddr, q, NULL); - goto out; + return ret; } /* @@ -2572,8 +2566,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) q->pi_state->owner); } -out: - return ret ? ret : locked; + return ret; } /** @@ -2670,7 +2663,7 @@ retry_private: ret = get_user(uval, uaddr); if (ret) - goto out; + return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; @@ -2683,7 +2676,6 @@ retry_private: ret = -EWOULDBLOCK; } -out: return ret; } -- cgit v1.2.3 From 9261308598ad28b9a8a2237d881833e9f217244e Mon Sep 17 00:00:00 2001 From: André Almeida Date: Thu, 2 Jul 2020 17:28:43 -0300 Subject: futex: Consistently use fshared as boolean MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since fshared is only conveying true/false values, declare it as bool. In get_futex_key() the usage of fshared can be restricted to the first part of the function. If fshared is false the function is terminated early and the subsequent code can use a constant 'true' instead of the variable. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200702202843.520764-5-andrealmeid@collabora.com --- kernel/futex.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 362fbca6d614..cda91755b77d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -476,7 +476,7 @@ static u64 get_inode_sequence_number(struct inode *inode) /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) @@ -500,8 +500,8 @@ static u64 get_inode_sequence_number(struct inode *inode) * * lock_page() might sleep, the caller should not hold a spinlock. */ -static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw) +static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, + enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -538,7 +538,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a again: /* Ignore any VERIFY_READ mapping (futex common case) */ - if (unlikely(should_fail_futex(fshared))) + if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); @@ -626,7 +626,7 @@ again: * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ - if (unlikely(should_fail_futex(fshared)) || ro) { + if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } -- cgit v1.2.3 From 9a71df495c3d29dab596bb590e73fd8b20106e2d Mon Sep 17 00:00:00 2001 From: André Almeida Date: Thu, 2 Jul 2020 17:28:42 -0300 Subject: futex: Remove unused or redundant includes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 82af7aca ("Removal of FUTEX_FD"), some includes related to file operations aren't needed anymore. More investigation around the includes showed that a lot of includes aren't required for compilation, possible due to redundant includes. Simplify the code by removing unused includes. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200702202843.520764-4-andrealmeid@collabora.com --- kernel/futex.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index cda91755b77d..4616d4ad609d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -32,30 +32,13 @@ * "But they come in a choice of three flavours!" */ #include -#include -#include -#include -#include #include -#include -#include -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include #include -#include #include -- cgit v1.2.3 From ce3aa9cc5109363099b7c4ac82e2c9768afcaf31 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 17 Jul 2020 12:35:22 +0200 Subject: bpf, netns: Handle multiple link attachments Extend the BPF netns link callbacks to rebuild (grow/shrink) or update the prog_array at given position when link gets attached/updated/released. This let's us lift the limit of having just one link attached for the new attach type introduced by subsequent patch. No functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200717103536.397595-2-jakub@cloudflare.com --- include/linux/bpf.h | 3 ++ kernel/bpf/core.c | 55 ++++++++++++++++++++++++++++ kernel/bpf/net_namespace.c | 90 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 139 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 54ad426dbea1..c8c9eabcd106 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -928,6 +928,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, void bpf_prog_array_delete_safe(struct bpf_prog_array *progs, struct bpf_prog *old_prog); +int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index); +int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, + struct bpf_prog *prog); int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9df4cc9a2907..7be02e555ab9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1958,6 +1958,61 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array, } } +/** + * bpf_prog_array_delete_safe_at() - Replaces the program at the given + * index into the program array with + * a dummy no-op program. + * @array: a bpf_prog_array + * @index: the index of the program to replace + * + * Skips over dummy programs, by not counting them, when calculating + * the the position of the program to replace. + * + * Return: + * * 0 - Success + * * -EINVAL - Invalid index value. Must be a non-negative integer. + * * -ENOENT - Index out of range + */ +int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index) +{ + return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog); +} + +/** + * bpf_prog_array_update_at() - Updates the program at the given index + * into the program array. + * @array: a bpf_prog_array + * @index: the index of the program to update + * @prog: the program to insert into the array + * + * Skips over dummy programs, by not counting them, when calculating + * the position of the program to update. + * + * Return: + * * 0 - Success + * * -EINVAL - Invalid index value. Must be a non-negative integer. + * * -ENOENT - Index out of range + */ +int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, + struct bpf_prog *prog) +{ + struct bpf_prog_array_item *item; + + if (unlikely(index < 0)) + return -EINVAL; + + for (item = array->items; item->prog; item++) { + if (item->prog == &dummy_bpf_prog.prog) + continue; + if (!index) { + WRITE_ONCE(item->prog, prog); + return 0; + } + index--; + } + return -ENOENT; +} + int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 310241ca7991..e9c8e26ac8f2 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -36,12 +36,50 @@ static void netns_bpf_run_array_detach(struct net *net, bpf_prog_array_free(run_array); } +static int link_index(struct net *net, enum netns_bpf_attach_type type, + struct bpf_netns_link *link) +{ + struct bpf_netns_link *pos; + int i = 0; + + list_for_each_entry(pos, &net->bpf.links[type], node) { + if (pos == link) + return i; + i++; + } + return -ENOENT; +} + +static int link_count(struct net *net, enum netns_bpf_attach_type type) +{ + struct list_head *pos; + int i = 0; + + list_for_each(pos, &net->bpf.links[type]) + i++; + return i; +} + +static void fill_prog_array(struct net *net, enum netns_bpf_attach_type type, + struct bpf_prog_array *prog_array) +{ + struct bpf_netns_link *pos; + unsigned int i = 0; + + list_for_each_entry(pos, &net->bpf.links[type], node) { + prog_array->items[i].prog = pos->link.prog; + i++; + } +} + static void bpf_netns_link_release(struct bpf_link *link) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; + struct bpf_prog_array *old_array, *new_array; struct net *net; + int cnt, idx; mutex_lock(&netns_bpf_mutex); @@ -53,9 +91,27 @@ static void bpf_netns_link_release(struct bpf_link *link) if (!net) goto out_unlock; - netns_bpf_run_array_detach(net, type); + /* Remember link position in case of safe delete */ + idx = link_index(net, type, net_link); list_del(&net_link->node); + cnt = link_count(net, type); + if (!cnt) { + netns_bpf_run_array_detach(net, type); + goto out_unlock; + } + + old_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + new_array = bpf_prog_array_alloc(cnt, GFP_KERNEL); + if (!new_array) { + WARN_ON(bpf_prog_array_delete_safe_at(old_array, idx)); + goto out_unlock; + } + fill_prog_array(net, type, new_array); + rcu_assign_pointer(net->bpf.run_array[type], new_array); + bpf_prog_array_free(old_array); + out_unlock: mutex_unlock(&netns_bpf_mutex); } @@ -77,7 +133,7 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, enum netns_bpf_attach_type type = net_link->netns_type; struct bpf_prog_array *run_array; struct net *net; - int ret = 0; + int idx, ret; if (old_prog && old_prog != link->prog) return -EPERM; @@ -95,7 +151,10 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); - WRITE_ONCE(run_array->items[0].prog, new_prog); + idx = link_index(net, type, net_link); + ret = bpf_prog_array_update_at(run_array, idx, new_prog); + if (ret) + goto out_unlock; old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); @@ -309,18 +368,28 @@ int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) return ret; } +static int netns_bpf_max_progs(enum netns_bpf_attach_type type) +{ + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + return 1; + default: + return 0; + } +} + static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); struct bpf_prog_array *run_array; - int err; + int cnt, err; mutex_lock(&netns_bpf_mutex); - /* Allow attaching only one prog or link for now */ - if (!list_empty(&net->bpf.links[type])) { + cnt = link_count(net, type); + if (cnt >= netns_bpf_max_progs(type)) { err = -E2BIG; goto out_unlock; } @@ -341,16 +410,19 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, if (err) goto out_unlock; - run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + run_array = bpf_prog_array_alloc(cnt + 1, GFP_KERNEL); if (!run_array) { err = -ENOMEM; goto out_unlock; } - run_array->items[0].prog = link->prog; - rcu_assign_pointer(net->bpf.run_array[type], run_array); list_add_tail(&net_link->node, &net->bpf.links[type]); + fill_prog_array(net, type, run_array); + run_array = rcu_replace_pointer(net->bpf.run_array[type], run_array, + lockdep_is_held(&netns_bpf_mutex)); + bpf_prog_array_free(run_array); + out_unlock: mutex_unlock(&netns_bpf_mutex); return err; -- cgit v1.2.3 From e9ddbb7707ff5891616240026062b8c1e29864ca Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 17 Jul 2020 12:35:23 +0200 Subject: bpf: Introduce SK_LOOKUP program type with a dedicated attach point Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer when looking up a listening socket for a new connection request for connection oriented protocols, or when looking up an unconnected socket for a packet for connection-less protocols. When called, SK_LOOKUP BPF program can select a socket that will receive the packet. This serves as a mechanism to overcome the limits of what bind() API allows to express. Two use-cases driving this work are: (1) steer packets destined to an IP range, on fixed port to a socket 192.0.2.0/24, port 80 -> NGINX socket (2) steer packets destined to an IP address, on any port to a socket 198.51.100.1, any port -> L7 proxy socket In its run-time context program receives information about the packet that triggered the socket lookup. Namely IP version, L4 protocol identifier, and address 4-tuple. Context can be further extended to include ingress interface identifier. To select a socket BPF program fetches it from a map holding socket references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...) helper to record the selection. Transport layer then uses the selected socket as a result of socket lookup. In its basic form, SK_LOOKUP acts as a filter and hence must return either SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should look for a socket to receive the packet, or use the one selected by the program if available, while SK_DROP informs the transport layer that the lookup should fail. This patch only enables the user to attach an SK_LOOKUP program to a network namespace. Subsequent patches hook it up to run on local delivery path in ipv4 and ipv6 stacks. Suggested-by: Marek Majkowski Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200717103536.397595-3-jakub@cloudflare.com --- include/linux/bpf-netns.h | 3 + include/linux/bpf.h | 1 + include/linux/bpf_types.h | 2 + include/linux/filter.h | 17 +++++ include/uapi/linux/bpf.h | 77 +++++++++++++++++++ kernel/bpf/net_namespace.c | 5 ++ kernel/bpf/syscall.c | 9 +++ kernel/bpf/verifier.c | 13 +++- net/core/filter.c | 180 +++++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 9 ++- 10 files changed, 312 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index 47d5b0c708c9..722f799c1a2e 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -8,6 +8,7 @@ enum netns_bpf_attach_type { NETNS_BPF_INVALID = -1, NETNS_BPF_FLOW_DISSECTOR = 0, + NETNS_BPF_SK_LOOKUP, MAX_NETNS_BPF_ATTACH_TYPE }; @@ -17,6 +18,8 @@ to_netns_bpf_attach_type(enum bpf_attach_type attach_type) switch (attach_type) { case BPF_FLOW_DISSECTOR: return NETNS_BPF_FLOW_DISSECTOR; + case BPF_SK_LOOKUP: + return NETNS_BPF_SK_LOOKUP; default: return NETNS_BPF_INVALID; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c8c9eabcd106..adb16bdc5f0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -249,6 +249,7 @@ enum bpf_arg_type { ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ + ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a18ae82a298a..a52a5688418e 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -64,6 +64,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2, #ifdef CONFIG_INET BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport, struct sk_reuseport_md, struct sk_reuseport_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_LOOKUP, sk_lookup, + struct bpf_sk_lookup, struct bpf_sk_lookup_kern) #endif #if defined(CONFIG_BPF_JIT) BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops, diff --git a/include/linux/filter.h b/include/linux/filter.h index 0b0144752d78..fa1ea12ad2cd 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1278,4 +1278,21 @@ struct bpf_sockopt_kern { s32 retval; }; +struct bpf_sk_lookup_kern { + u16 family; + u16 protocol; + struct { + __be32 saddr; + __be32 daddr; + } v4; + struct { + const struct in6_addr *saddr; + const struct in6_addr *daddr; + } v6; + __be16 sport; + u16 dport; + struct sock *selected_sk; + bool no_reuseport; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7ac3992dacfe..54d0c886e3ba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -189,6 +189,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_STRUCT_OPS, BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, + BPF_PROG_TYPE_SK_LOOKUP, }; enum bpf_attach_type { @@ -228,6 +229,7 @@ enum bpf_attach_type { BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, BPF_XDP_CPUMAP, + BPF_SK_LOOKUP, __MAX_BPF_ATTACH_TYPE }; @@ -3069,6 +3071,10 @@ union bpf_attr { * * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * * Assign the *sk* to the *skb*. When combined with appropriate * routing configuration to receive the packet towards the socket, * will cause *skb* to be delivered to the specified socket. @@ -3094,6 +3100,56 @@ union bpf_attr { * **-ESOCKTNOSUPPORT** if the socket type is not supported * (reuseport). * + * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs. + * + * Select the *sk* as a result of a socket lookup. + * + * For the operation to succeed passed socket must be compatible + * with the packet description provided by the *ctx* object. + * + * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must + * be an exact match. While IP family (**AF_INET** or + * **AF_INET6**) must be compatible, that is IPv6 sockets + * that are not v6-only can be selected for IPv4 packets. + * + * Only TCP listeners and UDP unconnected sockets can be + * selected. *sk* can also be NULL to reset any previous + * selection. + * + * *flags* argument can combination of following values: + * + * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous + * socket selection, potentially done by a BPF program + * that ran before us. + * + * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip + * load-balancing within reuseport group for the socket + * being selected. + * + * On success *ctx->sk* will point to the selected socket. + * + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EAFNOSUPPORT** if socket family (*sk->family*) is + * not compatible with packet family (*ctx->family*). + * + * * **-EEXIST** if socket has been already selected, + * potentially by another program, and + * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified. + * + * * **-EINVAL** if unsupported flags were specified. + * + * * **-EPROTOTYPE** if socket L4 protocol + * (*sk->protocol*) doesn't match packet protocol + * (*ctx->protocol*). + * + * * **-ESOCKTNOSUPPORT** if socket is not in allowed + * state (TCP listening or UDP unconnected). + * * u64 bpf_ktime_get_boot_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. @@ -3607,6 +3663,12 @@ enum { BPF_RINGBUF_HDR_SZ = 8, }; +/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */ +enum { + BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0), + BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1), +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, @@ -4349,4 +4411,19 @@ struct bpf_pidns_info { __u32 pid; __u32 tgid; }; + +/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ +struct bpf_sk_lookup { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + + __u32 family; /* Protocol family (AF_INET, AF_INET6) */ + __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ + __u32 remote_ip4; /* Network byte order */ + __u32 remote_ip6[4]; /* Network byte order */ + __u32 remote_port; /* Network byte order */ + __u32 local_ip4; /* Network byte order */ + __u32 local_ip6[4]; /* Network byte order */ + __u32 local_port; /* Host byte order */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index e9c8e26ac8f2..38b368bccda2 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -373,6 +373,8 @@ static int netns_bpf_max_progs(enum netns_bpf_attach_type type) switch (type) { case NETNS_BPF_FLOW_DISSECTOR: return 1; + case NETNS_BPF_SK_LOOKUP: + return 64; default: return 0; } @@ -403,6 +405,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, case NETNS_BPF_FLOW_DISSECTOR: err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; + case NETNS_BPF_SK_LOOKUP: + err = 0; /* nothing to check */ + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7ea9dfbebd8c..d07417d17712 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2022,6 +2022,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_SK_LOOKUP: + if (expected_attach_type == BPF_SK_LOOKUP) + return 0; + return -EINVAL; case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; @@ -2756,6 +2760,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: if (!capable(CAP_NET_ADMIN)) @@ -2817,6 +2822,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SOCKOPT; case BPF_TRACE_ITER: return BPF_PROG_TYPE_TRACING; + case BPF_SK_LOOKUP: + return BPF_PROG_TYPE_SK_LOOKUP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -2953,6 +2960,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: + case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; @@ -3891,6 +3899,7 @@ static int link_create(union bpf_attr *attr) ret = tracing_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3c1efc9d08fd..9a6703bc3f36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3878,10 +3878,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } meta->ref_obj_id = reg->ref_obj_id; } - } else if (arg_type == ARG_PTR_TO_SOCKET) { + } else if (arg_type == ARG_PTR_TO_SOCKET || + arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { expected_type = PTR_TO_SOCKET; - if (type != expected_type) - goto err_type; + if (!(register_is_null(reg) && + arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) { + if (type != expected_type) + goto err_type; + } } else if (arg_type == ARG_PTR_TO_BTF_ID) { expected_type = PTR_TO_BTF_ID; if (type != expected_type) @@ -7354,6 +7358,9 @@ static int check_return_code(struct bpf_verifier_env *env) return -ENOTSUPP; } break; + case BPF_PROG_TYPE_SK_LOOKUP: + range = tnum_range(SK_DROP, SK_PASS); + break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. diff --git a/net/core/filter.c b/net/core/filter.c index bdd2382e655d..d099436b3ff5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9229,6 +9229,186 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; + +BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, + struct sock *, sk, u64, flags) +{ + if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | + BPF_SK_LOOKUP_F_NO_REUSEPORT))) + return -EINVAL; + if (unlikely(sk && sk_is_refcounted(sk))) + return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ + if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED)) + return -ESOCKTNOSUPPORT; /* reject connected sockets */ + + /* Check if socket is suitable for packet L3/L4 protocol */ + if (sk && sk->sk_protocol != ctx->protocol) + return -EPROTOTYPE; + if (sk && sk->sk_family != ctx->family && + (sk->sk_family == AF_INET || ipv6_only_sock(sk))) + return -EAFNOSUPPORT; + + if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) + return -EEXIST; + + /* Select socket as lookup result */ + ctx->selected_sk = sk; + ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; + return 0; +} + +static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { + .func = bpf_sk_lookup_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_lookup_assign_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool sk_lookup_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) + return false; + if (off % size != 0) + return false; + if (type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct bpf_sk_lookup, sk): + info->reg_type = PTR_TO_SOCKET_OR_NULL; + return size == sizeof(__u64); + + case bpf_ctx_range(struct bpf_sk_lookup, family): + case bpf_ctx_range(struct bpf_sk_lookup, protocol): + case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): + case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): + case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + case bpf_ctx_range(struct bpf_sk_lookup, local_port): + bpf_ctx_record_field_size(info, sizeof(__u32)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); + + default: + return false; + } +} + +static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sk_lookup, sk): + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, selected_sk)); + break; + + case offsetof(struct bpf_sk_lookup, family): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + family, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, protocol): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + protocol, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, remote_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.saddr, 4, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.daddr, 4, target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sk_lookup, + remote_ip6[0], remote_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.saddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case bpf_ctx_range_till(struct bpf_sk_lookup, + local_ip6[0], local_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.daddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case offsetof(struct bpf_sk_lookup, remote_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + sport, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + dport, 2, target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_prog_ops sk_lookup_prog_ops = { +}; + +const struct bpf_verifier_ops sk_lookup_verifier_ops = { + .get_func_proto = sk_lookup_func_proto, + .is_valid_access = sk_lookup_is_valid_access, + .convert_ctx_access = sk_lookup_convert_ctx_access, +}; + #endif /* CONFIG_INET */ DEFINE_BPF_DISPATCHER(xdp) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6843376733df..5bfa448b4704 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -404,6 +404,7 @@ class PrinterHelpers(Printer): type_fwds = [ 'struct bpf_fib_lookup', + 'struct bpf_sk_lookup', 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', @@ -450,6 +451,7 @@ class PrinterHelpers(Printer): 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', + 'struct bpf_sk_lookup', 'struct bpf_sock', 'struct bpf_sock_addr', 'struct bpf_sock_ops', @@ -487,6 +489,11 @@ class PrinterHelpers(Printer): 'struct sk_msg_buff': 'struct sk_msg_md', 'struct xdp_buff': 'struct xdp_md', } + # Helpers overloaded for different context types. + overloaded_helpers = [ + 'bpf_get_socket_cookie', + 'bpf_sk_assign', + ] def print_header(self): header = '''\ @@ -543,7 +550,7 @@ class PrinterHelpers(Printer): for i, a in enumerate(proto['args']): t = a['type'] n = a['name'] - if proto['name'] == 'bpf_get_socket_cookie' and i == 0: + if proto['name'] in self.overloaded_helpers and i == 0: t = 'void' n = 'ctx' one_arg = '{}{}'.format(comma, self.map_type(t)) -- cgit v1.2.3 From 1559b4aa1db443096af493c7d621dc156054babe Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 17 Jul 2020 12:35:25 +0200 Subject: inet: Run SK_LOOKUP BPF program on socket lookup Run a BPF program before looking up a listening socket on the receive path. Program selects a listening socket to yield as result of socket lookup by calling bpf_sk_assign() helper and returning SK_PASS code. Program can revert its decision by assigning a NULL socket with bpf_sk_assign(). Alternatively, BPF program can also fail the lookup by returning with SK_DROP, or let the lookup continue as usual with SK_PASS on return, when no socket has been selected with bpf_sk_assign(). This lets the user match packets with listening sockets freely at the last possible point on the receive path, where we know that packets are destined for local delivery after undergoing policing, filtering, and routing. With BPF code selecting the socket, directing packets destined to an IP range or to a port range to a single socket becomes possible. In case multiple programs are attached, they are run in series in the order in which they were attached. The end result is determined from return codes of all the programs according to following rules: 1. If any program returned SK_PASS and selected a valid socket, the socket is used as result of socket lookup. 2. If more than one program returned SK_PASS and selected a socket, last selection takes effect. 3. If any program returned SK_DROP, and no program returned SK_PASS and selected a socket, socket lookup fails with -ECONNREFUSED. 4. If all programs returned SK_PASS and none of them selected a socket, socket lookup continues to htable-based lookup. Suggested-by: Marek Majkowski Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200717103536.397595-5-jakub@cloudflare.com --- include/linux/filter.h | 91 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/net_namespace.c | 32 +++++++++++++++- net/core/filter.c | 3 ++ net/ipv4/inet_hashtables.c | 31 ++++++++++++++++ 4 files changed, 156 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index fa1ea12ad2cd..c4f54c216347 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1295,4 +1295,95 @@ struct bpf_sk_lookup_kern { bool no_reuseport; }; +extern struct static_key_false bpf_sk_lookup_enabled; + +/* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup. + * + * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and + * SK_DROP. Their meaning is as follows: + * + * SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result + * SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup + * SK_DROP : terminate lookup with -ECONNREFUSED + * + * This macro aggregates return values and selected sockets from + * multiple BPF programs according to following rules in order: + * + * 1. If any program returned SK_PASS and a non-NULL ctx.selected_sk, + * macro result is SK_PASS and last ctx.selected_sk is used. + * 2. If any program returned SK_DROP return value, + * macro result is SK_DROP. + * 3. Otherwise result is SK_PASS and ctx.selected_sk is NULL. + * + * Caller must ensure that the prog array is non-NULL, and that the + * array as well as the programs it contains remain valid. + */ +#define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func) \ + ({ \ + struct bpf_sk_lookup_kern *_ctx = &(ctx); \ + struct bpf_prog_array_item *_item; \ + struct sock *_selected_sk = NULL; \ + bool _no_reuseport = false; \ + struct bpf_prog *_prog; \ + bool _all_pass = true; \ + u32 _ret; \ + \ + migrate_disable(); \ + _item = &(array)->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + /* restore most recent selection */ \ + _ctx->selected_sk = _selected_sk; \ + _ctx->no_reuseport = _no_reuseport; \ + \ + _ret = func(_prog, _ctx); \ + if (_ret == SK_PASS && _ctx->selected_sk) { \ + /* remember last non-NULL socket */ \ + _selected_sk = _ctx->selected_sk; \ + _no_reuseport = _ctx->no_reuseport; \ + } else if (_ret == SK_DROP && _all_pass) { \ + _all_pass = false; \ + } \ + _item++; \ + } \ + _ctx->selected_sk = _selected_sk; \ + _ctx->no_reuseport = _no_reuseport; \ + migrate_enable(); \ + _all_pass || _selected_sk ? SK_PASS : SK_DROP; \ + }) + +static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const u16 dport, + struct sock **psk) +{ + struct bpf_prog_array *run_array; + struct sock *selected_sk = NULL; + bool no_reuseport = false; + + rcu_read_lock(); + run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]); + if (run_array) { + struct bpf_sk_lookup_kern ctx = { + .family = AF_INET, + .protocol = protocol, + .v4.saddr = saddr, + .v4.daddr = daddr, + .sport = sport, + .dport = dport, + }; + u32 act; + + act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN); + if (act == SK_PASS) { + selected_sk = ctx.selected_sk; + no_reuseport = ctx.no_reuseport; + } else { + selected_sk = ERR_PTR(-ECONNREFUSED); + } + } + rcu_read_unlock(); + *psk = selected_sk; + return no_reuseport; +} + #endif /* __LINUX_FILTER_H__ */ diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 38b368bccda2..4e1bcaa2c3cb 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -25,6 +25,28 @@ struct bpf_netns_link { /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); +static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type) +{ + switch (type) { + case NETNS_BPF_SK_LOOKUP: + static_branch_dec(&bpf_sk_lookup_enabled); + break; + default: + break; + } +} + +static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type) +{ + switch (type) { + case NETNS_BPF_SK_LOOKUP: + static_branch_inc(&bpf_sk_lookup_enabled); + break; + default: + break; + } +} + /* Must be called with netns_bpf_mutex held. */ static void netns_bpf_run_array_detach(struct net *net, enum netns_bpf_attach_type type) @@ -91,6 +113,9 @@ static void bpf_netns_link_release(struct bpf_link *link) if (!net) goto out_unlock; + /* Mark attach point as unused */ + netns_bpf_attach_type_unneed(type); + /* Remember link position in case of safe delete */ idx = link_index(net, type, net_link); list_del(&net_link->node); @@ -428,6 +453,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, lockdep_is_held(&netns_bpf_mutex)); bpf_prog_array_free(run_array); + /* Mark attach point as used */ + netns_bpf_attach_type_need(type); + out_unlock: mutex_unlock(&netns_bpf_mutex); return err; @@ -503,8 +531,10 @@ static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { netns_bpf_run_array_detach(net, type); - list_for_each_entry(net_link, &net->bpf.links[type], node) + list_for_each_entry(net_link, &net->bpf.links[type], node) { net_link->net = NULL; /* auto-detach link */ + netns_bpf_attach_type_unneed(type); + } if (net->bpf.progs[type]) bpf_prog_put(net->bpf.progs[type]); } diff --git a/net/core/filter.c b/net/core/filter.c index d099436b3ff5..2bd129b5ae74 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9230,6 +9230,9 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; +DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled); +EXPORT_SYMBOL(bpf_sk_lookup_enabled); + BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, struct sock *, sk, u64, flags) { diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ab64834837c8..4eb4cd8d20dd 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -299,6 +299,29 @@ static struct sock *inet_lhash2_lookup(struct net *net, return result; } +static inline struct sock *inet_lookup_run_bpf(struct net *net, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum) +{ + struct sock *sk, *reuse_sk; + bool no_reuseport; + + if (hashinfo != &tcp_hashinfo) + return NULL; /* only TCP is supported */ + + no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, + saddr, sport, daddr, hnum, &sk); + if (no_reuseport || IS_ERR_OR_NULL(sk)) + return sk; + + reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + if (reuse_sk) + sk = reuse_sk; + return sk; +} + struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -310,6 +333,14 @@ struct sock *__inet_lookup_listener(struct net *net, struct sock *result = NULL; unsigned int hash2; + /* Lookup redirect from BPF */ + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { + result = inet_lookup_run_bpf(net, hashinfo, skb, doff, + saddr, sport, daddr, hnum); + if (result) + goto done; + } + hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); -- cgit v1.2.3 From 2f9237d4f6df49b74c51cdac555b0a9979d0c237 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Jul 2020 09:30:00 +0200 Subject: dma-mapping: make support for dma ops optional Avoid the overhead of the dma ops support for tiny builds that only use the direct mapping. Signed-off-by: Christoph Hellwig Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy --- arch/alpha/Kconfig | 1 + arch/arm/Kconfig | 1 + arch/ia64/Kconfig | 1 + arch/mips/Kconfig | 1 + arch/parisc/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/x86/Kconfig | 1 + drivers/infiniband/core/device.c | 6 +++++- drivers/iommu/Kconfig | 2 ++ drivers/macintosh/macio_asic.c | 4 ++-- drivers/misc/mic/Kconfig | 4 ++++ drivers/vdpa/Kconfig | 1 + drivers/xen/Kconfig | 1 + include/linux/device.h | 3 ++- include/linux/dma-mapping.h | 12 +++++++++++- kernel/dma/Kconfig | 4 ++++ kernel/dma/Makefile | 3 ++- 19 files changed, 43 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 10862c5a8c76..9c5f06e8eb9b 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -7,6 +7,7 @@ config ALPHA select ARCH_NO_PREEMPT select ARCH_NO_SG_CHAIN select ARCH_USE_CMPXCHG_LOCKREF + select DMA_OPS if PCI select FORCE_PCI if !ALPHA_JENSEN select PCI_DOMAINS if PCI select PCI_SYSCALL if PCI diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2ac74904a3ce..bee35b0187e4 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -41,6 +41,7 @@ config ARM select CPU_PM if SUSPEND || CPU_IDLE select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS select DMA_DECLARE_COHERENT + select DMA_OPS select DMA_REMAP if MMU select EDAC_SUPPORT select EDAC_ATOMIC_SCRUB diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 1fa2fe2ef053..5b4ec80bf586 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -192,6 +192,7 @@ config IA64_SGI_UV config IA64_HP_SBA_IOMMU bool "HP SBA IOMMU support" + select DMA_OPS default y help Say Y here to add support for the SBA IOMMU found on HP zx1 and diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 6fee1a133e9d..8a458105e445 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -367,6 +367,7 @@ config MACH_JAZZ select ARC_PROMLIB select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO + select DMA_OPS select FW_ARC select FW_ARC32 select ARCH_MAY_HAVE_PC_FDC diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 8e4c3708773d..38c1eafc1f1a 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -14,6 +14,7 @@ config PARISC select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_NO_SG_CHAIN select ARCH_SUPPORTS_MEMORY_FAILURE + select DMA_OPS select RTC_CLASS select RTC_DRV_GENERIC select INIT_ALL_POSSIBLE diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 9fa23eb320ff..e9b091d35872 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -151,6 +151,7 @@ config PPC select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS select DCACHE_WORD_ACCESS if PPC64 && CPU_LITTLE_ENDIAN + select DMA_OPS if PPC64 select DYNAMIC_FTRACE if FUNCTION_TRACER select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c7d7ede6300c..687fe23f61cc 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -113,6 +113,7 @@ config S390 select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 + select DMA_OPS if PCI select DYNAMIC_FTRACE if FUNCTION_TRACER select GENERIC_CLOCKEVENTS select GENERIC_CPU_AUTOPROBE diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 5bf2dc163540..5db1faaaee31 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -15,6 +15,7 @@ config SPARC default y select ARCH_MIGHT_HAVE_PC_PARPORT if SPARC64 && PCI select ARCH_MIGHT_HAVE_PC_SERIO + select DMA_OPS select OF select OF_PROMTREE select HAVE_ASM_MODVERSIONS diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 883da0abf779..96ab92754158 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -909,6 +909,7 @@ config DMI config GART_IOMMU bool "Old AMD GART IOMMU support" + select DMA_OPS select IOMMU_HELPER select SWIOTLB depends on X86_64 && PCI && AMD_NB diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 905a2beaf885..2927a9d16eaa 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1183,6 +1183,8 @@ static void setup_dma_device(struct ib_device *device) struct device *parent = device->dev.parent; WARN_ON_ONCE(device->dma_device); + +#ifdef CONFIG_DMA_OPS if (device->dev.dma_ops) { /* * The caller provided custom DMA operations. Copy the @@ -1203,7 +1205,9 @@ static void setup_dma_device(struct ib_device *device) else WARN_ON_ONCE(true); } - } else { + } else +#endif /* CONFIG_DMA_OPS */ + { /* * The caller did not provide custom DMA operations. Use the * DMA mapping operations of the parent device. diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index b0f308cb7f7c..b622af72448f 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -97,6 +97,7 @@ config OF_IOMMU # IOMMU-agnostic DMA-mapping layer config IOMMU_DMA bool + select DMA_OPS select IOMMU_API select IOMMU_IOVA select IRQ_MSI_IOMMU @@ -183,6 +184,7 @@ config DMAR_TABLE config INTEL_IOMMU bool "Support for Intel IOMMU using DMA Remapping Devices" depends on PCI_MSI && ACPI && (X86 || IA64) + select DMA_OPS select IOMMU_API select IOMMU_IOVA select NEED_DMA_MAP_STATE diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c index 92d142d2b75f..49af60bdac92 100644 --- a/drivers/macintosh/macio_asic.c +++ b/drivers/macintosh/macio_asic.c @@ -382,7 +382,7 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip, dma_set_max_seg_size(&dev->ofdev.dev, 65536); dma_set_seg_boundary(&dev->ofdev.dev, 0xffffffff); -#ifdef CONFIG_PCI +#if defined(CONFIG_PCI) && defined(CONFIG_DMA_OPS) /* Set the DMA ops to the ones from the PCI device, this could be * fishy if we didn't know that on PowerMac it's always direct ops * or iommu ops that will work fine @@ -391,7 +391,7 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip, */ dev->ofdev.dev.archdata = chip->lbus.pdev->dev.archdata; dev->ofdev.dev.dma_ops = chip->lbus.pdev->dev.dma_ops; -#endif /* CONFIG_PCI */ +#endif /* CONFIG_PCI && CONFIG_DMA_OPS */ #ifdef DEBUG printk("preparing mdev @%p, ofdev @%p, dev @%p, kobj @%p\n", diff --git a/drivers/misc/mic/Kconfig b/drivers/misc/mic/Kconfig index 8f201d019f5a..b9bb086785db 100644 --- a/drivers/misc/mic/Kconfig +++ b/drivers/misc/mic/Kconfig @@ -4,6 +4,7 @@ menu "Intel MIC & related support" config INTEL_MIC_BUS tristate "Intel MIC Bus Driver" depends on 64BIT && PCI && X86 + select DMA_OPS help This option is selected by any driver which registers a device or driver on the MIC Bus, such as CONFIG_INTEL_MIC_HOST, @@ -19,6 +20,7 @@ config INTEL_MIC_BUS config SCIF_BUS tristate "SCIF Bus Driver" depends on 64BIT && PCI && X86 + select DMA_OPS help This option is selected by any driver which registers a device or driver on the SCIF Bus, such as CONFIG_INTEL_MIC_HOST @@ -33,6 +35,7 @@ config SCIF_BUS config VOP_BUS tristate "VOP Bus Driver" + select DMA_OPS help This option is selected by any driver which registers a device or driver on the VOP Bus, such as CONFIG_INTEL_MIC_HOST @@ -49,6 +52,7 @@ config INTEL_MIC_HOST tristate "Intel MIC Host Driver" depends on 64BIT && PCI && X86 depends on INTEL_MIC_BUS && SCIF_BUS && MIC_COSM && VOP_BUS + select DMA_OPS help This enables Host Driver support for the Intel Many Integrated Core (MIC) family of PCIe form factor coprocessor devices that diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index 3e1ceb8e9f2b..d93a69b12f81 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -11,6 +11,7 @@ if VDPA config VDPA_SIM tristate "vDPA device simulator" depends on RUNTIME_TESTING_MENU && HAS_DMA + select DMA_OPS select VHOST_RING default n help diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 727f11eb46b2..1d339ef92422 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -179,6 +179,7 @@ config XEN_GRANT_DMA_ALLOC config SWIOTLB_XEN def_bool y + select DMA_OPS select SWIOTLB config XEN_PCIDEV_BACKEND diff --git a/include/linux/device.h b/include/linux/device.h index 15460a5ac024..4c4af98321eb 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -568,8 +568,9 @@ struct device { #ifdef CONFIG_GENERIC_MSI_IRQ struct list_head msi_list; #endif - +#ifdef CONFIG_DMA_OPS const struct dma_map_ops *dma_ops; +#endif u64 *dma_mask; /* dma mask (if dma'able device) */ u64 coherent_dma_mask;/* Like dma_mask, but for alloc_coherent mappings as diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index bd0a6f5ee445..39da883c8619 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -191,6 +191,7 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, #ifdef CONFIG_HAS_DMA #include +#ifdef CONFIG_DMA_OPS static inline const struct dma_map_ops *get_dma_ops(struct device *dev) { if (dev->dma_ops) @@ -203,7 +204,16 @@ static inline void set_dma_ops(struct device *dev, { dev->dma_ops = dma_ops; } - +#else /* CONFIG_DMA_OPS */ +static inline const struct dma_map_ops *get_dma_ops(struct device *dev) +{ + return NULL; +} +static inline void set_dma_ops(struct device *dev, + const struct dma_map_ops *dma_ops) +{ +} +#endif /* CONFIG_DMA_OPS */ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 1da3f44f2565..5cfb2428593a 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -5,6 +5,9 @@ config HAS_DMA depends on !NO_DMA default y +config DMA_OPS + bool + config NEED_SG_DMA_LENGTH bool @@ -60,6 +63,7 @@ config DMA_NONCOHERENT_CACHE_SYNC config DMA_VIRT_OPS bool depends on HAS_DMA + select DMA_OPS config SWIOTLB bool diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 370f63344e9c..32c7c1942bbd 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o +obj-$(CONFIG_HAS_DMA) += mapping.o direct.o +obj-$(CONFIG_DMA_OPS) += dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o -- cgit v1.2.3 From d35834c64820c7ef397f8a244061d4450720540e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Mar 2020 18:19:30 +0100 Subject: dma-mapping: add a dma_ops_bypass flag to struct device Several IOMMU drivers have a bypass mode where they can use a direct mapping if the devices DMA mask is large enough. Add generic support to the core dma-mapping code to do that to switch those drivers to a common solution. Signed-off-by: Christoph Hellwig Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy --- include/linux/device.h | 8 ++++++ kernel/dma/Kconfig | 8 ++++++ kernel/dma/mapping.c | 74 +++++++++++++++++++++++++++++++++++--------------- 3 files changed, 68 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/device.h b/include/linux/device.h index 4c4af98321eb..1f71acf37f78 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -523,6 +523,11 @@ struct dev_links_info { * sync_state() callback. * @dma_coherent: this particular device is dma coherent, even if the * architecture supports non-coherent devices. + * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the + * streaming DMA operations (->map_* / ->unmap_* / ->sync_*), + * and optionall (if the coherent mask is large enough) also + * for dma allocations. This flag is managed by the dma ops + * instance from ->dma_supported. * * At the lowest level, every device in a Linux system is represented by an * instance of struct device. The device structure contains the information @@ -623,6 +628,9 @@ struct device { defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) bool dma_coherent:1; #endif +#ifdef CONFIG_DMA_OPS_BYPASS + bool dma_ops_bypass : 1; +#endif }; static inline struct device *kobj_to_dev(struct kobject *kobj) diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 5cfb2428593a..f4770fcfa62b 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -8,6 +8,14 @@ config HAS_DMA config DMA_OPS bool +# +# IOMMU drivers that can bypass the IOMMU code and optionally use the direct +# mapping fast path should select this option and set the dma_ops_bypass +# flag in struct device where applicable +# +config DMA_OPS_BYPASS + bool + config NEED_SG_DMA_LENGTH bool diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b53953024512..0d129421e75f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -105,9 +105,35 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, } EXPORT_SYMBOL(dmam_alloc_attrs); -static inline bool dma_is_direct(const struct dma_map_ops *ops) +static bool dma_go_direct(struct device *dev, dma_addr_t mask, + const struct dma_map_ops *ops) { - return likely(!ops); + if (likely(!ops)) + return true; +#ifdef CONFIG_DMA_OPS_BYPASS + if (dev->dma_ops_bypass) + return min_not_zero(mask, dev->bus_dma_limit) >= + dma_direct_get_required_mask(dev); +#endif + return false; +} + + +/* + * Check if the devices uses a direct mapping for streaming DMA operations. + * This allows IOMMU drivers to set a bypass mode if the DMA mask is large + * enough. + */ +static inline bool dma_alloc_direct(struct device *dev, + const struct dma_map_ops *ops) +{ + return dma_go_direct(dev, dev->coherent_dma_mask, ops); +} + +static inline bool dma_map_direct(struct device *dev, + const struct dma_map_ops *ops) +{ + return dma_go_direct(dev, *dev->dma_mask, ops); } dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, @@ -118,7 +144,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); @@ -134,7 +160,7 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) dma_direct_unmap_page(dev, addr, size, dir, attrs); else if (ops->unmap_page) ops->unmap_page(dev, addr, size, dir, attrs); @@ -153,7 +179,7 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, int ents; BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); else ents = ops->map_sg(dev, sg, nents, dir, attrs); @@ -172,7 +198,7 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, BUG_ON(!valid_dma_direction(dir)); debug_dma_unmap_sg(dev, sg, nents, dir); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) dma_direct_unmap_sg(dev, sg, nents, dir, attrs); else if (ops->unmap_sg) ops->unmap_sg(dev, sg, nents, dir, attrs); @@ -191,7 +217,7 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) return DMA_MAPPING_ERROR; - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); else if (ops->map_resource) addr = ops->map_resource(dev, phys_addr, size, dir, attrs); @@ -207,7 +233,7 @@ void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (!dma_is_direct(ops) && ops->unmap_resource) + if (!dma_map_direct(dev, ops) && ops->unmap_resource) ops->unmap_resource(dev, addr, size, dir, attrs); debug_dma_unmap_resource(dev, addr, size, dir); } @@ -219,7 +245,7 @@ void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); else if (ops->sync_single_for_cpu) ops->sync_single_for_cpu(dev, addr, size, dir); @@ -233,7 +259,7 @@ void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) dma_direct_sync_single_for_device(dev, addr, size, dir); else if (ops->sync_single_for_device) ops->sync_single_for_device(dev, addr, size, dir); @@ -247,7 +273,7 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir); else if (ops->sync_sg_for_cpu) ops->sync_sg_for_cpu(dev, sg, nelems, dir); @@ -261,7 +287,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) dma_direct_sync_sg_for_device(dev, sg, nelems, dir); else if (ops->sync_sg_for_device) ops->sync_sg_for_device(dev, sg, nelems, dir); @@ -302,7 +328,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); if (!ops->get_sgtable) @@ -372,7 +398,7 @@ bool dma_can_mmap(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_can_mmap(dev); return ops->mmap != NULL; } @@ -397,7 +423,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); if (!ops->mmap) @@ -410,7 +436,7 @@ u64 dma_get_required_mask(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) return dma_direct_get_required_mask(dev); if (ops->get_required_mask) return ops->get_required_mask(dev); @@ -441,7 +467,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); else if (ops->alloc) cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); @@ -473,7 +499,7 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, return; debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) dma_direct_free(dev, size, cpu_addr, dma_handle, attrs); else if (ops->free) ops->free(dev, size, cpu_addr, dma_handle, attrs); @@ -484,7 +510,11 @@ int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + /* + * ->dma_supported sets the bypass flag, so we must always call + * into the method here unless the device is truly direct mapped. + */ + if (!ops) return dma_direct_supported(dev, mask); if (!ops->dma_supported) return 1; @@ -540,7 +570,7 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size, BUG_ON(!valid_dma_direction(dir)); - if (dma_is_direct(ops)) + if (dma_alloc_direct(dev, ops)) arch_dma_cache_sync(dev, vaddr, size, dir); else if (ops->cache_sync) ops->cache_sync(dev, vaddr, size, dir); @@ -552,7 +582,7 @@ size_t dma_max_mapping_size(struct device *dev) const struct dma_map_ops *ops = get_dma_ops(dev); size_t size = SIZE_MAX; - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) size = dma_direct_max_mapping_size(dev); else if (ops && ops->max_mapping_size) size = ops->max_mapping_size(dev); @@ -565,7 +595,7 @@ bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) + if (dma_map_direct(dev, ops)) return dma_direct_need_sync(dev, dma_addr); return ops->sync_single_for_cpu || ops->sync_single_for_device; } -- cgit v1.2.3 From 23efed6fa7519e29a01f12723b0cfa8118dc87cb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Jul 2020 17:00:16 +0200 Subject: dma-debug: use named initializers for dir2name Make dir2name a little more readable and maintainable by using named initializers. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/debug.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 36c962a86bf2..4dd0d9028ec8 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -144,8 +144,12 @@ static const char *type2name[] = { [dma_debug_resource] = "resource", }; -static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", - "DMA_FROM_DEVICE", "DMA_NONE" }; +static const char *dir2name[] = { + [DMA_BIDIRECTIONAL] = "DMA_BIDIRECTIONAL", + [DMA_TO_DEVICE] = "DMA_TO_DEVICE", + [DMA_FROM_DEVICE] = "DMA_FROM_DEVICE", + [DMA_NONE] = "DMA_NONE", +}; /* * The access to some variables in this macro is racy. We can't use atomic_t -- cgit v1.2.3 From 1caef81da05a84a40dbf02110e967ce6d1135ff6 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 19 Jul 2020 12:04:12 +0200 Subject: pid: use checkpoint_restore_ns_capable() for set_tid Use the newly introduced capability CAP_CHECKPOINT_RESTORE to allow using clone3() with set_tid set. Signed-off-by: Adrian Reber Signed-off-by: Nicolas Viennot Reviewed-by: Serge Hallyn Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20200719100418.2112740-3-areber@redhat.com Signed-off-by: Christian Brauner --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index f1496b757162..450d40469b1c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -198,7 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, if (tid != 1 && !tmp->child_reaper) goto out_free; retval = -EPERM; - if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN)) + if (!checkpoint_restore_ns_capable(tmp->user_ns)) goto out_free; set_tid_size--; } -- cgit v1.2.3 From b9a3db92e1a1f281724e2f34b849d18d1a53bece Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 19 Jul 2020 12:04:13 +0200 Subject: pid_namespace: use checkpoint_restore_ns_capable() for ns_last_pid Use the newly introduced capability CAP_CHECKPOINT_RESTORE to allow writing to ns_last_pid. Signed-off-by: Adrian Reber Signed-off-by: Nicolas Viennot Reviewed-by: Serge Hallyn Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20200719100418.2112740-4-areber@redhat.com Signed-off-by: Christian Brauner --- kernel/pid_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 0e5ac162c3a8..ac135bd600eb 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -269,7 +269,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, struct ctl_table tmp = *table; int ret, next; - if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) return -EPERM; /* -- cgit v1.2.3 From ebd6de6812387a2db9a52842cfbe004da1dd3be8 Mon Sep 17 00:00:00 2001 From: Nicolas Viennot Date: Sun, 19 Jul 2020 12:04:15 +0200 Subject: prctl: Allow local CAP_CHECKPOINT_RESTORE to change /proc/self/exe Originally, only a local CAP_SYS_ADMIN could change the exe link, making it difficult for doing checkpoint/restore without CAP_SYS_ADMIN. This commit adds CAP_CHECKPOINT_RESTORE in addition to CAP_SYS_ADMIN for permitting changing the exe link. The following describes the history of the /proc/self/exe permission checks as it may be difficult to understand what decisions lead to this point. * [1] May 2012: This commit introduces the ability of changing /proc/self/exe if the user is CAP_SYS_RESOURCE capable. In the related discussion [2], no clear thread model is presented for what could happen if the /proc/self/exe changes multiple times, or why would the admin be at the mercy of userspace. * [3] Oct 2014: This commit introduces a new API to change /proc/self/exe. The permission no longer checks for CAP_SYS_RESOURCE, but instead checks if the current user is root (uid=0) in its local namespace. In the related discussion [4] it is said that "Controlling exe_fd without privileges may turn out to be dangerous. At least things like tomoyo examine it for making policy decisions (see tomoyo_manager())." * [5] Dec 2016: This commit removes the restriction to change /proc/self/exe at most once. The related discussion [6] informs that the audit subsystem relies on the exe symlink, presumably audit_log_d_path_exe() in kernel/audit.c. * [7] May 2017: This commit changed the check from uid==0 to local CAP_SYS_ADMIN. No discussion. * [8] July 2020: A PoC to spoof any program's /proc/self/exe via ptrace is demonstrated Overall, the concrete points that were made to retain capability checks around changing the exe symlink is that tomoyo_manager() and audit_log_d_path_exe() uses the exe_file path. Christian Brauner said that relying on /proc//exe being immutable (or guarded by caps) in a sake of security is a bit misleading. It can only be used as a hint without any guarantees of what code is being executed once execve() returns to userspace. Christian suggested that in the future, we could call audit_log() or similar to inform the admin of all exe link changes, instead of attempting to provide security guarantees via permission checks. However, this proposed change requires the understanding of the security implications in the tomoyo/audit subsystems. [1] b32dfe377102 ("c/r: prctl: add ability to set new mm_struct::exe_file") [2] https://lore.kernel.org/patchwork/patch/292515/ [3] f606b77f1a9e ("prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation") [4] https://lore.kernel.org/patchwork/patch/479359/ [5] 3fb4afd9a504 ("prctl: remove one-shot limitation for changing exe link") [6] https://lore.kernel.org/patchwork/patch/697304/ [7] 4d28df6152aa ("prctl: Allow local CAP_SYS_ADMIN changing exe_file") [8] https://github.com/nviennot/run_as_exe Signed-off-by: Nicolas Viennot Signed-off-by: Adrian Reber Link: https://lore.kernel.org/r/20200719100418.2112740-6-areber@redhat.com Signed-off-by: Christian Brauner --- kernel/sys.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 00a96746e28a..a3f4ef0bbda3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2007,11 +2007,14 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.exe_fd != (u32)-1) { /* - * Make sure the caller has the rights to - * change /proc/pid/exe link: only local sys admin should - * be allowed to. + * Check if the current user is checkpoint/restore capable. + * At the time of this writing, it checks for CAP_SYS_ADMIN + * or CAP_CHECKPOINT_RESTORE. + * Note that a user with access to ptrace can masquerade an + * arbitrary program as any executable, even setuid ones. + * This may have implications in the tomoyo subsystem. */ - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + if (!checkpoint_restore_ns_capable(current_user_ns())) return -EINVAL; error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); -- cgit v1.2.3 From 227175b2c914bfa4a9aa5b210c7cabd42c5858ac Mon Sep 17 00:00:00 2001 From: Nicolas Viennot Date: Sun, 19 Jul 2020 12:04:16 +0200 Subject: prctl: exe link permission error changed from -EINVAL to -EPERM This brings consistency with the rest of the prctl() syscall where -EPERM is returned when failing a capability check. Signed-off-by: Nicolas Viennot Signed-off-by: Adrian Reber Reviewed-by: Serge Hallyn Link: https://lore.kernel.org/r/20200719100418.2112740-7-areber@redhat.com Signed-off-by: Christian Brauner --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index a3f4ef0bbda3..ca11af9d815d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2015,7 +2015,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * This may have implications in the tomoyo subsystem. */ if (!checkpoint_restore_ns_capable(current_user_ns())) - return -EINVAL; + return -EPERM; error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); if (error) -- cgit v1.2.3 From 1b86abc1c645ad5c9c7bf70910cb3ce73939d2d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:24 +0800 Subject: sched_clock: Expose struct clock_read_data In order to support perf_event_mmap_page::cap_time features, an architecture needs, aside from a userspace readable counter register, to expose the exact clock data so that userspace can convert the counter register into a correct timestamp. Provide struct clock_read_data and two (seqcount) helpers so that architectures (arm64 in specific) can expose the numbers to userspace. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-2-leo.yan@linaro.org Signed-off-by: Will Deacon --- include/linux/sched_clock.h | 28 ++++++++++++++++++++++++++++ kernel/time/sched_clock.c | 41 +++++++++++++---------------------------- 2 files changed, 41 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h index 0bb04a96a6d4..528718e4ed52 100644 --- a/include/linux/sched_clock.h +++ b/include/linux/sched_clock.h @@ -6,6 +6,34 @@ #define LINUX_SCHED_CLOCK #ifdef CONFIG_GENERIC_SCHED_CLOCK +/** + * struct clock_read_data - data required to read from sched_clock() + * + * @epoch_ns: sched_clock() value at last update + * @epoch_cyc: Clock cycle value at last update. + * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit + * clocks. + * @read_sched_clock: Current clock source (or dummy source when suspended). + * @mult: Multipler for scaled math conversion. + * @shift: Shift value for scaled math conversion. + * + * Care must be taken when updating this structure; it is read by + * some very hot code paths. It occupies <=40 bytes and, when combined + * with the seqcount used to synchronize access, comfortably fits into + * a 64 byte cache line. + */ +struct clock_read_data { + u64 epoch_ns; + u64 epoch_cyc; + u64 sched_clock_mask; + u64 (*read_sched_clock)(void); + u32 mult; + u32 shift; +}; + +extern struct clock_read_data *sched_clock_read_begin(unsigned int *seq); +extern int sched_clock_read_retry(unsigned int seq); + extern void generic_sched_clock_init(void); extern void sched_clock_register(u64 (*read)(void), int bits, diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fa3f800d7d76..0acaadc3156c 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -19,31 +19,6 @@ #include "timekeeping.h" -/** - * struct clock_read_data - data required to read from sched_clock() - * - * @epoch_ns: sched_clock() value at last update - * @epoch_cyc: Clock cycle value at last update. - * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit - * clocks. - * @read_sched_clock: Current clock source (or dummy source when suspended). - * @mult: Multipler for scaled math conversion. - * @shift: Shift value for scaled math conversion. - * - * Care must be taken when updating this structure; it is read by - * some very hot code paths. It occupies <=40 bytes and, when combined - * with the seqcount used to synchronize access, comfortably fits into - * a 64 byte cache line. - */ -struct clock_read_data { - u64 epoch_ns; - u64 epoch_cyc; - u64 sched_clock_mask; - u64 (*read_sched_clock)(void); - u32 mult; - u32 shift; -}; - /** * struct clock_data - all data needed for sched_clock() (including * registration of a new clock source) @@ -93,6 +68,17 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) return (cyc * mult) >> shift; } +struct clock_read_data *sched_clock_read_begin(unsigned int *seq) +{ + *seq = raw_read_seqcount(&cd.seq); + return cd.read_data + (*seq & 1); +} + +int sched_clock_read_retry(unsigned int seq) +{ + return read_seqcount_retry(&cd.seq, seq); +} + unsigned long long notrace sched_clock(void) { u64 cyc, res; @@ -100,13 +86,12 @@ unsigned long long notrace sched_clock(void) struct clock_read_data *rd; do { - seq = raw_read_seqcount(&cd.seq); - rd = cd.read_data + (seq & 1); + rd = sched_clock_read_begin(&seq); cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); - } while (read_seqcount_retry(&cd.seq, seq)); + } while (sched_clock_read_retry(seq)); return res; } -- cgit v1.2.3 From aadd6e5caaacd6feca9691ba30536e7de5a7d152 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 16 Jul 2020 13:11:25 +0800 Subject: time/sched_clock: Use raw_read_seqcount_latch() sched_clock uses seqcount_t latching to switch between two storage places protected by the sequence counter. This allows it to have interruptible, NMI-safe, seqcount_t write side critical sections. Since 7fc26327b756 ("seqlock: Introduce raw_read_seqcount_latch()"), raw_read_seqcount_latch() became the standardized way for seqcount_t latch read paths. Due to the dependent load, it also has one read memory barrier less than the currently used raw_read_seqcount() API. Use raw_read_seqcount_latch() for the seqcount_t latch read path. Signed-off-by: Ahmed S. Darwish Signed-off-by: Leo Yan Link: https://lkml.kernel.org/r/20200625085745.GD117543@hirez.programming.kicks-ass.net Link: https://lkml.kernel.org/r/20200715092345.GA231464@debian-buster-darwi.lab.linutronix.de Link: https://lore.kernel.org/r/20200716051130.4359-3-leo.yan@linaro.org References: 1809bfa44e10 ("timers, sched/clock: Avoid deadlock during read from NMI") Signed-off-by: Will Deacon --- kernel/time/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0acaadc3156c..0deaf4b79fb4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -70,7 +70,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) struct clock_read_data *sched_clock_read_begin(unsigned int *seq) { - *seq = raw_read_seqcount(&cd.seq); + *seq = raw_read_seqcount_latch(&cd.seq); return cd.read_data + (*seq & 1); } -- cgit v1.2.3 From 4834177e633258fbf3c5754b1220f01c705b79eb Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 9 Jul 2020 01:19:11 -0500 Subject: ima: Support additional conditionals in the KEXEC_CMDLINE hook function Take the properties of the kexec kernel's inode and the current task ownership into consideration when matching a KEXEC_CMDLINE operation to the rules in the IMA policy. This allows for some uniformity when writing IMA policy rules for KEXEC_KERNEL_CHECK, KEXEC_INITRAMFS_CHECK, and KEXEC_CMDLINE operations. Prior to this patch, it was not possible to write a set of rules like this: dont_measure func=KEXEC_KERNEL_CHECK obj_type=foo_t dont_measure func=KEXEC_INITRAMFS_CHECK obj_type=foo_t dont_measure func=KEXEC_CMDLINE obj_type=foo_t measure func=KEXEC_KERNEL_CHECK measure func=KEXEC_INITRAMFS_CHECK measure func=KEXEC_CMDLINE The inode information associated with the kernel being loaded by a kexec_kernel_load(2) syscall can now be included in the decision to measure or not Additonally, the uid, euid, and subj_* conditionals can also now be used in KEXEC_CMDLINE rules. There was no technical reason as to why those conditionals weren't being considered previously other than ima_match_rules() didn't have a valid inode to use so it immediately bailed out for KEXEC_CMDLINE operations rather than going through the full list of conditional comparisons. Signed-off-by: Tyler Hicks Cc: Eric Biederman Cc: kexec@lists.infradead.org Reviewed-by: Lakshmi Ramasubramanian Signed-off-by: Mimi Zohar --- include/linux/ima.h | 4 ++-- kernel/kexec_file.c | 2 +- security/integrity/ima/ima.h | 2 +- security/integrity/ima/ima_api.c | 2 +- security/integrity/ima/ima_appraise.c | 2 +- security/integrity/ima/ima_asymmetric_keys.c | 2 +- security/integrity/ima/ima_main.c | 23 +++++++++++++++++------ security/integrity/ima/ima_policy.c | 17 ++++++----------- security/integrity/ima/ima_queue_keys.c | 2 +- 9 files changed, 31 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/ima.h b/include/linux/ima.h index 9164e1534ec9..d15100de6cdd 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -25,7 +25,7 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size, enum kernel_read_file_id id); extern void ima_post_path_mknod(struct dentry *dentry); extern int ima_file_hash(struct file *file, char *buf, size_t buf_size); -extern void ima_kexec_cmdline(const void *buf, int size); +extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size); #ifdef CONFIG_IMA_KEXEC extern void ima_add_kexec_buffer(struct kimage *image); @@ -103,7 +103,7 @@ static inline int ima_file_hash(struct file *file, char *buf, size_t buf_size) return -EOPNOTSUPP; } -static inline void ima_kexec_cmdline(const void *buf, int size) {} +static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {} #endif /* CONFIG_IMA */ #ifndef CONFIG_IMA_KEXEC diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index bb05fd52de85..07df431c1f21 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -287,7 +287,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, goto out; } - ima_kexec_cmdline(image->cmdline_buf, + ima_kexec_cmdline(kernel_fd, image->cmdline_buf, image->cmdline_buf_len - 1); } diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index ea7e77536f3c..576ae2c6d418 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -265,7 +265,7 @@ void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc); -void process_buffer_measurement(const void *buf, int size, +void process_buffer_measurement(struct inode *inode, const void *buf, int size, const char *eventname, enum ima_hooks func, int pcr, const char *keyring); void ima_audit_measurement(struct integrity_iint_cache *iint, diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index bf22de8b7ce0..4f39fb93f278 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -162,7 +162,7 @@ err_out: /** * ima_get_action - appraise & measure decision based on policy. - * @inode: pointer to inode to measure + * @inode: pointer to the inode associated with the object being validated * @cred: pointer to credentials structure to validate * @secid: secid of the task being validated * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXEC, diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index a9649b04b9f1..6c52bf7ea7f0 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -328,7 +328,7 @@ int ima_check_blacklist(struct integrity_iint_cache *iint, rc = is_binary_blacklisted(digest, digestsize); if ((rc == -EPERM) && (iint->flags & IMA_MEASURE)) - process_buffer_measurement(digest, digestsize, + process_buffer_measurement(NULL, digest, digestsize, "blacklisted-hash", NONE, pcr, NULL); } diff --git a/security/integrity/ima/ima_asymmetric_keys.c b/security/integrity/ima/ima_asymmetric_keys.c index aaae80c4e376..1c68c500c26f 100644 --- a/security/integrity/ima/ima_asymmetric_keys.c +++ b/security/integrity/ima/ima_asymmetric_keys.c @@ -58,7 +58,7 @@ void ima_post_key_create_or_update(struct key *keyring, struct key *key, * if the IMA policy is configured to measure a key linked * to the given keyring. */ - process_buffer_measurement(payload, payload_len, + process_buffer_measurement(NULL, payload, payload_len, keyring->description, KEY_CHECK, 0, keyring->description); } diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 8351b2fd48e0..8a91711ca79b 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -726,6 +726,7 @@ int ima_load_data(enum kernel_load_data_id id) /* * process_buffer_measurement - Measure the buffer to ima log. + * @inode: inode associated with the object being measured (NULL for KEY_CHECK) * @buf: pointer to the buffer that needs to be added to the log. * @size: size of buffer(in bytes). * @eventname: event name to be used for the buffer entry. @@ -735,7 +736,7 @@ int ima_load_data(enum kernel_load_data_id id) * * Based on policy, the buffer is measured into the ima log. */ -void process_buffer_measurement(const void *buf, int size, +void process_buffer_measurement(struct inode *inode, const void *buf, int size, const char *eventname, enum ima_hooks func, int pcr, const char *keyring) { @@ -768,7 +769,7 @@ void process_buffer_measurement(const void *buf, int size, */ if (func) { security_task_getsecid(current, &secid); - action = ima_get_action(NULL, current_cred(), secid, 0, func, + action = ima_get_action(inode, current_cred(), secid, 0, func, &pcr, &template, keyring); if (!(action & IMA_MEASURE)) return; @@ -823,16 +824,26 @@ out: /** * ima_kexec_cmdline - measure kexec cmdline boot args + * @kernel_fd: file descriptor of the kexec kernel being loaded * @buf: pointer to buffer * @size: size of buffer * * Buffers can only be measured, not appraised. */ -void ima_kexec_cmdline(const void *buf, int size) +void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) { - if (buf && size != 0) - process_buffer_measurement(buf, size, "kexec-cmdline", - KEXEC_CMDLINE, 0, NULL); + struct fd f; + + if (!buf || !size) + return; + + f = fdget(kernel_fd); + if (!f.file) + return; + + process_buffer_measurement(file_inode(f.file), buf, size, + "kexec-cmdline", KEXEC_CMDLINE, 0, NULL); + fdput(f); } static int __init init_ima(void) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index dcd1aaac4ff0..9284055ee13a 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -443,13 +443,9 @@ static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode, { int i; - if ((func == KEXEC_CMDLINE) || (func == KEY_CHECK)) { - if ((rule->flags & IMA_FUNC) && (rule->func == func)) { - if (func == KEY_CHECK) - return ima_match_keyring(rule, keyring, cred); - return true; - } - return false; + if (func == KEY_CHECK) { + return (rule->flags & IMA_FUNC) && (rule->func == func) && + ima_match_keyring(rule, keyring, cred); } if ((rule->flags & IMA_FUNC) && (rule->func != func && func != POST_SETATTR)) @@ -1035,10 +1031,9 @@ static bool ima_validate_rule(struct ima_rule_entry *entry) if (entry->action & ~(MEASURE | DONT_MEASURE)) return false; - if (entry->flags & ~(IMA_FUNC | IMA_PCR)) - return false; - - if (ima_rule_contains_lsm_cond(entry)) + if (entry->flags & ~(IMA_FUNC | IMA_FSMAGIC | IMA_UID | + IMA_FOWNER | IMA_FSUUID | IMA_EUID | + IMA_PCR | IMA_FSNAME)) return false; break; diff --git a/security/integrity/ima/ima_queue_keys.c b/security/integrity/ima/ima_queue_keys.c index 56ce24a18b66..69a8626a35c0 100644 --- a/security/integrity/ima/ima_queue_keys.c +++ b/security/integrity/ima/ima_queue_keys.c @@ -158,7 +158,7 @@ void ima_process_queued_keys(void) list_for_each_entry_safe(entry, tmp, &ima_keys, list) { if (!timer_expired) - process_buffer_measurement(entry->payload, + process_buffer_measurement(NULL, entry->payload, entry->payload_len, entry->keyring_name, KEY_CHECK, 0, -- cgit v1.2.3 From be619f7f063a49c656f620a46af4f8ea3e759e91 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 13 Jul 2020 12:06:48 -0500 Subject: exec: Implement kernel_execve To allow the kernel not to play games with set_fs to call exec implement kernel_execve. The function kernel_execve takes pointers into kernel memory and copies the values pointed to onto the new userspace stack. The calls with arguments from kernel space of do_execve are replaced with calls to kernel_execve. The calls do_execve and do_execveat are made static as there are now no callers outside of exec. The comments that mention do_execve are updated to refer to kernel_execve or execve depending on the circumstances. In addition to correcting the comments, this makes it easy to grep for do_execve and verify it is not used. Inspired-by: https://lkml.kernel.org/r/20200627072704.2447163-1-hch@lst.de Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/87wo365ikj.fsf@x220.int.ebiederm.org Signed-off-by: "Eric W. Biederman" --- arch/x86/entry/entry_32.S | 2 +- arch/x86/entry/entry_64.S | 2 +- arch/x86/kernel/unwind_frame.c | 2 +- fs/exec.c | 88 +++++++++++++++++++++++++++++++++++++++++- include/linux/binfmts.h | 9 +---- init/main.c | 4 +- kernel/umh.c | 6 +-- security/tomoyo/common.h | 2 +- security/tomoyo/domain.c | 4 +- security/tomoyo/tomoyo.c | 4 +- 10 files changed, 100 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 024d7d276cd4..8f4e085ee06d 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -854,7 +854,7 @@ SYM_CODE_START(ret_from_fork) CALL_NOSPEC ebx /* * A kernel thread is allowed to return here after successfully - * calling do_execve(). Exit to userspace to complete the execve() + * calling kernel_execve(). Exit to userspace to complete the execve() * syscall. */ movl $0, PT_EAX(%esp) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d2a00c97e53f..73c7e255256b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -293,7 +293,7 @@ SYM_CODE_START(ret_from_fork) CALL_NOSPEC rbx /* * A kernel thread is allowed to return here after successfully - * calling do_execve(). Exit to userspace to complete the execve() + * calling kernel_execve(). Exit to userspace to complete the execve() * syscall. */ movq $0, RAX(%rsp) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 722a85f3b2dd..e40b4942157f 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -275,7 +275,7 @@ bool unwind_next_frame(struct unwind_state *state) * This user_mode() check is slightly broader than a PF_KTHREAD * check because it also catches the awkward situation where a * newly forked kthread transitions into a user task by calling - * do_execve(), which eventually clears PF_KTHREAD. + * kernel_execve(), which eventually clears PF_KTHREAD. */ if (!user_mode(regs)) goto the_end; diff --git a/fs/exec.c b/fs/exec.c index f8135dc149b3..3698252719a3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -448,6 +448,23 @@ static int count(struct user_arg_ptr argv, int max) return i; } +static int count_strings_kernel(const char *const *argv) +{ + int i; + + if (!argv) + return 0; + + for (i = 0; argv[i]; ++i) { + if (i >= MAX_ARG_STRINGS) + return -E2BIG; + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); + } + return i; +} + static int bprm_stack_limits(struct linux_binprm *bprm) { unsigned long limit, ptr_size; @@ -624,6 +641,20 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) } EXPORT_SYMBOL(copy_string_kernel); +static int copy_strings_kernel(int argc, const char *const *argv, + struct linux_binprm *bprm) +{ + while (argc-- > 0) { + int ret = copy_string_kernel(argv[argc], bprm); + if (ret < 0) + return ret; + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); + } + return 0; +} + #ifdef CONFIG_MMU /* @@ -1991,7 +2022,60 @@ out_ret: return retval; } -int do_execve(struct filename *filename, +int kernel_execve(const char *kernel_filename, + const char *const *argv, const char *const *envp) +{ + struct filename *filename; + struct linux_binprm *bprm; + int fd = AT_FDCWD; + int retval; + + filename = getname_kernel(kernel_filename); + if (IS_ERR(filename)) + return PTR_ERR(filename); + + bprm = alloc_bprm(fd, filename); + if (IS_ERR(bprm)) { + retval = PTR_ERR(bprm); + goto out_ret; + } + + retval = count_strings_kernel(argv); + if (retval < 0) + goto out_free; + bprm->argc = retval; + + retval = count_strings_kernel(envp); + if (retval < 0) + goto out_free; + bprm->envc = retval; + + retval = bprm_stack_limits(bprm); + if (retval < 0) + goto out_free; + + retval = copy_string_kernel(bprm->filename, bprm); + if (retval < 0) + goto out_free; + bprm->exec = bprm->p; + + retval = copy_strings_kernel(bprm->envc, envp, bprm); + if (retval < 0) + goto out_free; + + retval = copy_strings_kernel(bprm->argc, argv, bprm); + if (retval < 0) + goto out_free; + + retval = bprm_execve(bprm, fd, filename, 0); +out_free: + free_bprm(bprm); +out_ret: + putname(filename); + return retval; +} + +static int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { @@ -2000,7 +2084,7 @@ int do_execve(struct filename *filename, return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } -int do_execveat(int fd, struct filename *filename, +static int do_execveat(int fd, struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp, int flags) diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 8e9e1b0c8eb8..0571701ab1c5 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -135,12 +135,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm); extern void set_binfmt(struct linux_binfmt *new); extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t); -extern int do_execve(struct filename *, - const char __user * const __user *, - const char __user * const __user *); -extern int do_execveat(int, struct filename *, - const char __user * const __user *, - const char __user * const __user *, - int); +int kernel_execve(const char *filename, + const char *const *argv, const char *const *envp); #endif /* _LINUX_BINFMTS_H */ diff --git a/init/main.c b/init/main.c index 0ead83e86b5a..78ccec5c28f3 100644 --- a/init/main.c +++ b/init/main.c @@ -1329,9 +1329,7 @@ static int run_init_process(const char *init_filename) pr_debug(" with environment:\n"); for (p = envp_init; *p; p++) pr_debug(" %s\n", *p); - return do_execve(getname_kernel(init_filename), - (const char __user *const __user *)argv_init, - (const char __user *const __user *)envp_init); + return kernel_execve(init_filename, argv_init, envp_init); } static int try_to_run_init_process(const char *init_filename) diff --git a/kernel/umh.c b/kernel/umh.c index 6ca2096298b9..a25433f9cd9a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -98,9 +98,9 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); + retval = kernel_execve(sub_info->path, + (const char *const *)sub_info->argv, + (const char *const *)sub_info->envp); out: sub_info->retval = retval; /* diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h index 050473df5809..85246b9df7ca 100644 --- a/security/tomoyo/common.h +++ b/security/tomoyo/common.h @@ -425,7 +425,7 @@ struct tomoyo_request_info { struct tomoyo_obj_info *obj; /* * For holding parameters specific to execve() request. - * NULL if not dealing do_execve(). + * NULL if not dealing execve(). */ struct tomoyo_execve *ee; struct tomoyo_domain_info *domain; diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 7869d6a9980b..53b3e1f5f227 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -767,7 +767,7 @@ retry: /* * Check for domain transition preference if "file execute" matched. - * If preference is given, make do_execve() fail if domain transition + * If preference is given, make execve() fail if domain transition * has failed, for domain transition preference should be used with * destination domain defined. */ @@ -810,7 +810,7 @@ force_reset_domain: snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "<%s>", candidate->name); /* - * Make do_execve() fail if domain transition across namespaces + * Make execve() fail if domain transition across namespaces * has failed. */ reject_on_transition_failure = true; diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index f9adddc42ac8..1f3cd432d830 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -93,7 +93,7 @@ static int tomoyo_bprm_check_security(struct linux_binprm *bprm) struct tomoyo_task *s = tomoyo_task(current); /* - * Execute permission is checked against pathname passed to do_execve() + * Execute permission is checked against pathname passed to execve() * using current domain. */ if (!s->old_domain_info) { @@ -307,7 +307,7 @@ static int tomoyo_file_fcntl(struct file *file, unsigned int cmd, */ static int tomoyo_file_open(struct file *f) { - /* Don't check read permission here if called from do_execve(). */ + /* Don't check read permission here if called from execve(). */ if (current->in_execve) return 0; return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path, -- cgit v1.2.3 From f1d9b23cabc61e58509164c3c3132556476491d2 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 13 Jul 2020 15:51:59 -0400 Subject: audit: purge audit_log_string from the intra-kernel audit API audit_log_string() was inteded to be an internal audit function and since there are only two internal uses, remove them. Purge all external uses of it by restructuring code to use an existing audit_log_format() or using audit_log_format(). Please see the upstream issue https://github.com/linux-audit/audit-kernel/issues/84 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 5 ----- kernel/audit.c | 4 ++-- security/apparmor/audit.c | 10 ++++------ security/apparmor/file.c | 25 +++++++------------------ security/apparmor/ipc.c | 46 +++++++++++++++++++++++----------------------- security/apparmor/net.c | 14 ++++++++------ security/lsm_audit.c | 4 ++-- 7 files changed, 46 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 523f77494847..b3d859831a31 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -694,9 +694,4 @@ static inline bool audit_loginuid_set(struct task_struct *tsk) return uid_valid(audit_get_loginuid(tsk)); } -static inline void audit_log_string(struct audit_buffer *ab, const char *buf) -{ - audit_log_n_string(ab, buf, strlen(buf)); -} - #endif diff --git a/kernel/audit.c b/kernel/audit.c index 8c201f414226..a2f3e34aa724 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2080,13 +2080,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); if (!pathname) { - audit_log_string(ab, ""); + audit_log_format(ab, "\"\""); return; } p = d_path(path, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ - audit_log_string(ab, ""); + audit_log_format(ab, "\"\""); } else audit_log_untrustedstring(ab, p); kfree(pathname); diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c index 597732503815..f7e97c7e80f3 100644 --- a/security/apparmor/audit.c +++ b/security/apparmor/audit.c @@ -57,18 +57,16 @@ static void audit_pre(struct audit_buffer *ab, void *ca) struct common_audit_data *sa = ca; if (aa_g_audit_header) { - audit_log_format(ab, "apparmor="); - audit_log_string(ab, aa_audit_type[aad(sa)->type]); + audit_log_format(ab, "apparmor=\"%s\"", + aa_audit_type[aad(sa)->type]); } if (aad(sa)->op) { - audit_log_format(ab, " operation="); - audit_log_string(ab, aad(sa)->op); + audit_log_format(ab, " operation=\"%s\"", aad(sa)->op); } if (aad(sa)->info) { - audit_log_format(ab, " info="); - audit_log_string(ab, aad(sa)->info); + audit_log_format(ab, " info=\"%s\"", aad(sa)->info); if (aad(sa)->error) audit_log_format(ab, " error=%d", aad(sa)->error); } diff --git a/security/apparmor/file.c b/security/apparmor/file.c index 9a2d14b7c9f8..92acf9a49405 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -34,20 +34,6 @@ static u32 map_mask_to_chr_mask(u32 mask) return m; } -/** - * audit_file_mask - convert mask to permission string - * @buffer: buffer to write string to (NOT NULL) - * @mask: permission mask to convert - */ -static void audit_file_mask(struct audit_buffer *ab, u32 mask) -{ - char str[10]; - - aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs, - map_mask_to_chr_mask(mask)); - audit_log_string(ab, str); -} - /** * file_audit_cb - call back for file specific audit fields * @ab: audit_buffer (NOT NULL) @@ -57,14 +43,17 @@ static void file_audit_cb(struct audit_buffer *ab, void *va) { struct common_audit_data *sa = va; kuid_t fsuid = current_fsuid(); + char str[10]; if (aad(sa)->request & AA_AUDIT_FILE_MASK) { - audit_log_format(ab, " requested_mask="); - audit_file_mask(ab, aad(sa)->request); + aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs, + map_mask_to_chr_mask(aad(sa)->request)); + audit_log_format(ab, " requested_mask=\"%s\"", str); } if (aad(sa)->denied & AA_AUDIT_FILE_MASK) { - audit_log_format(ab, " denied_mask="); - audit_file_mask(ab, aad(sa)->denied); + aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs, + map_mask_to_chr_mask(aad(sa)->denied)); + audit_log_format(ab, " denied_mask=\"%s\"", str); } if (aad(sa)->request & AA_AUDIT_FILE_MASK) { audit_log_format(ab, " fsuid=%d", diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c index 4ecedffbdd33..fe36d112aad9 100644 --- a/security/apparmor/ipc.c +++ b/security/apparmor/ipc.c @@ -20,25 +20,23 @@ /** * audit_ptrace_mask - convert mask to permission string - * @buffer: buffer to write string to (NOT NULL) * @mask: permission mask to convert + * + * Returns: pointer to static string */ -static void audit_ptrace_mask(struct audit_buffer *ab, u32 mask) +static const char *audit_ptrace_mask(u32 mask) { switch (mask) { case MAY_READ: - audit_log_string(ab, "read"); - break; + return "read"; case MAY_WRITE: - audit_log_string(ab, "trace"); - break; + return "trace"; case AA_MAY_BE_READ: - audit_log_string(ab, "readby"); - break; + return "readby"; case AA_MAY_BE_TRACED: - audit_log_string(ab, "tracedby"); - break; + return "tracedby"; } + return ""; } /* call back to audit ptrace fields */ @@ -47,12 +45,12 @@ static void audit_ptrace_cb(struct audit_buffer *ab, void *va) struct common_audit_data *sa = va; if (aad(sa)->request & AA_PTRACE_PERM_MASK) { - audit_log_format(ab, " requested_mask="); - audit_ptrace_mask(ab, aad(sa)->request); + audit_log_format(ab, " requested_mask=\"%s\"", + audit_ptrace_mask(aad(sa)->request)); if (aad(sa)->denied & AA_PTRACE_PERM_MASK) { - audit_log_format(ab, " denied_mask="); - audit_ptrace_mask(ab, aad(sa)->denied); + audit_log_format(ab, " denied_mask=\"%s\"", + audit_ptrace_mask(aad(sa)->denied)); } } audit_log_format(ab, " peer="); @@ -142,16 +140,18 @@ static inline int map_signal_num(int sig) } /** - * audit_file_mask - convert mask to permission string - * @buffer: buffer to write string to (NOT NULL) + * audit_signal_mask - convert mask to permission string * @mask: permission mask to convert + * + * Returns: pointer to static string */ -static void audit_signal_mask(struct audit_buffer *ab, u32 mask) +static const char *audit_signal_mask(u32 mask) { if (mask & MAY_READ) - audit_log_string(ab, "receive"); + return "receive"; if (mask & MAY_WRITE) - audit_log_string(ab, "send"); + return "send"; + return ""; } /** @@ -164,11 +164,11 @@ static void audit_signal_cb(struct audit_buffer *ab, void *va) struct common_audit_data *sa = va; if (aad(sa)->request & AA_SIGNAL_PERM_MASK) { - audit_log_format(ab, " requested_mask="); - audit_signal_mask(ab, aad(sa)->request); + audit_log_format(ab, " requested_mask=\"%s\"", + audit_signal_mask(aad(sa)->request)); if (aad(sa)->denied & AA_SIGNAL_PERM_MASK) { - audit_log_format(ab, " denied_mask="); - audit_signal_mask(ab, aad(sa)->denied); + audit_log_format(ab, " denied_mask=\"%s\"", + audit_signal_mask(aad(sa)->denied)); } } if (aad(sa)->signal == SIGUNKNOWN) diff --git a/security/apparmor/net.c b/security/apparmor/net.c index d8afc39f663a..fa0e85568450 100644 --- a/security/apparmor/net.c +++ b/security/apparmor/net.c @@ -72,16 +72,18 @@ void audit_net_cb(struct audit_buffer *ab, void *va) { struct common_audit_data *sa = va; - audit_log_format(ab, " family="); if (address_family_names[sa->u.net->family]) - audit_log_string(ab, address_family_names[sa->u.net->family]); + audit_log_format(ab, " family=\"%s\"", + address_family_names[sa->u.net->family]); else - audit_log_format(ab, "\"unknown(%d)\"", sa->u.net->family); - audit_log_format(ab, " sock_type="); + audit_log_format(ab, " family=\"unknown(%d)\"", + sa->u.net->family); if (sock_type_names[aad(sa)->net.type]) - audit_log_string(ab, sock_type_names[aad(sa)->net.type]); + audit_log_format(ab, " sock_type=\"%s\"", + sock_type_names[aad(sa)->net.type]); else - audit_log_format(ab, "\"unknown(%d)\"", aad(sa)->net.type); + audit_log_format(ab, " sock_type=\"unknown(%d)\"", + aad(sa)->net.type); audit_log_format(ab, " protocol=%d", aad(sa)->net.protocol); if (aad(sa)->request & NET_PERMS_MASK) { diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 7c555621c2bd..53d0d183db8f 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -432,8 +432,8 @@ static void dump_common_audit_data(struct audit_buffer *ab, a->u.ibendport->port); break; case LSM_AUDIT_DATA_LOCKDOWN: - audit_log_format(ab, " lockdown_reason="); - audit_log_string(ab, lockdown_reasons[a->u.reason]); + audit_log_format(ab, " lockdown_reason=\"%s\"", + lockdown_reasons[a->u.reason]); break; } /* switch (a->type) */ } -- cgit v1.2.3 From b43870c74f3fdf0cd06bf5f1b7a5ed70a2cd4ed2 Mon Sep 17 00:00:00 2001 From: Max Englander Date: Sat, 4 Jul 2020 15:15:28 +0000 Subject: audit: report audit wait metric in audit status reply In environments where the preservation of audit events and predictable usage of system memory are prioritized, admins may use a combination of --backlog_wait_time and -b options at the risk of degraded performance resulting from backlog waiting. In some cases, this risk may be preferred to lost events or unbounded memory usage. Ideally, this risk can be mitigated by making adjustments when backlog waiting is detected. However, detection can be difficult using the currently available metrics. For example, an admin attempting to debug degraded performance may falsely believe a full backlog indicates backlog waiting. It may turn out the backlog frequently fills up but drains quickly. To make it easier to reliably track degraded performance to backlog waiting, this patch makes the following changes: Add a new field backlog_wait_time_total to the audit status reply. Initialize this field to zero. Add to this field the total time spent by the current task on scheduled timeouts while the backlog limit is exceeded. Reset field to zero upon request via AUDIT_SET. Tested on Ubuntu 18.04 using complementary changes to the audit-userspace and audit-testsuite: - https://github.com/linux-audit/audit-userspace/pull/134 - https://github.com/linux-audit/audit-testsuite/pull/97 Signed-off-by: Max Englander Signed-off-by: Paul Moore --- include/uapi/linux/audit.h | 18 +++++++++++------- kernel/audit.c | 35 +++++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 9b6a973f4cc3..cd2d8279a5e4 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -333,14 +333,15 @@ enum { }; /* Status symbols */ - /* Mask values */ -#define AUDIT_STATUS_ENABLED 0x0001 -#define AUDIT_STATUS_FAILURE 0x0002 -#define AUDIT_STATUS_PID 0x0004 + /* Mask values */ +#define AUDIT_STATUS_ENABLED 0x0001 +#define AUDIT_STATUS_FAILURE 0x0002 +#define AUDIT_STATUS_PID 0x0004 #define AUDIT_STATUS_RATE_LIMIT 0x0008 -#define AUDIT_STATUS_BACKLOG_LIMIT 0x0010 -#define AUDIT_STATUS_BACKLOG_WAIT_TIME 0x0020 -#define AUDIT_STATUS_LOST 0x0040 +#define AUDIT_STATUS_BACKLOG_LIMIT 0x0010 +#define AUDIT_STATUS_BACKLOG_WAIT_TIME 0x0020 +#define AUDIT_STATUS_LOST 0x0040 +#define AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL 0x0080 #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT 0x00000001 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME 0x00000002 @@ -467,6 +468,9 @@ struct audit_status { __u32 feature_bitmap; /* bitmap of kernel audit features */ }; __u32 backlog_wait_time;/* message queue wait timeout */ + __u32 backlog_wait_time_actual;/* time spent waiting while + * message limit exceeded + */ }; struct audit_features { diff --git a/kernel/audit.c b/kernel/audit.c index a2f3e34aa724..d72663ac248c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -136,6 +136,11 @@ u32 audit_sig_sid = 0; */ static atomic_t audit_lost = ATOMIC_INIT(0); +/* Monotonically increasing sum of time the kernel has spent + * waiting while the backlog limit is exceeded. + */ +static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0); + /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -1201,17 +1206,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_GET: { struct audit_status s; memset(&s, 0, sizeof(s)); - s.enabled = audit_enabled; - s.failure = audit_failure; + s.enabled = audit_enabled; + s.failure = audit_failure; /* NOTE: use pid_vnr() so the PID is relative to the current * namespace */ - s.pid = auditd_pid_vnr(); - s.rate_limit = audit_rate_limit; - s.backlog_limit = audit_backlog_limit; - s.lost = atomic_read(&audit_lost); - s.backlog = skb_queue_len(&audit_queue); - s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; - s.backlog_wait_time = audit_backlog_wait_time; + s.pid = auditd_pid_vnr(); + s.rate_limit = audit_rate_limit; + s.backlog_limit = audit_backlog_limit; + s.lost = atomic_read(&audit_lost); + s.backlog = skb_queue_len(&audit_queue); + s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; + s.backlog_wait_time = audit_backlog_wait_time; + s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual); audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } @@ -1315,6 +1321,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_config_change("lost", 0, lost, 1); return lost; } + if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) { + u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0); + + audit_log_config_change("backlog_wait_time_actual", 0, actual, 1); + return actual; + } break; } case AUDIT_GET_FEATURE: @@ -1826,12 +1838,15 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { + long rtime = stime; + DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); - stime = schedule_timeout(stime); + stime = schedule_timeout(rtime); + atomic_add(rtime - stime, &audit_backlog_wait_time_actual); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) -- cgit v1.2.3 From 343ead287dde0064c430ee1db477726f52e0269c Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Tue, 21 Jul 2020 12:07:16 +0200 Subject: bpf, netns: Fix build without CONFIG_INET When CONFIG_NET is set but CONFIG_INET isn't, build fails with: ld: kernel/bpf/net_namespace.o: in function `netns_bpf_attach_type_unneed': kernel/bpf/net_namespace.c:32: undefined reference to `bpf_sk_lookup_enabled' ld: kernel/bpf/net_namespace.o: in function `netns_bpf_attach_type_need': kernel/bpf/net_namespace.c:43: undefined reference to `bpf_sk_lookup_enabled' This is because without CONFIG_INET bpf_sk_lookup_enabled symbol is not available. Wrap references to bpf_sk_lookup_enabled with preprocessor conditionals. Fixes: 1559b4aa1db4 ("inet: Run SK_LOOKUP BPF program on socket lookup") Reported-by: Randy Dunlap Reported-by: Stephen Rothwell Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Randy Dunlap # build-tested Link: https://lore.kernel.org/bpf/20200721100716.720477-1-jakub@cloudflare.com --- kernel/bpf/net_namespace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 4e1bcaa2c3cb..71405edd667c 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -28,9 +28,11 @@ DEFINE_MUTEX(netns_bpf_mutex); static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type) { switch (type) { +#ifdef CONFIG_INET case NETNS_BPF_SK_LOOKUP: static_branch_dec(&bpf_sk_lookup_enabled); break; +#endif default: break; } @@ -39,9 +41,11 @@ static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type) static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type) { switch (type) { +#ifdef CONFIG_INET case NETNS_BPF_SK_LOOKUP: static_branch_inc(&bpf_sk_lookup_enabled); break; +#endif default: break; } -- cgit v1.2.3 From c576b9c77bea9a8ad11ddc277af3edbf15733ddd Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sun, 19 Jul 2020 17:52:41 +0200 Subject: bpf: cpumap: Fix possible rcpu kthread hung Fix the following cpumap kthread hung. The issue is currently occurring when __cpu_map_load_bpf_program fails (e.g if the bpf prog has not BPF_XDP_CPUMAP as expected_attach_type) $./test_progs -n 101 101/1 cpumap_with_progs:OK 101 xdp_cpumap_attach:OK Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED [ 369.996478] INFO: task cpumap/0/map:7:205 blocked for more than 122 seconds. [ 369.998463] Not tainted 5.8.0-rc4-01472-ge57892f50a07 #212 [ 370.000102] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 370.001918] cpumap/0/map:7 D 0 205 2 0x00004000 [ 370.003228] Call Trace: [ 370.003930] __schedule+0x5c7/0xf50 [ 370.004901] ? io_schedule_timeout+0xb0/0xb0 [ 370.005934] ? static_obj+0x31/0x80 [ 370.006788] ? mark_held_locks+0x24/0x90 [ 370.007752] ? cpu_map_bpf_prog_run_xdp+0x6c0/0x6c0 [ 370.008930] schedule+0x6f/0x160 [ 370.009728] schedule_preempt_disabled+0x14/0x20 [ 370.010829] kthread+0x17b/0x240 [ 370.011433] ? kthread_create_worker_on_cpu+0xd0/0xd0 [ 370.011944] ret_from_fork+0x1f/0x30 [ 370.012348] Showing all locks held in the system: [ 370.013025] 1 lock held by khungtaskd/33: [ 370.013432] #0: ffffffff82b24720 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x28/0x1c3 [ 370.014461] ============================================= Fixes: 9216477449f3 ("bpf: cpumap: Add the possibility to attach an eBPF program to cpumap") Reported-by: Jakub Sitnicki Signed-off-by: Lorenzo Bianconi Signed-off-by: Alexei Starovoitov Tested-by: Jakub Sitnicki Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/e54f2aabf959f298939e5507b09c48f8c2e380be.1595170625.git.lorenzo@kernel.org --- kernel/bpf/cpumap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 4c95d0615ca2..f1c46529929b 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -453,24 +453,27 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) rcpu->map_id = map_id; rcpu->value.qsize = value->qsize; + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + goto free_ptr_ring; + /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, "cpumap/%d/map:%d", cpu, map_id); if (IS_ERR(rcpu->kthread)) - goto free_ptr_ring; + goto free_prog; get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ - if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) - goto free_ptr_ring; - /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); return rcpu; +free_prog: + if (rcpu->prog) + bpf_prog_put(rcpu->prog); free_ptr_ring: ptr_ring_cleanup(rcpu->queue, NULL); free_queue: -- cgit v1.2.3 From bc4f0548f683a3d53359cef15f088d2d5bb4bc39 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 20 Jul 2020 09:33:58 -0700 Subject: bpf: Compute bpf_skc_to_*() helper socket btf ids at build time Currently, socket types (struct tcp_sock, udp_sock, etc.) used by bpf_skc_to_*() helpers are computed when vmlinux_btf is first built in the kernel. Commit 5a2798ab32ba ("bpf: Add BTF_ID_LIST/BTF_ID/BTF_ID_UNUSED macros") implemented a mechanism to compute btf_ids at kernel build time which can simplify kernel implementation and reduce runtime overhead by removing in-kernel btf_id calculation. This patch did exactly this, removing in-kernel btf_id computation and utilizing build-time btf_id computation. If CONFIG_DEBUG_INFO_BTF is not defined, BTF_ID_LIST will define an array with size of 5, which is not enough for btf_sock_ids. So define its own static array if CONFIG_DEBUG_INFO_BTF is not defined. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200720163358.1393023-1-yhs@fb.com --- include/linux/bpf.h | 4 ---- kernel/bpf/btf.c | 1 - net/core/filter.c | 49 ++++++++++++++++++------------------------------- 3 files changed, 18 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index adb16bdc5f0a..1df1c0fd3f28 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1541,7 +1541,6 @@ static inline bool bpf_map_is_dev_bound(struct bpf_map *map) struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); void bpf_map_offload_map_free(struct bpf_map *map); -void init_btf_sock_ids(struct btf *btf); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) @@ -1567,9 +1566,6 @@ static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) static inline void bpf_map_offload_map_free(struct bpf_map *map) { } -static inline void init_btf_sock_ids(struct btf *btf) -{ -} #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_BPF_STREAM_PARSER) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 03d6d43bb1d6..315cde73421b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3672,7 +3672,6 @@ struct btf *btf_parse_vmlinux(void) goto errout; bpf_struct_ops_init(btf, log); - init_btf_sock_ids(btf); btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); diff --git a/net/core/filter.c b/net/core/filter.c index 2bd129b5ae74..5a65fb4b95ff 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9426,19 +9426,19 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) * sock_common as the first argument in its memory layout. */ #define BTF_SOCK_TYPE_xxx \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, "inet_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, "inet_connection_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, "inet_request_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, "inet_timewait_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, "request_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, "sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, "sock_common") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, "tcp_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, "tcp_request_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, "tcp_timewait_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, "tcp6_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, "udp_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, "udp6_sock") + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, inet_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, inet_connection_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, inet_request_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, inet_timewait_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, request_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, sock_common) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, tcp_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, tcp_request_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, tcp_timewait_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) enum { #define BTF_SOCK_TYPE(name, str) name, @@ -9447,26 +9447,13 @@ BTF_SOCK_TYPE_xxx MAX_BTF_SOCK_TYPE, }; -static int btf_sock_ids[MAX_BTF_SOCK_TYPE]; - -#ifdef CONFIG_BPF_SYSCALL -static const char *bpf_sock_types[] = { -#define BTF_SOCK_TYPE(name, str) str, +#ifdef CONFIG_DEBUG_INFO_BTF +BTF_ID_LIST(btf_sock_ids) +#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) BTF_SOCK_TYPE_xxx #undef BTF_SOCK_TYPE -}; - -void init_btf_sock_ids(struct btf *btf) -{ - int i, btf_id; - - for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) { - btf_id = btf_find_by_name_kind(btf, bpf_sock_types[i], - BTF_KIND_STRUCT); - if (btf_id > 0) - btf_sock_ids[i] = btf_id; - } -} +#else +static u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; #endif static bool check_arg_btf_id(u32 btf_id, u32 arg) -- cgit v1.2.3 From 951cf368bcb11d6f817709660cf5cd914072c36f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 20 Jul 2020 09:34:03 -0700 Subject: bpf: net: Use precomputed btf_id for bpf iterators One additional field btf_id is added to struct bpf_ctx_arg_aux to store the precomputed btf_ids. The btf_id is computed at build time with BTF_ID_LIST or BTF_ID_LIST_GLOBAL macro definitions. All existing bpf iterators are changed to used pre-compute btf_ids. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200720163403.1393551-1-yhs@fb.com --- include/linux/bpf.h | 1 + kernel/bpf/btf.c | 5 +++-- kernel/bpf/map_iter.c | 7 ++++++- kernel/bpf/task_iter.c | 12 ++++++++++-- net/ipv4/tcp_ipv4.c | 4 +++- net/ipv4/udp.c | 4 +++- net/ipv6/route.c | 7 ++++++- net/netlink/af_netlink.c | 7 ++++++- 8 files changed, 38 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1df1c0fd3f28..bae557ff2da8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -668,6 +668,7 @@ struct bpf_jit_poke_descriptor { struct bpf_ctx_arg_aux { u32 offset; enum bpf_reg_type reg_type; + u32 btf_id; }; struct bpf_prog_aux { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 315cde73421b..ee36b7f60936 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3817,16 +3817,17 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - info->reg_type = PTR_TO_BTF_ID; for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; if (ctx_arg_info->offset == off) { info->reg_type = ctx_arg_info->reg_type; - break; + info->btf_id = ctx_arg_info->btf_id; + return true; } } + info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); if (ret > 0) { diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index c69071e334bf..8a7af11b411f 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -4,6 +4,7 @@ #include #include #include +#include struct bpf_iter_seq_map_info { u32 mid; @@ -81,7 +82,10 @@ static const struct seq_operations bpf_map_seq_ops = { .show = bpf_map_seq_show, }; -static const struct bpf_iter_reg bpf_map_reg_info = { +BTF_ID_LIST(btf_bpf_map_id) +BTF_ID(struct, bpf_map) + +static struct bpf_iter_reg bpf_map_reg_info = { .target = "bpf_map", .seq_ops = &bpf_map_seq_ops, .init_seq_private = NULL, @@ -96,6 +100,7 @@ static const struct bpf_iter_reg bpf_map_reg_info = { static int __init bpf_map_iter_init(void) { + bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id; return bpf_iter_reg_target(&bpf_map_reg_info); } diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 4dbf2b6035f8..2feecf095609 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -7,6 +7,7 @@ #include #include #include +#include struct bpf_iter_seq_task_common { struct pid_namespace *ns; @@ -312,7 +313,11 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; -static const struct bpf_iter_reg task_reg_info = { +BTF_ID_LIST(btf_task_file_ids) +BTF_ID(struct, task_struct) +BTF_ID(struct, file) + +static struct bpf_iter_reg task_reg_info = { .target = "task", .seq_ops = &task_seq_ops, .init_seq_private = init_seq_pidns, @@ -325,7 +330,7 @@ static const struct bpf_iter_reg task_reg_info = { }, }; -static const struct bpf_iter_reg task_file_reg_info = { +static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", .seq_ops = &task_file_seq_ops, .init_seq_private = init_seq_pidns, @@ -344,10 +349,13 @@ static int __init task_iter_init(void) { int ret; + task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; ret = bpf_iter_reg_target(&task_reg_info); if (ret) return ret; + task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; + task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; return bpf_iter_reg_target(&task_file_reg_info); } late_initcall(task_iter_init); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 116c11a0aaed..a7f1b41482f8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -76,6 +76,7 @@ #include #include #include +#include #include #include @@ -2954,7 +2955,7 @@ static void bpf_iter_fini_tcp(void *priv_data) bpf_iter_fini_seq_net(priv_data); } -static const struct bpf_iter_reg tcp_reg_info = { +static struct bpf_iter_reg tcp_reg_info = { .target = "tcp", .seq_ops = &bpf_iter_tcp_seq_ops, .init_seq_private = bpf_iter_init_tcp, @@ -2969,6 +2970,7 @@ static const struct bpf_iter_reg tcp_reg_info = { static void __init bpf_iter_register(void) { + tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]; if (bpf_iter_reg_target(&tcp_reg_info)) pr_warn("Warning: could not register bpf iterator tcp\n"); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b738c63d7a77..b5231ab350e0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -106,6 +106,7 @@ #include #include #include +#include #include #include #include "udp_impl.h" @@ -3232,7 +3233,7 @@ static void bpf_iter_fini_udp(void *priv_data) bpf_iter_fini_seq_net(priv_data); } -static const struct bpf_iter_reg udp_reg_info = { +static struct bpf_iter_reg udp_reg_info = { .target = "udp", .seq_ops = &bpf_iter_udp_seq_ops, .init_seq_private = bpf_iter_init_udp, @@ -3247,6 +3248,7 @@ static const struct bpf_iter_reg udp_reg_info = { static void __init bpf_iter_register(void) { + udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP]; if (bpf_iter_reg_target(&udp_reg_info)) pr_warn("Warning: could not register bpf iterator udp\n"); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 427b81cbc164..33f5efbad0a9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -61,6 +61,7 @@ #include #include #include +#include #ifdef CONFIG_SYSCTL #include @@ -6423,7 +6424,10 @@ void __init ip6_route_init_special_entries(void) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) -static const struct bpf_iter_reg ipv6_route_reg_info = { +BTF_ID_LIST(btf_fib6_info_id) +BTF_ID(struct, fib6_info) + +static struct bpf_iter_reg ipv6_route_reg_info = { .target = "ipv6_route", .seq_ops = &ipv6_route_seq_ops, .init_seq_private = bpf_iter_init_seq_net, @@ -6438,6 +6442,7 @@ static const struct bpf_iter_reg ipv6_route_reg_info = { static int __init bpf_iter_register(void) { + ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id; return bpf_iter_reg_target(&ipv6_route_reg_info); } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4f2c3b14ddbf..3cd58f0c2de4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -2803,7 +2804,10 @@ static const struct rhashtable_params netlink_rhashtable_params = { }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) -static const struct bpf_iter_reg netlink_reg_info = { +BTF_ID_LIST(btf_netlink_sock_id) +BTF_ID(struct, netlink_sock) + +static struct bpf_iter_reg netlink_reg_info = { .target = "netlink", .seq_ops = &netlink_seq_ops, .init_seq_private = bpf_iter_init_seq_net, @@ -2818,6 +2822,7 @@ static const struct bpf_iter_reg netlink_reg_info = { static int __init bpf_iter_register(void) { + netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id; return bpf_iter_reg_target(&netlink_reg_info); } #endif -- cgit v1.2.3 From 58877d347b58c9e971112df5eb311c13bb0acb28 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Jul 2020 14:52:11 +0200 Subject: sched: Better document ttwu() Dave hit the problem fixed by commit: b6e13e85829f ("sched/core: Fix ttwu() race") and failed to understand much of the code involved. Per his request a few comments to (hopefully) clarify things. Requested-by: Dave Chinner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200702125211.GQ4800@hirez.programming.kicks-ass.net --- include/linux/sched.h | 12 ++-- kernel/sched/core.c | 188 +++++++++++++++++++++++++++++++++++++++++++------- kernel/sched/sched.h | 10 +++ 3 files changed, 179 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 12b10ce51a08..5033813fecd5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -154,24 +154,24 @@ struct task_group; * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); - * if (!need_sleep) - * break; + * if (CONDITION) + * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the - * condition test and condition change and wakeup are under the same lock) then + * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * - * need_sleep = false; + * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * - * where wake_up_state() executes a full memory barrier before accessing the - * task state. + * where wake_up_state()/try_to_wake_up() executes a full memory barrier before + * accessing p->state. * * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 08d02ce26b71..12db8fbd9c97 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -79,6 +79,100 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; + +/* + * Serialization rules: + * + * Lock order: + * + * p->pi_lock + * rq->lock + * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) + * + * rq1->lock + * rq2->lock where: rq1 < rq2 + * + * Regular state: + * + * Normal scheduling state is serialized by rq->lock. __schedule() takes the + * local CPU's rq->lock, it optionally removes the task from the runqueue and + * always looks at the local rq data structures to find the most elegible task + * to run next. + * + * Task enqueue is also under rq->lock, possibly taken from another CPU. + * Wakeups from another LLC domain might use an IPI to transfer the enqueue to + * the local CPU to avoid bouncing the runqueue state around [ see + * ttwu_queue_wakelist() ] + * + * Task wakeup, specifically wakeups that involve migration, are horribly + * complicated to avoid having to take two rq->locks. + * + * Special state: + * + * System-calls and anything external will use task_rq_lock() which acquires + * both p->pi_lock and rq->lock. As a consequence the state they change is + * stable while holding either lock: + * + * - sched_setaffinity()/ + * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed + * - set_user_nice(): p->se.load, p->*prio + * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, + * p->se.load, p->rt_priority, + * p->dl.dl_{runtime, deadline, period, flags, bw, density} + * - sched_setnuma(): p->numa_preferred_nid + * - sched_move_task()/ + * cpu_cgroup_fork(): p->sched_task_group + * - uclamp_update_active() p->uclamp* + * + * p->state <- TASK_*: + * + * is changed locklessly using set_current_state(), __set_current_state() or + * set_special_state(), see their respective comments, or by + * try_to_wake_up(). This latter uses p->pi_lock to serialize against + * concurrent self. + * + * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: + * + * is set by activate_task() and cleared by deactivate_task(), under + * rq->lock. Non-zero indicates the task is runnable, the special + * ON_RQ_MIGRATING state is used for migration without holding both + * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). + * + * p->on_cpu <- { 0, 1 }: + * + * is set by prepare_task() and cleared by finish_task() such that it will be + * set before p is scheduled-in and cleared after p is scheduled-out, both + * under rq->lock. Non-zero indicates the task is running on its CPU. + * + * [ The astute reader will observe that it is possible for two tasks on one + * CPU to have ->on_cpu = 1 at the same time. ] + * + * task_cpu(p): is changed by set_task_cpu(), the rules are: + * + * - Don't call set_task_cpu() on a blocked task: + * + * We don't care what CPU we're not running on, this simplifies hotplug, + * the CPU assignment of blocked tasks isn't required to be valid. + * + * - for try_to_wake_up(), called under p->pi_lock: + * + * This allows try_to_wake_up() to only take one rq->lock, see its comment. + * + * - for migration called under rq->lock: + * [ see task_on_rq_migrating() in task_rq_lock() ] + * + * o move_queued_task() + * o detach_task() + * + * - for migration called under double_rq_lock(): + * + * o __migrate_swap_task() + * o push_rt_task() / pull_rt_task() + * o push_dl_task() / pull_dl_task() + * o dl_task_offline_migration() + * + */ + /* * __task_rq_lock - lock the rq @p resides on. */ @@ -1543,8 +1637,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, { lockdep_assert_held(&rq->lock); - WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); - dequeue_task(rq, p, DEQUEUE_NOCLOCK); + deactivate_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); rq_unlock(rq, rf); @@ -1552,8 +1645,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, rq_lock(rq, rf); BUG_ON(task_cpu(p) != new_cpu); - enqueue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); check_preempt_curr(rq, p, 0); return rq; @@ -2318,12 +2410,31 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, } /* - * Called in case the task @p isn't fully descheduled from its runqueue, - * in this case we must do a remote wakeup. Its a 'light' wakeup though, - * since all we need to do is flip p->state to TASK_RUNNING, since - * the task is still ->on_rq. + * Consider @p being inside a wait loop: + * + * for (;;) { + * set_current_state(TASK_UNINTERRUPTIBLE); + * + * if (CONDITION) + * break; + * + * schedule(); + * } + * __set_current_state(TASK_RUNNING); + * + * between set_current_state() and schedule(). In this case @p is still + * runnable, so all that needs doing is change p->state back to TASK_RUNNING in + * an atomic manner. + * + * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq + * then schedule() must still happen and p->state can be changed to + * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we + * need to do a full wakeup with enqueue. + * + * Returns: %true when the wakeup is done, + * %false otherwise. */ -static int ttwu_remote(struct task_struct *p, int wake_flags) +static int ttwu_runnable(struct task_struct *p, int wake_flags) { struct rq_flags rf; struct rq *rq; @@ -2464,6 +2575,14 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) return false; } + +#else /* !CONFIG_SMP */ + +static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) +{ + return false; +} + #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) @@ -2471,10 +2590,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; -#if defined(CONFIG_SMP) if (ttwu_queue_wakelist(p, cpu, wake_flags)) return; -#endif rq_lock(rq, &rf); update_rq_clock(rq); @@ -2530,8 +2647,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * migration. However the means are completely different as there is no lock * chain to provide order. Instead we do: * - * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_load_acquire(!X->on_cpu) + * 1) smp_store_release(X->on_cpu, 0) -- finish_task() + * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() * * Example: * @@ -2571,15 +2688,33 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * - * If (@state & @p->state) @p->state = TASK_RUNNING. + * Conceptually does: + * + * If (@state & @p->state) @p->state = TASK_RUNNING. * * If the task was not queued/runnable, also place it back on a runqueue. * - * Atomic against schedule() which would dequeue a task, also see - * set_current_state(). + * This function is atomic against schedule() which would dequeue the task. + * + * It issues a full memory barrier before accessing @p->state, see the comment + * with set_current_state(). + * + * Uses p->pi_lock to serialize against concurrent wake-ups. * - * This function executes a full memory barrier before accessing the task - * state; see set_current_state(). + * Relies on p->pi_lock stabilizing: + * - p->sched_class + * - p->cpus_ptr + * - p->sched_task_group + * in order to do migration, see its use of select_task_rq()/set_task_cpu(). + * + * Tries really hard to only take one task_rq(p)->lock for performance. + * Takes rq->lock in: + * - ttwu_runnable() -- old rq, unavoidable, see comment there; + * - ttwu_queue() -- new rq, for enqueue of the task; + * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. + * + * As a consequence we race really badly with just about everything. See the + * many memory barriers and their comments for details. * * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. @@ -2595,7 +2730,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) * == smp_processor_id()'. Together this means we can special - * case the whole 'p->on_rq && ttwu_remote()' case below + * case the whole 'p->on_rq && ttwu_runnable()' case below * without taking any locks. * * In particular: @@ -2616,8 +2751,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * If we are going to wake up a thread waiting for CONDITION we * need to ensure that CONDITION=1 done by the caller can not be - * reordered with p->state check below. This pairs with mb() in - * set_current_state() the waiting thread does. + * reordered with p->state check below. This pairs with smp_store_mb() + * in set_current_state() that the waiting thread does. */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); @@ -2652,7 +2787,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); - if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags)) + if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) goto unlock; if (p->in_iowait) { @@ -3222,8 +3357,10 @@ static inline void prepare_task(struct task_struct *next) /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. + * + * See the ttwu() WF_ON_CPU case and its ordering comment. */ - next->on_cpu = 1; + WRITE_ONCE(next->on_cpu, 1); #endif } @@ -3231,8 +3368,9 @@ static inline void finish_task(struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely + * This must be the very last reference to @prev from this CPU. After + * p->on_cpu is cleared, the task can be moved to a different CPU. We + * must ensure this doesn't happen until the switch is completely * finished. * * In particular, the load of prev->state in finish_task_switch() must diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 65b72e0487bf..9f33c77258ea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1203,6 +1203,16 @@ struct rq_flags { #endif }; +/* + * Lockdep annotation that avoids accidental unlocks; it's like a + * sticky/continuous lockdep_assert_held(). + * + * This avoids code that has access to 'struct rq *rq' (basically everything in + * the scheduler) from accidentally unlocking the rq if they do not also have a + * copy of the (on-stack) 'struct rq_flags rf'. + * + * Also see Documentation/locking/lockdep-design.rst. + */ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { rf->cookie = lockdep_pin_lock(&rq->lock); -- cgit v1.2.3 From 46132e3ac58cb2ee48051ed80bffc0070ad59b2e Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 1 Jul 2020 14:34:18 -0400 Subject: sched: nohz: stop passing around unused "ticks" parameter. The "ticks" parameter was added in commit 0f004f5a696a ("sched: Cure more NO_HZ load average woes") since calc_global_nohz() was called and needed the "ticks" argument. But in commit c308b56b5398 ("sched: Fix nohz load accounting -- again!") it became unused as the function calc_global_nohz() dropped using "ticks". Fixes: c308b56b5398 ("sched: Fix nohz load accounting -- again!") Signed-off-by: Paul Gortmaker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593628458-32290-1-git-send-email-paul.gortmaker@windriver.com --- include/linux/sched/loadavg.h | 2 +- kernel/sched/loadavg.c | 2 +- kernel/time/timekeeping.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h index 4859bea47a7b..83ec54b65e79 100644 --- a/include/linux/sched/loadavg.h +++ b/include/linux/sched/loadavg.h @@ -43,6 +43,6 @@ extern unsigned long calc_load_n(unsigned long load, unsigned long exp, #define LOAD_INT(x) ((x) >> FSHIFT) #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) -extern void calc_global_load(unsigned long ticks); +extern void calc_global_load(void); #endif /* _LINUX_SCHED_LOADAVG_H */ diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index de22da666ac7..d2a655643a02 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -347,7 +347,7 @@ static inline void calc_global_nohz(void) { } * * Called from the global timer code. */ -void calc_global_load(unsigned long ticks) +void calc_global_load(void) { unsigned long sample_window; long active, delta; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d20d489841c8..63a632f9896c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2193,7 +2193,7 @@ EXPORT_SYMBOL(ktime_get_coarse_ts64); void do_timer(unsigned long ticks) { jiffies_64 += ticks; - calc_global_load(ticks); + calc_global_load(); } /** -- cgit v1.2.3 From 3edecfef028536cb19a120ec8788bd8a11f93b9e Mon Sep 17 00:00:00 2001 From: Peter Puhov Date: Tue, 14 Jul 2020 08:59:41 -0400 Subject: sched/fair: update_pick_idlest() Select group with lowest group_util when idle_cpus are equal In slow path, when selecting idlest group, if both groups have type group_has_spare, only idle_cpus count gets compared. As a result, if multiple tasks are created in a tight loop, and go back to sleep immediately (while waiting for all tasks to be created), they may be scheduled on the same core, because CPU is back to idle when the new fork happen. For example: sudo perf record -e sched:sched_wakeup_new -- \ sysbench threads --threads=4 run ... total number of events: 61582 ... sudo perf script sysbench 129378 [006] 74586.633466: sched:sched_wakeup_new: sysbench:129380 [120] success=1 CPU:007 sysbench 129378 [006] 74586.634718: sched:sched_wakeup_new: sysbench:129381 [120] success=1 CPU:007 sysbench 129378 [006] 74586.635957: sched:sched_wakeup_new: sysbench:129382 [120] success=1 CPU:007 sysbench 129378 [006] 74586.637183: sched:sched_wakeup_new: sysbench:129383 [120] success=1 CPU:007 This may have negative impact on performance for workloads with frequent creation of multiple threads. In this patch we are using group_util to select idlest group if both groups have equal number of idle_cpus. Comparing the number of idle cpu is not enough in this case, because the newly forked thread sleeps immediately and before we select the cpu for the next one. This is shown in the trace where the same CPU7 is selected for all wakeup_new events. That's why, looking at utilization when there is the same number of CPU is a good way to see where the previous task was placed. Using nr_running doesn't solve the problem because the newly forked task is not running and the cpu would not have been idle in this case and an idle CPU would have been selected instead. With this patch newly created tasks would be better distributed. With this patch: sudo perf record -e sched:sched_wakeup_new -- \ sysbench threads --threads=4 run ... total number of events: 74401 ... sudo perf script sysbench 129455 [006] 75232.853257: sched:sched_wakeup_new: sysbench:129457 [120] success=1 CPU:008 sysbench 129455 [006] 75232.854489: sched:sched_wakeup_new: sysbench:129458 [120] success=1 CPU:009 sysbench 129455 [006] 75232.855732: sched:sched_wakeup_new: sysbench:129459 [120] success=1 CPU:010 sysbench 129455 [006] 75232.856980: sched:sched_wakeup_new: sysbench:129460 [120] success=1 CPU:011 We tested this patch with following benchmarks: master: 'commit b3a9e3b9622a ("Linux 5.8-rc1")' 100 iterations of: perf bench -f simple futex wake -s -t 128 -w 1 Lower result is better | | BASELINE | +PATCH | DELTA (%) | |---------|------------|----------|-------------| | mean | 0.33 | 0.313 | +5.152 | | std (%) | 10.433 | 7.563 | | 100 iterations of: sysbench threads --threads=8 run Higher result is better | | BASELINE | +PATCH | DELTA (%) | |---------|------------|----------|-------------| | mean | 5235.02 | 5863.73 | +12.01 | | std (%) | 8.166 | 10.265 | | 100 iterations of: sysbench mutex --mutex-num=1 --threads=8 run Lower result is better | | BASELINE | +PATCH | DELTA (%) | |---------|------------|----------|-------------| | mean | 0.413 | 0.404 | +2.179 | | std (%) | 3.791 | 1.816 | | Signed-off-by: Peter Puhov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200714125941.4174-1-peter.puhov@linaro.org --- kernel/sched/fair.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 98a53a2fe354..2ba8f230feb9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8711,8 +8711,14 @@ static bool update_pick_idlest(struct sched_group *idlest, case group_has_spare: /* Select group with most idle CPUs */ - if (idlest_sgs->idle_cpus >= sgs->idle_cpus) + if (idlest_sgs->idle_cpus > sgs->idle_cpus) return false; + + /* Select group with lowest group_util */ + if (idlest_sgs->idle_cpus == sgs->idle_cpus && + idlest_sgs->group_util <= sgs->group_util) + return false; + break; } -- cgit v1.2.3 From 589343569d7b58a64ec2446e6686c8e79aea7fcf Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 16 Jul 2020 15:04:57 +0800 Subject: smp: Fix a potential usage of stale nr_cpus The get_option() maybe return 0, it means that the nr_cpus is not initialized. Then we will use the stale nr_cpus to initialize the nr_cpu_ids. So fix it. Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200716070457.53255-1-songmuchun@bytedance.com --- kernel/smp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index aa17eedff5be..d0ae8eb6bf8b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -634,8 +634,7 @@ static int __init nrcpus(char *str) { int nr_cpus; - get_option(&str, &nr_cpus); - if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids) nr_cpu_ids = nr_cpus; return 0; -- cgit v1.2.3 From 25980c7a79af42f2daa73e2f475ebf4cbac8253e Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Sun, 12 Jul 2020 17:59:15 +0100 Subject: arch_topology, sched/core: Cleanup thermal pressure definition The following commit: 14533a16c46d ("thermal/cpu-cooling, sched/core: Move the arch_set_thermal_pressure() API to generic scheduler code") moved the definition of arch_set_thermal_pressure() to sched/core.c, but kept its declaration in linux/arch_topology.h. When building e.g. an x86 kernel with CONFIG_SCHED_THERMAL_PRESSURE=y, cpufreq_cooling.c ends up getting the declaration of arch_set_thermal_pressure() from include/linux/arch_topology.h, which is somewhat awkward. On top of this, sched/core.c unconditionally defines o The thermal_pressure percpu variable o arch_set_thermal_pressure() while arch_scale_thermal_pressure() does nothing unless redefined by the architecture. arch_*() functions are meant to be defined by architectures, so revert the aforementioned commit and re-implement it in a way that keeps arch_set_thermal_pressure() architecture-definable, and doesn't define the thermal pressure percpu variable for kernels that don't need it (CONFIG_SCHED_THERMAL_PRESSURE=n). Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200712165917.9168-2-valentin.schneider@arm.com --- arch/arm/include/asm/topology.h | 3 ++- arch/arm64/include/asm/topology.h | 3 ++- drivers/base/arch_topology.c | 11 +++++++++++ include/linux/arch_topology.h | 4 ++-- include/linux/sched/topology.h | 7 +++++++ kernel/sched/core.c | 11 ----------- 6 files changed, 24 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 435aba289fc5..e0593cf095d0 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -16,8 +16,9 @@ /* Enable topology flag updates */ #define arch_update_cpu_topology topology_update_cpu_topology -/* Replace task scheduler's default thermal pressure retrieve API */ +/* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure +#define arch_set_thermal_pressure topology_set_thermal_pressure #else diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 0cc835ddfcd1..e042f6527981 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -34,8 +34,9 @@ void topology_scale_freq_tick(void); /* Enable topology flag updates */ #define arch_update_cpu_topology topology_update_cpu_topology -/* Replace task scheduler's default thermal pressure retrieve API */ +/* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure +#define arch_set_thermal_pressure topology_set_thermal_pressure #include diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 4d0a0038b476..75f72d684294 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -54,6 +54,17 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) per_cpu(cpu_scale, cpu) = capacity; } +DEFINE_PER_CPU(unsigned long, thermal_pressure); + +void topology_set_thermal_pressure(const struct cpumask *cpus, + unsigned long th_pressure) +{ + int cpu; + + for_each_cpu(cpu, cpus) + WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); +} + static ssize_t cpu_capacity_show(struct device *dev, struct device_attribute *attr, char *buf) diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 0566cb3314ef..69b1dabe39dc 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -39,8 +39,8 @@ static inline unsigned long topology_get_thermal_pressure(int cpu) return per_cpu(thermal_pressure, cpu); } -void arch_set_thermal_pressure(struct cpumask *cpus, - unsigned long th_pressure); +void topology_set_thermal_pressure(const struct cpumask *cpus, + unsigned long th_pressure); struct cpu_topology { int thread_id; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index fb11091129b3..764222d637b7 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -232,6 +232,13 @@ unsigned long arch_scale_thermal_pressure(int cpu) } #endif +#ifndef arch_set_thermal_pressure +static __always_inline +void arch_set_thermal_pressure(const struct cpumask *cpus, + unsigned long th_pressure) +{ } +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 12db8fbd9c97..bd8e5211d31f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3869,17 +3869,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -DEFINE_PER_CPU(unsigned long, thermal_pressure); - -void arch_set_thermal_pressure(struct cpumask *cpus, - unsigned long th_pressure) -{ - int cpu; - - for_each_cpu(cpu, cpus) - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); -} - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. -- cgit v1.2.3 From bd25b4886ddcebec92591f298ce2ce345d7f2ea9 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 14 Jul 2020 16:13:51 -0400 Subject: padata: remove start function padata_start() is only used right after pcrypt allocates an instance with all possible CPUs, when PADATA_INVALID can't happen, so there's no need for a separate "start" step. It can be done during allocation to save text, make using padata easier, and avoid unneeded calls in the future. Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- crypto/pcrypt.c | 3 --- include/linux/padata.h | 1 - kernel/padata.c | 26 +------------------------- 3 files changed, 1 insertion(+), 29 deletions(-) (limited to 'kernel') diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c index cbc383a1a3fe..fb9127ec5357 100644 --- a/crypto/pcrypt.c +++ b/crypto/pcrypt.c @@ -355,9 +355,6 @@ static int __init pcrypt_init(void) if (err) goto err_deinit_pencrypt; - padata_start(pencrypt); - padata_start(pdecrypt); - return crypto_register_template(&pcrypt_tmpl); err_deinit_pencrypt: diff --git a/include/linux/padata.h b/include/linux/padata.h index 7302efff5e65..20294cddc739 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -204,6 +204,5 @@ extern void padata_do_serial(struct padata_priv *padata); extern void __init padata_do_multithreaded(struct padata_mt_job *job); extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask); -extern int padata_start(struct padata_instance *pinst); extern void padata_stop(struct padata_instance *pinst); #endif diff --git a/kernel/padata.c b/kernel/padata.c index 4373f7adaa40..931762316612 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -789,30 +789,6 @@ out: } EXPORT_SYMBOL(padata_set_cpumask); -/** - * padata_start - start the parallel processing - * - * @pinst: padata instance to start - * - * Return: 0 on success or negative error code - */ -int padata_start(struct padata_instance *pinst) -{ - int err = 0; - - mutex_lock(&pinst->lock); - - if (pinst->flags & PADATA_INVALID) - err = -EINVAL; - - __padata_start(pinst); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_start); - /** * padata_stop - stop the parallel processing * @@ -1100,7 +1076,7 @@ static struct padata_instance *padata_alloc(const char *name, if (padata_setup_cpumasks(pinst)) goto err_free_rcpumask_cbcpu; - pinst->flags = 0; + __padata_start(pinst); kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); -- cgit v1.2.3 From 350ef051d4edd884e8dea0be9f3685b4b32142fb Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 14 Jul 2020 16:13:52 -0400 Subject: padata: remove stop function padata_stop() has two callers and is unnecessary in both cases. When pcrypt calls it before padata_free(), it's being unloaded so there are no outstanding padata jobs[0]. When __padata_free() calls it, it's either along the same path or else pcrypt initialization failed, which of course means there are also no outstanding jobs. Removing it simplifies padata and saves text. [0] https://lore.kernel.org/linux-crypto/20191119225017.mjrak2fwa5vccazl@gondor.apana.org.au/ Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- Documentation/core-api/padata.rst | 16 ++-------------- crypto/pcrypt.c | 12 +++--------- include/linux/padata.h | 1 - kernel/padata.c | 14 -------------- 4 files changed, 5 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/Documentation/core-api/padata.rst b/Documentation/core-api/padata.rst index 0830e5b0e821..771d50330e5b 100644 --- a/Documentation/core-api/padata.rst +++ b/Documentation/core-api/padata.rst @@ -31,18 +31,7 @@ padata_instance structure for overall control of how jobs are to be run:: 'name' simply identifies the instance. -There are functions for enabling and disabling the instance:: - - int padata_start(struct padata_instance *pinst); - void padata_stop(struct padata_instance *pinst); - -These functions are setting or clearing the "PADATA_INIT" flag; if that flag is -not set, other functions will refuse to work. padata_start() returns zero on -success (flag set) or -EINVAL if the padata cpumask contains no active CPU -(flag not set). padata_stop() clears the flag and blocks until the padata -instance is unused. - -Finally, complete padata initialization by allocating a padata_shell:: +Then, complete padata initialization by allocating a padata_shell:: struct padata_shell *padata_alloc_shell(struct padata_instance *pinst); @@ -155,11 +144,10 @@ submitted. Destroying ---------- -Cleaning up a padata instance predictably involves calling the three free +Cleaning up a padata instance predictably involves calling the two free functions that correspond to the allocation in reverse:: void padata_free_shell(struct padata_shell *ps); - void padata_stop(struct padata_instance *pinst); void padata_free(struct padata_instance *pinst); It is the user's responsibility to ensure all outstanding jobs are complete diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c index fb9127ec5357..2d4ac9d44902 100644 --- a/crypto/pcrypt.c +++ b/crypto/pcrypt.c @@ -327,12 +327,6 @@ static int pcrypt_init_padata(struct padata_instance **pinst, const char *name) return ret; } -static void pcrypt_fini_padata(struct padata_instance *pinst) -{ - padata_stop(pinst); - padata_free(pinst); -} - static struct crypto_template pcrypt_tmpl = { .name = "pcrypt", .create = pcrypt_create, @@ -358,7 +352,7 @@ static int __init pcrypt_init(void) return crypto_register_template(&pcrypt_tmpl); err_deinit_pencrypt: - pcrypt_fini_padata(pencrypt); + padata_free(pencrypt); err_unreg_kset: kset_unregister(pcrypt_kset); err: @@ -369,8 +363,8 @@ static void __exit pcrypt_exit(void) { crypto_unregister_template(&pcrypt_tmpl); - pcrypt_fini_padata(pencrypt); - pcrypt_fini_padata(pdecrypt); + padata_free(pencrypt); + padata_free(pdecrypt); kset_unregister(pcrypt_kset); } diff --git a/include/linux/padata.h b/include/linux/padata.h index 20294cddc739..7d53208b43da 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -204,5 +204,4 @@ extern void padata_do_serial(struct padata_priv *padata); extern void __init padata_do_multithreaded(struct padata_mt_job *job); extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask); -extern void padata_stop(struct padata_instance *pinst); #endif diff --git a/kernel/padata.c b/kernel/padata.c index 931762316612..8f55e717ba50 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -789,19 +789,6 @@ out: } EXPORT_SYMBOL(padata_set_cpumask); -/** - * padata_stop - stop the parallel processing - * - * @pinst: padata instance to stop - */ -void padata_stop(struct padata_instance *pinst) -{ - mutex_lock(&pinst->lock); - __padata_stop(pinst); - mutex_unlock(&pinst->lock); -} -EXPORT_SYMBOL(padata_stop); - #ifdef CONFIG_HOTPLUG_CPU static int __padata_add_cpu(struct padata_instance *pinst, int cpu) @@ -883,7 +870,6 @@ static void __padata_free(struct padata_instance *pinst) WARN_ON(!list_empty(&pinst->pslist)); - padata_stop(pinst); free_cpumask_var(pinst->rcpumask.cbcpu); free_cpumask_var(pinst->rcpumask.pcpu); free_cpumask_var(pinst->cpumask.pcpu); -- cgit v1.2.3 From cec00e6e120f4f0743d2bf0638418684c74b16e0 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 14 Jul 2020 16:13:53 -0400 Subject: padata: inline single call of pd_setup_cpumasks() pd_setup_cpumasks() has only one caller. Move its contents inline to prepare for the next cleanup. Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 8f55e717ba50..27f90a3c4dc6 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -441,28 +441,6 @@ static int padata_setup_cpumasks(struct padata_instance *pinst) return err; } -static int pd_setup_cpumasks(struct parallel_data *pd, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) -{ - int err = -ENOMEM; - - if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) - goto out; - if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) - goto free_pcpu_mask; - - cpumask_copy(pd->cpumask.pcpu, pcpumask); - cpumask_copy(pd->cpumask.cbcpu, cbcpumask); - - return 0; - -free_pcpu_mask: - free_cpumask_var(pd->cpumask.pcpu); -out: - return err; -} - static void __init padata_mt_helper(struct work_struct *w) { struct padata_work *pw = container_of(w, struct padata_work, pw_work); @@ -613,8 +591,14 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) goto err_free_pqueue; pd->ps = ps; - if (pd_setup_cpumasks(pd, pcpumask, cbcpumask)) + + if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) goto err_free_squeue; + if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) + goto err_free_pcpu; + + cpumask_copy(pd->cpumask.pcpu, pcpumask); + cpumask_copy(pd->cpumask.cbcpu, cbcpumask); padata_init_pqueues(pd); padata_init_squeues(pd); @@ -626,6 +610,8 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) return pd; +err_free_pcpu: + free_cpumask_var(pd->cpumask.pcpu); err_free_squeue: free_percpu(pd->squeue); err_free_pqueue: -- cgit v1.2.3 From d69e037bcc4a7e31fdd40ae416aa1bd768dd7d99 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 14 Jul 2020 16:13:54 -0400 Subject: padata: remove effective cpumasks from the instance A padata instance has effective cpumasks that store the user-supplied masks ANDed with the online mask, but this middleman is unnecessary. parallel_data keeps the same information around. Removing this saves text and code churn in future changes. Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- include/linux/padata.h | 2 -- kernel/padata.c | 30 +++--------------------------- 2 files changed, 3 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/padata.h b/include/linux/padata.h index 7d53208b43da..a941b96b7119 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -167,7 +167,6 @@ struct padata_mt_job { * @serial_wq: The workqueue used for serial work. * @pslist: List of padata_shell objects attached to this instance. * @cpumask: User supplied cpumasks for parallel and serial works. - * @rcpumask: Actual cpumasks based on user cpumask and cpu_online_mask. * @kobj: padata instance kernel object. * @lock: padata instance lock. * @flags: padata flags. @@ -179,7 +178,6 @@ struct padata_instance { struct workqueue_struct *serial_wq; struct list_head pslist; struct padata_cpumask cpumask; - struct padata_cpumask rcpumask; struct kobject kobj; struct mutex lock; u8 flags; diff --git a/kernel/padata.c b/kernel/padata.c index 27f90a3c4dc6..4f0a57e5738c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -571,13 +571,8 @@ static void padata_init_pqueues(struct parallel_data *pd) static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) { struct padata_instance *pinst = ps->pinst; - const struct cpumask *cbcpumask; - const struct cpumask *pcpumask; struct parallel_data *pd; - cbcpumask = pinst->rcpumask.cbcpu; - pcpumask = pinst->rcpumask.pcpu; - pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); if (!pd) goto err; @@ -597,8 +592,8 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) goto err_free_pcpu; - cpumask_copy(pd->cpumask.pcpu, pcpumask); - cpumask_copy(pd->cpumask.cbcpu, cbcpumask); + cpumask_and(pd->cpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask); + cpumask_and(pd->cpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask); padata_init_pqueues(pd); padata_init_squeues(pd); @@ -668,12 +663,6 @@ static int padata_replace(struct padata_instance *pinst) pinst->flags |= PADATA_RESET; - cpumask_and(pinst->rcpumask.pcpu, pinst->cpumask.pcpu, - cpu_online_mask); - - cpumask_and(pinst->rcpumask.cbcpu, pinst->cpumask.cbcpu, - cpu_online_mask); - list_for_each_entry(ps, &pinst->pslist, list) { err = padata_replace_one(ps); if (err) @@ -856,8 +845,6 @@ static void __padata_free(struct padata_instance *pinst) WARN_ON(!list_empty(&pinst->pslist)); - free_cpumask_var(pinst->rcpumask.cbcpu); - free_cpumask_var(pinst->rcpumask.pcpu); free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); destroy_workqueue(pinst->serial_wq); @@ -1033,20 +1020,13 @@ static struct padata_instance *padata_alloc(const char *name, !padata_validate_cpumask(pinst, cbcpumask)) goto err_free_masks; - if (!alloc_cpumask_var(&pinst->rcpumask.pcpu, GFP_KERNEL)) - goto err_free_masks; - if (!alloc_cpumask_var(&pinst->rcpumask.cbcpu, GFP_KERNEL)) - goto err_free_rcpumask_pcpu; - INIT_LIST_HEAD(&pinst->pslist); cpumask_copy(pinst->cpumask.pcpu, pcpumask); cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); - cpumask_and(pinst->rcpumask.pcpu, pcpumask, cpu_online_mask); - cpumask_and(pinst->rcpumask.cbcpu, cbcpumask, cpu_online_mask); if (padata_setup_cpumasks(pinst)) - goto err_free_rcpumask_cbcpu; + goto err_free_masks; __padata_start(pinst); @@ -1064,10 +1044,6 @@ static struct padata_instance *padata_alloc(const char *name, return pinst; -err_free_rcpumask_cbcpu: - free_cpumask_var(pinst->rcpumask.cbcpu); -err_free_rcpumask_pcpu: - free_cpumask_var(pinst->rcpumask.pcpu); err_free_masks: free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); -- cgit v1.2.3 From 3f257191d31d5eaf154ebdb696efc238837ddd51 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 14 Jul 2020 16:13:55 -0400 Subject: padata: fold padata_alloc_possible() into padata_alloc() There's no reason to have two interfaces when there's only one caller. Removing _possible saves text and simplifies future changes. Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- Documentation/core-api/padata.rst | 2 +- crypto/pcrypt.c | 2 +- include/linux/padata.h | 2 +- kernel/padata.c | 33 +++++---------------------------- 4 files changed, 8 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/Documentation/core-api/padata.rst b/Documentation/core-api/padata.rst index 771d50330e5b..35175710b43c 100644 --- a/Documentation/core-api/padata.rst +++ b/Documentation/core-api/padata.rst @@ -27,7 +27,7 @@ padata_instance structure for overall control of how jobs are to be run:: #include - struct padata_instance *padata_alloc_possible(const char *name); + struct padata_instance *padata_alloc(const char *name); 'name' simply identifies the instance. diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c index 2d4ac9d44902..d569c7ed6c80 100644 --- a/crypto/pcrypt.c +++ b/crypto/pcrypt.c @@ -316,7 +316,7 @@ static int pcrypt_init_padata(struct padata_instance **pinst, const char *name) { int ret = -ENOMEM; - *pinst = padata_alloc_possible(name); + *pinst = padata_alloc(name); if (!*pinst) return ret; diff --git a/include/linux/padata.h b/include/linux/padata.h index a941b96b7119..070a7d43e8af 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -192,7 +192,7 @@ extern void __init padata_init(void); static inline void __init padata_init(void) {} #endif -extern struct padata_instance *padata_alloc_possible(const char *name); +extern struct padata_instance *padata_alloc(const char *name); extern void padata_free(struct padata_instance *pinst); extern struct padata_shell *padata_alloc_shell(struct padata_instance *pinst); extern void padata_free_shell(struct padata_shell *ps); diff --git a/kernel/padata.c b/kernel/padata.c index 4f0a57e5738c..1c0b97891edb 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -979,18 +979,12 @@ static struct kobj_type padata_attr_type = { }; /** - * padata_alloc - allocate and initialize a padata instance and specify - * cpumasks for serial and parallel workers. - * + * padata_alloc - allocate and initialize a padata instance * @name: used to identify the instance - * @pcpumask: cpumask that will be used for padata parallelization - * @cbcpumask: cpumask that will be used for padata serialization * * Return: new instance on success, NULL on error */ -static struct padata_instance *padata_alloc(const char *name, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +struct padata_instance *padata_alloc(const char *name) { struct padata_instance *pinst; @@ -1016,14 +1010,11 @@ static struct padata_instance *padata_alloc(const char *name, free_cpumask_var(pinst->cpumask.pcpu); goto err_free_serial_wq; } - if (!padata_validate_cpumask(pinst, pcpumask) || - !padata_validate_cpumask(pinst, cbcpumask)) - goto err_free_masks; INIT_LIST_HEAD(&pinst->pslist); - cpumask_copy(pinst->cpumask.pcpu, pcpumask); - cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); + cpumask_copy(pinst->cpumask.pcpu, cpu_possible_mask); + cpumask_copy(pinst->cpumask.cbcpu, cpu_possible_mask); if (padata_setup_cpumasks(pinst)) goto err_free_masks; @@ -1057,21 +1048,7 @@ err_free_inst: err: return NULL; } - -/** - * padata_alloc_possible - Allocate and initialize padata instance. - * Use the cpu_possible_mask for serial and - * parallel workers. - * - * @name: used to identify the instance - * - * Return: new instance on success, NULL on error - */ -struct padata_instance *padata_alloc_possible(const char *name) -{ - return padata_alloc(name, cpu_possible_mask, cpu_possible_mask); -} -EXPORT_SYMBOL(padata_alloc_possible); +EXPORT_SYMBOL(padata_alloc); /** * padata_free - free a padata instance -- cgit v1.2.3 From f601c725a6ac072608f47083e7c8115a3f504f95 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 14 Jul 2020 16:13:56 -0400 Subject: padata: remove padata_parallel_queue Only its reorder field is actually used now, so remove the struct and embed @reorder directly in parallel_data. No functional change, just a cleanup. Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- include/linux/padata.h | 15 ++------------- kernel/padata.c | 46 ++++++++++++++++++++-------------------------- 2 files changed, 22 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/include/linux/padata.h b/include/linux/padata.h index 070a7d43e8af..a433f13fc4bf 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -66,17 +66,6 @@ struct padata_serial_queue { struct parallel_data *pd; }; -/** - * struct padata_parallel_queue - The percpu padata parallel queue - * - * @reorder: List to wait for reordering after parallel processing. - * @num_obj: Number of objects that are processed by this cpu. - */ -struct padata_parallel_queue { - struct padata_list reorder; - atomic_t num_obj; -}; - /** * struct padata_cpumask - The cpumasks for the parallel/serial workers * @@ -93,7 +82,7 @@ struct padata_cpumask { * that depends on the cpumask in use. * * @ps: padata_shell object. - * @pqueue: percpu padata queues used for parallelization. + * @reorder_list: percpu reorder lists * @squeue: percpu padata queues used for serialuzation. * @refcnt: Number of objects holding a reference on this parallel_data. * @seq_nr: Sequence number of the parallelized data object. @@ -105,7 +94,7 @@ struct padata_cpumask { */ struct parallel_data { struct padata_shell *ps; - struct padata_parallel_queue __percpu *pqueue; + struct padata_list __percpu *reorder_list; struct padata_serial_queue __percpu *squeue; atomic_t refcnt; unsigned int seq_nr; diff --git a/kernel/padata.c b/kernel/padata.c index 1c0b97891edb..16cb894dc272 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -250,13 +250,11 @@ EXPORT_SYMBOL(padata_do_parallel); static struct padata_priv *padata_find_next(struct parallel_data *pd, bool remove_object) { - struct padata_parallel_queue *next_queue; struct padata_priv *padata; struct padata_list *reorder; int cpu = pd->cpu; - next_queue = per_cpu_ptr(pd->pqueue, cpu); - reorder = &next_queue->reorder; + reorder = per_cpu_ptr(pd->reorder_list, cpu); spin_lock(&reorder->lock); if (list_empty(&reorder->list)) { @@ -291,7 +289,7 @@ static void padata_reorder(struct parallel_data *pd) int cb_cpu; struct padata_priv *padata; struct padata_serial_queue *squeue; - struct padata_parallel_queue *next_queue; + struct padata_list *reorder; /* * We need to ensure that only one cpu can work on dequeueing of @@ -339,9 +337,8 @@ static void padata_reorder(struct parallel_data *pd) */ smp_mb(); - next_queue = per_cpu_ptr(pd->pqueue, pd->cpu); - if (!list_empty(&next_queue->reorder.list) && - padata_find_next(pd, false)) + reorder = per_cpu_ptr(pd->reorder_list, pd->cpu); + if (!list_empty(&reorder->list) && padata_find_next(pd, false)) queue_work(pinst->serial_wq, &pd->reorder_work); } @@ -401,17 +398,16 @@ void padata_do_serial(struct padata_priv *padata) { struct parallel_data *pd = padata->pd; int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr); - struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, - hashed_cpu); + struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu); struct padata_priv *cur; - spin_lock(&pqueue->reorder.lock); + spin_lock(&reorder->lock); /* Sort in ascending order of sequence number. */ - list_for_each_entry_reverse(cur, &pqueue->reorder.list, list) + list_for_each_entry_reverse(cur, &reorder->list, list) if (cur->seq_nr < padata->seq_nr) break; list_add(&padata->list, &cur->list); - spin_unlock(&pqueue->reorder.lock); + spin_unlock(&reorder->lock); /* * Ensure the addition to the reorder list is ordered correctly @@ -553,17 +549,15 @@ static void padata_init_squeues(struct parallel_data *pd) } } -/* Initialize all percpu queues used by parallel workers */ -static void padata_init_pqueues(struct parallel_data *pd) +/* Initialize per-CPU reorder lists */ +static void padata_init_reorder_list(struct parallel_data *pd) { int cpu; - struct padata_parallel_queue *pqueue; + struct padata_list *list; for_each_cpu(cpu, pd->cpumask.pcpu) { - pqueue = per_cpu_ptr(pd->pqueue, cpu); - - __padata_list_init(&pqueue->reorder); - atomic_set(&pqueue->num_obj, 0); + list = per_cpu_ptr(pd->reorder_list, cpu); + __padata_list_init(list); } } @@ -577,13 +571,13 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) if (!pd) goto err; - pd->pqueue = alloc_percpu(struct padata_parallel_queue); - if (!pd->pqueue) + pd->reorder_list = alloc_percpu(struct padata_list); + if (!pd->reorder_list) goto err_free_pd; pd->squeue = alloc_percpu(struct padata_serial_queue); if (!pd->squeue) - goto err_free_pqueue; + goto err_free_reorder_list; pd->ps = ps; @@ -595,7 +589,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) cpumask_and(pd->cpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask); cpumask_and(pd->cpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask); - padata_init_pqueues(pd); + padata_init_reorder_list(pd); padata_init_squeues(pd); pd->seq_nr = -1; atomic_set(&pd->refcnt, 1); @@ -609,8 +603,8 @@ err_free_pcpu: free_cpumask_var(pd->cpumask.pcpu); err_free_squeue: free_percpu(pd->squeue); -err_free_pqueue: - free_percpu(pd->pqueue); +err_free_reorder_list: + free_percpu(pd->reorder_list); err_free_pd: kfree(pd); err: @@ -621,7 +615,7 @@ static void padata_free_pd(struct parallel_data *pd) { free_cpumask_var(pd->cpumask.pcpu); free_cpumask_var(pd->cpumask.cbcpu); - free_percpu(pd->pqueue); + free_percpu(pd->reorder_list); free_percpu(pd->squeue); kfree(pd); } -- cgit v1.2.3 From 072e133d554bb74929f902582c8cf66d9fd12771 Mon Sep 17 00:00:00 2001 From: Peter Enderborg Date: Thu, 16 Jul 2020 09:15:10 +0200 Subject: tracefs: Remove unnecessary debug_fs checks. This is a preparation for debugfs restricted mode. We don't need debugfs to trace, the removed check stop tracefs to work if debugfs is not initialised. We instead tries to automount within debugfs and relay on it's handling. The code path is to create a backward compatibility from when tracefs was part of debugfs, it is now standalone and does not need debugfs. When debugfs is in restricted it is compiled in but not active and return EPERM to clients and tracefs wont work if it assumes it is active it is compiled in kernel. Reported-by: kernel test robot Signed-off-by: Peter Enderborg Reviewed-by: Greg Kroah-Hartman Acked-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20200716071511.26864-2-peter.enderborg@sony.com Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb62269724d5..848f67a5f16d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8945,9 +8945,7 @@ struct dentry *tracing_init_dentry(void) if (tr->dir) return NULL; - if (WARN_ON(!tracefs_initialized()) || - (IS_ENABLED(CONFIG_DEBUG_FS) && - WARN_ON(!debugfs_initialized()))) + if (WARN_ON(!tracefs_initialized())) return ERR_PTR(-ENODEV); /* -- cgit v1.2.3 From 31cd0e119d50cf27ebe214d1a8f7ca36692f13a5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 23 Jul 2020 17:16:41 +0200 Subject: timers: Recalculate next timer interrupt only when necessary The nohz tick code recalculates the timer wheel's next expiry on each idle loop iteration. On the other hand, the base next expiry is now always cached and updated upon timer enqueue and execution. Only timer dequeue may leave base->next_expiry out of date (but then its stale value won't ever go past the actual next expiry to be recalculated). Since recalculating the next_expiry isn't a free operation, especially when the last wheel level is reached to find out that no timer has been enqueued at all, reuse the next expiry cache when it is known to be reliable, which it is most of the time. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200723151641.12236-1-frederic@kernel.org --- kernel/time/timer.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 77e21e98ec32..96d802e9769e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -204,6 +204,7 @@ struct timer_base { unsigned long clk; unsigned long next_expiry; unsigned int cpu; + bool next_expiry_recalc; bool is_idle; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; @@ -593,6 +594,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, * can reevaluate the wheel: */ base->next_expiry = bucket_expiry; + base->next_expiry_recalc = false; trigger_dyntick_cpu(base, timer); } } @@ -836,8 +838,10 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base, if (!timer_pending(timer)) return 0; - if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) + if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) { __clear_bit(idx, base->pending_map); + base->next_expiry_recalc = true; + } detach_timer(timer, clear_pending); return 1; @@ -1571,6 +1575,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk >>= LVL_CLK_SHIFT; clk += adj; } + + base->next_expiry_recalc = false; + return next; } @@ -1631,9 +1638,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) return expires; raw_spin_lock(&base->lock); - nextevt = __next_timer_interrupt(base); + if (base->next_expiry_recalc) + base->next_expiry = __next_timer_interrupt(base); + nextevt = base->next_expiry; is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); - base->next_expiry = nextevt; + /* * We have a fresh next event. Check whether we can forward the * base. We can only do that when @basej is past base->clk @@ -1725,6 +1734,12 @@ static inline void __run_timers(struct timer_base *base) while (time_after_eq(jiffies, base->clk) && time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); + /* + * The only possible reason for not finding any expired + * timer at this clk is that all matching timers have been + * dequeued. + */ + WARN_ON_ONCE(!levels && !base->next_expiry_recalc); base->clk++; base->next_expiry = __next_timer_interrupt(base); -- cgit v1.2.3 From 142781e108b13b2b0e8f035cfb5bfbbc8f14d887 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jul 2020 23:59:56 +0200 Subject: entry: Provide generic syscall entry functionality On syscall entry certain work needs to be done: - Establish state (lockdep, context tracking, tracing) - Conditional work (ptrace, seccomp, audit...) This code is needlessly duplicated and different in all architectures. Provide a generic version based on the x86 implementation which has all the RCU and instrumentation bits right. As interrupt/exception entry from user space needs parts of the same functionality, provide a function for this as well. syscall_enter_from_user_mode() and irqentry_enter_from_user_mode() must be called right after the low level ASM entry. The calling code must be non-instrumentable. After the functions returns state is correct and the subsequent functions can be instrumented. Signed-off-by: Thomas Gleixner Acked-by: Kees Cook Link: https://lkml.kernel.org/r/20200722220519.513463269@linutronix.de --- arch/Kconfig | 3 ++ include/linux/entry-common.h | 121 +++++++++++++++++++++++++++++++++++++++++++ kernel/Makefile | 1 + kernel/entry/Makefile | 12 +++++ kernel/entry/common.c | 88 +++++++++++++++++++++++++++++++ 5 files changed, 225 insertions(+) create mode 100644 include/linux/entry-common.h create mode 100644 kernel/entry/Makefile create mode 100644 kernel/entry/common.c (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 8cc35dc556c7..852a527f418f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -27,6 +27,9 @@ config HAVE_IMA_KEXEC config HOTPLUG_SMT bool +config GENERIC_ENTRY + bool + config OPROFILE tristate "OProfile system profiling" depends on PROFILING diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h new file mode 100644 index 000000000000..42fc8e4632bb --- /dev/null +++ b/include/linux/entry-common.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_ENTRYCOMMON_H +#define __LINUX_ENTRYCOMMON_H + +#include +#include +#include +#include + +#include + +/* + * Define dummy _TIF work flags if not defined by the architecture or for + * disabled functionality. + */ +#ifndef _TIF_SYSCALL_EMU +# define _TIF_SYSCALL_EMU (0) +#endif + +#ifndef _TIF_SYSCALL_TRACEPOINT +# define _TIF_SYSCALL_TRACEPOINT (0) +#endif + +#ifndef _TIF_SECCOMP +# define _TIF_SECCOMP (0) +#endif + +#ifndef _TIF_SYSCALL_AUDIT +# define _TIF_SYSCALL_AUDIT (0) +#endif + +/* + * TIF flags handled in syscall_enter_from_usermode() + */ +#ifndef ARCH_SYSCALL_ENTER_WORK +# define ARCH_SYSCALL_ENTER_WORK (0) +#endif + +#define SYSCALL_ENTER_WORK \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ + _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_EMU | \ + ARCH_SYSCALL_ENTER_WORK) + +/** + * arch_check_user_regs - Architecture specific sanity check for user mode regs + * @regs: Pointer to currents pt_regs + * + * Defaults to an empty implementation. Can be replaced by architecture + * specific code. + * + * Invoked from syscall_enter_from_user_mode() in the non-instrumentable + * section. Use __always_inline so the compiler cannot push it out of line + * and make it instrumentable. + */ +static __always_inline void arch_check_user_regs(struct pt_regs *regs); + +#ifndef arch_check_user_regs +static __always_inline void arch_check_user_regs(struct pt_regs *regs) {} +#endif + +/** + * arch_syscall_enter_tracehook - Wrapper around tracehook_report_syscall_entry() + * @regs: Pointer to currents pt_regs + * + * Returns: 0 on success or an error code to skip the syscall. + * + * Defaults to tracehook_report_syscall_entry(). Can be replaced by + * architecture specific code. + * + * Invoked from syscall_enter_from_user_mode() + */ +static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs); + +#ifndef arch_syscall_enter_tracehook +static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs) +{ + return tracehook_report_syscall_entry(regs); +} +#endif + +/** + * syscall_enter_from_user_mode - Check and handle work before invoking + * a syscall + * @regs: Pointer to currents pt_regs + * @syscall: The syscall number + * + * Invoked from architecture specific syscall entry code with interrupts + * disabled. The calling code has to be non-instrumentable. When the + * function returns all state is correct and the subsequent functions can be + * instrumented. + * + * Returns: The original or a modified syscall number + * + * If the returned syscall number is -1 then the syscall should be + * skipped. In this case the caller may invoke syscall_set_error() or + * syscall_set_return_value() first. If neither of those are called and -1 + * is returned, then the syscall will fail with ENOSYS. + * + * The following functionality is handled here: + * + * 1) Establish state (lockdep, RCU (context tracking), tracing) + * 2) TIF flag dependent invocations of arch_syscall_enter_tracehook(), + * __secure_computing(), trace_sys_enter() + * 3) Invocation of audit_syscall_entry() + */ +long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall); + +/** + * irqentry_enter_from_user_mode - Establish state before invoking the irq handler + * @regs: Pointer to currents pt_regs + * + * Invoked from architecture specific entry code with interrupts disabled. + * Can only be called when the interrupt entry came from user mode. The + * calling code must be non-instrumentable. When the function returns all + * state is correct and the subsequent functions can be instrumented. + * + * The function establishes state (lockdep, RCU (context tracking), tracing) + */ +void irqentry_enter_from_user_mode(struct pt_regs *regs); + +#endif diff --git a/kernel/Makefile b/kernel/Makefile index f3218bc5ec69..fde2000d0d0d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -48,6 +48,7 @@ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ obj-y += dma/ +obj-y += entry/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile new file mode 100644 index 000000000000..c207d202bf3a --- /dev/null +++ b/kernel/entry/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +# Prevent the noinstr section from being pestered by sanitizer and other goodies +# as long as these things cannot be disabled per function. +KASAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong +CFLAGS_common.o += -fno-stack-protector + +obj-$(CONFIG_GENERIC_ENTRY) += common.o diff --git a/kernel/entry/common.c b/kernel/entry/common.c new file mode 100644 index 000000000000..1d636dee2fec --- /dev/null +++ b/kernel/entry/common.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#define CREATE_TRACE_POINTS +#include + +/** + * enter_from_user_mode - Establish state when coming from user mode + * + * Syscall/interrupt entry disables interrupts, but user mode is traced as + * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + */ +static __always_inline void enter_from_user_mode(struct pt_regs *regs) +{ + arch_check_user_regs(regs); + lockdep_hardirqs_off(CALLER_ADDR0); + + CT_WARN_ON(ct_state() != CONTEXT_USER); + user_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + +static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) +{ + if (unlikely(audit_context())) { + unsigned long args[6]; + + syscall_get_arguments(current, regs, args); + audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); + } +} + +static long syscall_trace_enter(struct pt_regs *regs, long syscall, + unsigned long ti_work) +{ + long ret = 0; + + /* Handle ptrace */ + if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { + ret = arch_syscall_enter_tracehook(regs); + if (ret || (ti_work & _TIF_SYSCALL_EMU)) + return -1L; + } + + /* Do seccomp after ptrace, to catch any tracer changes. */ + if (ti_work & _TIF_SECCOMP) { + ret = __secure_computing(NULL); + if (ret == -1L) + return ret; + } + + if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) + trace_sys_enter(regs, syscall); + + syscall_enter_audit(regs, syscall); + + return ret ? : syscall; +} + +noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +{ + unsigned long ti_work; + + enter_from_user_mode(regs); + instrumentation_begin(); + + local_irq_enable(); + ti_work = READ_ONCE(current_thread_info()->flags); + if (ti_work & SYSCALL_ENTER_WORK) + syscall = syscall_trace_enter(regs, syscall, ti_work); + instrumentation_end(); + + return syscall; +} + +noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); +} -- cgit v1.2.3 From a9f3a74a29af095f3e1b89e9176f8127912ae0f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jul 2020 23:59:57 +0200 Subject: entry: Provide generic syscall exit function Like syscall entry all architectures have similar and pointlessly different code to handle pending work before returning from a syscall to user space. 1) One-time syscall exit work: - rseq syscall exit - audit - syscall tracing - tracehook (single stepping) 2) Preparatory work - Exit to user mode loop (common TIF handling). - Architecture specific one time work arch_exit_to_user_mode_prepare() - Address limit and lockdep checks 3) Final transition (lockdep, tracing, context tracking, RCU). Invokes arch_exit_to_user_mode() to handle e.g. speculation mitigations Provide a generic version based on the x86 code which has all the RCU and instrumentation protections right. Provide a variant for interrupt return to user mode as well which shares the above #2 and #3 work items. After syscall_exit_to_user_mode() and irqentry_exit_to_user_mode() the architecture code just has to return to user space. The code after returning from these functions must not be instrumented. Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/20200722220519.613977173@linutronix.de --- include/linux/entry-common.h | 189 +++++++++++++++++++++++++++++++++++++++++++ kernel/entry/common.c | 169 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 358 insertions(+) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 42fc8e4632bb..c4a57be9cde4 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -29,6 +29,14 @@ # define _TIF_SYSCALL_AUDIT (0) #endif +#ifndef _TIF_PATCH_PENDING +# define _TIF_PATCH_PENDING (0) +#endif + +#ifndef _TIF_UPROBE +# define _TIF_UPROBE (0) +#endif + /* * TIF flags handled in syscall_enter_from_usermode() */ @@ -41,6 +49,29 @@ _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) +/* + * TIF flags handled in syscall_exit_to_user_mode() + */ +#ifndef ARCH_SYSCALL_EXIT_WORK +# define ARCH_SYSCALL_EXIT_WORK (0) +#endif + +#define SYSCALL_EXIT_WORK \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK) + +/* + * TIF flags handled in exit_to_user_mode_loop() + */ +#ifndef ARCH_EXIT_TO_USER_MODE_WORK +# define ARCH_EXIT_TO_USER_MODE_WORK (0) +#endif + +#define EXIT_TO_USER_MODE_WORK \ + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \ + ARCH_EXIT_TO_USER_MODE_WORK) + /** * arch_check_user_regs - Architecture specific sanity check for user mode regs * @regs: Pointer to currents pt_regs @@ -105,6 +136,149 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs */ long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall); +/** + * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable() + * @ti_work: Cached TIF flags gathered with interrupts disabled + * + * Defaults to local_irq_enable(). Can be supplied by architecture specific + * code. + */ +static inline void local_irq_enable_exit_to_user(unsigned long ti_work); + +#ifndef local_irq_enable_exit_to_user +static inline void local_irq_enable_exit_to_user(unsigned long ti_work) +{ + local_irq_enable(); +} +#endif + +/** + * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable() + * + * Defaults to local_irq_disable(). Can be supplied by architecture specific + * code. + */ +static inline void local_irq_disable_exit_to_user(void); + +#ifndef local_irq_disable_exit_to_user +static inline void local_irq_disable_exit_to_user(void) +{ + local_irq_disable(); +} +#endif + +/** + * arch_exit_to_user_mode_work - Architecture specific TIF work for exit + * to user mode. + * @regs: Pointer to currents pt_regs + * @ti_work: Cached TIF flags gathered with interrupts disabled + * + * Invoked from exit_to_user_mode_loop() with interrupt enabled + * + * Defaults to NOOP. Can be supplied by architecture specific code. + */ +static inline void arch_exit_to_user_mode_work(struct pt_regs *regs, + unsigned long ti_work); + +#ifndef arch_exit_to_user_mode_work +static inline void arch_exit_to_user_mode_work(struct pt_regs *regs, + unsigned long ti_work) +{ +} +#endif + +/** + * arch_exit_to_user_mode_prepare - Architecture specific preparation for + * exit to user mode. + * @regs: Pointer to currents pt_regs + * @ti_work: Cached TIF flags gathered with interrupts disabled + * + * Invoked from exit_to_user_mode_prepare() with interrupt disabled as the last + * function before return. Defaults to NOOP. + */ +static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + unsigned long ti_work); + +#ifndef arch_exit_to_user_mode_prepare +static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + unsigned long ti_work) +{ +} +#endif + +/** + * arch_exit_to_user_mode - Architecture specific final work before + * exit to user mode. + * + * Invoked from exit_to_user_mode() with interrupt disabled as the last + * function before return. Defaults to NOOP. + * + * This needs to be __always_inline because it is non-instrumentable code + * invoked after context tracking switched to user mode. + * + * An architecture implementation must not do anything complex, no locking + * etc. The main purpose is for speculation mitigations. + */ +static __always_inline void arch_exit_to_user_mode(void); + +#ifndef arch_exit_to_user_mode +static __always_inline void arch_exit_to_user_mode(void) { } +#endif + +/** + * arch_do_signal - Architecture specific signal delivery function + * @regs: Pointer to currents pt_regs + * + * Invoked from exit_to_user_mode_loop(). + */ +void arch_do_signal(struct pt_regs *regs); + +/** + * arch_syscall_exit_tracehook - Wrapper around tracehook_report_syscall_exit() + * @regs: Pointer to currents pt_regs + * @step: Indicator for single step + * + * Defaults to tracehook_report_syscall_exit(). Can be replaced by + * architecture specific code. + * + * Invoked from syscall_exit_to_user_mode() + */ +static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step); + +#ifndef arch_syscall_exit_tracehook +static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step) +{ + tracehook_report_syscall_exit(regs, step); +} +#endif + +/** + * syscall_exit_to_user_mode - Handle work before returning to user mode + * @regs: Pointer to currents pt_regs + * + * Invoked with interrupts enabled and fully valid regs. Returns with all + * work handled, interrupts disabled such that the caller can immediately + * switch to user mode. Called from architecture specific syscall and ret + * from fork code. + * + * The call order is: + * 1) One-time syscall exit work: + * - rseq syscall exit + * - audit + * - syscall tracing + * - tracehook (single stepping) + * + * 2) Preparatory work + * - Exit to user mode loop (common TIF handling). Invokes + * arch_exit_to_user_mode_work() for architecture specific TIF work + * - Architecture specific one time work arch_exit_to_user_mode_prepare() + * - Address limit and lockdep checks + * + * 3) Final transition (lockdep, tracing, context tracking, RCU). Invokes + * arch_exit_to_user_mode() to handle e.g. speculation mitigations + */ +void syscall_exit_to_user_mode(struct pt_regs *regs); + /** * irqentry_enter_from_user_mode - Establish state before invoking the irq handler * @regs: Pointer to currents pt_regs @@ -118,4 +292,19 @@ long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall); */ void irqentry_enter_from_user_mode(struct pt_regs *regs); +/** + * irqentry_exit_to_user_mode - Interrupt exit work + * @regs: Pointer to current's pt_regs + * + * Invoked with interrupts disbled and fully valid regs. Returns with all + * work handled, interrupts disabled such that the caller can immediately + * switch to user mode. Called from architecture specific interrupt + * handling code. + * + * The call order is #2 and #3 as described in syscall_exit_to_user_mode(). + * Interrupt exit is not invoking #1 which is the syscall specific one time + * work. + */ +void irqentry_exit_to_user_mode(struct pt_regs *regs); + #endif diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 1d636dee2fec..0a051bb91a56 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -2,6 +2,8 @@ #include #include +#include +#include #define CREATE_TRACE_POINTS #include @@ -82,7 +84,174 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) return syscall; } +/** + * exit_to_user_mode - Fixup state when exiting to user mode + * + * Syscall/interupt exit enables interrupts, but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about it. + * + * 1) Trace interrupts on state + * 2) Invoke context tracking if enabled to adjust RCU state + * 3) Invoke architecture specific last minute exit code, e.g. speculation + * mitigations, etc. + * 4) Tell lockdep that interrupts are enabled + */ +static __always_inline void exit_to_user_mode(void) +{ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + user_enter_irqoff(); + arch_exit_to_user_mode(); + lockdep_hardirqs_on(CALLER_ADDR0); +} + +/* Workaround to allow gradual conversion of architecture code */ +void __weak arch_do_signal(struct pt_regs *regs) { } + +static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) +{ + /* + * Before returning to user space ensure that all pending work + * items have been completed. + */ + while (ti_work & EXIT_TO_USER_MODE_WORK) { + + local_irq_enable_exit_to_user(ti_work); + + if (ti_work & _TIF_NEED_RESCHED) + schedule(); + + if (ti_work & _TIF_UPROBE) + uprobe_notify_resume(regs); + + if (ti_work & _TIF_PATCH_PENDING) + klp_update_patch_state(current); + + if (ti_work & _TIF_SIGPENDING) + arch_do_signal(regs); + + if (ti_work & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + rseq_handle_notify_resume(NULL, regs); + } + + /* Architecture specific TIF work */ + arch_exit_to_user_mode_work(regs, ti_work); + + /* + * Disable interrupts and reevaluate the work flags as they + * might have changed while interrupts and preemption was + * enabled above. + */ + local_irq_disable_exit_to_user(); + ti_work = READ_ONCE(current_thread_info()->flags); + } + + /* Return the latest work state for arch_exit_to_user_mode() */ + return ti_work; +} + +static void exit_to_user_mode_prepare(struct pt_regs *regs) +{ + unsigned long ti_work = READ_ONCE(current_thread_info()->flags); + + lockdep_assert_irqs_disabled(); + + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) + ti_work = exit_to_user_mode_loop(regs, ti_work); + + arch_exit_to_user_mode_prepare(regs, ti_work); + + /* Ensure that the address limit is intact and no locks are held */ + addr_limit_user_check(); + lockdep_assert_irqs_disabled(); + lockdep_sys_exit(); +} + +#ifndef _TIF_SINGLESTEP +static inline bool report_single_step(unsigned long ti_work) +{ + return false; +} +#else +/* + * If TIF_SYSCALL_EMU is set, then the only reason to report is when + * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall + * instruction has been already reported in syscall_enter_from_usermode(). + */ +#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) + +static inline bool report_single_step(unsigned long ti_work) +{ + return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP; +} +#endif + +static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work) +{ + bool step; + + audit_syscall_exit(regs); + + if (ti_work & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, syscall_get_return_value(current, regs)); + + step = report_single_step(ti_work); + if (step || ti_work & _TIF_SYSCALL_TRACE) + arch_syscall_exit_tracehook(regs, step); +} + +/* + * Syscall specific exit to user mode preparation. Runs with interrupts + * enabled. + */ +static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + u32 cached_flags = READ_ONCE(current_thread_info()->flags); + unsigned long nr = syscall_get_nr(current, regs); + + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { + if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) + local_irq_enable(); + } + + rseq_syscall(regs); + + /* + * Do one-time syscall specific work. If these work items are + * enabled, we want to run them exactly once per syscall exit with + * interrupts enabled. + */ + if (unlikely(cached_flags & SYSCALL_EXIT_WORK)) + syscall_exit_work(regs, cached_flags); +} + +__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + syscall_exit_to_user_mode_prepare(regs); + local_irq_disable_exit_to_user(); + exit_to_user_mode_prepare(regs); + instrumentation_end(); + exit_to_user_mode(); +} + noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) { enter_from_user_mode(regs); } + +noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + exit_to_user_mode_prepare(regs); + instrumentation_end(); + exit_to_user_mode(); +} -- cgit v1.2.3 From a5497bab5f72dce38a259a53fd3ac1239a7ecf40 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jul 2020 23:59:58 +0200 Subject: entry: Provide generic interrupt entry/exit code Like the syscall entry/exit code interrupt/exception entry after the real low level ASM bits should not be different accross architectures. Provide a generic version based on the x86 code. irqentry_enter() is called after the low level entry code and irqentry_exit() must be invoked right before returning to the low level code which just contains the actual return logic. The code before irqentry_enter() and irqentry_exit() must not be instrumented. Code after irqentry_enter() and before irqentry_exit() can be instrumented. irqentry_enter() invokes irqentry_enter_from_user_mode() if the interrupt/exception came from user mode. If if entered from kernel mode it handles the kernel mode variant of establishing state for lockdep, RCU and tracing depending on the kernel context it interrupted (idle, non-idle). Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200722220519.723703209@linutronix.de --- include/linux/entry-common.h | 62 +++++++++++++++++++++++ kernel/entry/common.c | 117 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index c4a57be9cde4..efebbffcd5cc 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -307,4 +307,66 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs); */ void irqentry_exit_to_user_mode(struct pt_regs *regs); +#ifndef irqentry_state +typedef struct irqentry_state { + bool exit_rcu; +} irqentry_state_t; +#endif + +/** + * irqentry_enter - Handle state tracking on ordinary interrupt entries + * @regs: Pointer to pt_regs of interrupted context + * + * Invokes: + * - lockdep irqflag state tracking as low level ASM entry disabled + * interrupts. + * + * - Context tracking if the exception hit user mode. + * + * - The hardirq tracer to keep the state consistent as low level ASM + * entry disabled interrupts. + * + * As a precondition, this requires that the entry came from user mode, + * idle, or a kernel context in which RCU is watching. + * + * For kernel mode entries RCU handling is done conditional. If RCU is + * watching then the only RCU requirement is to check whether the tick has + * to be restarted. If RCU is not watching then rcu_irq_enter() has to be + * invoked on entry and rcu_irq_exit() on exit. + * + * Avoiding the rcu_irq_enter/exit() calls is an optimization but also + * solves the problem of kernel mode pagefaults which can schedule, which + * is not possible after invoking rcu_irq_enter() without undoing it. + * + * For user mode entries irqentry_enter_from_user_mode() is invoked to + * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit + * would not be possible. + * + * Returns: An opaque object that must be passed to idtentry_exit() + */ +irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); + +/** + * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt + * + * Conditional reschedule with additional sanity checks. + */ +void irqentry_exit_cond_resched(void); + +/** + * irqentry_exit - Handle return from exception that used irqentry_enter() + * @regs: Pointer to pt_regs (exception entry regs) + * @state: Return value from matching call to irqentry_enter() + * + * Depending on the return target (kernel/user) this runs the necessary + * preemption and work checks if possible and reguired and returns to + * the caller with interrupts disabled and no further work pending. + * + * This is the last action before returning to the low level ASM code which + * just needs to return to the appropriate context. + * + * Counterpart to irqentry_enter(). + */ +void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state); + #endif diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 0a051bb91a56..495f5c051b03 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -255,3 +255,120 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) instrumentation_end(); exit_to_user_mode(); } + +irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs) +{ + irqentry_state_t ret = { + .exit_rcu = false, + }; + + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + return ret; + } + + /* + * If this entry hit the idle task invoke rcu_irq_enter() whether + * RCU is watching or not. + * + * Interupts can nest when the first interrupt invokes softirq + * processing on return which enables interrupts. + * + * Scheduler ticks in the idle task can mark quiescent state and + * terminate a grace period, if and only if the timer interrupt is + * not nested into another interrupt. + * + * Checking for __rcu_is_watching() here would prevent the nesting + * interrupt to invoke rcu_irq_enter(). If that nested interrupt is + * the tick then rcu_flavor_sched_clock_irq() would wrongfully + * assume that it is the first interupt and eventually claim + * quiescient state and end grace periods prematurely. + * + * Unconditionally invoke rcu_irq_enter() so RCU state stays + * consistent. + * + * TINY_RCU does not support EQS, so let the compiler eliminate + * this part when enabled. + */ + if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { + /* + * If RCU is not watching then the same careful + * sequence vs. lockdep and tracing is required + * as in irq_enter_from_user_mode(). + */ + lockdep_hardirqs_off(CALLER_ADDR0); + rcu_irq_enter(); + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); + + ret.exit_rcu = true; + return ret; + } + + /* + * If RCU is watching then RCU only wants to check whether it needs + * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() + * already contains a warning when RCU is not watching, so no point + * in having another one here. + */ + instrumentation_begin(); + rcu_irq_enter_check_tick(); + /* Use the combo lockdep/tracing function */ + trace_hardirqs_off(); + instrumentation_end(); + + return ret; +} + +void irqentry_exit_cond_resched(void) +{ + if (!preempt_count()) { + /* Sanity check RCU and thread stack */ + rcu_irq_exit_check_preempt(); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + WARN_ON_ONCE(!on_thread_stack()); + if (need_resched()) + preempt_schedule_irq(); + } +} + +void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state) +{ + lockdep_assert_irqs_disabled(); + + /* Check whether this returns to user mode */ + if (user_mode(regs)) { + irqentry_exit_to_user_mode(regs); + } else if (!regs_irqs_disabled(regs)) { + /* + * If RCU was not watching on entry this needs to be done + * carefully and needs the same ordering of lockdep/tracing + * and RCU as the return to user mode path. + */ + if (state.exit_rcu) { + instrumentation_begin(); + /* Tell the tracer that IRET will enable interrupts */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + rcu_irq_exit(); + lockdep_hardirqs_on(CALLER_ADDR0); + return; + } + + instrumentation_begin(); + if (IS_ENABLED(CONFIG_PREEMPTION)) + irqentry_exit_cond_resched(); + /* Covers both tracing and lockdep */ + trace_hardirqs_on(); + instrumentation_end(); + } else { + /* + * IRQ flags state is correct already. Just tell RCU if it + * was not watching on entry. + */ + if (state.exit_rcu) + rcu_irq_exit(); + } +} -- cgit v1.2.3 From 935ace2fb5cc49ae88bd1f1735ddc51cdc2ebfb3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jul 2020 23:59:59 +0200 Subject: entry: Provide infrastructure for work before transitioning to guest mode Entering a guest is similar to exiting to user space. Pending work like handling signals, rescheduling, task work etc. needs to be handled before that. Provide generic infrastructure to avoid duplication of the same handling code all over the place. The transfer to guest mode handling is different from the exit to usermode handling, e.g. vs. rseq and live patching, so a separate function is used. The initial list of work items handled is: TIF_SIGPENDING, TIF_NEED_RESCHED, TIF_NOTIFY_RESUME Architecture specific TIF flags can be added via defines in the architecture specific include files. The calling convention is also different from the syscall/interrupt entry functions as KVM invokes this from the outer vcpu_run() loop with interrupts and preemption enabled. To prevent missing a pending work item it invokes a check for pending TIF work from interrupt disabled code right before transitioning to guest mode. The lockdep, RCU and tracing state handling is also done directly around the switch to and from guest mode. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200722220519.833296398@linutronix.de --- include/linux/entry-kvm.h | 80 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/kvm_host.h | 8 +++++ kernel/entry/Makefile | 3 +- kernel/entry/kvm.c | 51 ++++++++++++++++++++++++++++++ virt/kvm/Kconfig | 3 ++ 5 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 include/linux/entry-kvm.h create mode 100644 kernel/entry/kvm.c (limited to 'kernel') diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h new file mode 100644 index 000000000000..0cef17afb41a --- /dev/null +++ b/include/linux/entry-kvm.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_ENTRYKVM_H +#define __LINUX_ENTRYKVM_H + +#include + +/* Transfer to guest mode work */ +#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK + +#ifndef ARCH_XFER_TO_GUEST_MODE_WORK +# define ARCH_XFER_TO_GUEST_MODE_WORK (0) +#endif + +#define XFER_TO_GUEST_MODE_WORK \ + (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK) + +struct kvm_vcpu; + +/** + * arch_xfer_to_guest_mode_handle_work - Architecture specific xfer to guest + * mode work handling function. + * @vcpu: Pointer to current's VCPU data + * @ti_work: Cached TIF flags gathered in xfer_to_guest_mode_handle_work() + * + * Invoked from xfer_to_guest_mode_handle_work(). Defaults to NOOP. Can be + * replaced by architecture specific code. + */ +static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, + unsigned long ti_work); + +#ifndef arch_xfer_to_guest_mode_work +static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, + unsigned long ti_work) +{ + return 0; +} +#endif + +/** + * xfer_to_guest_mode_handle_work - Check and handle pending work which needs + * to be handled before going to guest mode + * @vcpu: Pointer to current's VCPU data + * + * Returns: 0 or an error code + */ +int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu); + +/** + * __xfer_to_guest_mode_work_pending - Check if work is pending + * + * Returns: True if work pending, False otherwise. + * + * Bare variant of xfer_to_guest_mode_work_pending(). Can be called from + * interrupt enabled code for racy quick checks with care. + */ +static inline bool __xfer_to_guest_mode_work_pending(void) +{ + unsigned long ti_work = READ_ONCE(current_thread_info()->flags); + + return !!(ti_work & XFER_TO_GUEST_MODE_WORK); +} + +/** + * xfer_to_guest_mode_work_pending - Check if work is pending which needs to be + * handled before returning to guest mode + * + * Returns: True if work pending, False otherwise. + * + * Has to be invoked with interrupts disabled before the transition to + * guest mode. + */ +static inline bool xfer_to_guest_mode_work_pending(void) +{ + lockdep_assert_irqs_disabled(); + return __xfer_to_guest_mode_work_pending(); +} +#endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ + +#endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d564855243d8..ac83e9c1d82c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1439,4 +1439,12 @@ int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, uintptr_t data, const char *name, struct task_struct **thread_ptr); +#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK +static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) +{ + vcpu->run->exit_reason = KVM_EXIT_INTR; + vcpu->stat.signal_exits++; +} +#endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ + #endif diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index c207d202bf3a..34c8a3f1c735 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -9,4 +9,5 @@ KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector -obj-$(CONFIG_GENERIC_ENTRY) += common.o +obj-$(CONFIG_GENERIC_ENTRY) += common.o +obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c new file mode 100644 index 000000000000..eb1a8a4c867c --- /dev/null +++ b/kernel/entry/kvm.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) +{ + do { + int ret; + + if (ti_work & _TIF_SIGPENDING) { + kvm_handle_signal_exit(vcpu); + return -EINTR; + } + + if (ti_work & _TIF_NEED_RESCHED) + schedule(); + + if (ti_work & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(NULL); + } + + ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); + if (ret) + return ret; + + ti_work = READ_ONCE(current_thread_info()->flags); + } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); + return 0; +} + +int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) +{ + unsigned long ti_work; + + /* + * This is invoked from the outer guest loop with interrupts and + * preemption enabled. + * + * KVM invokes xfer_to_guest_mode_work_pending() with interrupts + * disabled in the inner loop before going into guest mode. No need + * to disable interrupts here. + */ + ti_work = READ_ONCE(current_thread_info()->flags); + if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) + return 0; + + return xfer_to_guest_mode_work(vcpu, ti_work); +} +EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work); diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index aad9284c043a..1c37ccd5d402 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -60,3 +60,6 @@ config HAVE_KVM_VCPU_RUN_PID_CHANGE config HAVE_KVM_NO_POLL bool + +config KVM_XFER_TO_GUEST_WORK + bool -- cgit v1.2.3 From e5ebffe18e5add85acb23c7a0f74509749967204 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Sun, 19 Jul 2020 17:10:45 -0600 Subject: dyndbg: rename __verbose section to __dyndbg dyndbg populates its callsite info into __verbose section, change that to a more specific and descriptive name, __dyndbg. Also, per checkpatch: simplify __attribute(..) to __section(__dyndbg) declaration. and 1 spelling fix, decriptor Acked-by: Signed-off-by: Jim Cromie Link: https://lore.kernel.org/r/20200719231058.1586423-6-jim.cromie@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/asm-generic/vmlinux.lds.h | 6 +++--- include/linux/dynamic_debug.h | 4 ++-- kernel/module.c | 2 +- lib/dynamic_debug.c | 12 ++++++------ 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index db600ef218d7..05af5cef1ad6 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -320,9 +320,9 @@ *(__tracepoints) \ /* implement dynamic printk debug */ \ . = ALIGN(8); \ - __start___verbose = .; \ - KEEP(*(__verbose)) \ - __stop___verbose = .; \ + __start___dyndbg = .; \ + KEEP(*(__dyndbg)) \ + __stop___dyndbg = .; \ LIKELY_PROFILE() \ BRANCH_PROFILE() \ TRACE_PRINTKS() \ diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index abcd5fde30eb..aa9ff9e1c0b3 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -80,7 +80,7 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, #define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \ static struct _ddebug __aligned(8) \ - __attribute__((section("__verbose"))) name = { \ + __section(__dyndbg) name = { \ .modname = KBUILD_MODNAME, \ .function = __func__, \ .filename = __FILE__, \ @@ -133,7 +133,7 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, /* * "Factory macro" for generating a call to func, guarded by a - * DYNAMIC_DEBUG_BRANCH. The dynamic debug decriptor will be + * DYNAMIC_DEBUG_BRANCH. The dynamic debug descriptor will be * initialized using the fmt argument. The function will be called with * the address of the descriptor as first argument, followed by all * the varargs. Note that fmt is repeated in invocations of this diff --git a/kernel/module.c b/kernel/module.c index aa183c9ac0a2..e7b4ff7e4fd0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3237,7 +3237,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) if (section_addr(info, "__obsparm")) pr_warn("%s: Ignoring obsolete parameters\n", mod->name); - info->debug = section_objs(info, "__verbose", + info->debug = section_objs(info, "__dyndbg", sizeof(*info->debug), &info->num_debug); return 0; diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index c97872cffc8e..66c0bdf06ce7 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -39,8 +39,8 @@ #include -extern struct _ddebug __start___verbose[]; -extern struct _ddebug __stop___verbose[]; +extern struct _ddebug __start___dyndbg[]; +extern struct _ddebug __stop___dyndbg[]; struct ddebug_table { struct list_head link; @@ -1019,7 +1019,7 @@ static int __init dynamic_debug_init(void) int n = 0, entries = 0, modct = 0; int verbose_bytes = 0; - if (&__start___verbose == &__stop___verbose) { + if (&__start___dyndbg == &__stop___dyndbg) { if (IS_ENABLED(CONFIG_DYNAMIC_DEBUG)) { pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); return 1; @@ -1028,10 +1028,10 @@ static int __init dynamic_debug_init(void) ddebug_init_success = 1; return 0; } - iter = __start___verbose; + iter = __start___dyndbg; modname = iter->modname; iter_start = iter; - for (; iter < __stop___verbose; iter++) { + for (; iter < __stop___dyndbg; iter++) { entries++; verbose_bytes += strlen(iter->modname) + strlen(iter->function) + strlen(iter->filename) + strlen(iter->format); @@ -1054,7 +1054,7 @@ static int __init dynamic_debug_init(void) ddebug_init_success = 1; vpr_info("%d modules, %d entries and %d bytes in ddebug tables, %d bytes in (readonly) verbose section\n", modct, entries, (int)(modct * sizeof(struct ddebug_table)), - verbose_bytes + (int)(__stop___verbose - __start___verbose)); + verbose_bytes + (int)(__stop___dyndbg - __start___dyndbg)); /* apply ddebug_query boot param, dont unload tables on err */ if (ddebug_setup_string[0] != '\0') { -- cgit v1.2.3 From 13efa616124f7eec7d6a58adeeef31864aa03879 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Sat, 25 Jul 2020 16:56:29 +0800 Subject: sched/uclamp: Remove unnecessary mutex_init() The uclamp_mutex lock is initialized statically via DEFINE_MUTEX(), it is unnecessary to initialize it runtime via mutex_init(). Signed-off-by: Qinglang Miao Signed-off-by: Ingo Molnar Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Dietmar Eggemann Link: https://lore.kernel.org/r/20200725085629.98292-1-miaoqinglang@huawei.com --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bd8e5211d31f..678253450ebb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1425,8 +1425,6 @@ static void __init init_uclamp(void) enum uclamp_id clamp_id; int cpu; - mutex_init(&uclamp_mutex); - for_each_possible_cpu(cpu) init_uclamp_rq(cpu_rq(cpu)); -- cgit v1.2.3 From a7ef9b28aa8d72a1656fa6f0a01bbd1493886317 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 25 Jul 2020 19:51:10 +0100 Subject: locking/lockdep: Fix overflow in presentation of average lock-time Though the number of lock-acquisitions is tracked as unsigned long, this is passed as the divisor to div_s64() which interprets it as a s32, giving nonsense values with more than 2 billion acquisitons. E.g. acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg ------------------------------------------------------------------------- 2350439395 0.07 353.38 649647067.36 0.-32 Signed-off-by: Chris Wilson Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20200725185110.11588-1-chris@chris-wilson.co.uk --- kernel/locking/lockdep_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 5525cd3ba0c8..02ef87f50df2 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -423,7 +423,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) seq_time(m, lt->min); seq_time(m, lt->max); seq_time(m, lt->total); - seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0); + seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0); } static void seq_stats(struct seq_file *m, struct lock_stat_data *data) -- cgit v1.2.3 From 3f9969f2c040ba2ba635b6b5a7051f404bcc634d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Jul 2020 12:51:56 -0700 Subject: bpf: Fix pos computation for bpf_iter seq_ops->start() Currently, the pos pointer in bpf iterator map/task/task_file seq_ops->start() is always incremented. This is incorrect. It should be increased only if *pos is 0 (for SEQ_START_TOKEN) since these start() function actually returns the first real object. If *pos is not 0, it merely found the object based on the state in seq->private, and not really advancing the *pos. This patch fixed this issue by only incrementing *pos if it is 0. Note that the old *pos calculation, although not correct, does not affect correctness of bpf_iter as bpf_iter seq_file->read() does not support llseek. This patch also renamed "mid" in bpf_map iterator seq_file private data to "map_id" for better clarity. Fixes: 6086d29def80 ("bpf: Add bpf_map iterator") Fixes: eaaacd23910f ("bpf: Add task and task/file iterator targets") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200722195156.4029817-1-yhs@fb.com --- kernel/bpf/map_iter.c | 16 ++++++---------- kernel/bpf/task_iter.c | 6 ++++-- 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 8a7af11b411f..5926c76d854e 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -7,7 +7,7 @@ #include struct bpf_iter_seq_map_info { - u32 mid; + u32 map_id; }; static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) @@ -15,27 +15,23 @@ static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_map_info *info = seq->private; struct bpf_map *map; - map = bpf_map_get_curr_or_next(&info->mid); + map = bpf_map_get_curr_or_next(&info->map_id); if (!map) return NULL; - ++*pos; + if (*pos == 0) + ++*pos; return map; } static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_map_info *info = seq->private; - struct bpf_map *map; ++*pos; - ++info->mid; + ++info->map_id; bpf_map_put((struct bpf_map *)v); - map = bpf_map_get_curr_or_next(&info->mid); - if (!map) - return NULL; - - return map; + return bpf_map_get_curr_or_next(&info->map_id); } struct bpf_iter__bpf_map { diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 2feecf095609..1039e52ebd8b 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -51,7 +51,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) if (!task) return NULL; - ++*pos; + if (*pos == 0) + ++*pos; return task; } @@ -210,7 +211,8 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) return NULL; } - ++*pos; + if (*pos == 0) + ++*pos; info->task = task; info->files = files; -- cgit v1.2.3 From a228a64fc1e4428e2b96dc68e9ad3c447095c9e7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Jul 2020 18:10:18 -0700 Subject: bpf: Add bpf_prog iterator It's mostly a copy paste of commit 6086d29def80 ("bpf: Add bpf_map iterator") that is use to implement bpf_seq_file opreations to traverse all bpf programs. v1->v2: Tweak to use build time btf_id Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/Makefile | 2 +- kernel/bpf/prog_iter.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 19 +++++++++ 4 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/prog_iter.c (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bae557ff2da8..72221aea1c60 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1117,6 +1117,7 @@ int generic_map_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); struct bpf_map *bpf_map_get_curr_or_next(u32 *id); +struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); extern int sysctl_unprivileged_bpf_disabled; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1131a921e1a6..e6eb9c0402da 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c new file mode 100644 index 000000000000..6541b577d69f --- /dev/null +++ b/kernel/bpf/prog_iter.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include +#include + +struct bpf_iter_seq_prog_info { + u32 prog_id; +}; + +static void *bpf_prog_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_prog_info *info = seq->private; + struct bpf_prog *prog; + + prog = bpf_prog_get_curr_or_next(&info->prog_id); + if (!prog) + return NULL; + + if (*pos == 0) + ++*pos; + return prog; +} + +static void *bpf_prog_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_prog_info *info = seq->private; + + ++*pos; + ++info->prog_id; + bpf_prog_put((struct bpf_prog *)v); + return bpf_prog_get_curr_or_next(&info->prog_id); +} + +struct bpf_iter__bpf_prog { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_prog *, prog); +}; + +DEFINE_BPF_ITER_FUNC(bpf_prog, struct bpf_iter_meta *meta, struct bpf_prog *prog) + +static int __bpf_prog_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_prog ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.prog = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_prog_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_prog_seq_show(seq, v, false); +} + +static void bpf_prog_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_prog_seq_show(seq, v, true); + else + bpf_prog_put((struct bpf_prog *)v); +} + +static const struct seq_operations bpf_prog_seq_ops = { + .start = bpf_prog_seq_start, + .next = bpf_prog_seq_next, + .stop = bpf_prog_seq_stop, + .show = bpf_prog_seq_show, +}; + +BTF_ID_LIST(btf_bpf_prog_id) +BTF_ID(struct, bpf_prog) + +static struct bpf_iter_reg bpf_prog_reg_info = { + .target = "bpf_prog", + .seq_ops = &bpf_prog_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_prog_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_prog, prog), + PTR_TO_BTF_ID_OR_NULL }, + }, +}; + +static int __init bpf_prog_iter_init(void) +{ + bpf_prog_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_prog_id; + return bpf_iter_reg_target(&bpf_prog_reg_info); +} + +late_initcall(bpf_prog_iter_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d07417d17712..ee290b1f2d9e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3044,6 +3044,25 @@ again: return map; } +struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id) +{ + struct bpf_prog *prog; + + spin_lock_bh(&prog_idr_lock); +again: + prog = idr_get_next(&prog_idr, id); + if (prog) { + prog = bpf_prog_inc_not_zero(prog); + if (IS_ERR(prog)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&prog_idr_lock); + + return prog; +} + #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id struct bpf_prog *bpf_prog_by_id(u32 id) -- cgit v1.2.3 From 14fc6bd6b79c430f615500d0fe6cea4722110db8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:09 -0700 Subject: bpf: Refactor bpf_iter_reg to have separate seq_info member There is no functionality change for this patch. Struct bpf_iter_reg is used to register a bpf_iter target, which includes information for both prog_load, link_create and seq_file creation. This patch puts fields related seq_file creation into a different structure. This will be useful for map elements iterator where one iterator covers different map types and different map types may have different seq_ops, init/fini private_data function and private_data size. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184109.590030-1-yhs@fb.com --- include/linux/bpf.h | 17 ++++++++++------- kernel/bpf/bpf_iter.c | 12 ++++++------ kernel/bpf/map_iter.c | 8 ++++++-- kernel/bpf/prog_iter.c | 8 ++++++-- kernel/bpf/task_iter.c | 16 ++++++++++++---- net/ipv4/tcp_ipv4.c | 8 ++++++-- net/ipv4/udp.c | 8 ++++++-- net/ipv6/route.c | 8 ++++++-- net/netlink/af_netlink.c | 8 ++++++-- 9 files changed, 64 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 72221aea1c60..127067f71fd4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -37,6 +37,15 @@ struct seq_operations; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; +typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); +typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); +struct bpf_iter_seq_info { + const struct seq_operations *seq_ops; + bpf_iter_init_seq_priv_t init_seq_private; + bpf_iter_fini_seq_priv_t fini_seq_private; + u32 seq_priv_size; +}; + /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ @@ -1189,18 +1198,12 @@ int bpf_obj_get_user(const char __user *pathname, int flags); extern int bpf_iter_ ## target(args); \ int __init bpf_iter_ ## target(args) { return 0; } -typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); -typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); - #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; - const struct seq_operations *seq_ops; - bpf_iter_init_seq_priv_t init_seq_private; - bpf_iter_fini_seq_priv_t fini_seq_private; - u32 seq_priv_size; u32 ctx_arg_info_size; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; + const struct bpf_iter_seq_info *seq_info; }; struct bpf_iter_meta { diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index dd612b80b9fe..5b2387d6aa1f 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -218,8 +218,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->reg_info->fini_seq_private) - iter_priv->tinfo->reg_info->fini_seq_private(seq->private); + if (iter_priv->tinfo->reg_info->seq_info->fini_seq_private) + iter_priv->tinfo->reg_info->seq_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -433,16 +433,16 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->reg_info->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops, + tinfo->reg_info->seq_info->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->reg_info->seq_info->seq_ops, total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->reg_info->init_seq_private) { - err = tinfo->reg_info->init_seq_private(priv_data->target_private); + if (tinfo->reg_info->seq_info->init_seq_private) { + err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private); if (err) goto release_seq_file; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 5926c76d854e..1a69241fb1e2 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -81,17 +81,21 @@ static const struct seq_operations bpf_map_seq_ops = { BTF_ID_LIST(btf_bpf_map_id) BTF_ID(struct, bpf_map) -static struct bpf_iter_reg bpf_map_reg_info = { - .target = "bpf_map", +static const struct bpf_iter_seq_info bpf_map_seq_info = { .seq_ops = &bpf_map_seq_ops, .init_seq_private = NULL, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), +}; + +static struct bpf_iter_reg bpf_map_reg_info = { + .target = "bpf_map", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map, map), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &bpf_map_seq_info, }; static int __init bpf_map_iter_init(void) diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c index 6541b577d69f..53a73c841c13 100644 --- a/kernel/bpf/prog_iter.c +++ b/kernel/bpf/prog_iter.c @@ -81,17 +81,21 @@ static const struct seq_operations bpf_prog_seq_ops = { BTF_ID_LIST(btf_bpf_prog_id) BTF_ID(struct, bpf_prog) -static struct bpf_iter_reg bpf_prog_reg_info = { - .target = "bpf_prog", +static const struct bpf_iter_seq_info bpf_prog_seq_info = { .seq_ops = &bpf_prog_seq_ops, .init_seq_private = NULL, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct bpf_iter_seq_prog_info), +}; + +static struct bpf_iter_reg bpf_prog_reg_info = { + .target = "bpf_prog", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_prog, prog), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &bpf_prog_seq_info, }; static int __init bpf_prog_iter_init(void) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 1039e52ebd8b..6d9cd23869bf 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -319,25 +319,32 @@ BTF_ID_LIST(btf_task_file_ids) BTF_ID(struct, task_struct) BTF_ID(struct, file) -static struct bpf_iter_reg task_reg_info = { - .target = "task", +static const struct bpf_iter_seq_info task_seq_info = { .seq_ops = &task_seq_ops, .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), +}; + +static struct bpf_iter_reg task_reg_info = { + .target = "task", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__task, task), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &task_seq_info, }; -static struct bpf_iter_reg task_file_reg_info = { - .target = "task_file", +static const struct bpf_iter_seq_info task_file_seq_info = { .seq_ops = &task_file_seq_ops, .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), +}; + +static struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__task_file, task), @@ -345,6 +352,7 @@ static struct bpf_iter_reg task_file_reg_info = { { offsetof(struct bpf_iter__task_file, file), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &task_file_seq_info, }; static int __init task_iter_init(void) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f8913923a6c0..cb288fdcf2ca 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2947,17 +2947,21 @@ static void bpf_iter_fini_tcp(void *priv_data) bpf_iter_fini_seq_net(priv_data); } -static struct bpf_iter_reg tcp_reg_info = { - .target = "tcp", +static const struct bpf_iter_seq_info tcp_seq_info = { .seq_ops = &bpf_iter_tcp_seq_ops, .init_seq_private = bpf_iter_init_tcp, .fini_seq_private = bpf_iter_fini_tcp, .seq_priv_size = sizeof(struct tcp_iter_state), +}; + +static struct bpf_iter_reg tcp_reg_info = { + .target = "tcp", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__tcp, sk_common), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &tcp_seq_info, }; static void __init bpf_iter_register(void) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0fb5e4ea133f..1bc50ec2caef 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -3208,17 +3208,21 @@ static void bpf_iter_fini_udp(void *priv_data) bpf_iter_fini_seq_net(priv_data); } -static struct bpf_iter_reg udp_reg_info = { - .target = "udp", +static const struct bpf_iter_seq_info udp_seq_info = { .seq_ops = &bpf_iter_udp_seq_ops, .init_seq_private = bpf_iter_init_udp, .fini_seq_private = bpf_iter_fini_udp, .seq_priv_size = sizeof(struct udp_iter_state), +}; + +static struct bpf_iter_reg udp_reg_info = { + .target = "udp", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__udp, udp_sk), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &udp_seq_info, }; static void __init bpf_iter_register(void) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 33f5efbad0a9..8bfc57b0802a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6427,17 +6427,21 @@ DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *r BTF_ID_LIST(btf_fib6_info_id) BTF_ID(struct, fib6_info) -static struct bpf_iter_reg ipv6_route_reg_info = { - .target = "ipv6_route", +static const struct bpf_iter_seq_info ipv6_route_seq_info = { .seq_ops = &ipv6_route_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct ipv6_route_iter), +}; + +static struct bpf_iter_reg ipv6_route_reg_info = { + .target = "ipv6_route", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__ipv6_route, rt), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &ipv6_route_seq_info, }; static int __init bpf_iter_register(void) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index d8921b833744..b5f30d7d30d0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2807,17 +2807,21 @@ static const struct rhashtable_params netlink_rhashtable_params = { BTF_ID_LIST(btf_netlink_sock_id) BTF_ID(struct, netlink_sock) -static struct bpf_iter_reg netlink_reg_info = { - .target = "netlink", +static const struct bpf_iter_seq_info netlink_seq_info = { .seq_ops = &netlink_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct nl_seq_iter), +}; + +static struct bpf_iter_reg netlink_reg_info = { + .target = "netlink", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__netlink, sk), PTR_TO_BTF_ID_OR_NULL }, }, + .seq_info = &netlink_seq_info, }; static int __init bpf_iter_register(void) -- cgit v1.2.3 From f9c792729581bd8b8473af163e8ab426c2c61d89 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:10 -0700 Subject: bpf: Refactor to provide aux info to bpf_iter_init_seq_priv_t This patch refactored target bpf_iter_init_seq_priv_t callback function to accept additional information. This will be needed in later patches for map element targets since a particular map should be passed to traverse elements for that particular map. In the future, other information may be passed to target as well, e.g., pid, cgroup id, etc. to customize the iterator. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184110.590156-1-yhs@fb.com --- fs/proc/proc_net.c | 2 +- include/linux/bpf.h | 7 ++++++- include/linux/proc_fs.h | 3 ++- kernel/bpf/bpf_iter.c | 2 +- kernel/bpf/task_iter.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/udp.c | 4 ++-- 7 files changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index dba63b2429f0..ed8a6306990c 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -98,7 +98,7 @@ static const struct proc_ops proc_net_seq_ops = { .proc_release = seq_release_net, }; -int bpf_iter_init_seq_net(void *priv_data) +int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux) { #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 127067f71fd4..ef52717336cf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -33,11 +33,13 @@ struct btf; struct btf_type; struct exception_table_entry; struct seq_operations; +struct bpf_iter_aux_info; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; -typedef int (*bpf_iter_init_seq_priv_t)(void *private_data); +typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, + struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); struct bpf_iter_seq_info { const struct seq_operations *seq_ops; @@ -1198,6 +1200,9 @@ int bpf_obj_get_user(const char __user *pathname, int flags); extern int bpf_iter_ ## target(args); \ int __init bpf_iter_ ## target(args) { return 0; } +struct bpf_iter_aux_info { +}; + #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index d1eed1b43651..2df965cd0974 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -133,7 +133,8 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo void *data); extern struct pid *tgid_pidfd_to_pid(const struct file *file); -extern int bpf_iter_init_seq_net(void *priv_data); +struct bpf_iter_aux_info; +extern int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux); extern void bpf_iter_fini_seq_net(void *priv_data); #ifdef CONFIG_PROC_PID_ARCH_STATUS diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5b2387d6aa1f..8fa94cb1b5a0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -442,7 +442,7 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) } if (tinfo->reg_info->seq_info->init_seq_private) { - err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private); + err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private, NULL); if (err) goto release_seq_file; } diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 6d9cd23869bf..232df29793e9 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -293,7 +293,7 @@ static void task_file_seq_stop(struct seq_file *seq, void *v) } } -static int init_seq_pidns(void *priv_data) +static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_iter_seq_task_common *common = priv_data; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cb288fdcf2ca..5084333b5ab6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2921,7 +2921,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, struct sock_common *sk_common, uid_t uid) -static int bpf_iter_init_tcp(void *priv_data) +static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) { struct tcp_iter_state *st = priv_data; struct tcp_seq_afinfo *afinfo; @@ -2933,7 +2933,7 @@ static int bpf_iter_init_tcp(void *priv_data) afinfo->family = AF_UNSPEC; st->bpf_seq_afinfo = afinfo; - ret = bpf_iter_init_seq_net(priv_data); + ret = bpf_iter_init_seq_net(priv_data, aux); if (ret) kfree(afinfo); return ret; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1bc50ec2caef..7ce31beccfc2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -3181,7 +3181,7 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = { DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta, struct udp_sock *udp_sk, uid_t uid, int bucket) -static int bpf_iter_init_udp(void *priv_data) +static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) { struct udp_iter_state *st = priv_data; struct udp_seq_afinfo *afinfo; @@ -3194,7 +3194,7 @@ static int bpf_iter_init_udp(void *priv_data) afinfo->family = AF_UNSPEC; afinfo->udp_table = &udp_table; st->bpf_seq_afinfo = afinfo; - ret = bpf_iter_init_seq_net(priv_data); + ret = bpf_iter_init_seq_net(priv_data, aux); if (ret) kfree(afinfo); return ret; -- cgit v1.2.3 From afbf21dce668ef59482037596eaffbe5041e094c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:11 -0700 Subject: bpf: Support readonly/readwrite buffers in verifier Readonly and readwrite buffer register states are introduced. Totally four states, PTR_TO_RDONLY_BUF[_OR_NULL] and PTR_TO_RDWR_BUF[_OR_NULL] are supported. As suggested by their respective names, PTR_TO_RDONLY_BUF[_OR_NULL] are for readonly buffers and PTR_TO_RDWR_BUF[_OR_NULL] for read/write buffers. These new register states will be used by later bpf map element iterator. New register states share some similarity to PTR_TO_TP_BUFFER as it will calculate accessed buffer size during verification time. The accessed buffer size will be later compared to other metrics during later attach/link_create time. Similar to reg_state PTR_TO_BTF_ID_OR_NULL in bpf iterator programs, PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL reg_types can be set at prog->aux->bpf_ctx_arg_aux, and bpf verifier will retrieve the values during btf_ctx_access(). Later bpf map element iterator implementation will show how such information will be assigned during target registeration time. The verifier is also enhanced such that PTR_TO_RDONLY_BUF can be passed to ARG_PTR_TO_MEM[_OR_NULL] helper argument, and PTR_TO_RDWR_BUF can be passed to ARG_PTR_TO_MEM[_OR_NULL] or ARG_PTR_TO_UNINIT_MEM. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184111.590274-1-yhs@fb.com --- include/linux/bpf.h | 6 ++++ kernel/bpf/btf.c | 13 ++++++++ kernel/bpf/verifier.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 104 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ef52717336cf..f9c4bb08f616 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -353,6 +353,10 @@ enum bpf_reg_type { PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */ PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ + PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ + PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ + PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ + PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -694,6 +698,8 @@ struct bpf_prog_aux { u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */ u32 ctx_arg_info_size; + u32 max_rdonly_access; + u32 max_rdwr_access; const struct bpf_ctx_arg_aux *ctx_arg_info; struct bpf_prog *linked_prog; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ee36b7f60936..0fd6bb62be3a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3806,6 +3806,19 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, btf_kind_str[BTF_INFO_KIND(t->info)]); return false; } + + /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + + if (ctx_arg_info->offset == off && + (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL || + ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) { + info->reg_type = ctx_arg_info->reg_type; + return true; + } + } + if (t->type == 0) /* This is a pointer to void. * It is the same as scalar from the verifier safety pov. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9a6703bc3f36..8d6979db48d8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -409,7 +409,9 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) type == PTR_TO_SOCK_COMMON_OR_NULL || type == PTR_TO_TCP_SOCK_OR_NULL || type == PTR_TO_BTF_ID_OR_NULL || - type == PTR_TO_MEM_OR_NULL; + type == PTR_TO_MEM_OR_NULL || + type == PTR_TO_RDONLY_BUF_OR_NULL || + type == PTR_TO_RDWR_BUF_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -503,6 +505,10 @@ static const char * const reg_type_str[] = { [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", [PTR_TO_MEM] = "mem", [PTR_TO_MEM_OR_NULL] = "mem_or_null", + [PTR_TO_RDONLY_BUF] = "rdonly_buf", + [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", + [PTR_TO_RDWR_BUF] = "rdwr_buf", + [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", }; static char slot_type_char[] = { @@ -2173,6 +2179,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: case PTR_TO_BTF_ID_OR_NULL: + case PTR_TO_RDONLY_BUF: + case PTR_TO_RDONLY_BUF_OR_NULL: + case PTR_TO_RDWR_BUF: + case PTR_TO_RDWR_BUF_OR_NULL: return true; default: return false; @@ -3052,14 +3062,15 @@ int check_ctx_reg(struct bpf_verifier_env *env, return 0; } -static int check_tp_buffer_access(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int regno, int off, int size) +static int __check_buffer_access(struct bpf_verifier_env *env, + const char *buf_info, + const struct bpf_reg_state *reg, + int regno, int off, int size) { if (off < 0) { verbose(env, - "R%d invalid tracepoint buffer access: off=%d, size=%d", - regno, off, size); + "R%d invalid %s buffer access: off=%d, size=%d", + regno, buf_info, off, size); return -EACCES; } if (!tnum_is_const(reg->var_off) || reg->var_off.value) { @@ -3071,12 +3082,45 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, regno, off, tn_buf); return -EACCES; } + + return 0; +} + +static int check_tp_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size) +{ + int err; + + err = __check_buffer_access(env, "tracepoint", reg, regno, off, size); + if (err) + return err; + if (off + size > env->prog->aux->max_tp_access) env->prog->aux->max_tp_access = off + size; return 0; } +static int check_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size, + bool zero_size_allowed, + const char *buf_info, + u32 *max_access) +{ + int err; + + err = __check_buffer_access(env, buf_info, reg, regno, off, size); + if (err) + return err; + + if (off + size > *max_access) + *max_access = off + size; + + return 0; +} + /* BPF architecture zero extends alu32 ops into 64-bit registesr */ static void zext_32_to_64(struct bpf_reg_state *reg) { @@ -3427,6 +3471,23 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == CONST_PTR_TO_MAP) { err = check_ptr_to_map_access(env, regs, regno, off, size, t, value_regno); + } else if (reg->type == PTR_TO_RDONLY_BUF) { + if (t == BPF_WRITE) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str[reg->type]); + return -EACCES; + } + err = check_buffer_access(env, reg, regno, off, size, "rdonly", + false, + &env->prog->aux->max_rdonly_access); + if (!err && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_RDWR_BUF) { + err = check_buffer_access(env, reg, regno, off, size, "rdwr", + false, + &env->prog->aux->max_rdwr_access); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -3668,6 +3729,18 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); + case PTR_TO_RDONLY_BUF: + if (meta && meta->raw_mode) + return -EACCES; + return check_buffer_access(env, reg, regno, reg->off, + access_size, zero_size_allowed, + "rdonly", + &env->prog->aux->max_rdonly_access); + case PTR_TO_RDWR_BUF: + return check_buffer_access(env, reg, regno, reg->off, + access_size, zero_size_allowed, + "rdwr", + &env->prog->aux->max_rdwr_access); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -3933,6 +4006,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && type != PTR_TO_MEM && + type != PTR_TO_RDONLY_BUF && + type != PTR_TO_RDWR_BUF && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; @@ -6806,6 +6881,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_BTF_ID; } else if (reg->type == PTR_TO_MEM_OR_NULL) { reg->type = PTR_TO_MEM; + } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) { + reg->type = PTR_TO_RDONLY_BUF; + } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) { + reg->type = PTR_TO_RDWR_BUF; } if (is_null) { /* We don't need id and ref_obj_id from this point -- cgit v1.2.3 From a5cbe05a6673b85bed2a63ffcfea6a96c6410cff Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:12 -0700 Subject: bpf: Implement bpf iterator for map elements The bpf iterator for map elements are implemented. The bpf program will receive four parameters: bpf_iter_meta *meta: the meta data bpf_map *map: the bpf_map whose elements are traversed void *key: the key of one element void *value: the value of the same element Here, meta and map pointers are always valid, and key has register type PTR_TO_RDONLY_BUF_OR_NULL and value has register type PTR_TO_RDWR_BUF_OR_NULL. The kernel will track the access range of key and value during verification time. Later, these values will be compared against the values in the actual map to ensure all accesses are within range. A new field iter_seq_info is added to bpf_map_ops which is used to add map type specific information, i.e., seq_ops, init/fini seq_file func and seq_file private data size. Subsequent patches will have actual implementation for bpf_map_ops->iter_seq_info. In user space, BPF_ITER_LINK_MAP_FD needs to be specified in prog attr->link_create.flags, which indicates that attr->link_create.target_fd is a map_fd. The reason for such an explicit flag is for possible future cases where one bpf iterator may allow more than one possible customization, e.g., pid and cgroup id for task_file. Current kernel internal implementation only allows the target to register at most one required bpf_iter_link_info. To support the above case, optional bpf_iter_link_info's are needed, the target can be extended to register such link infos, and user provided link_info needs to match one of target supported ones. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com --- include/linux/bpf.h | 16 ++++++++ include/uapi/linux/bpf.h | 7 ++++ kernel/bpf/bpf_iter.c | 85 ++++++++++++++++++++++++++++++++++-------- kernel/bpf/map_iter.c | 30 ++++++++++++++- tools/include/uapi/linux/bpf.h | 7 ++++ 5 files changed, 128 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f9c4bb08f616..4175cf1f4665 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -107,6 +107,9 @@ struct bpf_map_ops { /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; + + /* bpf_iter info used to open a seq_file */ + const struct bpf_iter_seq_info *iter_seq_info; }; struct bpf_map_memory { @@ -1207,12 +1210,18 @@ int bpf_obj_get_user(const char __user *pathname, int flags); int __init bpf_iter_ ## target(args) { return 0; } struct bpf_iter_aux_info { + struct bpf_map *map; }; +typedef int (*bpf_iter_check_target_t)(struct bpf_prog *prog, + struct bpf_iter_aux_info *aux); + #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; + bpf_iter_check_target_t check_target; u32 ctx_arg_info_size; + enum bpf_iter_link_info req_linfo; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; }; @@ -1223,6 +1232,13 @@ struct bpf_iter_meta { u64 seq_num; }; +struct bpf_iter__bpf_map_elem { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(void *, key); + __bpf_md_ptr(void *, value); +}; + int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 54d0c886e3ba..828c2f6438f2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -246,6 +246,13 @@ enum bpf_link_type { MAX_BPF_LINK_TYPE, }; +enum bpf_iter_link_info { + BPF_ITER_LINK_UNSPEC = 0, + BPF_ITER_LINK_MAP_FD = 1, + + MAX_BPF_ITER_LINK_INFO, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 8fa94cb1b5a0..363b9cafc2d8 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -14,11 +14,13 @@ struct bpf_iter_target_info { struct bpf_iter_link { struct bpf_link link; + struct bpf_iter_aux_info aux; struct bpf_iter_target_info *tinfo; }; struct bpf_iter_priv_data { struct bpf_iter_target_info *tinfo; + const struct bpf_iter_seq_info *seq_info; struct bpf_prog *prog; u64 session_id; u64 seq_num; @@ -35,7 +37,8 @@ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info); static void bpf_iter_inc_seq_num(struct seq_file *seq) { @@ -199,11 +202,25 @@ done: return copied; } +static const struct bpf_iter_seq_info * +__get_seq_info(struct bpf_iter_link *link) +{ + const struct bpf_iter_seq_info *seq_info; + + if (link->aux.map) { + seq_info = link->aux.map->ops->iter_seq_info; + if (seq_info) + return seq_info; + } + + return link->tinfo->reg_info->seq_info; +} + static int iter_open(struct inode *inode, struct file *file) { struct bpf_iter_link *link = inode->i_private; - return prepare_seq_file(file, link); + return prepare_seq_file(file, link, __get_seq_info(link)); } static int iter_release(struct inode *inode, struct file *file) @@ -218,8 +235,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->reg_info->seq_info->fini_seq_private) - iter_priv->tinfo->reg_info->seq_info->fini_seq_private(seq->private); + if (iter_priv->seq_info->fini_seq_private) + iter_priv->seq_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -318,6 +335,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) static void bpf_iter_link_release(struct bpf_link *link) { + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + if (iter_link->aux.map) + bpf_map_put_with_uref(iter_link->aux.map); } static void bpf_iter_link_dealloc(struct bpf_link *link) @@ -370,14 +392,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; + struct bpf_iter_aux_info aux = {}; struct bpf_iter_link *link; + u32 prog_btf_id, target_fd; bool existed = false; - u32 prog_btf_id; + struct bpf_map *map; int err; - if (attr->link_create.target_fd || attr->link_create.flags) - return -EINVAL; - prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { @@ -390,6 +411,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (!existed) return -ENOENT; + /* Make sure user supplied flags are target expected. */ + target_fd = attr->link_create.target_fd; + if (attr->link_create.flags != tinfo->reg_info->req_linfo) + return -EINVAL; + if (!attr->link_create.flags && target_fd) + return -EINVAL; + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -403,21 +431,45 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } + if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) { + map = bpf_map_get_with_uref(target_fd); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto cleanup_link; + } + + aux.map = map; + err = tinfo->reg_info->check_target(prog, &aux); + if (err) { + bpf_map_put_with_uref(map); + goto cleanup_link; + } + + link->aux.map = map; + } + return bpf_link_settle(&link_primer); + +cleanup_link: + bpf_link_cleanup(&link_primer); + return err; } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, struct bpf_iter_target_info *tinfo, + const struct bpf_iter_seq_info *seq_info, struct bpf_prog *prog) { priv_data->tinfo = tinfo; + priv_data->seq_info = seq_info; priv_data->prog = prog; priv_data->session_id = atomic64_inc_return(&session_id); priv_data->seq_num = 0; priv_data->done_stop = false; } -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info) { struct bpf_iter_priv_data *priv_data; struct bpf_iter_target_info *tinfo; @@ -433,21 +485,21 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->reg_info->seq_info->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->reg_info->seq_info->seq_ops, + seq_info->seq_priv_size; + priv_data = __seq_open_private(file, seq_info->seq_ops, total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->reg_info->seq_info->init_seq_private) { - err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private, NULL); + if (seq_info->init_seq_private) { + err = seq_info->init_seq_private(priv_data->target_private, &link->aux); if (err) goto release_seq_file; } - init_seq_meta(priv_data, tinfo, prog); + init_seq_meta(priv_data, tinfo, seq_info, prog); seq = file->private_data; seq->private = priv_data->target_private; @@ -463,6 +515,7 @@ release_prog: int bpf_iter_new_fd(struct bpf_link *link) { + struct bpf_iter_link *iter_link; struct file *file; unsigned int flags; int err, fd; @@ -481,8 +534,8 @@ int bpf_iter_new_fd(struct bpf_link *link) goto free_fd; } - err = prepare_seq_file(file, - container_of(link, struct bpf_iter_link, link)); + iter_link = container_of(link, struct bpf_iter_link, link); + err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link)); if (err) goto free_file; diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 1a69241fb1e2..8a1f9b3355d0 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -98,10 +98,38 @@ static struct bpf_iter_reg bpf_map_reg_info = { .seq_info = &bpf_map_seq_info, }; +static int bpf_iter_check_map(struct bpf_prog *prog, + struct bpf_iter_aux_info *aux) +{ + return -EINVAL; +} + +DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, + struct bpf_map *map, void *key, void *value) + +static const struct bpf_iter_reg bpf_map_elem_reg_info = { + .target = "bpf_map_elem", + .check_target = bpf_iter_check_map, + .req_linfo = BPF_ITER_LINK_MAP_FD, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map_elem, key), + PTR_TO_RDONLY_BUF_OR_NULL }, + { offsetof(struct bpf_iter__bpf_map_elem, value), + PTR_TO_RDWR_BUF_OR_NULL }, + }, +}; + static int __init bpf_map_iter_init(void) { + int ret; + bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id; - return bpf_iter_reg_target(&bpf_map_reg_info); + ret = bpf_iter_reg_target(&bpf_map_reg_info); + if (ret) + return ret; + + return bpf_iter_reg_target(&bpf_map_elem_reg_info); } late_initcall(bpf_map_iter_init); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 54d0c886e3ba..828c2f6438f2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -246,6 +246,13 @@ enum bpf_link_type { MAX_BPF_LINK_TYPE, }; +enum bpf_iter_link_info { + BPF_ITER_LINK_UNSPEC = 0, + BPF_ITER_LINK_MAP_FD = 1, + + MAX_BPF_ITER_LINK_INFO, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. -- cgit v1.2.3 From d6c4503cc29638f328e1a6e6fefbdbda401c28fc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:14 -0700 Subject: bpf: Implement bpf iterator for hash maps The bpf iterators for hash, percpu hash, lru hash and lru percpu hash are implemented. During link time, bpf_iter_reg->check_target() will check map type and ensure the program access key/value region is within the map defined key/value size limit. For percpu hash and lru hash maps, the bpf program will receive values for all cpus. The map element bpf iterator infrastructure will prepare value properly before passing the value pointer to the bpf program. This patch set supports readonly map keys and read/write map values. It does not support deleting map elements, e.g., from hash tables. If there is a user case for this, the following mechanism can be used to support map deletion for hashtab, etc. - permit a new bpf program return value, e.g., 2, to let bpf iterator know the map element should be removed. - since bucket lock is taken, the map element will be queued. - once bucket lock is released after all elements under this bucket are traversed, all to-be-deleted map elements can be deleted. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184114.590470-1-yhs@fb.com --- kernel/bpf/hashtab.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/map_iter.c | 24 ++++++- 2 files changed, 217 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d4378d7d442b..024276787055 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1612,6 +1612,196 @@ htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, true, false); } +struct bpf_iter_seq_hash_map_info { + struct bpf_map *map; + struct bpf_htab *htab; + void *percpu_value_buf; // non-zero means percpu hash + unsigned long flags; + u32 bucket_id; + u32 skip_elems; +}; + +static struct htab_elem * +bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, + struct htab_elem *prev_elem) +{ + const struct bpf_htab *htab = info->htab; + unsigned long flags = info->flags; + u32 skip_elems = info->skip_elems; + u32 bucket_id = info->bucket_id; + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; + struct htab_elem *elem; + struct bucket *b; + u32 i, count; + + if (bucket_id >= htab->n_buckets) + return NULL; + + /* try to find next elem in the same bucket */ + if (prev_elem) { + /* no update/deletion on this bucket, prev_elem should be still valid + * and we won't skip elements. + */ + n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); + elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); + if (elem) + return elem; + + /* not found, unlock and go to the next bucket */ + b = &htab->buckets[bucket_id++]; + htab_unlock_bucket(htab, b, flags); + skip_elems = 0; + } + + for (i = bucket_id; i < htab->n_buckets; i++) { + b = &htab->buckets[i]; + flags = htab_lock_bucket(htab, b); + + count = 0; + head = &b->head; + hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + if (count >= skip_elems) { + info->flags = flags; + info->bucket_id = i; + info->skip_elems = count; + return elem; + } + count++; + } + + htab_unlock_bucket(htab, b, flags); + skip_elems = 0; + } + + info->bucket_id = i; + info->skip_elems = 0; + return NULL; +} + +static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + struct htab_elem *elem; + + elem = bpf_hash_map_seq_find_next(info, NULL); + if (!elem) + return NULL; + + if (*pos == 0) + ++*pos; + return elem; +} + +static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + + ++*pos; + ++info->skip_elems; + return bpf_hash_map_seq_find_next(info, v); +} + +static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + u32 roundup_key_size, roundup_value_size; + struct bpf_iter__bpf_map_elem ctx = {}; + struct bpf_map *map = info->map; + struct bpf_iter_meta meta; + int ret = 0, off = 0, cpu; + struct bpf_prog *prog; + void __percpu *pptr; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, elem == NULL); + if (prog) { + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + roundup_key_size = round_up(map->key_size, 8); + ctx.key = elem->key; + if (!info->percpu_value_buf) { + ctx.value = elem->key + roundup_key_size; + } else { + roundup_value_size = round_up(map->value_size, 8); + pptr = htab_elem_get_ptr(elem, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu), + roundup_value_size); + off += roundup_value_size; + } + ctx.value = info->percpu_value_buf; + } + } + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; +} + +static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_hash_map_seq_show(seq, v); +} + +static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_hash_map_info *info = seq->private; + + if (!v) + (void)__bpf_hash_map_seq_show(seq, NULL); + else + htab_unlock_bucket(info->htab, + &info->htab->buckets[info->bucket_id], + info->flags); +} + +static int bpf_iter_init_hash_map(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_hash_map_info *seq_info = priv_data; + struct bpf_map *map = aux->map; + void *value_buf; + u32 buf_size; + + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + buf_size = round_up(map->value_size, 8) * num_possible_cpus(); + value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); + if (!value_buf) + return -ENOMEM; + + seq_info->percpu_value_buf = value_buf; + } + + seq_info->map = map; + seq_info->htab = container_of(map, struct bpf_htab, map); + return 0; +} + +static void bpf_iter_fini_hash_map(void *priv_data) +{ + struct bpf_iter_seq_hash_map_info *seq_info = priv_data; + + kfree(seq_info->percpu_value_buf); +} + +static const struct seq_operations bpf_hash_map_seq_ops = { + .start = bpf_hash_map_seq_start, + .next = bpf_hash_map_seq_next, + .stop = bpf_hash_map_seq_stop, + .show = bpf_hash_map_seq_show, +}; + +static const struct bpf_iter_seq_info iter_seq_info = { + .seq_ops = &bpf_hash_map_seq_ops, + .init_seq_private = bpf_iter_init_hash_map, + .fini_seq_private = bpf_iter_fini_hash_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), +}; + static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, @@ -1626,6 +1816,7 @@ const struct bpf_map_ops htab_map_ops = { BATCH_OPS(htab), .map_btf_name = "bpf_htab", .map_btf_id = &htab_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int htab_lru_map_btf_id; @@ -1643,6 +1834,7 @@ const struct bpf_map_ops htab_lru_map_ops = { BATCH_OPS(htab_lru), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_map_btf_id, + .iter_seq_info = &iter_seq_info, }; /* Called from eBPF program */ @@ -1760,6 +1952,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { BATCH_OPS(htab_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_percpu_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int htab_lru_percpu_map_btf_id; @@ -1775,6 +1968,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { BATCH_OPS(htab_lru_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_percpu_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int fd_htab_map_alloc_check(union bpf_attr *attr) diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 8a1f9b3355d0..bcb68b55bf65 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -101,7 +101,29 @@ static struct bpf_iter_reg bpf_map_reg_info = { static int bpf_iter_check_map(struct bpf_prog *prog, struct bpf_iter_aux_info *aux) { - return -EINVAL; + u32 key_acc_size, value_acc_size, key_size, value_size; + struct bpf_map *map = aux->map; + bool is_percpu = false; + + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) + is_percpu = true; + else if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH) + return -EINVAL; + + key_acc_size = prog->aux->max_rdonly_access; + value_acc_size = prog->aux->max_rdwr_access; + key_size = map->key_size; + if (!is_percpu) + value_size = map->value_size; + else + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + + if (key_acc_size > key_size || value_acc_size > value_size) + return -EACCES; + + return 0; } DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, -- cgit v1.2.3 From d3cc2ab546adc6e52b65f36f7c34820d2830d0c9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:15 -0700 Subject: bpf: Implement bpf iterator for array maps The bpf iterators for array and percpu array are implemented. Similar to hash maps, for percpu array map, bpf program will receive values from all cpus. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184115.590532-1-yhs@fb.com --- kernel/bpf/arraymap.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/map_iter.c | 6 ++- 2 files changed, 142 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c66e8273fccd..8ff419b632a6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -487,6 +487,142 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + pgoff); } +struct bpf_iter_seq_array_map_info { + struct bpf_map *map; + void *percpu_value_buf; + u32 index; +}; + +static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_array_map_info *info = seq->private; + struct bpf_map *map = info->map; + struct bpf_array *array; + u32 index; + + if (info->index >= map->max_entries) + return NULL; + + if (*pos == 0) + ++*pos; + array = container_of(map, struct bpf_array, map); + index = info->index & array->index_mask; + if (info->percpu_value_buf) + return array->pptrs[index]; + return array->value + array->elem_size * index; +} + +static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_array_map_info *info = seq->private; + struct bpf_map *map = info->map; + struct bpf_array *array; + u32 index; + + ++*pos; + ++info->index; + if (info->index >= map->max_entries) + return NULL; + + array = container_of(map, struct bpf_array, map); + index = info->index & array->index_mask; + if (info->percpu_value_buf) + return array->pptrs[index]; + return array->value + array->elem_size * index; +} + +static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_array_map_info *info = seq->private; + struct bpf_iter__bpf_map_elem ctx = {}; + struct bpf_map *map = info->map; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int off = 0, cpu = 0; + void __percpu **pptr; + u32 size; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, v == NULL); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (v) { + ctx.key = &info->index; + + if (!info->percpu_value_buf) { + ctx.value = v; + } else { + pptr = v; + size = round_up(map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu), + size); + off += size; + } + ctx.value = info->percpu_value_buf; + } + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_array_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_array_map_seq_show(seq, v); +} + +static void bpf_array_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_array_map_seq_show(seq, NULL); +} + +static int bpf_iter_init_array_map(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_array_map_info *seq_info = priv_data; + struct bpf_map *map = aux->map; + void *value_buf; + u32 buf_size; + + if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + buf_size = round_up(map->value_size, 8) * num_possible_cpus(); + value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); + if (!value_buf) + return -ENOMEM; + + seq_info->percpu_value_buf = value_buf; + } + + seq_info->map = map; + return 0; +} + +static void bpf_iter_fini_array_map(void *priv_data) +{ + struct bpf_iter_seq_array_map_info *seq_info = priv_data; + + kfree(seq_info->percpu_value_buf); +} + +static const struct seq_operations bpf_array_map_seq_ops = { + .start = bpf_array_map_seq_start, + .next = bpf_array_map_seq_next, + .stop = bpf_array_map_seq_stop, + .show = bpf_array_map_seq_show, +}; + +static const struct bpf_iter_seq_info iter_seq_info = { + .seq_ops = &bpf_array_map_seq_ops, + .init_seq_private = bpf_iter_init_array_map, + .fini_seq_private = bpf_iter_fini_array_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), +}; + static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, @@ -506,6 +642,7 @@ const struct bpf_map_ops array_map_ops = { .map_update_batch = generic_map_update_batch, .map_btf_name = "bpf_array", .map_btf_id = &array_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int percpu_array_map_btf_id; @@ -521,6 +658,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_check_btf = array_map_check_btf, .map_btf_name = "bpf_array", .map_btf_id = &percpu_array_map_btf_id, + .iter_seq_info = &iter_seq_info, }; static int fd_array_map_alloc_check(union bpf_attr *attr) diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index bcb68b55bf65..fbe1f557cb88 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -106,10 +106,12 @@ static int bpf_iter_check_map(struct bpf_prog *prog, bool is_percpu = false; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) is_percpu = true; else if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_LRU_HASH) + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) return -EINVAL; key_acc_size = prog->aux->max_rdonly_access; -- cgit v1.2.3 From 7b04d6d60fcfb5b2200ffebb9cfb90927bdfeec7 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 23 Jul 2020 11:06:44 -0700 Subject: bpf: Separate bpf_get_[stack|stackid] for perf events BPF Calling get_perf_callchain() on perf_events from PEBS entries may cause unwinder errors. To fix this issue, the callchain is fetched early. Such perf_events are marked with __PERF_SAMPLE_CALLCHAIN_EARLY. Similarly, calling bpf_get_[stack|stackid] on perf_events from PEBS may also cause unwinder errors. To fix this, add separate version of these two helpers, bpf_get_[stack|stackid]_pe. These two hepers use callchain in bpf_perf_event_data_kern->data->callchain. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723180648.1429892-2-songliubraving@fb.com --- include/linux/bpf.h | 2 + kernel/bpf/stackmap.c | 184 ++++++++++++++++++++++++++++++++++++++++++----- kernel/trace/bpf_trace.c | 4 +- 3 files changed, 170 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4175cf1f4665..8357be349133 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1675,6 +1675,8 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_get_task_stack_proto; +extern const struct bpf_func_proto bpf_get_stackid_proto_pe; +extern const struct bpf_func_proto bpf_get_stack_proto_pe; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 48d8e739975f..5beb2f8c23da 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -387,11 +388,10 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) #endif } -BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, - u64, flags) +static long __bpf_get_stackid(struct bpf_map *map, + struct perf_callchain_entry *trace, u64 flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ @@ -399,21 +399,9 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; - bool kernel = !user; u64 *ips; bool hash_matches; - if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | - BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) - return -EINVAL; - - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); - - if (unlikely(!trace)) - /* couldn't fetch the stack trace */ - return -EFAULT; - /* get_perf_callchain() guarantees that trace->nr >= init_nr * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth */ @@ -478,6 +466,30 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, return id; } +BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags) +{ + u32 max_depth = map->value_size / stack_map_data_size(map); + /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ + u32 init_nr = sysctl_perf_event_max_stack - max_depth; + bool user = flags & BPF_F_USER_STACK; + struct perf_callchain_entry *trace; + bool kernel = !user; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + return __bpf_get_stackid(map, trace, flags); +} + const struct bpf_func_proto bpf_get_stackid_proto = { .func = bpf_get_stackid, .gpl_only = true, @@ -487,7 +499,77 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; +static __u64 count_kernel_ip(struct perf_callchain_entry *trace) +{ + __u64 nr_kernel = 0; + + while (nr_kernel < trace->nr) { + if (trace->ip[nr_kernel] == PERF_CONTEXT_USER) + break; + nr_kernel++; + } + return nr_kernel; +} + +BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, + struct bpf_map *, map, u64, flags) +{ + struct perf_event *event = ctx->event; + struct perf_callchain_entry *trace; + bool kernel, user; + __u64 nr_kernel; + int ret; + + /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ + if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + return bpf_get_stackid((unsigned long)(ctx->regs), + (unsigned long) map, flags, 0, 0); + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + user = flags & BPF_F_USER_STACK; + kernel = !user; + + trace = ctx->data->callchain; + if (unlikely(!trace)) + return -EFAULT; + + nr_kernel = count_kernel_ip(trace); + + if (kernel) { + __u64 nr = trace->nr; + + trace->nr = nr_kernel; + ret = __bpf_get_stackid(map, trace, flags); + + /* restore nr */ + trace->nr = nr; + } else { /* user */ + u64 skip = flags & BPF_F_SKIP_FIELD_MASK; + + skip += nr_kernel; + if (skip > BPF_F_SKIP_FIELD_MASK) + return -EFAULT; + + flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; + ret = __bpf_get_stackid(map, trace, flags); + } + return ret; +} + +const struct bpf_func_proto bpf_get_stackid_proto_pe = { + .func = bpf_get_stackid_pe, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, + struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags) { u32 init_nr, trace_nr, copy_len, elem_size, num_elem; @@ -520,7 +602,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, else init_nr = sysctl_perf_event_max_stack - num_elem; - if (kernel && task) + if (trace_in) + trace = trace_in; + else if (kernel && task) trace = get_callchain_entry_for_task(task, init_nr); else trace = get_perf_callchain(regs, init_nr, kernel, user, @@ -556,7 +640,7 @@ clear: BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { - return __bpf_get_stack(regs, NULL, buf, size, flags); + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); } const struct bpf_func_proto bpf_get_stack_proto = { @@ -574,7 +658,7 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, { struct pt_regs *regs = task_pt_regs(task); - return __bpf_get_stack(regs, task, buf, size, flags); + return __bpf_get_stack(regs, task, NULL, buf, size, flags); } BTF_ID_LIST(bpf_get_task_stack_btf_ids) @@ -591,6 +675,70 @@ const struct bpf_func_proto bpf_get_task_stack_proto = { .btf_id = bpf_get_task_stack_btf_ids, }; +BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, + void *, buf, u32, size, u64, flags) +{ + struct perf_event *event = ctx->event; + struct perf_callchain_entry *trace; + bool kernel, user; + int err = -EINVAL; + __u64 nr_kernel; + + if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + return __bpf_get_stack(ctx->regs, NULL, NULL, buf, size, flags); + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_USER_BUILD_ID))) + goto clear; + + user = flags & BPF_F_USER_STACK; + kernel = !user; + + err = -EFAULT; + trace = ctx->data->callchain; + if (unlikely(!trace)) + goto clear; + + nr_kernel = count_kernel_ip(trace); + + if (kernel) { + __u64 nr = trace->nr; + + trace->nr = nr_kernel; + err = __bpf_get_stack(ctx->regs, NULL, trace, buf, + size, flags); + + /* restore nr */ + trace->nr = nr; + } else { /* user */ + u64 skip = flags & BPF_F_SKIP_FIELD_MASK; + + skip += nr_kernel; + if (skip > BPF_F_SKIP_FIELD_MASK) + goto clear; + + flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; + err = __bpf_get_stack(ctx->regs, NULL, trace, buf, + size, flags); + } + return err; + +clear: + memset(buf, 0, size); + return err; + +} + +const struct bpf_func_proto bpf_get_stack_proto_pe = { + .func = bpf_get_stack_pe, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3cc0dcb60ca2..cb91ef902cc4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1411,9 +1411,9 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: - return &bpf_get_stackid_proto_tp; + return &bpf_get_stackid_proto_pe; case BPF_FUNC_get_stack: - return &bpf_get_stack_proto_tp; + return &bpf_get_stack_proto_pe; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; case BPF_FUNC_read_branch_records: -- cgit v1.2.3 From 5d99cb2c86775b4780c02a339a9578bf9471ead9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 23 Jul 2020 11:06:45 -0700 Subject: bpf: Fail PERF_EVENT_IOC_SET_BPF when bpf_get_[stack|stackid] cannot work bpf_get_[stack|stackid] on perf_events with precise_ip uses callchain attached to perf_sample_data. If this callchain is not presented, do not allow attaching BPF program that calls bpf_get_[stack|stackid] to this event. In the error case, -EPROTO is returned so that libbpf can identify this error and print proper hint message. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723180648.1429892-3-songliubraving@fb.com --- include/linux/filter.h | 3 ++- kernel/bpf/verifier.c | 3 +++ kernel/events/core.c | 18 ++++++++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index d07a6e973a7d..0a355b005bf4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -533,7 +533,8 @@ struct bpf_prog { is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ has_callchain_buf:1, /* callchain buffer allocated? */ - enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */ + enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ + call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d6979db48d8..cd14e70f2d07 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4962,6 +4962,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn env->prog->has_callchain_buf = true; } + if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack) + env->prog->call_get_stack = true; + if (changes_data) clear_all_pkt_pointers(env); return 0; diff --git a/kernel/events/core.c b/kernel/events/core.c index 856d98c36f56..ddcfd2fb5cc5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9544,6 +9544,24 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) if (IS_ERR(prog)) return PTR_ERR(prog); + if (event->attr.precise_ip && + prog->call_get_stack && + (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || + event->attr.exclude_callchain_kernel || + event->attr.exclude_callchain_user)) { + /* + * On perf_event with precise_ip, calling bpf_get_stack() + * may trigger unwinder warnings and occasional crashes. + * bpf_get_[stack|stackid] works around this issue by using + * callchain attached to perf_sample_data. If the + * perf_event does not full (kernel and user) callchain + * attached to perf_sample_data, do not allow attaching BPF + * program that calls bpf_get_[stack|stackid]. + */ + bpf_prog_put(prog); + return -EPROTO; + } + event->prog = prog; event->orig_overflow_handler = READ_ONCE(event->overflow_handler); WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); -- cgit v1.2.3 From 7d9c3427894fe70d1347b4820476bf37736d2ff0 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 23 Jul 2020 23:47:43 -0500 Subject: bpf: Make cgroup storages shared between programs on the same cgroup This change comes in several parts: One, the restriction that the CGROUP_STORAGE map can only be used by one program is removed. This results in the removal of the field 'aux' in struct bpf_cgroup_storage_map, and removal of relevant code associated with the field, and removal of now-noop functions bpf_free_cgroup_storage and bpf_cgroup_storage_release. Second, we permit a key of type u64 as the key to the map. Providing such a key type indicates that the map should ignore attach type when comparing map keys. However, for simplicity newly linked storage will still have the attach type at link time in its key struct. cgroup_storage_check_btf is adapted to accept u64 as the type of the key. Third, because the storages are now shared, the storages cannot be unconditionally freed on program detach. There could be two ways to solve this issue: * A. Reference count the usage of the storages, and free when the last program is detached. * B. Free only when the storage is impossible to be referred to again, i.e. when either the cgroup_bpf it is attached to, or the map itself, is freed. Option A has the side effect that, when the user detach and reattach a program, whether the program gets a fresh storage depends on whether there is another program attached using that storage. This could trigger races if the user is multi-threaded, and since nondeterminism in data races is evil, go with option B. The both the map and the cgroup_bpf now tracks their associated storages, and the storage unlink and free are removed from cgroup_bpf_detach and added to cgroup_bpf_release and cgroup_storage_map_free. The latter also new holds the cgroup_mutex to prevent any races with the former. Fourth, on attach, we reuse the old storage if the key already exists in the map, via cgroup_storage_lookup. If the storage does not exist yet, we create a new one, and publish it at the last step in the attach process. This does not create a race condition because for the whole attach the cgroup_mutex is held. We keep track of an array of new storages that was allocated and if the process fails only the new storages would get freed. Signed-off-by: YiFei Zhu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/d5401c6106728a00890401190db40020a1f84ff1.1595565795.git.zhuyifei@google.com --- include/linux/bpf-cgroup.h | 12 ++- kernel/bpf/cgroup.c | 67 ++++++++------ kernel/bpf/core.c | 12 --- kernel/bpf/local_storage.c | 216 ++++++++++++++++++++++++--------------------- 4 files changed, 164 insertions(+), 143 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 2c6f26670acc..64f367044e25 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -46,7 +46,8 @@ struct bpf_cgroup_storage { }; struct bpf_cgroup_storage_map *map; struct bpf_cgroup_storage_key key; - struct list_head list; + struct list_head list_map; + struct list_head list_cg; struct rb_node node; struct rcu_head rcu; }; @@ -78,6 +79,9 @@ struct cgroup_bpf { struct list_head progs[MAX_BPF_ATTACH_TYPE]; u32 flags[MAX_BPF_ATTACH_TYPE]; + /* list of cgroup shared storages */ + struct list_head storages; + /* temp storage for effective prog array used by prog_attach/detach */ struct bpf_prog_array *inactive; @@ -161,6 +165,9 @@ static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage this_cpu_write(bpf_cgroup_storage[stype], storage[stype]); } +struct bpf_cgroup_storage * +cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, + void *key, bool locked); struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype); void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); @@ -169,7 +176,6 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, enum bpf_attach_type type); void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map); -void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *map); int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, @@ -383,8 +389,6 @@ static inline void bpf_cgroup_storage_set( struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } -static inline void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, - struct bpf_map *map) {} static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; } static inline void bpf_cgroup_storage_free( diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ac53102e244a..957cce1d5168 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -37,17 +37,34 @@ static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[]) } static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[], - struct bpf_prog *prog) + struct bpf_cgroup_storage *new_storages[], + enum bpf_attach_type type, + struct bpf_prog *prog, + struct cgroup *cgrp) { enum bpf_cgroup_storage_type stype; + struct bpf_cgroup_storage_key key; + struct bpf_map *map; + + key.cgroup_inode_id = cgroup_id(cgrp); + key.attach_type = type; for_each_cgroup_storage_type(stype) { + map = prog->aux->cgroup_storage[stype]; + if (!map) + continue; + + storages[stype] = cgroup_storage_lookup((void *)map, &key, false); + if (storages[stype]) + continue; + storages[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(storages[stype])) { - storages[stype] = NULL; - bpf_cgroup_storages_free(storages); + bpf_cgroup_storages_free(new_storages); return -ENOMEM; } + + new_storages[stype] = storages[stype]; } return 0; @@ -63,7 +80,7 @@ static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[], } static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], - struct cgroup* cgrp, + struct cgroup *cgrp, enum bpf_attach_type attach_type) { enum bpf_cgroup_storage_type stype; @@ -72,14 +89,6 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], bpf_cgroup_storage_link(storages[stype], cgrp, attach_type); } -static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[]) -{ - enum bpf_cgroup_storage_type stype; - - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_unlink(storages[stype]); -} - /* Called when bpf_cgroup_link is auto-detached from dying cgroup. * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It * doesn't free link memory, which will eventually be done by bpf_link's @@ -101,22 +110,23 @@ static void cgroup_bpf_release(struct work_struct *work) struct cgroup *p, *cgrp = container_of(work, struct cgroup, bpf.release_work); struct bpf_prog_array *old_array; + struct list_head *storages = &cgrp->bpf.storages; + struct bpf_cgroup_storage *storage, *stmp; + unsigned int type; mutex_lock(&cgroup_mutex); for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { struct list_head *progs = &cgrp->bpf.progs[type]; - struct bpf_prog_list *pl, *tmp; + struct bpf_prog_list *pl, *pltmp; - list_for_each_entry_safe(pl, tmp, progs, node) { + list_for_each_entry_safe(pl, pltmp, progs, node) { list_del(&pl->node); if (pl->prog) bpf_prog_put(pl->prog); if (pl->link) bpf_cgroup_link_auto_detach(pl->link); - bpf_cgroup_storages_unlink(pl->storage); - bpf_cgroup_storages_free(pl->storage); kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -126,6 +136,11 @@ static void cgroup_bpf_release(struct work_struct *work) bpf_prog_array_free(old_array); } + list_for_each_entry_safe(storage, stmp, storages, list_cg) { + bpf_cgroup_storage_unlink(storage); + bpf_cgroup_storage_free(storage); + } + mutex_unlock(&cgroup_mutex); for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) @@ -290,6 +305,8 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); + INIT_LIST_HEAD(&cgrp->bpf.storages); + for (i = 0; i < NR; i++) if (compute_effective_progs(cgrp, i, &arrays[i])) goto cleanup; @@ -422,7 +439,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; - struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_prog_list *pl; int err; @@ -455,17 +472,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (IS_ERR(pl)) return PTR_ERR(pl); - if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog)) + if (bpf_cgroup_storages_alloc(storage, new_storage, type, + prog ? : link->link.prog, cgrp)) return -ENOMEM; if (pl) { old_prog = pl->prog; - bpf_cgroup_storages_unlink(pl->storage); - bpf_cgroup_storages_assign(old_storage, pl->storage); } else { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - bpf_cgroup_storages_free(storage); + bpf_cgroup_storages_free(new_storage); return -ENOMEM; } list_add_tail(&pl->node, progs); @@ -480,12 +496,11 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (err) goto cleanup; - bpf_cgroup_storages_free(old_storage); if (old_prog) bpf_prog_put(old_prog); else static_branch_inc(&cgroup_bpf_enabled_key); - bpf_cgroup_storages_link(pl->storage, cgrp, type); + bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; cleanup: @@ -493,9 +508,7 @@ cleanup: pl->prog = old_prog; pl->link = NULL; } - bpf_cgroup_storages_free(pl->storage); - bpf_cgroup_storages_assign(pl->storage, old_storage); - bpf_cgroup_storages_link(pl->storage, cgrp, type); + bpf_cgroup_storages_free(new_storage); if (!old_prog) { list_del(&pl->node); kfree(pl); @@ -679,8 +692,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); - bpf_cgroup_storages_unlink(pl->storage); - bpf_cgroup_storages_free(pl->storage); kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7be02e555ab9..bde93344164d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2097,24 +2097,12 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array, : 0; } -static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) -{ - enum bpf_cgroup_storage_type stype; - - for_each_cgroup_storage_type(stype) { - if (!aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]); - } -} - void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len) { struct bpf_map *map; u32 i; - bpf_free_cgroup_storage(aux); for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 51bd5a8cb01b..3b2c70197d78 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -9,6 +9,8 @@ #include #include +#include "../cgroup/cgroup-internal.h" + DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); #ifdef CONFIG_CGROUP_BPF @@ -20,7 +22,6 @@ struct bpf_cgroup_storage_map { struct bpf_map map; spinlock_t lock; - struct bpf_prog_aux *aux; struct rb_root root; struct list_head list; }; @@ -30,24 +31,41 @@ static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) return container_of(map, struct bpf_cgroup_storage_map, map); } -static int bpf_cgroup_storage_key_cmp( - const struct bpf_cgroup_storage_key *key1, - const struct bpf_cgroup_storage_key *key2) +static bool attach_type_isolated(const struct bpf_map *map) { - if (key1->cgroup_inode_id < key2->cgroup_inode_id) - return -1; - else if (key1->cgroup_inode_id > key2->cgroup_inode_id) - return 1; - else if (key1->attach_type < key2->attach_type) - return -1; - else if (key1->attach_type > key2->attach_type) - return 1; + return map->key_size == sizeof(struct bpf_cgroup_storage_key); +} + +static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map, + const void *_key1, const void *_key2) +{ + if (attach_type_isolated(&map->map)) { + const struct bpf_cgroup_storage_key *key1 = _key1; + const struct bpf_cgroup_storage_key *key2 = _key2; + + if (key1->cgroup_inode_id < key2->cgroup_inode_id) + return -1; + else if (key1->cgroup_inode_id > key2->cgroup_inode_id) + return 1; + else if (key1->attach_type < key2->attach_type) + return -1; + else if (key1->attach_type > key2->attach_type) + return 1; + } else { + const __u64 *cgroup_inode_id1 = _key1; + const __u64 *cgroup_inode_id2 = _key2; + + if (*cgroup_inode_id1 < *cgroup_inode_id2) + return -1; + else if (*cgroup_inode_id1 > *cgroup_inode_id2) + return 1; + } return 0; } -static struct bpf_cgroup_storage *cgroup_storage_lookup( - struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key, - bool locked) +struct bpf_cgroup_storage * +cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, + void *key, bool locked) { struct rb_root *root = &map->root; struct rb_node *node; @@ -61,7 +79,7 @@ static struct bpf_cgroup_storage *cgroup_storage_lookup( storage = container_of(node, struct bpf_cgroup_storage, node); - switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) { + switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) { case -1: node = node->rb_left; break; @@ -93,7 +111,7 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, this = container_of(*new, struct bpf_cgroup_storage, node); parent = *new; - switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) { + switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) { case -1: new = &((*new)->rb_left); break; @@ -111,10 +129,9 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, return 0; } -static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) +static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; storage = cgroup_storage_lookup(map, key, false); @@ -124,17 +141,13 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) return &READ_ONCE(storage->buf)->data[0]; } -static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, +static int cgroup_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) - return -EINVAL; - - if (unlikely(flags & BPF_NOEXIST)) + if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST))) return -EINVAL; if (unlikely((flags & BPF_F_LOCK) && @@ -167,11 +180,10 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return 0; } -int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, void *value) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; int cpu, off = 0; u32 size; @@ -197,11 +209,10 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, return 0; } -int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, void *value, u64 map_flags) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; int cpu, off = 0; u32 size; @@ -232,12 +243,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, return 0; } -static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, +static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key, void *_next_key) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; - struct bpf_cgroup_storage_key *next = _next_key; struct bpf_cgroup_storage *storage; spin_lock_bh(&map->lock); @@ -250,17 +259,23 @@ static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, if (!storage) goto enoent; - storage = list_next_entry(storage, list); + storage = list_next_entry(storage, list_map); if (!storage) goto enoent; } else { storage = list_first_entry(&map->list, - struct bpf_cgroup_storage, list); + struct bpf_cgroup_storage, list_map); } spin_unlock_bh(&map->lock); - next->attach_type = storage->key.attach_type; - next->cgroup_inode_id = storage->key.cgroup_inode_id; + + if (attach_type_isolated(&map->map)) { + struct bpf_cgroup_storage_key *next = _next_key; + *next = storage->key; + } else { + __u64 *next = _next_key; + *next = storage->key.cgroup_inode_id; + } return 0; enoent: @@ -275,7 +290,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) struct bpf_map_memory mem; int ret; - if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) + if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && + attr->key_size != sizeof(__u64)) return ERR_PTR(-EINVAL); if (attr->value_size == 0) @@ -318,6 +334,17 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *_map) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct list_head *storages = &map->list; + struct bpf_cgroup_storage *storage, *stmp; + + mutex_lock(&cgroup_mutex); + + list_for_each_entry_safe(storage, stmp, storages, list_map) { + bpf_cgroup_storage_unlink(storage); + bpf_cgroup_storage_free(storage); + } + + mutex_unlock(&cgroup_mutex); WARN_ON(!RB_EMPTY_ROOT(&map->root)); WARN_ON(!list_empty(&map->list)); @@ -335,49 +362,63 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - struct btf_member *m; - u32 offset, size; - - /* Key is expected to be of struct bpf_cgroup_storage_key type, - * which is: - * struct bpf_cgroup_storage_key { - * __u64 cgroup_inode_id; - * __u32 attach_type; - * }; - */ + if (attach_type_isolated(map)) { + struct btf_member *m; + u32 offset, size; + + /* Key is expected to be of struct bpf_cgroup_storage_key type, + * which is: + * struct bpf_cgroup_storage_key { + * __u64 cgroup_inode_id; + * __u32 attach_type; + * }; + */ + + /* + * Key_type must be a structure with two fields. + */ + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || + BTF_INFO_VLEN(key_type->info) != 2) + return -EINVAL; + + /* + * The first field must be a 64 bit integer at 0 offset. + */ + m = (struct btf_member *)(key_type + 1); + size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); + if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) + return -EINVAL; + + /* + * The second field must be a 32 bit integer at 64 bit offset. + */ + m++; + offset = offsetof(struct bpf_cgroup_storage_key, attach_type); + size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); + if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) + return -EINVAL; + } else { + u32 int_data; - /* - * Key_type must be a structure with two fields. - */ - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || - BTF_INFO_VLEN(key_type->info) != 2) - return -EINVAL; + /* + * Key is expected to be u64, which stores the cgroup_inode_id + */ - /* - * The first field must be a 64 bit integer at 0 offset. - */ - m = (struct btf_member *)(key_type + 1); - size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); - if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) - return -EINVAL; + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; - /* - * The second field must be a 32 bit integer at 64 bit offset. - */ - m++; - offset = offsetof(struct bpf_cgroup_storage_key, attach_type); - size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); - if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) - return -EINVAL; + int_data = *(u32 *)(key_type + 1); + if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) + return -EINVAL; + } return 0; } -static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, +static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage_key *key = _key; struct bpf_cgroup_storage *storage; int cpu; @@ -426,38 +467,13 @@ const struct bpf_map_ops cgroup_storage_map_ops = { int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) { enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - int ret = -EBUSY; - - spin_lock_bh(&map->lock); - if (map->aux && map->aux != aux) - goto unlock; if (aux->cgroup_storage[stype] && aux->cgroup_storage[stype] != _map) - goto unlock; + return -EBUSY; - map->aux = aux; aux->cgroup_storage[stype] = _map; - ret = 0; -unlock: - spin_unlock_bh(&map->lock); - - return ret; -} - -void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map) -{ - enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - - spin_lock_bh(&map->lock); - if (map->aux == aux) { - WARN_ON(aux->cgroup_storage[stype] != _map); - map->aux = NULL; - aux->cgroup_storage[stype] = NULL; - } - spin_unlock_bh(&map->lock); + return 0; } static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) @@ -578,7 +594,8 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, spin_lock_bh(&map->lock); WARN_ON(cgroup_storage_insert(map, storage)); - list_add(&storage->list, &map->list); + list_add(&storage->list_map, &map->list); + list_add(&storage->list_cg, &cgroup->bpf.storages); spin_unlock_bh(&map->lock); } @@ -596,7 +613,8 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) root = &map->root; rb_erase(&storage->node, root); - list_del(&storage->list); + list_del(&storage->list_map); + list_del(&storage->list_cg); spin_unlock_bh(&map->lock); } -- cgit v1.2.3 From dfcdf0e9ad2e006196986f363c99b2097aec5ef0 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Fri, 24 Jul 2020 16:17:53 -0500 Subject: bpf/local_storage: Fix build without CONFIG_CGROUP local_storage.o has its compile guard as CONFIG_BPF_SYSCALL, which does not imply that CONFIG_CGROUP is on. Including cgroup-internal.h when CONFIG_CGROUP is off cause a compilation failure. Fixes: f67cfc233706 ("bpf: Make cgroup storages shared between programs on the same cgroup") Reported-by: kernel test robot Signed-off-by: YiFei Zhu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200724211753.902969-1-zhuyifei1999@gmail.com --- kernel/bpf/local_storage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 3b2c70197d78..571bb351ed3b 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -9,12 +9,12 @@ #include #include -#include "../cgroup/cgroup-internal.h" - DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); #ifdef CONFIG_CGROUP_BPF +#include "../cgroup/cgroup-internal.h" + #define LOCAL_STORAGE_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) -- cgit v1.2.3 From 2b9b305fcdda1810bdffeb599361174eb2cd0b7c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 24 Jul 2020 13:05:02 -0700 Subject: bpf: Fix build on architectures with special bpf_user_pt_regs_t Architectures like s390, powerpc, arm64, riscv have speical definition of bpf_user_pt_regs_t. So we need to cast the pointer before passing it to bpf_get_stack(). This is similar to bpf_get_stack_tp(). Fixes: 03d42fd2d83f ("bpf: Separate bpf_get_[stack|stackid] for perf events BPF") Reported-by: kernel test robot Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200724200503.3629591-1-songliubraving@fb.com --- kernel/bpf/stackmap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 5beb2f8c23da..4fd830a62be2 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -678,6 +678,7 @@ const struct bpf_func_proto bpf_get_task_stack_proto = { BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { + struct pt_regs *regs = (struct pt_regs *)(ctx->regs); struct perf_event *event = ctx->event; struct perf_callchain_entry *trace; bool kernel, user; @@ -685,7 +686,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, __u64 nr_kernel; if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) - return __bpf_get_stack(ctx->regs, NULL, NULL, buf, size, flags); + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) @@ -705,8 +706,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, __u64 nr = trace->nr; trace->nr = nr_kernel; - err = __bpf_get_stack(ctx->regs, NULL, trace, buf, - size, flags); + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); /* restore nr */ trace->nr = nr; @@ -718,8 +718,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, goto clear; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; - err = __bpf_get_stack(ctx->regs, NULL, trace, buf, - size, flags); + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); } return err; -- cgit v1.2.3 From aa8d3a716b59db6c1ad6c68fb8aa05e31980da60 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 Jul 2020 23:45:57 -0700 Subject: bpf, xdp: Add bpf_link-based XDP attachment API Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through BPF_LINK_CREATE command. bpf_xdp_link is mutually exclusive with direct BPF program attachment, previous BPF program should be detached prior to attempting to create a new bpf_xdp_link attachment (for a given XDP mode). Once BPF link is attached, it can't be replaced by other BPF program attachment or link attachment. It will be detached only when the last BPF link FD is closed. bpf_xdp_link will be auto-detached when net_device is shutdown, similarly to how other BPF links behave (cgroup, flow_dissector). At that point bpf_link will become defunct, but won't be destroyed until last FD is closed. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200722064603.3350758-5-andriin@fb.com --- include/linux/netdevice.h | 4 ++ include/uapi/linux/bpf.h | 7 +- kernel/bpf/syscall.c | 5 ++ net/core/dev.c | 169 ++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 178 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cad44b40c776..7d3c412fcfe5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -888,6 +888,7 @@ struct bpf_prog_offload_ops; struct netlink_ext_ack; struct xdp_umem; struct xdp_dev_bulk_queue; +struct bpf_xdp_link; enum bpf_xdp_mode { XDP_MODE_SKB = 0, @@ -898,6 +899,7 @@ enum bpf_xdp_mode { struct bpf_xdp_entity { struct bpf_prog *prog; + struct bpf_xdp_link *link; }; struct netdev_bpf { @@ -3831,7 +3833,9 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags); +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); + int xdp_umem_query(struct net_device *dev, u16 queue_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 828c2f6438f2..87823fb9c123 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -230,6 +230,7 @@ enum bpf_attach_type { BPF_CGROUP_INET_SOCK_RELEASE, BPF_XDP_CPUMAP, BPF_SK_LOOKUP, + BPF_XDP, __MAX_BPF_ATTACH_TYPE }; @@ -242,6 +243,7 @@ enum bpf_link_type { BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, + BPF_LINK_TYPE_XDP = 6, MAX_BPF_LINK_TYPE, }; @@ -614,7 +616,10 @@ union bpf_attr { struct { /* struct used by BPF_LINK_CREATE command */ __u32 prog_fd; /* eBPF program to attach */ - __u32 target_fd; /* object to attach to */ + union { + __u32 target_fd; /* object to attach to */ + __u32 target_ifindex; /* target ifindex */ + }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ } link_create; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ee290b1f2d9e..0e8c88db7e7a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2824,6 +2824,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_TRACING; case BPF_SK_LOOKUP: return BPF_PROG_TYPE_SK_LOOKUP; + case BPF_XDP: + return BPF_PROG_TYPE_XDP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3921,6 +3923,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; + case BPF_PROG_TYPE_XDP: + ret = bpf_xdp_link_attach(attr, prog); + break; default: ret = -EINVAL; } diff --git a/net/core/dev.c b/net/core/dev.c index 521ce031ee35..e24248f3d675 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8716,6 +8716,12 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down_generic); +struct bpf_xdp_link { + struct bpf_link link; + struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ + int flags; +}; + static enum bpf_xdp_mode dev_xdp_mode(u32 flags) { if (flags & XDP_FLAGS_HW_MODE) @@ -8738,9 +8744,19 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) }; } +static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + return dev->xdp_state[mode].link; +} + static struct bpf_prog *dev_xdp_prog(struct net_device *dev, enum bpf_xdp_mode mode) { + struct bpf_xdp_link *link = dev_xdp_link(dev, mode); + + if (link) + return link->link.prog; return dev->xdp_state[mode].prog; } @@ -8751,9 +8767,17 @@ u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) return prog ? prog->aux->id : 0; } +static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_xdp_link *link) +{ + dev->xdp_state[mode].link = link; + dev->xdp_state[mode].prog = NULL; +} + static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_prog *prog) { + dev->xdp_state[mode].link = NULL; dev->xdp_state[mode].prog = prog; } @@ -8793,6 +8817,7 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, static void dev_xdp_uninstall(struct net_device *dev) { + struct bpf_xdp_link *link; struct bpf_prog *prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; @@ -8810,14 +8835,20 @@ static void dev_xdp_uninstall(struct net_device *dev) WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); - bpf_prog_put(prog); - dev_xdp_set_prog(dev, mode, NULL); + /* auto-detach link from net device */ + link = dev_xdp_link(dev, mode); + if (link) + link->dev = NULL; + else + bpf_prog_put(prog); + + dev_xdp_set_link(dev, mode, NULL); } } static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, - struct bpf_prog *new_prog, struct bpf_prog *old_prog, - u32 flags) + struct bpf_xdp_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog, u32 flags) { struct bpf_prog *cur_prog; enum bpf_xdp_mode mode; @@ -8826,6 +8857,14 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack ASSERT_RTNL(); + /* either link or prog attachment, never both */ + if (link && (new_prog || old_prog)) + return -EINVAL; + /* link supports only XDP mode flags */ + if (link && (flags & ~XDP_FLAGS_MODES)) { + NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); + return -EINVAL; + } /* just one XDP mode bit should be set, zero defaults to SKB mode */ if (hweight32(flags & XDP_FLAGS_MODES) > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); @@ -8838,7 +8877,18 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack } mode = dev_xdp_mode(flags); + /* can't replace attached link */ + if (dev_xdp_link(dev, mode)) { + NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); + return -EBUSY; + } + cur_prog = dev_xdp_prog(dev, mode); + /* can't replace attached prog with link */ + if (link && cur_prog) { + NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); + return -EBUSY; + } if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { NL_SET_ERR_MSG(extack, "Active program does not match expected"); return -EEXIST; @@ -8848,6 +8898,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return -EBUSY; } + /* put effective new program into new_prog */ + if (link) + new_prog = link->link.prog; + if (new_prog) { bool offload = mode == XDP_MODE_HW; enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB @@ -8884,13 +8938,116 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return err; } - dev_xdp_set_prog(dev, mode, new_prog); + if (link) + dev_xdp_set_link(dev, mode, link); + else + dev_xdp_set_prog(dev, mode, new_prog); if (cur_prog) bpf_prog_put(cur_prog); return 0; } +static int dev_xdp_attach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); +} + +static int dev_xdp_detach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + + ASSERT_RTNL(); + + mode = dev_xdp_mode(link->flags); + if (dev_xdp_link(dev, mode) != link) + return -EINVAL; + + bpf_op = dev_xdp_bpf_op(dev, mode); + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + dev_xdp_set_link(dev, mode, NULL); + return 0; +} + +static void bpf_xdp_link_release(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + rtnl_lock(); + + /* if racing with net_device's tear down, xdp_link->dev might be + * already NULL, in which case link was already auto-detached + */ + if (xdp_link->dev) + WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + + rtnl_unlock(); +} + +static void bpf_xdp_link_dealloc(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + kfree(xdp_link); +} + +static const struct bpf_link_ops bpf_xdp_link_lops = { + .release = bpf_xdp_link_release, + .dealloc = bpf_xdp_link_dealloc, +}; + +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct bpf_xdp_link *link; + struct net_device *dev; + int err, fd; + + dev = dev_get_by_index(net, attr->link_create.target_ifindex); + if (!dev) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_dev; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); + link->dev = dev; + link->flags = attr->link_create.flags; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto out_put_dev; + } + + rtnl_lock(); + err = dev_xdp_attach_link(dev, NULL, link); + rtnl_unlock(); + + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_dev; + } + + fd = bpf_link_settle(&link_primer); + /* link itself doesn't hold dev's refcnt to not complicate shutdown */ + dev_put(dev); + return fd; + +out_put_dev: + dev_put(dev); + return err; +} + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device @@ -8927,7 +9084,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, } } - err = dev_xdp_attach(dev, extack, new_prog, old_prog, flags); + err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); err_out: if (err && new_prog) -- cgit v1.2.3 From aadfc2f957cb470a5a7e52cc41a2fa86e784bcd2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Jul 2020 11:19:51 +0200 Subject: entry: Correct 'noinstr' attributes The noinstr attribute is to be specified before the return type in the same way 'inline' is used. Similar cases were recently fixed for x86 in commit 7f6fa101dfac ("x86: Correct noinstr qualifiers"), but the generic entry code was based on the the original version and did not carry the fix over. Fixes: a5497bab5f72 ("entry: Provide generic interrupt entry/exit code") Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200725091951.744848-3-mingo@kernel.org --- kernel/entry/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 495f5c051b03..9852e0d62d95 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -256,7 +256,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) exit_to_user_mode(); } -irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs) +noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) { irqentry_state_t ret = { .exit_rcu = false, @@ -333,7 +333,7 @@ void irqentry_exit_cond_resched(void) } } -void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state) +noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) { lockdep_assert_irqs_disabled(); -- cgit v1.2.3 From 45e9504f109b84986ba9d25a273a5d5bd3f3e9d9 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 16 Jul 2020 16:39:05 +0800 Subject: genirq/irqdomain: Remove redundant NULL pointer check on fwnode The is_fwnode_irqchip() helper will check if the fwnode_handle is empty. There is no need to perform a redundant check outside of it. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200716083905.287-1-yuzenghui@huawei.com --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index a4c2c915511d..155460fc0147 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -142,7 +142,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, if (!domain) return NULL; - if (fwnode && is_fwnode_irqchip(fwnode)) { + if (is_fwnode_irqchip(fwnode)) { fwid = container_of(fwnode, struct irqchip_fwid, fwnode); switch (fwid->type) { -- cgit v1.2.3 From 8a667928cfae337b85bee4fb0dbc09f3c2715856 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 10 Jul 2020 23:18:22 +0000 Subject: irqdomain: Export irq_domain_update_bus_token Add export for irq_domain_update_bus_token() so that we can allow drivers like the qcom-pdc driver to be loadable as a module. Signed-off-by: John Stultz Signed-off-by: Marc Zyngier Reviewed-by: Linus Walleij Cc: Andy Gross Cc: Bjorn Andersson Cc: Joerg Roedel Cc: Thomas Gleixner Cc: Jason Cooper Cc: Marc Zyngier Cc: Linus Walleij Cc: Maulik Shah Cc: Lina Iyer Cc: Saravana Kannan Cc: Todd Kjos Cc: Greg Kroah-Hartman Cc: linux-arm-msm@vger.kernel.org Cc: iommu@lists.linux-foundation.org Cc: linux-gpio@vger.kernel.org Link: https://lore.kernel.org/r/20200710231824.60699-2-john.stultz@linaro.org --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 155460fc0147..76cd7ebd1178 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -281,6 +281,7 @@ void irq_domain_update_bus_token(struct irq_domain *domain, mutex_unlock(&irq_domain_mutex); } +EXPORT_SYMBOL_GPL(irq_domain_update_bus_token); /** * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs -- cgit v1.2.3 From 8d16f5b979660c0fdcfa21a418cc03f1fde60cf7 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 10 Jul 2020 23:18:23 +0000 Subject: genirq: Export irq_chip_retrigger_hierarchy and irq_chip_set_vcpu_affinity_parent Add EXPORT_SYMBOL_GPL entries for irq_chip_retrigger_hierarchy() and irq_chip_set_vcpu_affinity_parent() so that we can allow drivers like the qcom-pdc driver to be loadable as a module. Signed-off-by: John Stultz Signed-off-by: Marc Zyngier Reviewed-by: Linus Walleij Cc: Andy Gross Cc: Bjorn Andersson Cc: Joerg Roedel Cc: Thomas Gleixner Cc: Jason Cooper Cc: Marc Zyngier Cc: Linus Walleij Cc: Maulik Shah Cc: Lina Iyer Cc: Saravana Kannan Cc: Todd Kjos Cc: Greg Kroah-Hartman Cc: linux-arm-msm@vger.kernel.org Cc: iommu@lists.linux-foundation.org Cc: linux-gpio@vger.kernel.org Link: https://lore.kernel.org/r/20200710231824.60699-3-john.stultz@linaro.org --- kernel/irq/chip.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 41e7e37a0928..ba6ce66d7ed6 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1478,6 +1478,7 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) return 0; } +EXPORT_SYMBOL_GPL(irq_chip_retrigger_hierarchy); /** * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt @@ -1492,7 +1493,7 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) return -ENOSYS; } - +EXPORT_SYMBOL_GPL(irq_chip_set_vcpu_affinity_parent); /** * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt * @data: Pointer to interrupt specific data -- cgit v1.2.3 From ed00495333ccc80fc8fb86fb43773c3c2a499466 Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Mon, 27 Jul 2020 14:48:52 +0200 Subject: locking/lockdep: Fix TRACE_IRQFLAGS vs. NMIs Prior to commit: 859d069ee1dd ("lockdep: Prepare for NMI IRQ state tracking") IRQ state tracking was disabled in NMIs due to nmi_enter() doing lockdep_off() -- with the obvious requirement that NMI entry call nmi_enter() before trace_hardirqs_off(). [ AFAICT, PowerPC and SH violate this order on their NMI entry ] However, that commit explicitly changed lockdep_hardirqs_*() to ignore lockdep_off() and breaks every architecture that has irq-tracing in it's NMI entry that hasn't been fixed up (x86 being the only fixed one at this point). The reason for this change is that by ignoring lockdep_off() we can: - get rid of 'current->lockdep_recursion' in lockdep_assert_irqs*() which was going to to give header-recursion issues with the seqlock rework. - allow these lockdep_assert_*() macros to function in NMI context. Restore the previous state of things and allow an architecture to opt-in to the NMI IRQ tracking support, however instead of relying on lockdep_off(), rely on in_nmi(), both are part of nmi_enter() and so over-all entry ordering doesn't need to change. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200727124852.GK119549@hirez.programming.kicks-ass.net --- arch/x86/Kconfig.debug | 3 +++ kernel/locking/lockdep.c | 8 +++++++- lib/Kconfig.debug | 6 ++++++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 0dd319e6e5b4..ee1d3c5834c6 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -3,6 +3,9 @@ config TRACE_IRQFLAGS_SUPPORT def_bool y +config TRACE_IRQFLAGS_NMI_SUPPORT + def_bool y + config EARLY_PRINTK_USB bool diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d595623c4b34..8b0b28b4546b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3712,6 +3712,9 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) * and not rely on hardware state like normal interrupts. */ if (unlikely(in_nmi())) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; + /* * Skip: * - recursion check, because NMI can hit lockdep; @@ -3773,7 +3776,10 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) * they will restore the software state. This ensures the software * state is consistent inside NMIs as well. */ - if (unlikely(!in_nmi() && (current->lockdep_recursion & LOCKDEP_RECURSION_MASK))) + if (in_nmi()) { + if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI)) + return; + } else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) return; /* diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9ad9210d70a1..fa964b51f066 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1325,11 +1325,17 @@ config WW_MUTEX_SELFTEST endmenu # lock debugging config TRACE_IRQFLAGS + depends on TRACE_IRQFLAGS_SUPPORT bool help Enables hooks to interrupt enabling and disabling for either tracing or lock debugging. +config TRACE_IRQFLAGS_NMI + def_bool y + depends on TRACE_IRQFLAGS + depends on TRACE_IRQFLAGS_NMI_SUPPORT + config STACKTRACE bool "Stack backtrace support" depends on STACKTRACE_SUPPORT -- cgit v1.2.3 From f0c7baca180046824e07fc5f1326e83a8fd150c7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Jul 2020 22:44:41 +0200 Subject: genirq/affinity: Make affinity setting if activated opt-in John reported that on a RK3288 system the perf per CPU interrupts are all affine to CPU0 and provided the analysis: "It looks like what happens is that because the interrupts are not per-CPU in the hardware, armpmu_request_irq() calls irq_force_affinity() while the interrupt is deactivated and then request_irq() with IRQF_PERCPU | IRQF_NOBALANCING. Now when irq_startup() runs with IRQ_STARTUP_NORMAL, it calls irq_setup_affinity() which returns early because IRQF_PERCPU and IRQF_NOBALANCING are set, leaving the interrupt on its original CPU." This was broken by the recent commit which blocked interrupt affinity setting in hardware before activation of the interrupt. While this works in general, it does not work for this particular case. As contrary to the initial analysis not all interrupt chip drivers implement an activate callback, the safe cure is to make the deferred interrupt affinity setting at activation time opt-in. Implement the necessary core logic and make the two irqchip implementations for which this is required opt-in. In hindsight this would have been the right thing to do, but ... Fixes: baedb87d1b53 ("genirq/affinity: Handle affinity setting on inactive interrupts correctly") Reported-by: John Keeping Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Acked-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/87blk4tzgm.fsf@nanos.tec.linutronix.de --- arch/x86/kernel/apic/vector.c | 4 ++++ drivers/irqchip/irq-gic-v3-its.c | 5 ++++- include/linux/irq.h | 13 +++++++++++++ kernel/irq/manage.c | 6 +++++- 4 files changed, 26 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 7649da2478d8..dae32d948bf2 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -560,6 +560,10 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, * as that can corrupt the affinity move state. */ irqd_set_handle_enforce_irqctx(irqd); + + /* Don't invoke affinity setter on deactivated interrupts */ + irqd_set_affinity_on_activate(irqd); + /* * Legacy vectors are already assigned when the IOAPIC * takes them over. They stay on the same vector. This is diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index beac4caefad9..103d850b5595 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -3523,6 +3523,7 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, msi_alloc_info_t *info = args; struct its_device *its_dev = info->scratchpad[0].ptr; struct its_node *its = its_dev->its; + struct irq_data *irqd; irq_hw_number_t hwirq; int err; int i; @@ -3542,7 +3543,9 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, &its_irq_chip, its_dev); - irqd_set_single_target(irq_desc_get_irq_data(irq_to_desc(virq + i))); + irqd = irq_get_irq_data(virq + i); + irqd_set_single_target(irqd); + irqd_set_affinity_on_activate(irqd); pr_debug("ID:%d pID:%d vID:%d\n", (int)(hwirq + i - its_dev->event_map.lpi_base), (int)(hwirq + i), virq + i); diff --git a/include/linux/irq.h b/include/linux/irq.h index 8d5bc2c237d7..1b7f4dfee35b 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -213,6 +213,8 @@ struct irq_data { * required * IRQD_HANDLE_ENFORCE_IRQCTX - Enforce that handle_irq_*() is only invoked * from actual interrupt context. + * IRQD_AFFINITY_ON_ACTIVATE - Affinity is set on activation. Don't call + * irq_chip::irq_set_affinity() when deactivated. */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -237,6 +239,7 @@ enum { IRQD_CAN_RESERVE = (1 << 26), IRQD_MSI_NOMASK_QUIRK = (1 << 27), IRQD_HANDLE_ENFORCE_IRQCTX = (1 << 28), + IRQD_AFFINITY_ON_ACTIVATE = (1 << 29), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -421,6 +424,16 @@ static inline bool irqd_msi_nomask_quirk(struct irq_data *d) return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK; } +static inline void irqd_set_affinity_on_activate(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE; +} + +static inline bool irqd_affinity_on_activate(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_AFFINITY_ON_ACTIVATE; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2a9fec53e159..48c38e09c673 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -320,12 +320,16 @@ static bool irq_set_affinity_deactivated(struct irq_data *data, struct irq_desc *desc = irq_data_to_desc(data); /* + * Handle irq chips which can handle affinity only in activated + * state correctly + * * If the interrupt is not yet activated, just store the affinity * mask and do not call the chip driver at all. On activation the * driver has to make sure anyway that the interrupt is in a * useable state so startup works. */ - if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data)) + if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || + irqd_is_activated(data) || !irqd_affinity_on_activate(data)) return false; cpumask_copy(desc->irq_common_data.affinity, mask); -- cgit v1.2.3 From aa251fc5b936d3ddb4b4c4b36427eb9aa3347c82 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 25 Jul 2020 13:30:55 +0100 Subject: genirq/debugfs: Add missing irqchip flags Recently introduced irqchip flags lack the corresponding printouts in debugfs. Add them. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/874kpvydxc.wl-maz@kernel.org --- kernel/irq/debugfs.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 4f9f844074db..b95ff5d5f4bd 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -112,6 +112,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_AFFINITY_SET), BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), + BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), BIT_MASK_DESCR(IRQD_CAN_RESERVE), BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK), @@ -120,6 +121,10 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_WAKEUP_STATE), BIT_MASK_DESCR(IRQD_WAKEUP_ARMED), + + BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET), + + BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX), }; static const struct irq_bit_descr irqdesc_states[] = { -- cgit v1.2.3 From b54cecf5e2293d15620f7b3f8d1bf486243d5643 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 7 Jun 2020 12:10:40 +0300 Subject: fsnotify: pass dir argument to handle_event() callback The 'inode' argument to handle_event(), sometimes referred to as 'to_tell' is somewhat obsolete. It is a remnant from the times when a group could only have an inode mark associated with an event. We now pass an iter_info array to the callback, with all marks associated with an event. Most backends ignore this argument, with two exceptions: 1. dnotify uses it for sanity check that event is on directory 2. fanotify uses it to report fid of directory on directory entry modification events Remove the 'inode' argument and add a 'dir' argument. The callback function signature is deliberately changed, because the meaning of the argument has changed and the arguments have been documented. The 'dir' argument is set to when 'file_name' is specified and it is referring to the directory that the 'file_name' entry belongs to. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/nfsd/filecache.c | 6 +++--- fs/notify/dnotify/dnotify.c | 8 ++++---- fs/notify/fanotify/fanotify.c | 23 +++++++++++------------ fs/notify/fsnotify.c | 26 ++++++++++++-------------- fs/notify/inotify/inotify.h | 6 +++--- fs/notify/inotify/inotify_fsnotify.c | 7 +++---- fs/notify/inotify/inotify_user.c | 4 ++-- include/linux/fsnotify_backend.h | 16 +++++++++++++--- kernel/audit_fsnotify.c | 10 +++++----- kernel/audit_tree.c | 6 +++--- kernel/audit_watch.c | 6 +++--- 11 files changed, 62 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ace8e5c30952..bbc7892d2928 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -598,9 +598,9 @@ static struct notifier_block nfsd_file_lease_notifier = { }; static int -nfsd_file_fsnotify_handle_event(struct fsnotify_group *group, - struct inode *to_tell, - u32 mask, const void *data, int data_type, +nfsd_file_fsnotify_handle_event(struct fsnotify_group *group, u32 mask, + const void *data, int data_type, + struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 7a42c2ebe28d..2d2eadfb5186 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -70,9 +70,9 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) * destroy the dnotify struct if it was not registered to receive multiple * events. */ -static int dnotify_handle_event(struct fsnotify_group *group, - struct inode *inode, - u32 mask, const void *data, int data_type, +static int dnotify_handle_event(struct fsnotify_group *group, u32 mask, + const void *data, int data_type, + struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { @@ -84,7 +84,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; /* not a dir, dnotify doesn't care */ - if (!S_ISDIR(inode->i_mode)) + if (!dir && !(mask & FS_ISDIR)) return 0; if (WARN_ON(fsnotify_iter_vfsmount_mark(iter_info))) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 41f5fc9a8f19..e417c64c365b 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -335,11 +335,11 @@ out: * FS_ATTRIB reports the child inode even if reported on a watched parent. * FS_CREATE reports the modified dir inode and not the created inode. */ -static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask, - const void *data, int data_type) +static struct inode *fanotify_fid_inode(u32 event_mask, const void *data, + int data_type, struct inode *dir) { if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) - return to_tell; + return dir; return fsnotify_data_inode(data, data_type); } @@ -416,14 +416,14 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, } static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, - struct inode *inode, u32 mask, - const void *data, int data_type, + u32 mask, const void *data, + int data_type, struct inode *dir, const struct qstr *file_name, __kernel_fsid_t *fsid) { struct fanotify_event *event = NULL; gfp_t gfp = GFP_KERNEL_ACCOUNT; - struct inode *id = fanotify_fid_inode(inode, mask, data, data_type); + struct inode *id = fanotify_fid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); /* @@ -507,9 +507,9 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) return fsid; } -static int fanotify_handle_event(struct fsnotify_group *group, - struct inode *inode, - u32 mask, const void *data, int data_type, +static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, + const void *data, int data_type, + struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { @@ -546,8 +546,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, if (!mask) return 0; - pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, - mask); + pr_debug("%s: group=%p mask=%x\n", __func__, group, mask); if (fanotify_is_perm_event(mask)) { /* @@ -565,7 +564,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, return 0; } - event = fanotify_alloc_event(group, inode, mask, data, data_type, + event = fanotify_alloc_event(group, mask, data, data_type, dir, file_name, &fsid); ret = -ENOMEM; if (unlikely(!event)) { diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 30628a72ca01..c4ac4d13e10f 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -185,11 +185,9 @@ notify_child: } EXPORT_SYMBOL_GPL(__fsnotify_parent); -static int send_to_group(struct inode *to_tell, - __u32 mask, const void *data, - int data_is, u32 cookie, - const struct qstr *file_name, - struct fsnotify_iter_info *iter_info) +static int send_to_group(__u32 mask, const void *data, int data_type, + struct inode *dir, const struct qstr *file_name, + u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); @@ -225,15 +223,14 @@ static int send_to_group(struct inode *to_tell, } } - pr_debug("%s: group=%p to_tell=%p mask=%x marks_mask=%x marks_ignored_mask=%x" - " data=%p data_is=%d cookie=%d\n", - __func__, group, to_tell, mask, marks_mask, marks_ignored_mask, - data, data_is, cookie); + pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", + __func__, group, mask, marks_mask, marks_ignored_mask, + data, data_type, dir, cookie); if (!(test_mask & marks_mask & ~marks_ignored_mask)) return 0; - return group->ops->handle_event(group, to_tell, mask, data, data_is, + return group->ops->handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } @@ -317,12 +314,13 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) * out to all of the registered fsnotify_group. Those groups can then use the * notification event in whatever means they feel necessary. */ -int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, +int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_type, const struct qstr *file_name, u32 cookie) { - const struct path *path = fsnotify_data_path(data, data_is); + const struct path *path = fsnotify_data_path(data, data_type); struct fsnotify_iter_info iter_info = {}; struct super_block *sb = to_tell->i_sb; + struct inode *dir = file_name ? to_tell : NULL; struct mount *mnt = NULL; int ret = 0; __u32 test_mask, marks_mask; @@ -375,8 +373,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, * That's why this traversal is so complicated... */ while (fsnotify_iter_select_report_types(&iter_info)) { - ret = send_to_group(to_tell, mask, data, data_is, cookie, - file_name, &iter_info); + ret = send_to_group(mask, data, data_type, dir, file_name, + cookie, &iter_info); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 3f246f7b8a92..4327d0e9c364 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -24,9 +24,9 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); -extern int inotify_handle_event(struct fsnotify_group *group, - struct inode *inode, - u32 mask, const void *data, int data_type, +extern int inotify_handle_event(struct fsnotify_group *group, u32 mask, + const void *data, int data_type, + struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 9b481460a2dc..dfd455798a1b 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -55,9 +55,8 @@ static int inotify_merge(struct list_head *list, return event_compare(last_event, event); } -int inotify_handle_event(struct fsnotify_group *group, - struct inode *inode, - u32 mask, const void *data, int data_type, +int inotify_handle_event(struct fsnotify_group *group, u32 mask, + const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { @@ -82,7 +81,7 @@ int inotify_handle_event(struct fsnotify_group *group, alloc_len += len + 1; } - pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, + pr_debug("%s: group=%p mark=%p mask=%x\n", __func__, group, inode_mark, mask); i_mark = container_of(inode_mark, struct inotify_inode_mark, diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index f88bbcc9efeb..5385d5817dd9 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -490,8 +490,8 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, fsn_mark); /* Queue ignore event for the watch */ - inotify_handle_event(group, NULL, FS_IN_IGNORED, NULL, - FSNOTIFY_EVENT_NONE, NULL, 0, &iter_info); + inotify_handle_event(group, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, + NULL, NULL, 0, &iter_info); i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 97300f3b8ff0..0de130cbf72d 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -108,6 +108,17 @@ struct mem_cgroup; * these operations for each relevant group. * * handle_event - main call for a group to handle an fs event + * @group: group to notify + * @mask: event type and flags + * @data: object that event happened on + * @data_type: type of object for fanotify_data_XXX() accessors + * @dir: optional directory associated with event - + * if @file_name is not NULL, this is the directory that + * @file_name is relative to + * @file_name: optional file name associated with event + * @cookie: inotify rename cookie + * @iter_info: array of marks from this group that are interested in the event + * * free_group_priv - called when a group refcnt hits 0 to clean up the private union * freeing_mark - called when a mark is being destroyed for some reason. The group * MUST be holding a reference on each mark and that reference must be @@ -115,9 +126,8 @@ struct mem_cgroup; * userspace messages that marks have been removed. */ struct fsnotify_ops { - int (*handle_event)(struct fsnotify_group *group, - struct inode *inode, - u32 mask, const void *data, int data_type, + int (*handle_event)(struct fsnotify_group *group, u32 mask, + const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info); void (*free_group_priv)(struct fsnotify_group *group); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 3596448bfdab..30ca239285a3 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -152,11 +152,11 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) } /* Update mark data in audit rules based on fsnotify events. */ -static int audit_mark_handle_event(struct fsnotify_group *group, - struct inode *to_tell, - u32 mask, const void *data, int data_type, - const struct qstr *dname, u32 cookie, - struct fsnotify_iter_info *iter_info) +static int audit_mark_handle_event(struct fsnotify_group *group, u32 mask, + const void *data, int data_type, + struct inode *dir, + const struct qstr *dname, u32 cookie, + struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct audit_fsnotify_mark *audit_mark; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e49c912f862d..2ce2ac1ce100 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1037,9 +1037,9 @@ static void evict_chunk(struct audit_chunk *chunk) audit_schedule_prune(); } -static int audit_tree_handle_event(struct fsnotify_group *group, - struct inode *to_tell, - u32 mask, const void *data, int data_type, +static int audit_tree_handle_event(struct fsnotify_group *group, u32 mask, + const void *data, int data_type, + struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e09c551ae52d..61fd601f1edf 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -464,9 +464,9 @@ void audit_remove_watch_rule(struct audit_krule *krule) } /* Update watch data in audit rules based on fsnotify events. */ -static int audit_watch_handle_event(struct fsnotify_group *group, - struct inode *to_tell, - u32 mask, const void *data, int data_type, +static int audit_watch_handle_event(struct fsnotify_group *group, u32 mask, + const void *data, int data_type, + struct inode *dir, const struct qstr *dname, u32 cookie, struct fsnotify_iter_info *iter_info) { -- cgit v1.2.3 From b4e9c9549f62329d2412f899635fddc5212b9cd4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 1 Jun 2020 19:42:40 -0400 Subject: introduction of regset ->get() wrappers, switching ELF coredumps to those Two new helpers: given a process and regset, dump into a buffer. regset_get() takes a buffer and size, regset_get_alloc() takes size and allocates a buffer. Return value in both cases is the amount of data actually dumped in case of success or -E... on error. In both cases the size is capped by regset->n * regset->size, so ->get() is called with offset 0 and size no more than what regset expects. binfmt_elf.c callers of ->get() are switched to using those; the other caller (copy_regset_to_user()) will need some preparations to switch. Signed-off-by: Al Viro --- fs/binfmt_elf.c | 54 ++++++++++++++++++++++++-------------------------- include/linux/regset.h | 9 +++++++++ kernel/Makefile | 2 +- kernel/regset.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 29 deletions(-) create mode 100644 kernel/regset.c (limited to 'kernel') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 9fe3b51c116a..e922a6abdca8 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1821,7 +1821,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, long signr, size_t *total) { unsigned int i; - unsigned int regset0_size = regset_size(t->task, &view->regsets[0]); + int regset0_size; /* * NT_PRSTATUS is the one special case, because the regset data @@ -1830,8 +1830,10 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, * We assume that regset 0 is NT_PRSTATUS. */ fill_prstatus(&t->prstatus, t->task, signr); - (void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset0_size, - &t->prstatus.pr_reg, NULL); + regset0_size = regset_get(t->task, &view->regsets[0], + sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); + if (regset0_size < 0) + return 0; fill_note(&t->notes[0], "CORE", NT_PRSTATUS, PRSTATUS_SIZE(t->prstatus, regset0_size), &t->prstatus); @@ -1846,32 +1848,28 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, */ for (i = 1; i < view->n; ++i) { const struct user_regset *regset = &view->regsets[i]; + int note_type = regset->core_note_type; + bool is_fpreg = note_type == NT_PRFPREG; + void *data; + int ret; + do_thread_regset_writeback(t->task, regset); - if (regset->core_note_type && regset->get && - (!regset->active || regset->active(t->task, regset) > 0)) { - int ret; - size_t size = regset_size(t->task, regset); - void *data = kzalloc(size, GFP_KERNEL); - if (unlikely(!data)) - return 0; - ret = regset->get(t->task, regset, - 0, size, data, NULL); - if (unlikely(ret)) - kfree(data); - else { - if (regset->core_note_type != NT_PRFPREG) - fill_note(&t->notes[i], "LINUX", - regset->core_note_type, - size, data); - else { - SET_PR_FPVALID(&t->prstatus, - 1, regset0_size); - fill_note(&t->notes[i], "CORE", - NT_PRFPREG, size, data); - } - *total += notesize(&t->notes[i]); - } - } + if (!note_type) // not for coredumps + continue; + if (regset->active && regset->active(t->task, regset) <= 0) + continue; + + ret = regset_get_alloc(t->task, regset, ~0U, &data); + if (ret < 0) + continue; + + if (is_fpreg) + SET_PR_FPVALID(&t->prstatus, 1, regset0_size); + + fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX", + note_type, ret, data); + + *total += notesize(&t->notes[i]); } return 1; diff --git a/include/linux/regset.h b/include/linux/regset.h index 46d6ae68c455..968a032922d5 100644 --- a/include/linux/regset.h +++ b/include/linux/regset.h @@ -353,6 +353,15 @@ static inline int user_regset_copyin_ignore(unsigned int *pos, return 0; } +extern int regset_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int size, void *data); + +extern int regset_get_alloc(struct task_struct *target, + const struct user_regset *regset, + unsigned int size, + void **data); + /** * copy_regset_to_user - fetch a thread's user_regset data into user memory * @target: thread to be examined diff --git a/kernel/Makefile b/kernel/Makefile index f3218bc5ec69..e6e03380a0f1 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o ucount.o + async.o range.o smpboot.o ucount.o regset.o obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o diff --git a/kernel/regset.c b/kernel/regset.c new file mode 100644 index 000000000000..6b39fa0993ec --- /dev/null +++ b/kernel/regset.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include + +static int __regset_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int size, + void **data) +{ + void *p = *data, *to_free = NULL; + int res; + + if (!regset->get) + return -EOPNOTSUPP; + if (size > regset->n * regset->size) + size = regset->n * regset->size; + if (!p) { + to_free = p = kzalloc(size, GFP_KERNEL); + if (!p) + return -ENOMEM; + } + res = regset->get(target, regset, 0, size, p, NULL); + if (unlikely(res < 0)) { + kfree(to_free); + return res; + } + *data = p; + if (regset->get_size) { // arm64-only kludge, will go away + unsigned max_size = regset->get_size(target, regset); + if (size > max_size) + size = max_size; + } + return size; +} + +int regset_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int size, + void *data) +{ + return __regset_get(target, regset, size, &data); +} +EXPORT_SYMBOL(regset_get); + +int regset_get_alloc(struct task_struct *target, + const struct user_regset *regset, + unsigned int size, + void **data) +{ + *data = NULL; + return __regset_get(target, regset, size, data); +} +EXPORT_SYMBOL(regset_get_alloc); -- cgit v1.2.3 From dc12d7968f9c9540494deb1285854b18ca4465ec Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 17 Feb 2020 12:25:14 -0500 Subject: copy_regset_to_user(): do all copyout at once. Turn copy_regset_to_user() into regset_get_alloc() + copy_to_user(). Now all ->get() calls have a kernel buffer as destination. Note that we'd already eliminated the callers of copy_regset_to_user() with non-zero offset; now that argument is simply unused. Uninlined, while we are at it. Signed-off-by: Al Viro --- include/linux/regset.h | 29 ++++------------------------- kernel/regset.c | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/regset.h b/include/linux/regset.h index 968a032922d5..af57c1db1924 100644 --- a/include/linux/regset.h +++ b/include/linux/regset.h @@ -362,31 +362,10 @@ extern int regset_get_alloc(struct task_struct *target, unsigned int size, void **data); -/** - * copy_regset_to_user - fetch a thread's user_regset data into user memory - * @target: thread to be examined - * @view: &struct user_regset_view describing user thread machine state - * @setno: index in @view->regsets - * @offset: offset into the regset data, in bytes - * @size: amount of data to copy, in bytes - * @data: user-mode pointer to copy into - */ -static inline int copy_regset_to_user(struct task_struct *target, - const struct user_regset_view *view, - unsigned int setno, - unsigned int offset, unsigned int size, - void __user *data) -{ - const struct user_regset *regset = &view->regsets[setno]; - - if (!regset->get) - return -EOPNOTSUPP; - - if (!access_ok(data, size)) - return -EFAULT; - - return regset->get(target, regset, offset, size, NULL, data); -} +extern int copy_regset_to_user(struct task_struct *target, + const struct user_regset_view *view, + unsigned int setno, unsigned int offset, + unsigned int size, void __user *data); /** * copy_regset_from_user - store into thread's user_regset data from user memory diff --git a/kernel/regset.c b/kernel/regset.c index 6b39fa0993ec..0a610983ce43 100644 --- a/kernel/regset.c +++ b/kernel/regset.c @@ -52,3 +52,29 @@ int regset_get_alloc(struct task_struct *target, return __regset_get(target, regset, size, data); } EXPORT_SYMBOL(regset_get_alloc); + +/** + * copy_regset_to_user - fetch a thread's user_regset data into user memory + * @target: thread to be examined + * @view: &struct user_regset_view describing user thread machine state + * @setno: index in @view->regsets + * @offset: offset into the regset data, in bytes + * @size: amount of data to copy, in bytes + * @data: user-mode pointer to copy into + */ +int copy_regset_to_user(struct task_struct *target, + const struct user_regset_view *view, + unsigned int setno, + unsigned int offset, unsigned int size, + void __user *data) +{ + const struct user_regset *regset = &view->regsets[setno]; + void *buf; + int ret; + + ret = regset_get_alloc(target, regset, size, &buf); + if (ret > 0) + ret = copy_to_user(data, buf, ret) ? -EFAULT : 0; + kfree(buf); + return ret; +} -- cgit v1.2.3 From 7717cb9bdd0421faa432a4e0d499fdba6e2394c8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 20 Feb 2020 20:48:16 -0500 Subject: regset: new method and helpers for it ->regset_get() takes task+regset+buffer, returns the amount of free space left in the buffer on success and -E... on error. buffer is represented as struct membuf - a pair of (kernel) pointer and amount of space left Primitives for writing to such: * membuf_write(buf, data, size) * membuf_zero(buf, size) * membuf_store(buf, value) These are implemented as inlines (in case of membuf_store - a macro). All writes are sequential; they become no-ops when there's no space left. Return value of all primitives is the amount of space left after the operation, so they can be used as return values of ->regset_get(). Example of use: // stores pt_regs of task + 64 bytes worth of zeroes + 32bit PID of task int foo_get(struct task_struct *task, const struct regset *regset, struct membuf to) { membuf_write(&to, task_pt_regs(task), sizeof(struct pt_regs)); membuf_zero(&to, 64); return membuf_store(&to, (u32)task_tgid_vnr(task)); } regset_get()/regset_get_alloc() taught to use that thing if present. By the end of the series all users of ->get() will be converted; then ->get() and ->get_size() can go. Note that unlike ->get() this thing always starts at offset 0 and, since it only writes to kernel buffer, can't fail on copyout. It can, of course, fail for other reasons, but those tend to be less numerous. The caller guarantees that the buffer size won't be bigger than regset->n * regset->size. That simplifies life for quite a few instances. Signed-off-by: Al Viro --- include/linux/regset.h | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/regset.c | 12 +++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/regset.h b/include/linux/regset.h index af57c1db1924..f6125a7d949d 100644 --- a/include/linux/regset.h +++ b/include/linux/regset.h @@ -17,6 +17,52 @@ struct task_struct; struct user_regset; +struct membuf { + void *p; + size_t left; +}; + +static inline int membuf_zero(struct membuf *s, size_t size) +{ + if (s->left) { + if (size > s->left) + size = s->left; + memset(s->p, 0, size); + s->p += size; + s->left -= size; + } + return s->left; +} + +static inline int membuf_write(struct membuf *s, const void *v, size_t size) +{ + if (s->left) { + if (size > s->left) + size = s->left; + memcpy(s->p, v, size); + s->p += size; + s->left -= size; + } + return s->left; +} + +/* current s->p must be aligned for v; v must be a scalar */ +#define membuf_store(s, v) \ +({ \ + struct membuf *__s = (s); \ + if (__s->left) { \ + typeof(v) __v = (v); \ + size_t __size = sizeof(__v); \ + if (unlikely(__size > __s->left)) { \ + __size = __s->left; \ + memcpy(__s->p, &__v, __size); \ + } else { \ + *(typeof(__v + 0) *)__s->p = __v; \ + } \ + __s->p += __size; \ + __s->left -= __size; \ + } \ + __s->left;}) /** * user_regset_active_fn - type of @active function in &struct user_regset @@ -57,6 +103,10 @@ typedef int user_regset_get_fn(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf); +typedef int user_regset_get2_fn(struct task_struct *target, + const struct user_regset *regset, + struct membuf to); + /** * user_regset_set_fn - type of @set function in &struct user_regset * @target: thread being examined @@ -186,6 +236,7 @@ typedef unsigned int user_regset_get_size_fn(struct task_struct *target, */ struct user_regset { user_regset_get_fn *get; + user_regset_get2_fn *regset_get; user_regset_set_fn *set; user_regset_active_fn *active; user_regset_writeback_fn *writeback; diff --git a/kernel/regset.c b/kernel/regset.c index 0a610983ce43..eaeaefbbd39e 100644 --- a/kernel/regset.c +++ b/kernel/regset.c @@ -11,7 +11,7 @@ static int __regset_get(struct task_struct *target, void *p = *data, *to_free = NULL; int res; - if (!regset->get) + if (!regset->get && !regset->regset_get) return -EOPNOTSUPP; if (size > regset->n * regset->size) size = regset->n * regset->size; @@ -20,6 +20,16 @@ static int __regset_get(struct task_struct *target, if (!p) return -ENOMEM; } + if (regset->regset_get) { + res = regset->regset_get(target, regset, + (struct membuf){.p = p, .left = size}); + if (res < 0) { + kfree(to_free); + return res; + } + *data = p; + return size - res; + } res = regset->get(target, regset, 0, size, p, NULL); if (unlikely(res < 0)) { kfree(to_free); -- cgit v1.2.3 From 1e6986c9db21265bac1435a344b4446c51a3f4d8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2020 15:34:20 -0400 Subject: regset: kill ->get() no instances left Signed-off-by: Al Viro --- include/linux/regset.h | 22 ---------------------- kernel/regset.c | 24 +++++------------------- 2 files changed, 5 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/regset.h b/include/linux/regset.h index f6125a7d949d..2a4a555b1617 100644 --- a/include/linux/regset.h +++ b/include/linux/regset.h @@ -82,27 +82,6 @@ static inline int membuf_write(struct membuf *s, const void *v, size_t size) typedef int user_regset_active_fn(struct task_struct *target, const struct user_regset *regset); -/** - * user_regset_get_fn - type of @get function in &struct user_regset - * @target: thread being examined - * @regset: regset being examined - * @pos: offset into the regset data to access, in bytes - * @count: amount of data to copy, in bytes - * @kbuf: if not %NULL, a kernel-space pointer to copy into - * @ubuf: if @kbuf is %NULL, a user-space pointer to copy into - * - * Fetch register values. Return %0 on success; -%EIO or -%ENODEV - * are usual failure returns. The @pos and @count values are in - * bytes, but must be properly aligned. If @kbuf is non-null, that - * buffer is used and @ubuf is ignored. If @kbuf is %NULL, then - * ubuf gives a userland pointer to access directly, and an -%EFAULT - * return value is possible. - */ -typedef int user_regset_get_fn(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf); - typedef int user_regset_get2_fn(struct task_struct *target, const struct user_regset *regset, struct membuf to); @@ -235,7 +214,6 @@ typedef unsigned int user_regset_get_size_fn(struct task_struct *target, * omitted when there is an @active function and it returns zero. */ struct user_regset { - user_regset_get_fn *get; user_regset_get2_fn *regset_get; user_regset_set_fn *set; user_regset_active_fn *active; diff --git a/kernel/regset.c b/kernel/regset.c index eaeaefbbd39e..586823786f39 100644 --- a/kernel/regset.c +++ b/kernel/regset.c @@ -11,7 +11,7 @@ static int __regset_get(struct task_struct *target, void *p = *data, *to_free = NULL; int res; - if (!regset->get && !regset->regset_get) + if (!regset->regset_get) return -EOPNOTSUPP; if (size > regset->n * regset->size) size = regset->n * regset->size; @@ -20,28 +20,14 @@ static int __regset_get(struct task_struct *target, if (!p) return -ENOMEM; } - if (regset->regset_get) { - res = regset->regset_get(target, regset, - (struct membuf){.p = p, .left = size}); - if (res < 0) { - kfree(to_free); - return res; - } - *data = p; - return size - res; - } - res = regset->get(target, regset, 0, size, p, NULL); - if (unlikely(res < 0)) { + res = regset->regset_get(target, regset, + (struct membuf){.p = p, .left = size}); + if (res < 0) { kfree(to_free); return res; } *data = p; - if (regset->get_size) { // arm64-only kludge, will go away - unsigned max_size = regset->get_size(target, regset); - if (size > max_size) - size = max_size; - } - return size; + return size - res; } int regset_get(struct task_struct *target, -- cgit v1.2.3 From 82ace1efb3cb1d49a1681cc6e31156047d5ae1f2 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Jul 2020 15:58:44 +0300 Subject: fsnotify: create helper fsnotify_inode() Simple helper to consolidate biolerplate code. Link: https://lore.kernel.org/r/20200722125849.17418-5-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/kernfs/file.c | 6 ++---- fs/notify/fsnotify.c | 2 +- include/linux/fsnotify.h | 26 +++++++++++--------------- kernel/trace/trace.c | 3 +-- 4 files changed, 15 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 5b1468bc509e..1d185bffc52f 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -910,10 +910,8 @@ repeat: kernfs_put(parent); } - if (!p_inode) { - fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, - NULL, 0); - } + if (!p_inode) + fsnotify_inode(inode, FS_MODIFY); iput(inode); } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 0b0e01e04349..ba172b742de5 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -74,7 +74,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) iput(iput_inode); /* for each watch, send FS_UNMOUNT and then remove it */ - fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + fsnotify_inode(inode, FS_UNMOUNT); fsnotify_inode_delete(inode); diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 9b2566d273a9..f9db7c9b3ef1 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -38,6 +38,14 @@ static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, fsnotify_name(dir, mask, d_inode(dentry), &dentry->d_name, 0); } +static inline void fsnotify_inode(struct inode *inode, __u32 mask) +{ + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + + fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); +} + /* Notify this dentry's parent about a child's events. */ static inline int fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) @@ -105,12 +113,7 @@ static inline int fsnotify_perm(struct file *file, int mask) */ static inline void fsnotify_link_count(struct inode *inode) { - __u32 mask = FS_ATTRIB; - - if (S_ISDIR(inode->i_mode)) - mask |= FS_ISDIR; - - fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + fsnotify_inode(inode, FS_ATTRIB); } /* @@ -125,7 +128,6 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, u32 fs_cookie = fsnotify_get_cookie(); __u32 old_dir_mask = FS_MOVED_FROM; __u32 new_dir_mask = FS_MOVED_TO; - __u32 mask = FS_MOVE_SELF; const struct qstr *new_name = &moved->d_name; if (old_dir == new_dir) @@ -134,7 +136,6 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, if (isdir) { old_dir_mask |= FS_ISDIR; new_dir_mask |= FS_ISDIR; - mask |= FS_ISDIR; } fsnotify_name(old_dir, old_dir_mask, source, old_name, fs_cookie); @@ -144,7 +145,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, fsnotify_link_count(target); if (source) - fsnotify(source, mask, source, FSNOTIFY_EVENT_INODE, NULL, 0); + fsnotify_inode(source, FS_MOVE_SELF); audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE); } @@ -169,12 +170,7 @@ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt) */ static inline void fsnotify_inoderemove(struct inode *inode) { - __u32 mask = FS_DELETE_SELF; - - if (S_ISDIR(inode->i_mode)) - mask |= FS_ISDIR; - - fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + fsnotify_inode(inode, FS_DELETE_SELF); __fsnotify_inode_delete(inode); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb62269724d5..0c655c039506 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1543,8 +1543,7 @@ static void latency_fsnotify_workfn(struct work_struct *work) { struct trace_array *tr = container_of(work, struct trace_array, fsnotify_work); - fsnotify(tr->d_max_latency->d_inode, FS_MODIFY, - tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); + fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); } static void latency_fsnotify_workfn_irq(struct irq_work *iwork) -- cgit v1.2.3 From 7dbe6080167860df0bf8627e55fd5154f366cc7a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Jul 2020 15:58:43 +0300 Subject: audit: do not set FS_EVENT_ON_CHILD in audit marks mask The audit group marks mask does not contain any events possible on a child so setting the flag FS_EVENT_ON_CHILD in the mask is counter productive. It may lead to the undesired outcome of setting the dentry flag DCACHE_FSNOTIFY_PARENT_WATCHED on a directory inode even though it is not watching children, because the audit mark contribute the flag FS_EVENT_ON_CHILD to the inode's fsnotify_mask and another mark could be contributing an event that is possible on child to the inode's mask. Furthermore in the following patches we want to use FS_EVENT_ON_CHILD for non-dir inodes for other purposes so stop using the flag. Link: https://lore.kernel.org/r/20200722125849.17418-4-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- kernel/audit_fsnotify.c | 2 +- kernel/audit_watch.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 30ca239285a3..bd3a6b79316a 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -36,7 +36,7 @@ static struct fsnotify_group *audit_fsnotify_group; /* fsnotify events we care about. */ #define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ - FS_MOVE_SELF | FS_EVENT_ON_CHILD) + FS_MOVE_SELF) static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 61fd601f1edf..e23d54bcc587 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -53,7 +53,7 @@ static struct fsnotify_group *audit_watch_group; /* fsnotify events we care about. */ #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ - FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT) + FS_MOVE_SELF | FS_UNMOUNT) static void audit_free_parent(struct audit_parent *parent) { -- cgit v1.2.3 From b9a1b9772509cbc6f6aa8bcd0b019f6347a2b631 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Jul 2020 15:58:48 +0300 Subject: fsnotify: create method handle_inode_event() in fsnotify_operations The method handle_event() grew a lot of complexity due to the design of fanotify and merging of ignore masks. Most backends do not care about this complex functionality, so we can hide this complexity from them. Introduce a method handle_inode_event() that serves those backends and passes a single inode mark and less arguments. This change converts all backends except fanotify and inotify to use the simplified handle_inode_event() method. In pricipal, inotify could have also used the new method, but that would require passing more arguments on the simple helper (data, data_type, cookie), so we leave it with the handle_event() method. Link: https://lore.kernel.org/r/20200722125849.17418-9-amir73il@gmail.com Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/nfsd/filecache.c | 12 ++++------ fs/notify/dnotify/dnotify.c | 38 +++++++---------------------- fs/notify/fsnotify.c | 52 ++++++++++++++++++++++++++++++++++++++-- include/linux/fsnotify_backend.h | 19 ++++++++++++--- kernel/audit_fsnotify.c | 20 +++++++--------- kernel/audit_tree.c | 10 ++++---- kernel/audit_watch.c | 17 ++++++------- 7 files changed, 97 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index bbc7892d2928..c8b9d2667ee6 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -598,14 +598,10 @@ static struct notifier_block nfsd_file_lease_notifier = { }; static int -nfsd_file_fsnotify_handle_event(struct fsnotify_group *group, u32 mask, - const void *data, int data_type, - struct inode *dir, - const struct qstr *file_name, u32 cookie, - struct fsnotify_iter_info *iter_info) +nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *name) { - struct inode *inode = fsnotify_data_inode(data, data_type); - trace_nfsd_file_fsnotify_handle_event(inode, mask); /* Should be no marks on non-regular files */ @@ -626,7 +622,7 @@ nfsd_file_fsnotify_handle_event(struct fsnotify_group *group, u32 mask, static const struct fsnotify_ops nfsd_file_fsnotify_ops = { - .handle_event = nfsd_file_fsnotify_handle_event, + .handle_inode_event = nfsd_file_fsnotify_handle_event, .free_mark = nfsd_file_mark_free, }; diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index ca78d3f78da8..5dcda8f20c04 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -70,8 +70,9 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) * destroy the dnotify struct if it was not registered to receive multiple * events. */ -static void dnotify_one_event(struct fsnotify_group *group, u32 mask, - struct fsnotify_mark *inode_mark) +static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *name) { struct dnotify_mark *dn_mark; struct dnotify_struct *dn; @@ -79,6 +80,10 @@ static void dnotify_one_event(struct fsnotify_group *group, u32 mask, struct fown_struct *fown; __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; + /* not a dir, dnotify doesn't care */ + if (!dir && !(mask & FS_ISDIR)) + return 0; + dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); spin_lock(&inode_mark->lock); @@ -100,33 +105,6 @@ static void dnotify_one_event(struct fsnotify_group *group, u32 mask, } spin_unlock(&inode_mark->lock); -} - -static int dnotify_handle_event(struct fsnotify_group *group, u32 mask, - const void *data, int data_type, - struct inode *dir, - const struct qstr *file_name, u32 cookie, - struct fsnotify_iter_info *iter_info) -{ - struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); - struct fsnotify_mark *child_mark = fsnotify_iter_child_mark(iter_info); - - /* not a dir, dnotify doesn't care */ - if (!dir && !(mask & FS_ISDIR)) - return 0; - - if (WARN_ON(fsnotify_iter_vfsmount_mark(iter_info))) - return 0; - - /* - * Some events can be sent on both parent dir and subdir marks - * (e.g. DN_ATTRIB). If both parent dir and subdir are watching, - * report the event once to parent dir and once to subdir. - */ - if (inode_mark) - dnotify_one_event(group, mask, inode_mark); - if (child_mark) - dnotify_one_event(group, mask, child_mark); return 0; } @@ -143,7 +121,7 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) } static const struct fsnotify_ops dnotify_fsnotify_ops = { - .handle_event = dnotify_handle_event, + .handle_inode_event = dnotify_handle_event, .free_mark = dnotify_free_mark, }; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 494d5d70323f..a960ec3a569a 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -230,6 +230,49 @@ notify: } EXPORT_SYMBOL_GPL(__fsnotify_parent); +static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, + const void *data, int data_type, + struct inode *dir, const struct qstr *name, + u32 cookie, struct fsnotify_iter_info *iter_info) +{ + struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); + struct fsnotify_mark *child_mark = fsnotify_iter_child_mark(iter_info); + struct inode *inode = fsnotify_data_inode(data, data_type); + const struct fsnotify_ops *ops = group->ops; + int ret; + + if (WARN_ON_ONCE(!ops->handle_inode_event)) + return 0; + + if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) || + WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info))) + return 0; + + /* + * An event can be sent on child mark iterator instead of inode mark + * iterator because of other groups that have interest of this inode + * and have marks on both parent and child. We can simplify this case. + */ + if (!inode_mark) { + inode_mark = child_mark; + child_mark = NULL; + dir = NULL; + name = NULL; + } + + ret = ops->handle_inode_event(inode_mark, mask, inode, dir, name); + if (ret || !child_mark) + return ret; + + /* + * Some events can be sent on both parent dir and child marks + * (e.g. FS_ATTRIB). If both parent dir and child are watching, + * report the event once to parent dir with name and once to child + * without name. + */ + return ops->handle_inode_event(child_mark, mask, inode, NULL, NULL); +} + static int send_to_group(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) @@ -275,8 +318,13 @@ static int send_to_group(__u32 mask, const void *data, int data_type, if (!(test_mask & marks_mask & ~marks_ignored_mask)) return 0; - return group->ops->handle_event(group, mask, data, data_type, dir, - file_name, cookie, iter_info); + if (group->ops->handle_event) { + return group->ops->handle_event(group, mask, data, data_type, dir, + file_name, cookie, iter_info); + } + + return fsnotify_handle_event(group, mask, data, data_type, dir, + file_name, cookie, iter_info); } static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp) diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 32104cfc27a5..f8529a3a2923 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -128,17 +128,30 @@ struct mem_cgroup; * @cookie: inotify rename cookie * @iter_info: array of marks from this group that are interested in the event * + * handle_inode_event - simple variant of handle_event() for groups that only + * have inode marks and don't have ignore mask + * @mark: mark to notify + * @mask: event type and flags + * @inode: inode that event happened on + * @dir: optional directory associated with event - + * if @file_name is not NULL, this is the directory that + * @file_name is relative to. + * @file_name: optional file name associated with event + * * free_group_priv - called when a group refcnt hits 0 to clean up the private union * freeing_mark - called when a mark is being destroyed for some reason. The group - * MUST be holding a reference on each mark and that reference must be - * dropped in this function. inotify uses this function to send - * userspace messages that marks have been removed. + * MUST be holding a reference on each mark and that reference must be + * dropped in this function. inotify uses this function to send + * userspace messages that marks have been removed. */ struct fsnotify_ops { int (*handle_event)(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info); + int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *file_name); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); void (*free_event)(struct fsnotify_event *event); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index bd3a6b79316a..bfcfcd61adb6 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -152,35 +152,31 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) } /* Update mark data in audit rules based on fsnotify events. */ -static int audit_mark_handle_event(struct fsnotify_group *group, u32 mask, - const void *data, int data_type, - struct inode *dir, - const struct qstr *dname, u32 cookie, - struct fsnotify_iter_info *iter_info) +static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *dname) { - struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct audit_fsnotify_mark *audit_mark; - const struct inode *inode = fsnotify_data_inode(data, data_type); audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); - BUG_ON(group != audit_fsnotify_group); - - if (WARN_ON(!inode)) + if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) || + WARN_ON_ONCE(!inode)) return 0; if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) return 0; audit_update_mark(audit_mark, inode); - } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) + } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) { audit_autoremove_mark_rule(audit_mark); + } return 0; } static const struct fsnotify_ops audit_mark_fsnotify_ops = { - .handle_event = audit_mark_handle_event, + .handle_inode_event = audit_mark_handle_event, .free_mark = audit_fsnotify_free_mark, }; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 2ce2ac1ce100..025d24abf15d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1037,11 +1037,9 @@ static void evict_chunk(struct audit_chunk *chunk) audit_schedule_prune(); } -static int audit_tree_handle_event(struct fsnotify_group *group, u32 mask, - const void *data, int data_type, - struct inode *dir, - const struct qstr *file_name, u32 cookie, - struct fsnotify_iter_info *iter_info) +static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *file_name) { return 0; } @@ -1070,7 +1068,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark, } static const struct fsnotify_ops audit_tree_ops = { - .handle_event = audit_tree_handle_event, + .handle_inode_event = audit_tree_handle_event, .freeing_mark = audit_tree_freeing_mark, .free_mark = audit_tree_destroy_watch, }; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e23d54bcc587..246e5ba704c0 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -464,20 +464,17 @@ void audit_remove_watch_rule(struct audit_krule *krule) } /* Update watch data in audit rules based on fsnotify events. */ -static int audit_watch_handle_event(struct fsnotify_group *group, u32 mask, - const void *data, int data_type, - struct inode *dir, - const struct qstr *dname, u32 cookie, - struct fsnotify_iter_info *iter_info) +static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *dname) { - struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); - const struct inode *inode = fsnotify_data_inode(data, data_type); struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); - BUG_ON(group != audit_watch_group); - WARN_ON(!inode); + if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) || + WARN_ON_ONCE(!inode)) + return 0; if (mask & (FS_CREATE|FS_MOVED_TO) && inode) audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); @@ -490,7 +487,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, u32 mask, } static const struct fsnotify_ops audit_watch_fsnotify_ops = { - .handle_event = audit_watch_handle_event, + .handle_inode_event = audit_watch_handle_event, .free_mark = audit_watch_free_mark, }; -- cgit v1.2.3 From f6dfbe31e8fa5cbd5bc89df9d7f0fa0af7e69981 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 27 Jul 2020 18:54:11 +0100 Subject: bpf: Fix swapped arguments in calls to check_buffer_access There are a couple of arguments of the boolean flag zero_size_allowed and the char pointer buf_info when calling to function check_buffer_access that are swapped by mistake. Fix these by swapping them to correct the argument ordering. Fixes: afbf21dce668 ("bpf: Support readonly/readwrite buffers in verifier") Addresses-Coverity: ("Array compared to 0") Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200727175411.155179-1-colin.king@canonical.com --- kernel/bpf/verifier.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cd14e70f2d07..88bb25d08bf8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3477,14 +3477,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn regno, reg_type_str[reg->type]); return -EACCES; } - err = check_buffer_access(env, reg, regno, off, size, "rdonly", - false, + err = check_buffer_access(env, reg, regno, off, size, false, + "rdonly", &env->prog->aux->max_rdonly_access); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_RDWR_BUF) { - err = check_buffer_access(env, reg, regno, off, size, "rdwr", - false, + err = check_buffer_access(env, reg, regno, off, size, false, + "rdwr", &env->prog->aux->max_rdwr_access); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); -- cgit v1.2.3 From 112a0e4171e111e963aada3fe790c71accf4d705 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 28 Jul 2020 16:34:00 +0900 Subject: kprobes: Remove unnecessary module_mutex locking from kprobe_optimizer() Since we already lock both kprobe_mutex and text_mutex in the optimizer, text will not be changed and the module unloading will be stopped inside kprobes_module_callback(). The mutex_lock() has originally been introduced to avoid conflict with text modification, at that point we didn't hold text_mutex. But after: f1c6ece23729 ("kprobes: Fix potential deadlock in kprobe_optimizer()") We started holding the text_mutex and don't need the modules mutex anyway. So remove the module_mutex locking. [ mingo: Amended the changelog. ] Suggested-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Cc: Jarkko Sakkinen Link: https://lore.kernel.org/r/20200728163400.e00b09c594763349f99ce6cb@kernel.org --- kernel/kprobes.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 146c648eb943..e87679a48ba2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -598,8 +598,6 @@ static void kprobe_optimizer(struct work_struct *work) mutex_lock(&kprobe_mutex); cpus_read_lock(); mutex_lock(&text_mutex); - /* Lock modules while optimizing kprobes */ - mutex_lock(&module_mutex); /* * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) @@ -624,7 +622,6 @@ static void kprobe_optimizer(struct work_struct *work) /* Step 4: Free cleaned kprobes after quiesence period */ do_free_cleaned_kprobes(); - mutex_unlock(&module_mutex); mutex_unlock(&text_mutex); cpus_read_unlock(); -- cgit v1.2.3 From 21a6ee14a8f277766618ef07154432b46528113e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 28 Jul 2020 19:17:55 +0800 Subject: sched: Remove duplicated tick_nohz_full_enabled() check In sched_update_tick_dependency() there's two calls that check whether nohz_full is enabled: tick_nohz_full_cpu() does it implicitly, while there's also an explicit call to tick_nohz_full_enabled(). Remove the duplicated, open coded check. [ mingo: Amended the changelog. ] Signed-off-by: Miaohe Lin Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/1595935075-14223-1-git-send-email-linmiaohe@huawei.com --- kernel/sched/sched.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9f33c77258ea..296efd30d8c9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1961,12 +1961,7 @@ extern int __init sched_tick_offload_init(void); */ static inline void sched_update_tick_dependency(struct rq *rq) { - int cpu; - - if (!tick_nohz_full_enabled()) - return; - - cpu = cpu_of(rq); + int cpu = cpu_of(rq); if (!tick_nohz_full_cpu(cpu)) return; -- cgit v1.2.3 From 274b3f7bf34415eed106e479e4815e897ce5d763 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 22 Jul 2020 16:33:43 +0200 Subject: dma-contiguous: cleanup dma_alloc_contiguous Split out a cma_alloc_aligned helper to deal with the "interesting" calling conventions for cma_alloc, which then allows to the main function to be written straight forward. This also takes advantage of the fact that NULL dev arguments have been gone from the DMA API for a while. Signed-off-by: Christoph Hellwig Reviewed-by: Nicolin Chen Reviewed-by: Barry Song --- kernel/dma/contiguous.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 15bc5026c485..cff7e60968b9 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -215,6 +215,13 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, return cma_release(dev_get_cma_area(dev), pages, count); } +static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp) +{ + unsigned int align = min(get_order(size), CONFIG_CMA_ALIGNMENT); + + return cma_alloc(cma, size >> PAGE_SHIFT, align, gfp & __GFP_NOWARN); +} + /** * dma_alloc_contiguous() - allocate contiguous pages * @dev: Pointer to device for which the allocation is performed. @@ -231,24 +238,14 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { - size_t count = size >> PAGE_SHIFT; - struct page *page = NULL; - struct cma *cma = NULL; - - if (dev && dev->cma_area) - cma = dev->cma_area; - else if (count > 1) - cma = dma_contiguous_default_area; - /* CMA can be used only in the context which permits sleeping */ - if (cma && gfpflags_allow_blocking(gfp)) { - size_t align = get_order(size); - size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); - - page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN); - } - - return page; + if (!gfpflags_allow_blocking(gfp)) + return NULL; + if (dev->cma_area) + return cma_alloc_aligned(dev->cma_area, size, gfp); + if (size <= PAGE_SIZE || !dma_contiguous_default_area) + return NULL; + return cma_alloc_aligned(dma_contiguous_default_area, size, gfp); } /** -- cgit v1.2.3 From 310ad7970a0dec847563dc6dba9e7e587d545622 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Jul 2020 12:05:27 -0700 Subject: bpf: Fix build without CONFIG_NET when using BPF XDP link Entire net/core subsystem is not built without CONFIG_NET. linux/netdevice.h just assumes that it's always there, so the easiest way to fix this is to conditionally compile out bpf_xdp_link_attach() use in bpf/syscall.c. Fixes: aa8d3a716b59 ("bpf, xdp: Add bpf_link-based XDP attachment API") Reported-by: Randy Dunlap Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Randy Dunlap # build-tested Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200728190527.110830-1-andriin@fb.com --- kernel/bpf/syscall.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0e8c88db7e7a..cd3d599e9e90 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3923,9 +3923,11 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; +#ifdef CONFIG_NET case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; +#endif default: ret = -EINVAL; } -- cgit v1.2.3 From 48001ea50d17f3eb06a552e9ecf21f7fc01b25da Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 20 Jul 2020 15:08:18 -0700 Subject: PM, libnvdimm: Add runtime firmware activation support Abstract platform specific mechanics for nvdimm firmware activation behind a handful of generic ops. At the bus level ->activate_state() indicates the unified state (idle, busy, armed) of all DIMMs on the bus, and ->capability() indicates the system state expectations for activate. At the DIMM level ->activate_state() indicates the per-DIMM state, ->activate_result() indicates the outcome of the last activation attempt, and ->arm() attempts to transition the DIMM from 'idle' to 'armed'. A new hibernate_quiet_exec() facility is added to support firmware activation in an OS defined system quiesce state. It leverages the fact that the hibernate-freeze state wants to assert that a memory hibernation snapshot can be taken. This is in contrast to a platform firmware defined quiesce state that may forcefully quiet the memory controller independent of whether an individual device-driver properly supports hibernate-freeze. The libnvdimm sysfs interface is extended to support detection of a firmware activate capability. The mechanism supports enumeration and triggering of firmware activate, optionally in the hibernate_quiet_exec() context. [rafael: hibernate_quiet_exec() proposal] [vishal: fix up sparse warning, grammar in Documentation/] Cc: Pavel Machek Cc: Ira Weiny Cc: Len Brown Cc: Jonathan Corbet Cc: Dave Jiang Cc: Vishal Verma Reported-by: kernel test robot Co-developed-by: "Rafael J. Wysocki" Signed-off-by: "Rafael J. Wysocki" Signed-off-by: Dan Williams Signed-off-by: Vishal Verma --- Documentation/ABI/testing/sysfs-bus-nvdimm | 2 + .../driver-api/nvdimm/firmware-activate.rst | 86 ++++++++++++ drivers/nvdimm/core.c | 149 +++++++++++++++++++++ drivers/nvdimm/dimm_devs.c | 115 ++++++++++++++++ drivers/nvdimm/nd-core.h | 1 + include/linux/libnvdimm.h | 44 ++++++ include/linux/suspend.h | 6 + kernel/power/hibernate.c | 97 ++++++++++++++ 8 files changed, 500 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-bus-nvdimm create mode 100644 Documentation/driver-api/nvdimm/firmware-activate.rst (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-bus-nvdimm b/Documentation/ABI/testing/sysfs-bus-nvdimm new file mode 100644 index 000000000000..d64380262be8 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-nvdimm @@ -0,0 +1,2 @@ +The libnvdimm sub-system implements a common sysfs interface for +platform nvdimm resources. See Documentation/driver-api/nvdimm/. diff --git a/Documentation/driver-api/nvdimm/firmware-activate.rst b/Documentation/driver-api/nvdimm/firmware-activate.rst new file mode 100644 index 000000000000..7ee7decbbdc3 --- /dev/null +++ b/Documentation/driver-api/nvdimm/firmware-activate.rst @@ -0,0 +1,86 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +NVDIMM Runtime Firmware Activation +================================== + +Some persistent memory devices run a firmware locally on the device / +"DIMM" to perform tasks like media management, capacity provisioning, +and health monitoring. The process of updating that firmware typically +involves a reboot because it has implications for in-flight memory +transactions. However, reboots are disruptive and at least the Intel +persistent memory platform implementation, described by the Intel ACPI +DSM specification [1], has added support for activating firmware at +runtime. + +A native sysfs interface is implemented in libnvdimm to allow platform +to advertise and control their local runtime firmware activation +capability. + +The libnvdimm bus object, ndbusX, implements an ndbusX/firmware/activate +attribute that shows the state of the firmware activation as one of 'idle', +'armed', 'overflow', and 'busy'. + +- idle: + No devices are set / armed to activate firmware + +- armed: + At least one device is armed + +- busy: + In the busy state armed devices are in the process of transitioning + back to idle and completing an activation cycle. + +- overflow: + If the platform has a concept of incremental work needed to perform + the activation it could be the case that too many DIMMs are armed for + activation. In that scenario the potential for firmware activation to + timeout is indicated by the 'overflow' state. + +The 'ndbusX/firmware/activate' property can be written with a value of +either 'live', or 'quiesce'. A value of 'quiesce' triggers the kernel to +run firmware activation from within the equivalent of the hibernation +'freeze' state where drivers and applications are notified to stop their +modifications of system memory. A value of 'live' attempts +firmware activation without this hibernation cycle. The +'ndbusX/firmware/activate' property will be elided completely if no +firmware activation capability is detected. + +Another property 'ndbusX/firmware/capability' indicates a value of +'live' or 'quiesce', where 'live' indicates that the firmware +does not require or inflict any quiesce period on the system to update +firmware. A capability value of 'quiesce' indicates that firmware does +expect and injects a quiet period for the memory controller, but 'live' +may still be written to 'ndbusX/firmware/activate' as an override to +assume the risk of racing firmware update with in-flight device and +application activity. The 'ndbusX/firmware/capability' property will be +elided completely if no firmware activation capability is detected. + +The libnvdimm memory-device / DIMM object, nmemX, implements +'nmemX/firmware/activate' and 'nmemX/firmware/result' attributes to +communicate the per-device firmware activation state. Similar to the +'ndbusX/firmware/activate' attribute, the 'nmemX/firmware/activate' +attribute indicates 'idle', 'armed', or 'busy'. The state transitions +from 'armed' to 'idle' when the system is prepared to activate firmware, +firmware staged + state set to armed, and 'ndbusX/firmware/activate' is +triggered. After that activation event the nmemX/firmware/result +attribute reflects the state of the last activation as one of: + +- none: + No runtime activation triggered since the last time the device was reset + +- success: + The last runtime activation completed successfully. + +- fail: + The last runtime activation failed for device-specific reasons. + +- not_staged: + The last runtime activation failed due to a sequencing error of the + firmware image not being staged. + +- need_reset: + Runtime firmware activation failed, but the firmware can still be + activated via the legacy method of power-cycling the system. + +[1]: https://docs.pmem.io/persistent-memory/ diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index fe9bd6febdd2..c21ba0602029 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -4,6 +4,7 @@ */ #include #include +#include #include #include #include @@ -389,8 +390,156 @@ static const struct attribute_group nvdimm_bus_attribute_group = { .attrs = nvdimm_bus_attributes, }; +static ssize_t capability_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + enum nvdimm_fwa_capability cap; + + if (!nd_desc->fw_ops) + return -EOPNOTSUPP; + + nvdimm_bus_lock(dev); + cap = nd_desc->fw_ops->capability(nd_desc); + nvdimm_bus_unlock(dev); + + switch (cap) { + case NVDIMM_FWA_CAP_QUIESCE: + return sprintf(buf, "quiesce\n"); + case NVDIMM_FWA_CAP_LIVE: + return sprintf(buf, "live\n"); + default: + return -EOPNOTSUPP; + } +} + +static DEVICE_ATTR_RO(capability); + +static ssize_t activate_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + enum nvdimm_fwa_capability cap; + enum nvdimm_fwa_state state; + + if (!nd_desc->fw_ops) + return -EOPNOTSUPP; + + nvdimm_bus_lock(dev); + cap = nd_desc->fw_ops->capability(nd_desc); + state = nd_desc->fw_ops->activate_state(nd_desc); + nvdimm_bus_unlock(dev); + + if (cap < NVDIMM_FWA_CAP_QUIESCE) + return -EOPNOTSUPP; + + switch (state) { + case NVDIMM_FWA_IDLE: + return sprintf(buf, "idle\n"); + case NVDIMM_FWA_BUSY: + return sprintf(buf, "busy\n"); + case NVDIMM_FWA_ARMED: + return sprintf(buf, "armed\n"); + case NVDIMM_FWA_ARM_OVERFLOW: + return sprintf(buf, "overflow\n"); + default: + return -ENXIO; + } +} + +static int exec_firmware_activate(void *data) +{ + struct nvdimm_bus_descriptor *nd_desc = data; + + return nd_desc->fw_ops->activate(nd_desc); +} + +static ssize_t activate_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + enum nvdimm_fwa_state state; + bool quiesce; + ssize_t rc; + + if (!nd_desc->fw_ops) + return -EOPNOTSUPP; + + if (sysfs_streq(buf, "live")) + quiesce = false; + else if (sysfs_streq(buf, "quiesce")) + quiesce = true; + else + return -EINVAL; + + nvdimm_bus_lock(dev); + state = nd_desc->fw_ops->activate_state(nd_desc); + + switch (state) { + case NVDIMM_FWA_BUSY: + rc = -EBUSY; + break; + case NVDIMM_FWA_ARMED: + case NVDIMM_FWA_ARM_OVERFLOW: + if (quiesce) + rc = hibernate_quiet_exec(exec_firmware_activate, nd_desc); + else + rc = nd_desc->fw_ops->activate(nd_desc); + break; + case NVDIMM_FWA_IDLE: + default: + rc = -ENXIO; + } + nvdimm_bus_unlock(dev); + + if (rc == 0) + rc = len; + return rc; +} + +static DEVICE_ATTR_ADMIN_RW(activate); + +static umode_t nvdimm_bus_firmware_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + enum nvdimm_fwa_capability cap; + + /* + * Both 'activate' and 'capability' disappear when no ops + * detected, or a negative capability is indicated. + */ + if (!nd_desc->fw_ops) + return 0; + + nvdimm_bus_lock(dev); + cap = nd_desc->fw_ops->capability(nd_desc); + nvdimm_bus_unlock(dev); + + if (cap < NVDIMM_FWA_CAP_QUIESCE) + return 0; + + return a->mode; +} +static struct attribute *nvdimm_bus_firmware_attributes[] = { + &dev_attr_activate.attr, + &dev_attr_capability.attr, + NULL, +}; + +static const struct attribute_group nvdimm_bus_firmware_attribute_group = { + .name = "firmware", + .attrs = nvdimm_bus_firmware_attributes, + .is_visible = nvdimm_bus_firmware_visible, +}; + const struct attribute_group *nvdimm_bus_attribute_groups[] = { &nvdimm_bus_attribute_group, + &nvdimm_bus_firmware_attribute_group, NULL, }; diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index b7b77e8d9027..85b53a7f44f2 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -446,9 +446,124 @@ static const struct attribute_group nvdimm_attribute_group = { .is_visible = nvdimm_visible, }; +static ssize_t result_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + enum nvdimm_fwa_result result; + + if (!nvdimm->fw_ops) + return -EOPNOTSUPP; + + nvdimm_bus_lock(dev); + result = nvdimm->fw_ops->activate_result(nvdimm); + nvdimm_bus_unlock(dev); + + switch (result) { + case NVDIMM_FWA_RESULT_NONE: + return sprintf(buf, "none\n"); + case NVDIMM_FWA_RESULT_SUCCESS: + return sprintf(buf, "success\n"); + case NVDIMM_FWA_RESULT_FAIL: + return sprintf(buf, "fail\n"); + case NVDIMM_FWA_RESULT_NOTSTAGED: + return sprintf(buf, "not_staged\n"); + case NVDIMM_FWA_RESULT_NEEDRESET: + return sprintf(buf, "need_reset\n"); + default: + return -ENXIO; + } +} +static DEVICE_ATTR_ADMIN_RO(result); + +static ssize_t activate_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + enum nvdimm_fwa_state state; + + if (!nvdimm->fw_ops) + return -EOPNOTSUPP; + + nvdimm_bus_lock(dev); + state = nvdimm->fw_ops->activate_state(nvdimm); + nvdimm_bus_unlock(dev); + + switch (state) { + case NVDIMM_FWA_IDLE: + return sprintf(buf, "idle\n"); + case NVDIMM_FWA_BUSY: + return sprintf(buf, "busy\n"); + case NVDIMM_FWA_ARMED: + return sprintf(buf, "armed\n"); + default: + return -ENXIO; + } +} + +static ssize_t activate_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + enum nvdimm_fwa_trigger arg; + int rc; + + if (!nvdimm->fw_ops) + return -EOPNOTSUPP; + + if (sysfs_streq(buf, "arm")) + arg = NVDIMM_FWA_ARM; + else if (sysfs_streq(buf, "disarm")) + arg = NVDIMM_FWA_DISARM; + else + return -EINVAL; + + nvdimm_bus_lock(dev); + rc = nvdimm->fw_ops->arm(nvdimm, arg); + nvdimm_bus_unlock(dev); + + if (rc < 0) + return rc; + return len; +} +static DEVICE_ATTR_ADMIN_RW(activate); + +static struct attribute *nvdimm_firmware_attributes[] = { + &dev_attr_activate.attr, + &dev_attr_result.attr, +}; + +static umode_t nvdimm_firmware_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + struct nvdimm *nvdimm = to_nvdimm(dev); + enum nvdimm_fwa_capability cap; + + if (!nd_desc->fw_ops) + return 0; + if (!nvdimm->fw_ops) + return 0; + + nvdimm_bus_lock(dev); + cap = nd_desc->fw_ops->capability(nd_desc); + nvdimm_bus_unlock(dev); + + if (cap < NVDIMM_FWA_CAP_QUIESCE) + return 0; + + return a->mode; +} + +static const struct attribute_group nvdimm_firmware_attribute_group = { + .name = "firmware", + .attrs = nvdimm_firmware_attributes, + .is_visible = nvdimm_firmware_visible, +}; + static const struct attribute_group *nvdimm_attribute_groups[] = { &nd_device_attribute_group, &nvdimm_attribute_group, + &nvdimm_firmware_attribute_group, NULL, }; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index ddb9d97d9129..564faa36a3ca 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -45,6 +45,7 @@ struct nvdimm { struct kernfs_node *overwrite_state; } sec; struct delayed_work dwork; + const struct nvdimm_fw_ops *fw_ops; }; static inline unsigned long nvdimm_security_flags( diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index ad9898ece7d3..15dbcb718316 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -86,6 +86,7 @@ struct nvdimm_bus_descriptor { int (*flush_probe)(struct nvdimm_bus_descriptor *nd_desc); int (*clear_to_send)(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *data); + const struct nvdimm_bus_fw_ops *fw_ops; }; struct nd_cmd_desc { @@ -200,6 +201,49 @@ struct nvdimm_security_ops { int (*query_overwrite)(struct nvdimm *nvdimm); }; +enum nvdimm_fwa_state { + NVDIMM_FWA_INVALID, + NVDIMM_FWA_IDLE, + NVDIMM_FWA_ARMED, + NVDIMM_FWA_BUSY, + NVDIMM_FWA_ARM_OVERFLOW, +}; + +enum nvdimm_fwa_trigger { + NVDIMM_FWA_ARM, + NVDIMM_FWA_DISARM, +}; + +enum nvdimm_fwa_capability { + NVDIMM_FWA_CAP_INVALID, + NVDIMM_FWA_CAP_NONE, + NVDIMM_FWA_CAP_QUIESCE, + NVDIMM_FWA_CAP_LIVE, +}; + +enum nvdimm_fwa_result { + NVDIMM_FWA_RESULT_INVALID, + NVDIMM_FWA_RESULT_NONE, + NVDIMM_FWA_RESULT_SUCCESS, + NVDIMM_FWA_RESULT_NOTSTAGED, + NVDIMM_FWA_RESULT_NEEDRESET, + NVDIMM_FWA_RESULT_FAIL, +}; + +struct nvdimm_bus_fw_ops { + enum nvdimm_fwa_state (*activate_state) + (struct nvdimm_bus_descriptor *nd_desc); + enum nvdimm_fwa_capability (*capability) + (struct nvdimm_bus_descriptor *nd_desc); + int (*activate)(struct nvdimm_bus_descriptor *nd_desc); +}; + +struct nvdimm_fw_ops { + enum nvdimm_fwa_state (*activate_state)(struct nvdimm *nvdimm); + enum nvdimm_fwa_result (*activate_result)(struct nvdimm *nvdimm); + int (*arm)(struct nvdimm *nvdimm, enum nvdimm_fwa_trigger arg); +}; + void badrange_init(struct badrange *badrange); int badrange_add(struct badrange *badrange, u64 addr, u64 length); void badrange_forget(struct badrange *badrange, phys_addr_t start, diff --git a/include/linux/suspend.h b/include/linux/suspend.h index b960098acfb0..cb9afad82a90 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -453,6 +453,8 @@ extern bool hibernation_available(void); asmlinkage int swsusp_save(void); extern struct pbe *restore_pblist; int pfn_is_nosave(unsigned long pfn); + +int hibernate_quiet_exec(int (*func)(void *data), void *data); #else /* CONFIG_HIBERNATION */ static inline void register_nosave_region(unsigned long b, unsigned long e) {} static inline void register_nosave_region_late(unsigned long b, unsigned long e) {} @@ -464,6 +466,10 @@ static inline void hibernation_set_ops(const struct platform_hibernation_ops *op static inline int hibernate(void) { return -ENOSYS; } static inline bool system_entering_hibernation(void) { return false; } static inline bool hibernation_available(void) { return false; } + +static inline int hibernate_quiet_exec(int (*func)(void *data), void *data) { + return -ENOTSUPP; +} #endif /* CONFIG_HIBERNATION */ #ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 02ec716a4927..e6fab3f09c98 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -795,6 +795,103 @@ int hibernate(void) return error; } +/** + * hibernate_quiet_exec - Execute a function with all devices frozen. + * @func: Function to execute. + * @data: Data pointer to pass to @func. + * + * Return the @func return value or an error code if it cannot be executed. + */ +int hibernate_quiet_exec(int (*func)(void *data), void *data) +{ + int error, nr_calls = 0; + + lock_system_sleep(); + + if (!hibernate_acquire()) { + error = -EBUSY; + goto unlock; + } + + pm_prepare_console(); + + error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; + goto exit; + } + + error = freeze_processes(); + if (error) + goto exit; + + lock_device_hotplug(); + + pm_suspend_clear_flags(); + + error = platform_begin(true); + if (error) + goto thaw; + + error = freeze_kernel_threads(); + if (error) + goto thaw; + + error = dpm_prepare(PMSG_FREEZE); + if (error) + goto dpm_complete; + + suspend_console(); + + error = dpm_suspend(PMSG_FREEZE); + if (error) + goto dpm_resume; + + error = dpm_suspend_end(PMSG_FREEZE); + if (error) + goto dpm_resume; + + error = platform_pre_snapshot(true); + if (error) + goto skip; + + error = func(data); + +skip: + platform_finish(true); + + dpm_resume_start(PMSG_THAW); + +dpm_resume: + dpm_resume(PMSG_THAW); + + resume_console(); + +dpm_complete: + dpm_complete(PMSG_THAW); + + thaw_kernel_threads(); + +thaw: + platform_end(true); + + unlock_device_hotplug(); + + thaw_processes(); + +exit: + __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); + + pm_restore_console(); + + hibernate_release(); + +unlock: + unlock_system_sleep(); + + return error; +} +EXPORT_SYMBOL_GPL(hibernate_quiet_exec); /** * software_resume - Resume from a saved hibernation image. -- cgit v1.2.3 From 4fd5750af02ab7bba7c58a073060cc1da8a69173 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Jul 2020 23:49:18 +0200 Subject: sched,tracing: Convert to sched_set_fifo() One module user of sched_setscheduler() was overlooked and is obviously causing build failures. Convert ring_buffer_benchmark to use sched_set_fifo_low() when fifo==1 and sched_set_fifo() when fifo==2. This is a bit of an abuse, but it makes the thing 'work' again. Specifically, it enables all combinations that were previously possible: producer higher than consumer consumer higher than producer Fixes: 616d91b68cd5 ("sched: Remove sched_setscheduler*() EXPORTs") Reported-by: kernel test robot Reported-by: Stephen Rothwell Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20200720214918.GM5523@worktop.programming.kicks-ass.net --- kernel/trace/ring_buffer_benchmark.c | 48 +++++++++++++++++------------------- 1 file changed, 23 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 8df0aa810950..78e576575b79 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -45,8 +45,8 @@ MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); static int producer_nice = MAX_NICE; static int consumer_nice = MAX_NICE; -static int producer_fifo = -1; -static int consumer_fifo = -1; +static int producer_fifo; +static int consumer_fifo; module_param(producer_nice, int, 0644); MODULE_PARM_DESC(producer_nice, "nice prio for producer"); @@ -55,10 +55,10 @@ module_param(consumer_nice, int, 0644); MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); module_param(producer_fifo, int, 0644); -MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); +MODULE_PARM_DESC(producer_fifo, "use fifo for producer: 0 - disabled, 1 - low prio, 2 - fifo"); module_param(consumer_fifo, int, 0644); -MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); +MODULE_PARM_DESC(consumer_fifo, "use fifo for consumer: 0 - disabled, 1 - low prio, 2 - fifo"); static int read_events; @@ -303,22 +303,22 @@ static void ring_buffer_producer(void) trace_printk("ERROR!\n"); if (!disable_reader) { - if (consumer_fifo < 0) + if (consumer_fifo) + trace_printk("Running Consumer at SCHED_FIFO %s\n", + consumer_fifo == 1 ? "low" : "high"); + else trace_printk("Running Consumer at nice: %d\n", consumer_nice); - else - trace_printk("Running Consumer at SCHED_FIFO %d\n", - consumer_fifo); } - if (producer_fifo < 0) + if (producer_fifo) + trace_printk("Running Producer at SCHED_FIFO %s\n", + producer_fifo == 1 ? "low" : "high"); + else trace_printk("Running Producer at nice: %d\n", producer_nice); - else - trace_printk("Running Producer at SCHED_FIFO %d\n", - producer_fifo); /* Let the user know that the test is running at low priority */ - if (producer_fifo < 0 && consumer_fifo < 0 && + if (!producer_fifo && !consumer_fifo && producer_nice == MAX_NICE && consumer_nice == MAX_NICE) trace_printk("WARNING!!! This test is running at lowest priority.\n"); @@ -455,21 +455,19 @@ static int __init ring_buffer_benchmark_init(void) * Run them as low-prio background tasks by default: */ if (!disable_reader) { - if (consumer_fifo >= 0) { - struct sched_param param = { - .sched_priority = consumer_fifo - }; - sched_setscheduler(consumer, SCHED_FIFO, ¶m); - } else + if (consumer_fifo >= 2) + sched_set_fifo(consumer); + else if (consumer_fifo == 1) + sched_set_fifo_low(consumer); + else set_user_nice(consumer, consumer_nice); } - if (producer_fifo >= 0) { - struct sched_param param = { - .sched_priority = producer_fifo - }; - sched_setscheduler(producer, SCHED_FIFO, ¶m); - } else + if (producer_fifo >= 2) + sched_set_fifo(producer); + else if (producer_fifo == 1) + sched_set_fifo_low(producer); + else set_user_nice(producer, producer_nice); return 0; -- cgit v1.2.3 From e65855a52b479f98674998cb23b21ef5a8144b04 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 16 Jul 2020 12:03:47 +0100 Subject: sched/uclamp: Fix a deadlock when enabling uclamp static key The following splat was caught when setting uclamp value of a task: BUG: sleeping function called from invalid context at ./include/linux/percpu-rwsem.h:49 cpus_read_lock+0x68/0x130 static_key_enable+0x1c/0x38 __sched_setscheduler+0x900/0xad8 Fix by ensuring we enable the key outside of the critical section in __sched_setscheduler() Fixes: 46609ce22703 ("sched/uclamp: Protect uclamp fast path code with static key") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200716110347.19553-4-qais.yousef@arm.com --- kernel/sched/core.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 678253450ebb..e44d83f3e0e6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1346,6 +1346,15 @@ static int uclamp_validate(struct task_struct *p, if (upper_bound > SCHED_CAPACITY_SCALE) return -EINVAL; + /* + * We have valid uclamp attributes; make sure uclamp is enabled. + * + * We need to do that here, because enabling static branches is a + * blocking operation which obviously cannot be done while holding + * scheduler locks. + */ + static_branch_enable(&sched_uclamp_used); + return 0; } @@ -1376,8 +1385,6 @@ static void __setscheduler_uclamp(struct task_struct *p, if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) return; - static_branch_enable(&sched_uclamp_used); - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], attr->sched_util_min, true); -- cgit v1.2.3 From 13685c4a08fca9dd76bf53bfcbadc044ab2a08cb Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 16 Jul 2020 12:03:45 +0100 Subject: sched/uclamp: Add a new sysctl to control RT default boost value RT tasks by default run at the highest capacity/performance level. When uclamp is selected this default behavior is retained by enforcing the requested uclamp.min (p->uclamp_req[UCLAMP_MIN]) of the RT tasks to be uclamp_none(UCLAMP_MAX), which is SCHED_CAPACITY_SCALE; the maximum value. This is also referred to as 'the default boost value of RT tasks'. See commit 1a00d999971c ("sched/uclamp: Set default clamps for RT tasks"). On battery powered devices, it is desired to control this default (currently hardcoded) behavior at runtime to reduce energy consumed by RT tasks. For example, a mobile device manufacturer where big.LITTLE architecture is dominant, the performance of the little cores varies across SoCs, and on high end ones the big cores could be too power hungry. Given the diversity of SoCs, the new knob allows manufactures to tune the best performance/power for RT tasks for the particular hardware they run on. They could opt to further tune the value when the user selects a different power saving mode or when the device is actively charging. The runtime aspect of it further helps in creating a single kernel image that can be run on multiple devices that require different tuning. Keep in mind that a lot of RT tasks in the system are created by the kernel. On Android for instance I can see over 50 RT tasks, only a handful of which created by the Android framework. To control the default behavior globally by system admins and device integrator, introduce the new sysctl_sched_uclamp_util_min_rt_default to change the default boost value of the RT tasks. I anticipate this to be mostly in the form of modifying the init script of a particular device. To avoid polluting the fast path with unnecessary code, the approach taken is to synchronously do the update by traversing all the existing tasks in the system. This could race with a concurrent fork(), which is dealt with by introducing sched_post_fork() function which will ensure the racy fork will get the right update applied. Tested on Juno-r2 in combination with the RT capacity awareness [1]. By default an RT task will go to the highest capacity CPU and run at the maximum frequency, which is particularly energy inefficient on high end mobile devices because the biggest core[s] are 'huge' and power hungry. With this patch the RT task can be controlled to run anywhere by default, and doesn't cause the frequency to be maximum all the time. Yet any task that really needs to be boosted can easily escape this default behavior by modifying its requested uclamp.min value (p->uclamp_req[UCLAMP_MIN]) via sched_setattr() syscall. [1] 804d402fb6f6: ("sched/rt: Make RT capacity-aware") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200716110347.19553-2-qais.yousef@arm.com --- include/linux/sched.h | 10 +++- include/linux/sched/sysctl.h | 1 + include/linux/sched/task.h | 1 + kernel/fork.c | 1 + kernel/sched/core.c | 119 ++++++++++++++++++++++++++++++++++++++++--- kernel/sysctl.c | 7 +++ 6 files changed, 131 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index adf0125190d4..a6bf77c34687 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -686,9 +686,15 @@ struct task_struct { struct sched_dl_entity dl; #ifdef CONFIG_UCLAMP_TASK - /* Clamp values requested for a scheduling entity */ + /* + * Clamp values requested for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ struct uclamp_se uclamp_req[UCLAMP_CNT]; - /* Effective clamp values used for a scheduling entity */ + /* + * Effective clamp values used for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 24be30a40814..3c31ba88aca5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -67,6 +67,7 @@ extern unsigned int sysctl_sched_dl_period_min; #ifdef CONFIG_UCLAMP_TASK extern unsigned int sysctl_sched_uclamp_util_min; extern unsigned int sysctl_sched_uclamp_util_max; +extern unsigned int sysctl_sched_uclamp_util_min_rt_default; #endif #ifdef CONFIG_CFS_BANDWIDTH diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 38359071236a..e7ddab095baf 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -55,6 +55,7 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); diff --git a/kernel/fork.c b/kernel/fork.c index efc5493203ae..e75c2e41f3d1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2304,6 +2304,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e44d83f3e0e6..12e1f3a2cabc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -889,6 +889,23 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +/* + * By default RT tasks run at the maximum performance point/capacity of the + * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to + * SCHED_CAPACITY_SCALE. + * + * This knob allows admins to change the default behavior when uclamp is being + * used. In battery powered devices, particularly, running at the maximum + * capacity and frequency will increase energy consumption and shorten the + * battery life. + * + * This knob only affects RT tasks that their uclamp_se->user_defined == false. + * + * This knob will not override the system default sched_util_clamp_min defined + * above. + */ +unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; + /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; @@ -991,6 +1008,64 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +static void __uclamp_update_util_min_rt_default(struct task_struct *p) +{ + unsigned int default_util_min; + struct uclamp_se *uc_se; + + lockdep_assert_held(&p->pi_lock); + + uc_se = &p->uclamp_req[UCLAMP_MIN]; + + /* Only sync if user didn't override the default */ + if (uc_se->user_defined) + return; + + default_util_min = sysctl_sched_uclamp_util_min_rt_default; + uclamp_se_set(uc_se, default_util_min, false); +} + +static void uclamp_update_util_min_rt_default(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + + if (!rt_task(p)) + return; + + /* Protect updates to p->uclamp_* */ + rq = task_rq_lock(p, &rf); + __uclamp_update_util_min_rt_default(p); + task_rq_unlock(rq, p, &rf); +} + +static void uclamp_sync_util_min_rt_default(void) +{ + struct task_struct *g, *p; + + /* + * copy_process() sysctl_uclamp + * uclamp_min_rt = X; + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) + * // link thread smp_mb__after_spinlock() + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); + * sched_post_fork() for_each_process_thread() + * __uclamp_sync_rt() __uclamp_sync_rt() + * + * Ensures that either sched_post_fork() will observe the new + * uclamp_min_rt or for_each_process_thread() will observe the new + * task. + */ + read_lock(&tasklist_lock); + smp_mb__after_spinlock(); + read_unlock(&tasklist_lock); + + rcu_read_lock(); + for_each_process_thread(g, p) + uclamp_update_util_min_rt_default(p); + rcu_read_unlock(); +} + static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { @@ -1278,12 +1353,13 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; - int old_min, old_max; + int old_min, old_max, old_min_rt; int result; mutex_lock(&uclamp_mutex); old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; + old_min_rt = sysctl_sched_uclamp_util_min_rt_default; result = proc_dointvec(table, write, buffer, lenp, ppos); if (result) @@ -1292,7 +1368,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, goto done; if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || - sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || + sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) { + result = -EINVAL; goto undo; } @@ -1313,6 +1391,11 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, uclamp_update_root_tg(); } + if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { + static_branch_enable(&sched_uclamp_used); + uclamp_sync_util_min_rt_default(); + } + /* * We update all RUNNABLE tasks only when task groups are in use. * Otherwise, keep it simple and do just a lazy update at each next @@ -1324,6 +1407,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; + sysctl_sched_uclamp_util_min_rt_default = old_min_rt; done: mutex_unlock(&uclamp_mutex); @@ -1369,17 +1453,20 @@ static void __setscheduler_uclamp(struct task_struct *p, */ for_each_clamp_id(clamp_id) { struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; - unsigned int clamp_value = uclamp_none(clamp_id); /* Keep using defined clamps across class changes */ if (uc_se->user_defined) continue; - /* By default, RT tasks always get 100% boost */ + /* + * RT by default have a 100% boost value that could be modified + * at runtime. + */ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - clamp_value = uclamp_none(UCLAMP_MAX); + __uclamp_update_util_min_rt_default(p); + else + uclamp_se_set(uc_se, uclamp_none(clamp_id), false); - uclamp_se_set(uc_se, clamp_value, false); } if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) @@ -1400,6 +1487,10 @@ static void uclamp_fork(struct task_struct *p) { enum uclamp_id clamp_id; + /* + * We don't need to hold task_rq_lock() when updating p->uclamp_* here + * as the task is still at its early fork stages. + */ for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; @@ -1412,6 +1503,11 @@ static void uclamp_fork(struct task_struct *p) } } +static void uclamp_post_fork(struct task_struct *p) +{ + uclamp_update_util_min_rt_default(p); +} + static void __init init_uclamp_rq(struct rq *rq) { enum uclamp_id clamp_id; @@ -1462,6 +1558,7 @@ static inline int uclamp_validate(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } +static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -3205,6 +3302,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } +void sched_post_fork(struct task_struct *p) +{ + uclamp_post_fork(p); +} + unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) @@ -5724,6 +5826,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_nice = task_nice(p); #ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4aea67d3d552..1b4d2dc270a5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1815,6 +1815,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_sched_uclamp_handler, }, + { + .procname = "sched_util_clamp_min_rt_default", + .data = &sysctl_sched_uclamp_util_min_rt_default, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, #endif #ifdef CONFIG_SCHED_AUTOGROUP { -- cgit v1.2.3 From f891f19736bdf404845f97d8038054be37160ea8 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 29 Jul 2020 17:09:19 +0530 Subject: kexec_file: Allow archs to handle special regions while locating memory hole Some architectures may have special memory regions, within the given memory range, which can't be used for the buffer in a kexec segment. Implement weak arch_kexec_locate_mem_hole() definition which arch code may override, to take care of special regions, while trying to locate a memory hole. Also, add the missing declarations for arch overridable functions and and drop the __weak descriptors in the declarations to avoid non-weak definitions from becoming weak. Signed-off-by: Hari Bathini Tested-by: Pingfan Liu Reviewed-by: Thiago Jung Bauermann Acked-by: Dave Young Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/159602273603.575379.17665852963340380839.stgit@hbathini --- include/linux/kexec.h | 29 ++++++++++++++++++----------- kernel/kexec_file.c | 16 ++++++++++++++-- 2 files changed, 32 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index ea67910ae6b7..9e93bef52968 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -183,17 +183,24 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, bool get_value); void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); -int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len); -void * __weak arch_kexec_kernel_image_load(struct kimage *image); -int __weak arch_kexec_apply_relocations_add(struct purgatory_info *pi, - Elf_Shdr *section, - const Elf_Shdr *relsec, - const Elf_Shdr *symtab); -int __weak arch_kexec_apply_relocations(struct purgatory_info *pi, - Elf_Shdr *section, - const Elf_Shdr *relsec, - const Elf_Shdr *symtab); +/* Architectures may override the below functions */ +int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len); +void *arch_kexec_kernel_image_load(struct kimage *image); +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); +int arch_kexec_apply_relocations(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); +int arch_kimage_file_post_load_cleanup(struct kimage *image); +#ifdef CONFIG_KEXEC_SIG +int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len); +#endif +int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf); extern int kexec_add_buffer(struct kexec_buf *kbuf); int kexec_locate_mem_hole(struct kexec_buf *kbuf); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index bb05fd52de85..eb42d2efa16a 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -657,6 +657,19 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) return ret == 1 ? 0 : -EADDRNOTAVAIL; } +/** + * arch_kexec_locate_mem_hole - Find free memory to place the segments. + * @kbuf: Parameters for the memory search. + * + * On success, kbuf->mem will have the start address of the memory region found. + * + * Return: 0 on success, negative errno on error. + */ +int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf) +{ + return kexec_locate_mem_hole(kbuf); +} + /** * kexec_add_buffer - place a buffer in a kexec segment * @kbuf: Buffer contents and memory parameters. @@ -669,7 +682,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) */ int kexec_add_buffer(struct kexec_buf *kbuf) { - struct kexec_segment *ksegment; int ret; @@ -697,7 +709,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); /* Walk the RAM ranges and allocate a suitable range for the buffer */ - ret = kexec_locate_mem_hole(kbuf); + ret = arch_kexec_locate_mem_hole(kbuf); if (ret) return ret; -- cgit v1.2.3 From b75058614fdd3140074a640b514f6a0b4d485a2d Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:19 +0200 Subject: sched: tasks: Use sequence counter with associated spinlock A sequence counter write side critical section must be protected by some form of locking to serialize writers. A plain seqcount_t does not contain the information of which lock must be held when entering a write side critical section. Use the new seqcount_spinlock_t data type, which allows to associate a spinlock with the sequence counter. This enables lockdep to verify that the spinlock used for writer serialization is held when the write side critical section is entered. If lockdep is disabled this lock association is compiled out and has neither storage size nor runtime overhead. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-14-a.darwish@linutronix.de --- include/linux/sched.h | 2 +- init/init_task.c | 3 ++- kernel/fork.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8d1de021b315..9a9d8263962d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1050,7 +1050,7 @@ struct task_struct { /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Seqence number to catch updates: */ - seqcount_t mems_allowed_seq; + seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif diff --git a/init/init_task.c b/init/init_task.c index 15089d15010a..94fe3ba1bb60 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -154,7 +154,8 @@ struct task_struct init_task .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), #endif #ifdef CONFIG_CPUSETS - .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), + .mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq, + &init_task.alloc_lock), #endif #ifdef CONFIG_RT_MUTEXES .pi_waiters = RB_ROOT_CACHED, diff --git a/kernel/fork.c b/kernel/fork.c index 70d9d0a4de2a..fc72f09a61b2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2032,7 +2032,7 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_slab_spread_rotor = NUMA_NO_NODE; - seqcount_init(&p->mems_allowed_seq); + seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; -- cgit v1.2.3 From 025e82bcbc34cd071390e72fd0b31593282f9425 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:23 +0200 Subject: timekeeping: Use sequence counter with associated raw spinlock A sequence counter write side critical section must be protected by some form of locking to serialize writers. A plain seqcount_t does not contain the information of which lock must be held when entering a write side critical section. Use the new seqcount_raw_spinlock_t data type, which allows to associate a raw spinlock with the sequence counter. This enables lockdep to verify that the raw spinlock used for writer serialization is held when the write side critical section is entered. If lockdep is disabled this lock association is compiled out and has neither storage size nor runtime overhead. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-18-a.darwish@linutronix.de --- kernel/time/timekeeping.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d20d489841c8..05ecfd8a3314 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -39,18 +39,19 @@ enum timekeeping_adv_mode { TK_ADV_FREQ }; +static DEFINE_RAW_SPINLOCK(timekeeper_lock); + /* * The most important data for readout fits into a single 64 byte * cache line. */ static struct { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; } tk_core ____cacheline_aligned = { - .seq = SEQCNT_ZERO(tk_core.seq), + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), }; -static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; /** @@ -63,7 +64,7 @@ static struct timekeeper shadow_timekeeper; * See @update_fast_timekeeper() below. */ struct tk_fast { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct tk_read_base base[2]; }; @@ -80,11 +81,13 @@ static struct clocksource dummy_clock = { }; static struct tk_fast tk_fast_mono ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; @@ -157,7 +160,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * tk_clock_read - atomic clocksource read() helper * * This helper is necessary to use in the read paths because, while the - * seqlock ensures we don't return a bad value while structures are updated, + * seqcount ensures we don't return a bad value while structures are updated, * it doesn't protect from potential crashes. There is the possibility that * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if @@ -222,10 +225,10 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) unsigned int seq; /* - * Since we're called holding a seqlock, the data may shift + * Since we're called holding a seqcount, the data may shift * under us while we're doing the calculation. This can cause * false positives, since we'd note a problem but throw the - * results away. So nest another seqlock here to atomically + * results away. So nest another seqcount here to atomically * grab the points we are checking with. */ do { @@ -486,7 +489,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * * To keep it NMI safe since we're accessing from tracing, we're not using a * separate timekeeper with updates to monotonic clock and boot offset - * protected with seqlocks. This has the following minor side effects: + * protected with seqcounts. This has the following minor side effects: * * (1) Its possible that a timestamp be taken after the boot offset is updated * but before the timekeeper is updated. If this happens, the new boot offset -- cgit v1.2.3 From af5a06b582ec3d7b0160b4faaa65f73d8dcf989f Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 20 Jul 2020 17:55:30 +0200 Subject: hrtimer: Use sequence counter with associated raw spinlock A sequence counter write side critical section must be protected by some form of locking to serialize writers. A plain seqcount_t does not contain the information of which lock must be held when entering a write side critical section. Use the new seqcount_raw_spinlock_t data type, which allows to associate a raw spinlock with the sequence counter. This enables lockdep to verify that the raw spinlock used for writer serialization is held when the write side critical section is entered. If lockdep is disabled this lock association is compiled out and has neither storage size nor runtime overhead. Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200720155530.1173732-25-a.darwish@linutronix.de --- include/linux/hrtimer.h | 2 +- kernel/time/hrtimer.c | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 15c8ac313678..25993b86ac5c 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -159,7 +159,7 @@ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; unsigned int index; clockid_t clockid; - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct hrtimer *running; struct timerqueue_head active; ktime_t (*get_time)(void); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d89da1c7e005..c4038511d5c9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -135,7 +135,11 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .clock_base = { { .cpu_base = &migration_cpu_base, }, }, + .clock_base = { { + .cpu_base = &migration_cpu_base, + .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, + &migration_cpu_base.lock), + }, }, }; #define migration_base migration_cpu_base.clock_base[0] @@ -1998,8 +2002,11 @@ int hrtimers_prepare_cpu(unsigned int cpu) int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - cpu_base->clock_base[i].cpu_base = cpu_base; - timerqueue_init_head(&cpu_base->clock_base[i].active); + struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; + + clock_b->cpu_base = cpu_base; + seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); + timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; -- cgit v1.2.3 From b13fecb1c3a603c4b8e99b306fecf4f668c11b32 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 13 Jul 2020 15:01:26 -0700 Subject: treewide: Replace DECLARE_TASKLET() with DECLARE_TASKLET_OLD() This converts all the existing DECLARE_TASKLET() (and ...DISABLED) macros with DECLARE_TASKLET_OLD() in preparation for refactoring the tasklet callback type. All existing DECLARE_TASKLET() users had a "0" data argument, it has been removed here as well. Reviewed-by: Greg Kroah-Hartman Acked-by: Thomas Gleixner Signed-off-by: Kees Cook --- drivers/input/keyboard/omap-keypad.c | 2 +- drivers/input/serio/hil_mlc.c | 2 +- drivers/net/wan/farsync.c | 4 ++-- drivers/s390/crypto/ap_bus.c | 2 +- drivers/staging/most/dim2/dim2.c | 2 +- drivers/staging/octeon/ethernet-tx.c | 2 +- drivers/tty/vt/keyboard.c | 2 +- drivers/usb/gadget/udc/snps_udc_core.c | 2 +- drivers/usb/host/fhci-sched.c | 2 +- include/linux/interrupt.h | 15 ++++++++++----- kernel/backtracetest.c | 2 +- kernel/debug/debug_core.c | 2 +- kernel/irq/resend.c | 2 +- net/atm/pppoatm.c | 2 +- net/iucv/iucv.c | 2 +- sound/drivers/pcsp/pcsp_lib.c | 2 +- 16 files changed, 26 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/drivers/input/keyboard/omap-keypad.c b/drivers/input/keyboard/omap-keypad.c index 5fe7a5633e33..dbe836c7ff47 100644 --- a/drivers/input/keyboard/omap-keypad.c +++ b/drivers/input/keyboard/omap-keypad.c @@ -46,7 +46,7 @@ struct omap_kp { unsigned short keymap[]; }; -static DECLARE_TASKLET_DISABLED(kp_tasklet, omap_kp_tasklet, 0); +static DECLARE_TASKLET_DISABLED_OLD(kp_tasklet, omap_kp_tasklet); static unsigned int *row_gpios; static unsigned int *col_gpios; diff --git a/drivers/input/serio/hil_mlc.c b/drivers/input/serio/hil_mlc.c index e1423f7648d6..65f4e9d62a67 100644 --- a/drivers/input/serio/hil_mlc.c +++ b/drivers/input/serio/hil_mlc.c @@ -77,7 +77,7 @@ static struct timer_list hil_mlcs_kicker; static int hil_mlcs_probe; static void hil_mlcs_process(unsigned long unused); -static DECLARE_TASKLET_DISABLED(hil_mlcs_tasklet, hil_mlcs_process, 0); +static DECLARE_TASKLET_DISABLED_OLD(hil_mlcs_tasklet, hil_mlcs_process); /* #define HIL_MLC_DEBUG */ diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c index 7916efce7188..f5198a391417 100644 --- a/drivers/net/wan/farsync.c +++ b/drivers/net/wan/farsync.c @@ -569,8 +569,8 @@ static void do_bottom_half_rx(struct fst_card_info *card); static void fst_process_tx_work_q(unsigned long work_q); static void fst_process_int_work_q(unsigned long work_q); -static DECLARE_TASKLET(fst_tx_task, fst_process_tx_work_q, 0); -static DECLARE_TASKLET(fst_int_task, fst_process_int_work_q, 0); +static DECLARE_TASKLET_OLD(fst_tx_task, fst_process_tx_work_q); +static DECLARE_TASKLET_OLD(fst_int_task, fst_process_int_work_q); static struct fst_card_info *fst_card_array[FST_MAX_CARDS]; static spinlock_t fst_work_q_lock; diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index e71ca4a719a5..2589ccd257e3 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -93,7 +93,7 @@ static DECLARE_WORK(ap_scan_work, ap_scan_bus); * Tasklet & timer for AP request polling and interrupts */ static void ap_tasklet_fn(unsigned long); -static DECLARE_TASKLET(ap_tasklet, ap_tasklet_fn, 0); +static DECLARE_TASKLET_OLD(ap_tasklet, ap_tasklet_fn); static DECLARE_WAIT_QUEUE_HEAD(ap_poll_wait); static struct task_struct *ap_poll_kthread; static DEFINE_MUTEX(ap_poll_thread_mutex); diff --git a/drivers/staging/most/dim2/dim2.c b/drivers/staging/most/dim2/dim2.c index 8e0f27e61652..509c8012d20b 100644 --- a/drivers/staging/most/dim2/dim2.c +++ b/drivers/staging/most/dim2/dim2.c @@ -46,7 +46,7 @@ MODULE_PARM_DESC(fcnt, "Num of frames per sub-buffer for sync channels as a powe static DEFINE_SPINLOCK(dim_lock); static void dim2_tasklet_fn(unsigned long data); -static DECLARE_TASKLET(dim2_tasklet, dim2_tasklet_fn, 0); +static DECLARE_TASKLET_OLD(dim2_tasklet, dim2_tasklet_fn); /** * struct hdm_channel - private structure to keep channel specific data diff --git a/drivers/staging/octeon/ethernet-tx.c b/drivers/staging/octeon/ethernet-tx.c index ab7dd8216006..9c71ad5af7b9 100644 --- a/drivers/staging/octeon/ethernet-tx.c +++ b/drivers/staging/octeon/ethernet-tx.c @@ -41,7 +41,7 @@ #endif static void cvm_oct_tx_do_cleanup(unsigned long arg); -static DECLARE_TASKLET(cvm_oct_tx_cleanup_tasklet, cvm_oct_tx_do_cleanup, 0); +static DECLARE_TASKLET_OLD(cvm_oct_tx_cleanup_tasklet, cvm_oct_tx_do_cleanup); /* Maximum number of SKBs to try to free per xmit packet. */ #define MAX_SKB_TO_FREE (MAX_OUT_QUEUE_DEPTH * 2) diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c index 568b2171f335..f80199984ee0 100644 --- a/drivers/tty/vt/keyboard.c +++ b/drivers/tty/vt/keyboard.c @@ -1236,7 +1236,7 @@ static void kbd_bh(unsigned long dummy) } } -DECLARE_TASKLET_DISABLED(keyboard_tasklet, kbd_bh, 0); +DECLARE_TASKLET_DISABLED_OLD(keyboard_tasklet, kbd_bh); #if defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_ALPHA) ||\ defined(CONFIG_MIPS) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) ||\ diff --git a/drivers/usb/gadget/udc/snps_udc_core.c b/drivers/usb/gadget/udc/snps_udc_core.c index afdd28f332ce..e76f1a50b0fc 100644 --- a/drivers/usb/gadget/udc/snps_udc_core.c +++ b/drivers/usb/gadget/udc/snps_udc_core.c @@ -96,7 +96,7 @@ static int stop_pollstall_timer; static DECLARE_COMPLETION(on_pollstall_exit); /* tasklet for usb disconnect */ -static DECLARE_TASKLET(disconnect_tasklet, udc_tasklet_disconnect, 0); +static DECLARE_TASKLET_OLD(disconnect_tasklet, udc_tasklet_disconnect); /* endpoint names used for print */ static const char ep0_string[] = "ep0in"; diff --git a/drivers/usb/host/fhci-sched.c b/drivers/usb/host/fhci-sched.c index 3235d5307403..5c423f240a1f 100644 --- a/drivers/usb/host/fhci-sched.c +++ b/drivers/usb/host/fhci-sched.c @@ -677,7 +677,7 @@ static void process_done_list(unsigned long data) enable_irq(fhci_to_hcd(fhci)->irq); } -DECLARE_TASKLET(fhci_tasklet, process_done_list, 0); +DECLARE_TASKLET_OLD(fhci_tasklet, process_done_list); /* transfer complted callback */ u32 fhci_transfer_confirm_callback(struct fhci_hcd *fhci) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 5db970b6615a..b911196f03eb 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -612,12 +612,17 @@ struct tasklet_struct unsigned long data; }; -#define DECLARE_TASKLET(name, func, data) \ -struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(0), func, data } - -#define DECLARE_TASKLET_DISABLED(name, func, data) \ -struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data } +#define DECLARE_TASKLET_OLD(name, _func) \ +struct tasklet_struct name = { \ + .count = ATOMIC_INIT(0), \ + .func = _func, \ +} +#define DECLARE_TASKLET_DISABLED_OLD(name, _func) \ +struct tasklet_struct name = { \ + .count = ATOMIC_INIT(1), \ + .func = _func, \ +} enum { diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a2a97fa3071b..370217dd7e39 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -29,7 +29,7 @@ static void backtrace_test_irq_callback(unsigned long data) complete(&backtrace_work); } -static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); +static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback); static void backtrace_test_irq(void) { diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 9e5934780f41..b16dbc1bf056 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -1068,7 +1068,7 @@ static void kgdb_tasklet_bpt(unsigned long ing) atomic_set(&kgdb_break_tasklet_var, 0); } -static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0); +static DECLARE_TASKLET_OLD(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt); void kgdb_schedule_breakpoint(void) { diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 27634f4022d0..c48ce19a257f 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -45,7 +45,7 @@ static void resend_irqs(unsigned long arg) } /* Tasklet to handle resend: */ -static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); +static DECLARE_TASKLET_OLD(resend_tasklet, resend_irqs); static int irq_sw_resend(struct irq_desc *desc) { diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index 45d8e1d5d033..579b66da1d95 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -393,7 +393,7 @@ static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg) * Each PPPoATM instance has its own tasklet - this is just a * prototypical one used to initialize them */ - static const DECLARE_TASKLET(tasklet_proto, pppoatm_wakeup_sender, 0); + static const DECLARE_TASKLET_OLD(tasklet_proto, pppoatm_wakeup_sender); if (copy_from_user(&be, arg, sizeof be)) return -EFAULT; if (be.encaps != PPPOATM_ENCAPS_AUTODETECT && diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 19250a0c85d3..cd2e468852e7 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -105,7 +105,7 @@ static LIST_HEAD(iucv_task_queue); * The tasklet for fast delivery of iucv interrupts. */ static void iucv_tasklet_fn(unsigned long); -static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_fn,0); +static DECLARE_TASKLET_OLD(iucv_tasklet, iucv_tasklet_fn); /* * Queue of interrupt buffers for delivery via a work queue diff --git a/sound/drivers/pcsp/pcsp_lib.c b/sound/drivers/pcsp/pcsp_lib.c index 05244b11ed5e..4e79293d7f11 100644 --- a/sound/drivers/pcsp/pcsp_lib.c +++ b/sound/drivers/pcsp/pcsp_lib.c @@ -36,7 +36,7 @@ static void pcsp_call_pcm_elapsed(unsigned long priv) } } -static DECLARE_TASKLET(pcsp_pcm_tasklet, pcsp_call_pcm_elapsed, 0); +static DECLARE_TASKLET_OLD(pcsp_pcm_tasklet, pcsp_call_pcm_elapsed); /* write the port and returns the next expire time in ns; * called at the trigger-start and in hrtimer callback -- cgit v1.2.3 From 12cc923f1ccc1df467e046b02a72c2b3b321b6a2 Mon Sep 17 00:00:00 2001 From: Romain Perier Date: Sun, 29 Sep 2019 18:30:13 +0200 Subject: tasklet: Introduce new initialization API Nowadays, modern kernel subsystems that use callbacks pass the data structure associated with a given callback as argument to the callback. The tasklet subsystem remains one which passes an arbitrary unsigned long to the callback function. This has several problems: - This keeps an extra field for storing the argument in each tasklet data structure, it bloats the tasklet_struct structure with a redundant .data field - No type checking can be performed on this argument. Instead of using container_of() like other callback subsystems, it forces callbacks to do explicit type cast of the unsigned long argument into the required object type. - Buffer overflows can overwrite the .func and the .data field, so an attacker can easily overwrite the function and its first argument to whatever it wants. Add a new tasklet initialization API, via DECLARE_TASKLET() and tasklet_setup(), which will replace the existing ones. This work is greatly inspired by the timer_struct conversion series, see commit e99e88a9d2b0 ("treewide: setup_timer() -> timer_setup()") To avoid problems with both -Wcast-function-type (which is enabled in the kernel via -Wextra is several subsystems), and with mismatched function prototypes when build with Control Flow Integrity enabled, this adds the "use_callback" member to let the tasklet caller choose which union member to call through. Once all old API uses are removed, this and the .data member will be removed as well. (On 64-bit this does not grow the struct size as the new member fills the hole after atomic_t, which is also "int" sized.) Signed-off-by: Romain Perier Co-developed-by: Allen Pais Signed-off-by: Allen Pais Reviewed-by: Greg Kroah-Hartman Acked-by: Thomas Gleixner Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- include/linux/interrupt.h | 28 +++++++++++++++++++++++++++- kernel/softirq.c | 18 +++++++++++++++++- 2 files changed, 44 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index b911196f03eb..f9aee3538461 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -585,6 +585,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void) /* Tasklets --- multithreaded analogue of BHs. + This API is deprecated. Please consider using threaded IRQs instead: + https://lore.kernel.org/lkml/20200716081538.2sivhkj4hcyrusem@linutronix.de + Main feature differing them of generic softirqs: tasklet is running only on one CPU simultaneously. @@ -608,10 +611,31 @@ struct tasklet_struct struct tasklet_struct *next; unsigned long state; atomic_t count; - void (*func)(unsigned long); + bool use_callback; + union { + void (*func)(unsigned long data); + void (*callback)(struct tasklet_struct *t); + }; unsigned long data; }; +#define DECLARE_TASKLET(name, _callback) \ +struct tasklet_struct name = { \ + .count = ATOMIC_INIT(0), \ + .callback = _callback, \ + .use_callback = true, \ +} + +#define DECLARE_TASKLET_DISABLED(name, _callback) \ +struct tasklet_struct name = { \ + .count = ATOMIC_INIT(1), \ + .callback = _callback, \ + .use_callback = true, \ +} + +#define from_tasklet(var, callback_tasklet, tasklet_fieldname) \ + container_of(callback_tasklet, typeof(*var), tasklet_fieldname) + #define DECLARE_TASKLET_OLD(name, _func) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(0), \ @@ -691,6 +715,8 @@ extern void tasklet_kill(struct tasklet_struct *t); extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); +extern void tasklet_setup(struct tasklet_struct *t, + void (*callback)(struct tasklet_struct *)); /* * Autoprobing for irqs: diff --git a/kernel/softirq.c b/kernel/softirq.c index c4201b7f42b1..292e7c2d2333 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -547,7 +547,10 @@ static void tasklet_action_common(struct softirq_action *a, if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) BUG(); - t->func(t->data); + if (t->use_callback) + t->callback(t); + else + t->func(t->data); tasklet_unlock(t); continue; } @@ -573,6 +576,18 @@ static __latent_entropy void tasklet_hi_action(struct softirq_action *a) tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } +void tasklet_setup(struct tasklet_struct *t, + void (*callback)(struct tasklet_struct *)) +{ + t->next = NULL; + t->state = 0; + atomic_set(&t->count, 0); + t->callback = callback; + t->use_callback = true; + t->data = 0; +} +EXPORT_SYMBOL(tasklet_setup); + void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data) { @@ -580,6 +595,7 @@ void tasklet_init(struct tasklet_struct *t, t->state = 0; atomic_set(&t->count, 0); t->func = func; + t->use_callback = false; t->data = data; } EXPORT_SYMBOL(tasklet_init); -- cgit v1.2.3 From 4fc00b79b85d4c34bef06ad49f109ad7cd9e5d83 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 28 Jul 2020 15:18:01 -0700 Subject: bpf: Add missing newline characters in verifier error messages Newline characters are added in two verifier error messages, refactored in Commit afbf21dce668 ("bpf: Support readonly/readwrite buffers in verifier"). This way, they do not mix with messages afterwards. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200728221801.1090349-1-yhs@fb.com --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 88bb25d08bf8..b6ccfce3bf4c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3069,7 +3069,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env, { if (off < 0) { verbose(env, - "R%d invalid %s buffer access: off=%d, size=%d", + "R%d invalid %s buffer access: off=%d, size=%d\n", regno, buf_info, off, size); return -EACCES; } @@ -3078,7 +3078,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "R%d invalid variable buffer offset: off=%d, var_off=%s", + "R%d invalid variable buffer offset: off=%d, var_off=%s\n", regno, off, tn_buf); return -EACCES; } -- cgit v1.2.3 From a9d0ba6772a0056457838022b624c3d2a17cee21 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 30 Jul 2020 16:23:17 +0800 Subject: tracing/hwlat: Drop the duplicate assignment in start_kthread() We have set 'current_mask' to '&save_cpumask' in its declaration, so there is no need to assign again. Link: https://lkml.kernel.org/r/20200730082318.42584-1-haokexin@gmail.com Signed-off-by: Kevin Hao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index e2be7bb7ef7e..ddb528a6cd51 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -371,7 +371,6 @@ static int start_kthread(struct trace_array *tr) return 0; /* Just pick the first CPU on first iteration */ - current_mask = &save_cpumask; get_online_cpus(); cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); put_online_cpus(); -- cgit v1.2.3 From 96b4833b6827a62c295b149213c68b559514c929 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 30 Jul 2020 16:23:18 +0800 Subject: tracing/hwlat: Honor the tracing_cpumask In calculation of the cpu mask for the hwlat kernel thread, the wrong cpu mask is used instead of the tracing_cpumask, this causes the tracing/tracing_cpumask useless for hwlat tracer. Fixes it. Link: https://lkml.kernel.org/r/20200730082318.42584-2-haokexin@gmail.com Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 0330f7aa8ee6 ("tracing: Have hwlat trace migrate across tracing_cpumask CPUs") Signed-off-by: Kevin Hao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index ddb528a6cd51..17873e5d0353 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -283,6 +283,7 @@ static bool disable_migrate; static void move_to_next_cpu(void) { struct cpumask *current_mask = &save_cpumask; + struct trace_array *tr = hwlat_trace; int next_cpu; if (disable_migrate) @@ -296,7 +297,7 @@ static void move_to_next_cpu(void) goto disable; get_online_cpus(); - cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); next_cpu = cpumask_next(smp_processor_id(), current_mask); put_online_cpus(); @@ -372,7 +373,7 @@ static int start_kthread(struct trace_array *tr) /* Just pick the first CPU on first iteration */ get_online_cpus(); - cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); put_online_cpus(); next_cpu = cpumask_first(current_mask); -- cgit v1.2.3 From 8a224ffb3f52b0027f6b7279854c71a31c48fc97 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Wed, 29 Jul 2020 02:05:53 +0800 Subject: ftrace: Setup correct FTRACE_FL_REGS flags for module When module loaded and enabled, we will use __ftrace_replace_code for module if any ftrace_ops referenced it found. But we will get wrong ftrace_addr for module rec in ftrace_get_addr_new, because rec->flags has not been setup correctly. It can cause the callback function of a ftrace_ops has FTRACE_OPS_FL_SAVE_REGS to be called with pt_regs set to NULL. So setup correct FTRACE_FL_REGS flags for rec when we call referenced_filters to find ftrace_ops references it. Link: https://lkml.kernel.org/r/20200728180554.65203-1-zhouchengming@bytedance.com Cc: stable@vger.kernel.org Fixes: 8c4f3c3fa9681 ("ftrace: Check module functions being traced on reload") Signed-off-by: Chengming Zhou Signed-off-by: Muchun Song Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c141d347f71a..d052f856f1cf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6198,8 +6198,11 @@ static int referenced_filters(struct dyn_ftrace *rec) int cnt = 0; for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) - cnt++; + if (ops_references_rec(ops, rec)) { + cnt++; + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + rec->flags |= FTRACE_FL_REGS; + } } return cnt; @@ -6378,8 +6381,8 @@ void ftrace_module_enable(struct module *mod) if (ftrace_start_up) cnt += referenced_filters(rec); - /* This clears FTRACE_FL_DISABLED */ - rec->flags = cnt; + rec->flags &= ~FTRACE_FL_DISABLED; + rec->flags += cnt; if (ftrace_start_up && cnt) { int failed = __ftrace_replace_code(rec, 1); -- cgit v1.2.3 From c5f51572a7fdff5be9e67c231de593155f394ab3 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Wed, 29 Jul 2020 02:05:54 +0800 Subject: ftrace: Do not let direct or IPMODIFY ftrace_ops be added to module and set trampolines When inserting a module, we find all ftrace_ops referencing it on the ftrace_ops_list. But FTRACE_OPS_FL_DIRECT and FTRACE_OPS_FL_IPMODIFY flags are special, and should not be set automatically. So warn and skip ftrace_ops that have these two flags set and adding new code. Also check if only one ftrace_ops references the module, in which case we can use a trampoline as an optimization. Link: https://lkml.kernel.org/r/20200728180554.65203-2-zhouchengming@bytedance.com Signed-off-by: Chengming Zhou Signed-off-by: Muchun Song Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d052f856f1cf..f433cb44300a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6199,9 +6199,17 @@ static int referenced_filters(struct dyn_ftrace *rec) for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { if (ops_references_rec(ops, rec)) { + if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT)) + continue; + if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + continue; cnt++; if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) rec->flags |= FTRACE_FL_REGS; + if (cnt == 1 && ops->trampoline) + rec->flags |= FTRACE_FL_TRAMP; + else + rec->flags &= ~FTRACE_FL_TRAMP; } } -- cgit v1.2.3 From ee896ee8051a618ea4fe3d91c58d3e6ef9faf705 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Mon, 27 Jul 2020 11:28:40 +0200 Subject: tracing: Remove outdated comment in stack handling This comment describes the behaviour before commit 2a820bf74918 ("tracing: Use percpu stack trace buffer more intelligently"). Since that commit, interrupts and NMIs do use the per-cpu stacks so the comment is no longer correct. Remove it. (Note that the FTRACE_STACK_SIZE mentioned in the comment has never existed, it probably should have said FTRACE_STACK_ENTRIES.) Link: https://lkml.kernel.org/r/20200727092840.18659-1-vincent.whitchurch@axis.com Signed-off-by: Vincent Whitchurch Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4aab712f9567..dbcacdd56b02 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2930,12 +2930,6 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, skip++; #endif - /* - * Since events can happen in NMIs there's no safe way to - * use the per cpu ftrace_stacks. We reserve it and if an interrupt - * or NMI comes in, it will just have to use the default - * FTRACE_STACK_SIZE. - */ preempt_disable_notrace(); stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; -- cgit v1.2.3 From 0584df9c12f449124d0bfef9899e5365604ee7a9 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 29 Jul 2020 13:09:15 +0200 Subject: lockdep: Refactor IRQ trace events fields into struct Refactor the IRQ trace events fields, used for printing information about the IRQ trace events, into a separate struct 'irqtrace_events'. This improves readability by separating the information only used in reporting, as well as enables (simplified) storing/restoring of irqtrace_events snapshots. No functional change intended. Signed-off-by: Marco Elver Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200729110916.3920464-1-elver@google.com Signed-off-by: Ingo Molnar --- include/linux/irqflags.h | 13 +++++++++++ include/linux/sched.h | 11 ++------- kernel/fork.c | 16 +++++-------- kernel/locking/lockdep.c | 58 +++++++++++++++++++++++++----------------------- 4 files changed, 50 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 5811ee8a5cd8..bd5c55755447 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -33,6 +33,19 @@ #ifdef CONFIG_TRACE_IRQFLAGS +/* Per-task IRQ trace events information. */ +struct irqtrace_events { + unsigned int irq_events; + unsigned long hardirq_enable_ip; + unsigned long hardirq_disable_ip; + unsigned int hardirq_enable_event; + unsigned int hardirq_disable_event; + unsigned long softirq_disable_ip; + unsigned long softirq_enable_ip; + unsigned int softirq_disable_event; + unsigned int softirq_enable_event; +}; + DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); diff --git a/include/linux/sched.h b/include/linux/sched.h index 8d1de021b315..52e0fdd6a555 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -980,17 +981,9 @@ struct task_struct { #endif #ifdef CONFIG_TRACE_IRQFLAGS - unsigned int irq_events; + struct irqtrace_events irqtrace; unsigned int hardirq_threaded; - unsigned long hardirq_enable_ip; - unsigned long hardirq_disable_ip; - unsigned int hardirq_enable_event; - unsigned int hardirq_disable_event; u64 hardirq_chain_key; - unsigned long softirq_disable_ip; - unsigned long softirq_enable_ip; - unsigned int softirq_disable_event; - unsigned int softirq_enable_event; int softirqs_enabled; int softirq_context; int irq_config; diff --git a/kernel/fork.c b/kernel/fork.c index 70d9d0a4de2a..56a640799680 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2035,17 +2035,11 @@ static __latent_entropy struct task_struct *copy_process( seqcount_init(&p->mems_allowed_seq); #endif #ifdef CONFIG_TRACE_IRQFLAGS - p->irq_events = 0; - p->hardirq_enable_ip = 0; - p->hardirq_enable_event = 0; - p->hardirq_disable_ip = _THIS_IP_; - p->hardirq_disable_event = 0; - p->softirqs_enabled = 1; - p->softirq_enable_ip = _THIS_IP_; - p->softirq_enable_event = 0; - p->softirq_disable_ip = 0; - p->softirq_disable_event = 0; - p->softirq_context = 0; + memset(&p->irqtrace, 0, sizeof(p->irqtrace)); + p->irqtrace.hardirq_disable_ip = _THIS_IP_; + p->irqtrace.softirq_enable_ip = _THIS_IP_; + p->softirqs_enabled = 1; + p->softirq_context = 0; #endif p->pagefault_disabled = 0; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c9ea05edce25..7b5800374c40 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3484,19 +3484,21 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, void print_irqtrace_events(struct task_struct *curr) { - printk("irq event stamp: %u\n", curr->irq_events); + const struct irqtrace_events *trace = &curr->irqtrace; + + printk("irq event stamp: %u\n", trace->irq_events); printk("hardirqs last enabled at (%u): [<%px>] %pS\n", - curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, - (void *)curr->hardirq_enable_ip); + trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip, + (void *)trace->hardirq_enable_ip); printk("hardirqs last disabled at (%u): [<%px>] %pS\n", - curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, - (void *)curr->hardirq_disable_ip); + trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip, + (void *)trace->hardirq_disable_ip); printk("softirqs last enabled at (%u): [<%px>] %pS\n", - curr->softirq_enable_event, (void *)curr->softirq_enable_ip, - (void *)curr->softirq_enable_ip); + trace->softirq_enable_event, (void *)trace->softirq_enable_ip, + (void *)trace->softirq_enable_ip); printk("softirqs last disabled at (%u): [<%px>] %pS\n", - curr->softirq_disable_event, (void *)curr->softirq_disable_ip, - (void *)curr->softirq_disable_ip); + trace->softirq_disable_event, (void *)trace->softirq_disable_ip, + (void *)trace->softirq_disable_ip); } static int HARDIRQ_verbose(struct lock_class *class) @@ -3699,7 +3701,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare); void noinstr lockdep_hardirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; if (unlikely(!debug_locks)) return; @@ -3752,8 +3754,8 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) skip_checks: /* we'll do an OFF -> ON transition: */ this_cpu_write(hardirqs_enabled, 1); - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; + trace->hardirq_enable_ip = ip; + trace->hardirq_enable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_on_events); } EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); @@ -3763,8 +3765,6 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); */ void noinstr lockdep_hardirqs_off(unsigned long ip) { - struct task_struct *curr = current; - if (unlikely(!debug_locks)) return; @@ -3784,12 +3784,14 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) return; if (lockdep_hardirqs_enabled()) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ this_cpu_write(hardirqs_enabled, 0); - curr->hardirq_disable_ip = ip; - curr->hardirq_disable_event = ++curr->irq_events; + trace->hardirq_disable_ip = ip; + trace->hardirq_disable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_off_events); } else { debug_atomic_inc(redundant_hardirqs_off); @@ -3802,7 +3804,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); */ void lockdep_softirqs_on(unsigned long ip) { - struct task_struct *curr = current; + struct irqtrace_events *trace = ¤t->irqtrace; if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3814,7 +3816,7 @@ void lockdep_softirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { debug_atomic_inc(redundant_softirqs_on); return; } @@ -3823,9 +3825,9 @@ void lockdep_softirqs_on(unsigned long ip) /* * We'll do an OFF -> ON transition: */ - curr->softirqs_enabled = 1; - curr->softirq_enable_ip = ip; - curr->softirq_enable_event = ++curr->irq_events; + current->softirqs_enabled = 1; + trace->softirq_enable_ip = ip; + trace->softirq_enable_event = ++trace->irq_events; debug_atomic_inc(softirqs_on_events); /* * We are going to turn softirqs on, so set the @@ -3833,7 +3835,7 @@ void lockdep_softirqs_on(unsigned long ip) * enabled too: */ if (lockdep_hardirqs_enabled()) - mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); + mark_held_locks(current, LOCK_ENABLED_SOFTIRQ); lockdep_recursion_finish(); } @@ -3842,8 +3844,6 @@ void lockdep_softirqs_on(unsigned long ip) */ void lockdep_softirqs_off(unsigned long ip) { - struct task_struct *curr = current; - if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3853,13 +3853,15 @@ void lockdep_softirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { + if (current->softirqs_enabled) { + struct irqtrace_events *trace = ¤t->irqtrace; + /* * We have done an ON -> OFF transition: */ - curr->softirqs_enabled = 0; - curr->softirq_disable_ip = ip; - curr->softirq_disable_event = ++curr->irq_events; + current->softirqs_enabled = 0; + trace->softirq_disable_ip = ip; + trace->softirq_disable_event = ++trace->irq_events; debug_atomic_inc(softirqs_off_events); /* * Whoops, we wanted softirqs off, so why aren't they? -- cgit v1.2.3 From 92c209ac6d3d35783c16c8a717547183e6e11162 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 29 Jul 2020 13:09:16 +0200 Subject: kcsan: Improve IRQ state trace reporting To improve the general usefulness of the IRQ state trace events with KCSAN enabled, save and restore the trace information when entering and exiting the KCSAN runtime as well as when generating a KCSAN report. Without this, reporting the IRQ trace events (whether via a KCSAN report or outside of KCSAN via a lockdep report) is rather useless due to continuously being touched by KCSAN. This is because if KCSAN is enabled, every instrumented memory access causes changes to IRQ trace events (either by KCSAN disabling/enabling interrupts or taking report_lock when generating a report). Before "lockdep: Prepare for NMI IRQ state tracking", KCSAN avoided touching the IRQ trace events via raw_local_irq_save/restore() and lockdep_off/on(). Fixes: 248591f5d257 ("kcsan: Make KCSAN compatible with new IRQ state tracking") Signed-off-by: Marco Elver Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200729110916.3920464-2-elver@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/kcsan/core.c | 23 +++++++++++++++++++++++ kernel/kcsan/kcsan.h | 7 +++++++ kernel/kcsan/report.c | 3 +++ 4 files changed, 37 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 52e0fdd6a555..060e9214c8b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1184,8 +1184,12 @@ struct task_struct { #ifdef CONFIG_KASAN unsigned int kasan_depth; #endif + #ifdef CONFIG_KCSAN struct kcsan_ctx kcsan_ctx; +#ifdef CONFIG_TRACE_IRQFLAGS + struct irqtrace_events kcsan_save_irqtrace; +#endif #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 732623c30359..0fe068192781 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -291,6 +291,20 @@ static inline unsigned int get_delay(void) 0); } +void kcsan_save_irqtrace(struct task_struct *task) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + task->kcsan_save_irqtrace = task->irqtrace; +#endif +} + +void kcsan_restore_irqtrace(struct task_struct *task) +{ +#ifdef CONFIG_TRACE_IRQFLAGS + task->irqtrace = task->kcsan_save_irqtrace; +#endif +} + /* * Pull everything together: check_access() below contains the performance * critical operations; the fast-path (including check_access) functions should @@ -336,9 +350,11 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, flags = user_access_save(); if (consumed) { + kcsan_save_irqtrace(current); kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE, KCSAN_REPORT_CONSUMED_WATCHPOINT, watchpoint - watchpoints); + kcsan_restore_irqtrace(current); } else { /* * The other thread may not print any diagnostics, as it has @@ -396,6 +412,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) goto out; } + /* + * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's + * runtime is entered for every memory access, and potentially useful + * information is lost if dirtied by KCSAN. + */ + kcsan_save_irqtrace(current); if (!kcsan_interrupt_watcher) local_irq_save(irq_flags); @@ -539,6 +561,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) out_unlock: if (!kcsan_interrupt_watcher) local_irq_restore(irq_flags); + kcsan_restore_irqtrace(current); out: user_access_restore(ua_flags); } diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 763d6d08d94b..29480010dc30 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -9,6 +9,7 @@ #define _KERNEL_KCSAN_KCSAN_H #include +#include /* The number of adjacent watchpoints to check. */ #define KCSAN_CHECK_ADJACENT 1 @@ -22,6 +23,12 @@ extern unsigned int kcsan_udelay_interrupt; */ extern bool kcsan_enabled; +/* + * Save/restore IRQ flags state trace dirtied by KCSAN. + */ +void kcsan_save_irqtrace(struct task_struct *task); +void kcsan_restore_irqtrace(struct task_struct *task); + /* * Initialize debugfs file. */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 6b2fb1a6d8cd..9d07e175de0f 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -308,6 +308,9 @@ static void print_verbose_info(struct task_struct *task) if (!task) return; + /* Restore IRQ state trace for printing. */ + kcsan_restore_irqtrace(task); + pr_err("\n"); debug_show_held_locks(task); print_irqtrace_events(task); -- cgit v1.2.3 From f4470cdf108f00533e8079b19434e6cb48c17fa3 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 31 Jul 2020 20:20:14 +0100 Subject: sched: Document arch_scale_*_capacity() Rather that hide their purpose in some dark, damp corner of Documentation/, add some documentation to the default implementations. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200731192016.7484-2-valentin.schneider@arm.com --- include/linux/sched/topology.h | 10 ++++++++++ kernel/sched/sched.h | 10 ++++++++++ 2 files changed, 20 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 764222d637b7..820511289857 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -217,6 +217,16 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) #endif /* !CONFIG_SMP */ #ifndef arch_scale_cpu_capacity +/** + * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU. + * @cpu: the CPU in question. + * + * Return: the CPU scale factor normalized against SCHED_CAPACITY_SCALE, i.e. + * + * max_perf(cpu) + * ----------------------------- * SCHED_CAPACITY_SCALE + * max(max_perf(c) : c \in CPUs) + */ static __always_inline unsigned long arch_scale_cpu_capacity(int cpu) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 296efd30d8c9..3fd283892761 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2049,6 +2049,16 @@ void arch_scale_freq_tick(void) #endif #ifndef arch_scale_freq_capacity +/** + * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. + * @cpu: the CPU in question. + * + * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e. + * + * f_curr + * ------ * SCHED_CAPACITY_SCALE + * f_max + */ static __always_inline unsigned long arch_scale_freq_capacity(int cpu) { -- cgit v1.2.3 From 7ef5264de773279b9f23b6cc8afb5addb30e970b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:20 +0200 Subject: modules: mark ref_module static ref_module isn't used anywhere outside of module.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- include/linux/module.h | 1 - kernel/module.c | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 2e6670860d27..f1fdbeef2153 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -657,7 +657,6 @@ static inline void __module_get(struct module *module) #define symbol_put_addr(p) do { } while (0) #endif /* CONFIG_MODULE_UNLOAD */ -int ref_module(struct module *a, struct module *b); /* This is a #define so the string doesn't get put in every .o file */ #define module_name(mod) \ diff --git a/kernel/module.c b/kernel/module.c index e8a198588f26..baae0e83d630 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -869,7 +869,7 @@ static int add_module_usage(struct module *a, struct module *b) } /* Module a uses b: caller needs module_mutex() */ -int ref_module(struct module *a, struct module *b) +static int ref_module(struct module *a, struct module *b) { int err; @@ -888,7 +888,6 @@ int ref_module(struct module *a, struct module *b) } return 0; } -EXPORT_SYMBOL_GPL(ref_module); /* Clear the unload stuff of the module. */ static void module_unload_free(struct module *mod) @@ -1169,11 +1168,10 @@ static inline void module_unload_free(struct module *mod) { } -int ref_module(struct module *a, struct module *b) +static int ref_module(struct module *a, struct module *b) { return strong_try_module_get(b); } -EXPORT_SYMBOL_GPL(ref_module); static inline int module_unload_init(struct module *mod) { -- cgit v1.2.3 From 773110470e2fa3839523384ae014f8a723c4d178 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:21 +0200 Subject: modules: mark find_symbol static find_symbol is only used in module.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- include/linux/module.h | 11 ----------- kernel/module.c | 3 +-- 2 files changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index f1fdbeef2153..90bdc362be36 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -590,17 +590,6 @@ struct symsearch { bool unused; }; -/* - * Search for an exported symbol by name. - * - * Must be called with module_mutex held or preemption disabled. - */ -const struct kernel_symbol *find_symbol(const char *name, - struct module **owner, - const s32 **crc, - bool gplok, - bool warn); - /* * Walk the exported symbol table * diff --git a/kernel/module.c b/kernel/module.c index baae0e83d630..0f95fb4b3e37 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -585,7 +585,7 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, /* Find an exported symbol and return it, along with, (optional) crc and * (optional) module which owns it. Needs preempt disabled or module_mutex. */ -const struct kernel_symbol *find_symbol(const char *name, +static const struct kernel_symbol *find_symbol(const char *name, struct module **owner, const s32 **crc, bool gplok, @@ -608,7 +608,6 @@ const struct kernel_symbol *find_symbol(const char *name, pr_debug("Failed to find symbol %s\n", name); return NULL; } -EXPORT_SYMBOL_GPL(find_symbol); /* * Search for module by name: must hold module_mutex (or preempt disabled -- cgit v1.2.3 From a54e04914c211b5678602a46b3ede5d82ec1327d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:22 +0200 Subject: modules: mark each_symbol_section static each_symbol_section is only used inside of module.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- include/linux/module.h | 9 --------- kernel/module.c | 3 +-- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 90bdc362be36..b79219eed83c 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -590,15 +590,6 @@ struct symsearch { bool unused; }; -/* - * Walk the exported symbol table - * - * Must be called with module_mutex held or preemption disabled. - */ -bool each_symbol_section(bool (*fn)(const struct symsearch *arr, - struct module *owner, - void *data), void *data); - /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if symnum out of range. */ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, diff --git a/kernel/module.c b/kernel/module.c index 0f95fb4b3e37..c2a099a27b68 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -422,7 +422,7 @@ static bool each_symbol_in_section(const struct symsearch *arr, } /* Returns true as soon as fn returns true, otherwise false. */ -bool each_symbol_section(bool (*fn)(const struct symsearch *arr, +static bool each_symbol_section(bool (*fn)(const struct symsearch *arr, struct module *owner, void *data), void *data) @@ -484,7 +484,6 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr, } return false; } -EXPORT_SYMBOL_GPL(each_symbol_section); struct find_symbol_arg { /* Input */ -- cgit v1.2.3 From 3fe1e56d0e68b623dd62d8d38265d2a052e7e185 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:23 +0200 Subject: modules: unexport __module_text_address __module_text_address is only used by built-in code. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- kernel/module.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index c2a099a27b68..6ee1739e3150 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4503,7 +4503,6 @@ struct module *__module_text_address(unsigned long addr) } return mod; } -EXPORT_SYMBOL_GPL(__module_text_address); /* Don't grab lock, we're oopsing. */ void print_modules(void) -- cgit v1.2.3 From 34e64705ad415ed7a816e60ef62b42fe6d1729d9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:24 +0200 Subject: modules: unexport __module_address __module_address is only used by built-in code. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- kernel/module.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6ee1739e3150..e85d06158fbc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4464,7 +4464,6 @@ struct module *__module_address(unsigned long addr) } return mod; } -EXPORT_SYMBOL_GPL(__module_address); /* * is_module_text_address - is this address inside module code? -- cgit v1.2.3 From cd8732cdcc37d7077c4fa2c966b748c0662b607e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:25 +0200 Subject: modules: rename the licence field in struct symsearch to license Use the same spelling variant as the rest of the file. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- include/linux/module.h | 2 +- kernel/module.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index b79219eed83c..be04ba2f881d 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -586,7 +586,7 @@ struct symsearch { NOT_GPL_ONLY, GPL_ONLY, WILL_BE_GPL_ONLY, - } licence; + } license; bool unused; }; diff --git a/kernel/module.c b/kernel/module.c index e85d06158fbc..62d817a0dca8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -504,9 +504,9 @@ static bool check_exported_symbol(const struct symsearch *syms, struct find_symbol_arg *fsa = data; if (!fsa->gplok) { - if (syms->licence == GPL_ONLY) + if (syms->license == GPL_ONLY) return false; - if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { + if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) { pr_warn("Symbol %s is being used by a non-GPL module, " "which will not be allowed in the future\n", fsa->name); -- cgit v1.2.3 From ef1dac6021cc8ec5de02ce31722bf26ac4ed5523 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Jul 2020 08:10:26 +0200 Subject: modules: return licensing information from find_symbol Report the GPLONLY status through a new argument. Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- include/linux/module.h | 2 +- kernel/module.c | 16 +++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index be04ba2f881d..30b0f5fcdb3c 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -582,7 +582,7 @@ struct module *find_module(const char *name); struct symsearch { const struct kernel_symbol *start, *stop; const s32 *crcs; - enum { + enum mod_license { NOT_GPL_ONLY, GPL_ONLY, WILL_BE_GPL_ONLY, diff --git a/kernel/module.c b/kernel/module.c index 62d817a0dca8..656f5ff27088 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -495,6 +495,7 @@ struct find_symbol_arg { struct module *owner; const s32 *crc; const struct kernel_symbol *sym; + enum mod_license license; }; static bool check_exported_symbol(const struct symsearch *syms, @@ -528,6 +529,7 @@ static bool check_exported_symbol(const struct symsearch *syms, fsa->owner = owner; fsa->crc = symversion(syms->crcs, symnum); fsa->sym = &syms->start[symnum]; + fsa->license = syms->license; return true; } @@ -587,6 +589,7 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, static const struct kernel_symbol *find_symbol(const char *name, struct module **owner, const s32 **crc, + enum mod_license *license, bool gplok, bool warn) { @@ -601,6 +604,8 @@ static const struct kernel_symbol *find_symbol(const char *name, *owner = fsa.owner; if (crc) *crc = fsa.crc; + if (license) + *license = fsa.license; return fsa.sym; } @@ -1074,7 +1079,7 @@ void __symbol_put(const char *symbol) struct module *owner; preempt_disable(); - if (!find_symbol(symbol, &owner, NULL, true, false)) + if (!find_symbol(symbol, &owner, NULL, NULL, true, false)) BUG(); module_put(owner); preempt_enable(); @@ -1352,7 +1357,7 @@ static inline int check_modstruct_version(const struct load_info *info, * locking is necessary -- use preempt_disable() to placate lockdep. */ preempt_disable(); - if (!find_symbol("module_layout", NULL, &crc, true, false)) { + if (!find_symbol("module_layout", NULL, &crc, NULL, true, false)) { preempt_enable(); BUG(); } @@ -1436,6 +1441,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, struct module *owner; const struct kernel_symbol *sym; const s32 *crc; + enum mod_license license; int err; /* @@ -1445,7 +1451,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, */ sched_annotate_sleep(); mutex_lock(&module_mutex); - sym = find_symbol(name, &owner, &crc, + sym = find_symbol(name, &owner, &crc, &license, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); if (!sym) goto unlock; @@ -2213,7 +2219,7 @@ void *__symbol_get(const char *symbol) const struct kernel_symbol *sym; preempt_disable(); - sym = find_symbol(symbol, &owner, NULL, true, true); + sym = find_symbol(symbol, &owner, NULL, NULL, true, true); if (sym && strong_try_module_get(owner)) sym = NULL; preempt_enable(); @@ -2249,7 +2255,7 @@ static int verify_exported_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { if (find_symbol(kernel_symbol_name(s), &owner, NULL, - true, false)) { + NULL, true, false)) { pr_err("%s: exports duplicate symbol %s" " (owned by %s)\n", mod->name, kernel_symbol_name(s), -- cgit v1.2.3 From 73b11c2ab072d5b0599d1e12cc126f55ee306daf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 31 Jul 2020 11:28:26 -0700 Subject: bpf: Add support for forced LINK_DETACH command Add LINK_DETACH command to force-detach bpf_link without destroying it. It has the same behavior as auto-detaching of bpf_link due to cgroup dying for bpf_cgroup_link or net_device being destroyed for bpf_xdp_link. In such case, bpf_link is still a valid kernel object, but is defuncts and doesn't hold BPF program attached to corresponding BPF hook. This functionality allows users with enough access rights to manually force-detach attached bpf_link without killing respective owner process. This patch implements LINK_DETACH for cgroup, xdp, and netns links, mostly re-using existing link release handling code. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200731182830.286260-2-andriin@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 5 +++++ kernel/bpf/cgroup.c | 15 ++++++++++++++- kernel/bpf/net_namespace.c | 8 ++++++++ kernel/bpf/syscall.c | 26 ++++++++++++++++++++++++++ net/core/dev.c | 11 ++++++++++- 6 files changed, 64 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 40c5e206ecf2..cef4ef0d2b4e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -793,6 +793,7 @@ struct bpf_link { struct bpf_link_ops { void (*release)(struct bpf_link *link); void (*dealloc)(struct bpf_link *link); + int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index eb5e0c38eb2c..b134e679e9db 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -117,6 +117,7 @@ enum bpf_cmd { BPF_LINK_GET_NEXT_ID, BPF_ENABLE_STATS, BPF_ITER_CREATE, + BPF_LINK_DETACH, }; enum bpf_map_type { @@ -634,6 +635,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { + __u32 link_fd; + } link_detach; + struct { /* struct used by BPF_ENABLE_STATS command */ __u32 type; } enable_stats; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 957cce1d5168..83ff127ef7ae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -814,6 +814,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link) { struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link); + struct cgroup *cg; /* link might have been auto-detached by dying cgroup already, * in that case our work is done here @@ -832,8 +833,12 @@ static void bpf_cgroup_link_release(struct bpf_link *link) WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); + cg = cg_link->cgroup; + cg_link->cgroup = NULL; + mutex_unlock(&cgroup_mutex); - cgroup_put(cg_link->cgroup); + + cgroup_put(cg); } static void bpf_cgroup_link_dealloc(struct bpf_link *link) @@ -844,6 +849,13 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } +static int bpf_cgroup_link_detach(struct bpf_link *link) +{ + bpf_cgroup_link_release(link); + + return 0; +} + static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { @@ -883,6 +895,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, + .detach = bpf_cgroup_link_detach, .update_prog = cgroup_bpf_replace, .show_fdinfo = bpf_cgroup_link_show_fdinfo, .fill_link_info = bpf_cgroup_link_fill_link_info, diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 71405edd667c..542f275bf252 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -142,9 +142,16 @@ static void bpf_netns_link_release(struct bpf_link *link) bpf_prog_array_free(old_array); out_unlock: + net_link->net = NULL; mutex_unlock(&netns_bpf_mutex); } +static int bpf_netns_link_detach(struct bpf_link *link) +{ + bpf_netns_link_release(link); + return 0; +} + static void bpf_netns_link_dealloc(struct bpf_link *link) { struct bpf_netns_link *net_link = @@ -228,6 +235,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, static const struct bpf_link_ops bpf_netns_link_ops = { .release = bpf_netns_link_release, .dealloc = bpf_netns_link_dealloc, + .detach = bpf_netns_link_detach, .update_prog = bpf_netns_link_update_prog, .fill_link_info = bpf_netns_link_fill_info, .show_fdinfo = bpf_netns_link_show_fdinfo, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cd3d599e9e90..2f343ce15747 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3991,6 +3991,29 @@ out_put_link: return ret; } +#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd + +static int link_detach(union bpf_attr *attr) +{ + struct bpf_link *link; + int ret; + + if (CHECK_ATTR(BPF_LINK_DETACH)) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->link_detach.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + if (link->ops->detach) + ret = link->ops->detach(link); + else + ret = -EOPNOTSUPP; + + bpf_link_put(link); + return ret; +} + static int bpf_link_inc_not_zero(struct bpf_link *link) { return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; @@ -4240,6 +4263,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_ITER_CREATE: err = bpf_iter_create(&attr); break; + case BPF_LINK_DETACH: + err = link_detach(&attr); + break; default: err = -EINVAL; break; diff --git a/net/core/dev.c b/net/core/dev.c index a2a57988880a..c8b911b10187 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8979,12 +8979,20 @@ static void bpf_xdp_link_release(struct bpf_link *link) /* if racing with net_device's tear down, xdp_link->dev might be * already NULL, in which case link was already auto-detached */ - if (xdp_link->dev) + if (xdp_link->dev) { WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + xdp_link->dev = NULL; + } rtnl_unlock(); } +static int bpf_xdp_link_detach(struct bpf_link *link) +{ + bpf_xdp_link_release(link); + return 0; +} + static void bpf_xdp_link_dealloc(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); @@ -9066,6 +9074,7 @@ out_unlock: static const struct bpf_link_ops bpf_xdp_link_lops = { .release = bpf_xdp_link_release, .dealloc = bpf_xdp_link_dealloc, + .detach = bpf_xdp_link_detach, .show_fdinfo = bpf_xdp_link_show_fdinfo, .fill_link_info = bpf_xdp_link_fill_link_info, .update_prog = bpf_xdp_link_update, -- cgit v1.2.3 From c6fe44d96fc1536af5b11cd859686453d1b7bfd1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 23 Jul 2020 12:33:41 -0700 Subject: list: add "list_del_init_careful()" to go with "list_empty_careful()" That gives us ordering guarantees around the pair. Signed-off-by: Linus Torvalds --- include/linux/list.h | 20 +++++++++++++++++++- kernel/sched/wait.c | 2 +- mm/filemap.c | 7 +------ 3 files changed, 21 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/list.h b/include/linux/list.h index aff44d34f4e4..0d0d17a10d25 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -282,6 +282,24 @@ static inline int list_empty(const struct list_head *head) return READ_ONCE(head->next) == head; } +/** + * list_del_init_careful - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + * + * This is the same as list_del_init(), except designed to be used + * together with list_empty_careful() in a way to guarantee ordering + * of other memory operations. + * + * Any memory operations done before a list_del_init_careful() are + * guaranteed to be visible after a list_empty_careful() test. + */ +static inline void list_del_init_careful(struct list_head *entry) +{ + __list_del_entry(entry); + entry->prev = entry; + smp_store_release(&entry->next, entry); +} + /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test @@ -297,7 +315,7 @@ static inline int list_empty(const struct list_head *head) */ static inline int list_empty_careful(const struct list_head *head) { - struct list_head *next = head->next; + struct list_head *next = smp_load_acquire(&head->next); return (next == head) && (next == head->prev); } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index ba059fbfc53a..01f5d3020589 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -389,7 +389,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i int ret = default_wake_function(wq_entry, mode, sync, key); if (ret) - list_del_init(&wq_entry->entry); + list_del_init_careful(&wq_entry->entry); return ret; } diff --git a/mm/filemap.c b/mm/filemap.c index 8c3d3e233d37..991503bbf922 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1041,13 +1041,8 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, * since after list_del_init(&wait->entry) the wait entry * might be de-allocated and the process might even have * exited. - * - * We _really_ should have a "list_del_init_careful()" to - * properly pair with the unlocked "list_empty_careful()" - * in finish_wait(). */ - smp_mb(); - list_del_init(&wait->entry); + list_del_init_careful(&wait->entry); return ret; } -- cgit v1.2.3 From 0f69dae4d1069963fdcc16e63655927d83281006 Mon Sep 17 00:00:00 2001 From: Zhaoyang Huang Date: Fri, 31 Jul 2020 08:27:45 +0800 Subject: trace : Have tracing buffer info use kvzalloc instead of kzalloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit High order memory stuff within trace could introduce OOM, use kvzalloc instead. Please find the bellowing for the call stack we run across in an android system. The scenario happens when traced_probes is woken up to get a large quantity of trace even if free memory is even higher than watermark_low.  traced_probes invoked oom-killer: gfp_mask=0x140c0c0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null), order=2, oom_score_adj=-1 traced_probes cpuset=system-background mems_allowed=0 CPU: 3 PID: 588 Comm: traced_probes Tainted: G W O 4.14.181 #1 Hardware name: Generic DT based system (unwind_backtrace) from [] (show_stack+0x20/0x24) (show_stack) from [] (dump_stack+0xa8/0xec) (dump_stack) from [] (dump_header+0x9c/0x220) (dump_header) from [] (oom_kill_process+0xc0/0x5c4) (oom_kill_process) from [] (out_of_memory+0x220/0x310) (out_of_memory) from [] (__alloc_pages_nodemask+0xff8/0x13a4) (__alloc_pages_nodemask) from [] (kmalloc_order+0x30/0x48) (kmalloc_order) from [] (kmalloc_order_trace+0x30/0x118) (kmalloc_order_trace) from [] (tracing_buffers_open+0x50/0xfc) (tracing_buffers_open) from [] (do_dentry_open+0x278/0x34c) (do_dentry_open) from [] (vfs_open+0x50/0x70) (vfs_open) from [] (path_openat+0x5fc/0x169c) (path_openat) from [] (do_filp_open+0x94/0xf8) (do_filp_open) from [] (do_sys_open+0x168/0x26c) (do_sys_open) from [] (SyS_openat+0x34/0x38) (SyS_openat) from [] (ret_fast_syscall+0x0/0x28) Link: https://lkml.kernel.org/r/1596155265-32365-1-git-send-email-zhaoyang.huang@unisoc.com Signed-off-by: Zhaoyang Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dbcacdd56b02..06c0feae5ff9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7402,7 +7402,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) if (ret) return ret; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = kvzalloc(sizeof(*info), GFP_KERNEL); if (!info) { trace_array_put(tr); return -ENOMEM; @@ -7528,7 +7528,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) if (info->spare) ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); - kfree(info); + kvfree(info); mutex_unlock(&trace_types_lock); -- cgit v1.2.3 From c58b6b0372de0d4cd0536d6585addd1b36b151ae Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 24 Jul 2020 20:50:48 -0400 Subject: ftrace: Fix ftrace_trace_task return value I was attempting to use pid filtering with function_graph, but it wasn't allowing anything to make it through. Turns out ftrace_trace_task returns false if ftrace_ignore_pid is not-empty, which isn't correct anymore. We're now setting it to FTRACE_PID_IGNORE if we need to ignore that pid, otherwise it's set to the pid (which is weird considering the name) or to FTRACE_PID_TRACE. Fix the check to check for != FTRACE_PID_IGNORE. With this we can now use function_graph with pid filtering. Link: https://lkml.kernel.org/r/20200725005048.1790-1-josef@toxicpanda.com Fixes: 717e3f5ebc82 ("ftrace: Make function trace pid filtering a bit more exact") Signed-off-by: Josef Bacik Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 --- kernel/trace/trace.h | 7 ++++++- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f433cb44300a..4e3a5d79c078 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -139,9 +139,6 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) #endif } -#define FTRACE_PID_IGNORE -1 -#define FTRACE_PID_TRACE -2 - static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f21607f87189..610d21355526 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1103,6 +1103,10 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER + +#define FTRACE_PID_IGNORE -1 +#define FTRACE_PID_TRACE -2 + struct ftrace_func_command { struct list_head list; char *name; @@ -1114,7 +1118,8 @@ struct ftrace_func_command { extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct trace_array *tr) { - return !this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid); + return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) != + FTRACE_PID_IGNORE; } extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, -- cgit v1.2.3 From 0cb2f1372baa60af8456388a574af6133edd7d80 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Jul 2020 14:45:36 +0800 Subject: kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler We found a case of kernel panic on our server. The stack trace is as follows(omit some irrelevant information): BUG: kernel NULL pointer dereference, address: 0000000000000080 RIP: 0010:kprobe_ftrace_handler+0x5e/0xe0 RSP: 0018:ffffb512c6550998 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff8e9d16eea018 RCX: 0000000000000000 RDX: ffffffffbe1179c0 RSI: ffffffffc0535564 RDI: ffffffffc0534ec0 RBP: ffffffffc0534ec1 R08: ffff8e9d1bbb0f00 R09: 0000000000000004 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffff8e9d1f797060 R14: 000000000000bacc R15: ffff8e9ce13eca00 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000080 CR3: 00000008453d0005 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ftrace_ops_assist_func+0x56/0xe0 ftrace_call+0x5/0x34 tcpa_statistic_send+0x5/0x130 [ttcp_engine] The tcpa_statistic_send is the function being kprobed. After analysis, the root cause is that the fourth parameter regs of kprobe_ftrace_handler is NULL. Why regs is NULL? We use the crash tool to analyze the kdump. crash> dis tcpa_statistic_send -r : callq 0xffffffffbd8018c0 The tcpa_statistic_send calls ftrace_caller instead of ftrace_regs_caller. So it is reasonable that the fourth parameter regs of kprobe_ftrace_handler is NULL. In theory, we should call the ftrace_regs_caller instead of the ftrace_caller. After in-depth analysis, we found a reproducible path. Writing a simple kernel module which starts a periodic timer. The timer's handler is named 'kprobe_test_timer_handler'. The module name is kprobe_test.ko. 1) insmod kprobe_test.ko 2) bpftrace -e 'kretprobe:kprobe_test_timer_handler {}' 3) echo 0 > /proc/sys/kernel/ftrace_enabled 4) rmmod kprobe_test 5) stop step 2) kprobe 6) insmod kprobe_test.ko 7) bpftrace -e 'kretprobe:kprobe_test_timer_handler {}' We mark the kprobe as GONE but not disarm the kprobe in the step 4). The step 5) also do not disarm the kprobe when unregister kprobe. So we do not remove the ip from the filter. In this case, when the module loads again in the step 6), we will replace the code to ftrace_caller via the ftrace_module_enable(). When we register kprobe again, we will not replace ftrace_caller to ftrace_regs_caller because the ftrace is disabled in the step 3). So the step 7) will trigger kernel panic. Fix this problem by disarming the kprobe when the module is going away. Link: https://lkml.kernel.org/r/20200728064536.24405-1-songmuchun@bytedance.com Cc: stable@vger.kernel.org Fixes: ae6aa16fdc16 ("kprobes: introduce ftrace based optimization") Acked-by: Masami Hiramatsu Signed-off-by: Muchun Song Co-developed-by: Chengming Zhou Signed-off-by: Chengming Zhou Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 4a904cc56d68..07bf03fcf574 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2113,6 +2113,13 @@ static void kill_kprobe(struct kprobe *p) * the original probed function (which will be freed soon) any more. */ arch_remove_kprobe(p); + + /* + * The module is going away. We should disarm the kprobe which + * is using ftrace. + */ + if (kprobe_ftrace(p)) + disarm_kprobe_ftrace(p); } /* Disable one kprobe */ -- cgit v1.2.3 From 231621d0c570765d5cebd95c582f1c8df5c46028 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 24 Jul 2020 11:24:24 +0800 Subject: tracing/uprobe: Remove dead code in trace_uprobe_register() In the function trace_uprobe_register(), the statement "return 0;" out of switch case is dead code, remove it. Link: https://lkml.kernel.org/r/1595561064-29186-1-git-send-email-fanpeng@loongson.cn Signed-off-by: Peng Fan Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index fdd47f99b18f..f4286c9bdeb4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1456,7 +1456,6 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, default: return 0; } - return 0; } static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) -- cgit v1.2.3 From afcab636657421f7ebfa0783a91f90256bba0091 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Aug 2020 20:00:02 -0400 Subject: tracing: Use trace_sched_process_free() instead of exit() for pid tracing On exit, if a process is preempted after the trace_sched_process_exit() tracepoint but before the process is done exiting, then when it gets scheduled in, the function tracers will not filter it properly against the function tracing pid filters. That is because the function tracing pid filters hooks to the sched_process_exit() tracepoint to remove the exiting task's pid from the filter list. Because the filtering happens at the sched_switch tracepoint, when the exiting task schedules back in to finish up the exit, it will no longer be in the function pid filtering tables. This was noticeable in the notrace self tests on a preemptable kernel, as the tests would fail as it exits and preempted after being taken off the notrace filter table and on scheduling back in it would not be in the notrace list, and then the ending of the exit function would trace. The test detected this and would fail. Cc: stable@vger.kernel.org Cc: Namhyung Kim Fixes: 1e10486ffee0a ("ftrace: Add 'function-fork' trace option") Fixes: c37775d57830a ("tracing: Add infrastructure to allow set_event_pid to follow children" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace_events.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4e3a5d79c078..76f2dd6fd414 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6985,12 +6985,12 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) if (enable) { register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, tr); - register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit, tr); } else { unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, tr); - unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit, tr); } } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f6f55682d3e2..a85effb2373b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -538,12 +538,12 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) if (enable) { register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork, tr, INT_MIN); - register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit, + register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit, tr, INT_MAX); } else { unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork, tr); - unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit, + unregister_trace_sched_process_free(event_filter_pid_sched_process_exit, tr); } } -- cgit v1.2.3 From 262e6ae7081df304fc625cf368d5c2cbba2bb991 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jul 2020 23:33:33 +0200 Subject: modules: inherit TAINT_PROPRIETARY_MODULE If a TAINT_PROPRIETARY_MODULE exports symbol, inherit the taint flag for all modules importing these symbols, and don't allow loading symbols from TAINT_PROPRIETARY_MODULE modules if the module previously imported gplonly symbols. Add a anti-circumvention devices so people don't accidentally get themselves into trouble this way. Comment from Greg: "Ah, the proven-to-be-illegal "GPL Condom" defense :)" [jeyu: pr_info -> pr_err and pr_warn as per discussion] Link: http://lore.kernel.org/r/20200730162957.GA22469@lst.de Acked-by: Daniel Vetter Reviewed-by: Greg Kroah-Hartman Signed-off-by: Christoph Hellwig Signed-off-by: Jessica Yu --- include/linux/module.h | 1 + kernel/module.c | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 30b0f5fcdb3c..e30ed5fa33a7 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -389,6 +389,7 @@ struct module { unsigned int num_gpl_syms; const struct kernel_symbol *gpl_syms; const s32 *gpl_crcs; + bool using_gplonly_symbols; #ifdef CONFIG_UNUSED_SYMBOLS /* unused exported symbols. */ diff --git a/kernel/module.c b/kernel/module.c index 656f5ff27088..09bf5a652a47 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1431,6 +1431,24 @@ static int verify_namespace_is_imported(const struct load_info *info, return 0; } +static bool inherit_taint(struct module *mod, struct module *owner) +{ + if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints)) + return true; + + if (mod->using_gplonly_symbols) { + pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n", + mod->name, owner->name); + return false; + } + + if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) { + pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n", + mod->name, owner->name); + set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints); + } + return true; +} /* Resolve a symbol for this module. I.e. if we find one, record usage. */ static const struct kernel_symbol *resolve_symbol(struct module *mod, @@ -1456,6 +1474,14 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, if (!sym) goto unlock; + if (license == GPL_ONLY) + mod->using_gplonly_symbols = true; + + if (!inherit_taint(mod, owner)) { + sym = NULL; + goto getname; + } + if (!check_version(info, name, mod, crc)) { sym = ERR_PTR(-EINVAL); goto getname; -- cgit v1.2.3 From a1bd06853ee478d37fae9435c5521e301de94c67 Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Wed, 5 Aug 2020 16:31:38 -0400 Subject: sched: Fix use of count for nr_running tracepoint The count field is meant to tell if an update to nr_running is an add or a subtract. Make it do so by adding the missing minus sign. Fixes: 9d246053a691 ("sched: Add a tracepoint to track rq->nr_running") Signed-off-by: Phil Auld Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200805203138.1411-1-pauld@redhat.com --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3fd283892761..28709f6b0975 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1999,7 +1999,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; if (trace_sched_update_nr_running_tp_enabled()) { - call_trace_sched_update_nr_running(rq, count); + call_trace_sched_update_nr_running(rq, -count); } /* Check if we still need preemption */ -- cgit v1.2.3 From 19d0070a2792181f79df01277fe00b83b9f7eda7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Aug 2020 17:01:23 +0200 Subject: timekeeping/vsyscall: Provide vdso_update_begin/end() Architectures can have the requirement to add additional architecture specific data to the VDSO data page which needs to be updated independent of the timekeeper updates. To protect these updates vs. concurrent readers and a conflicting update through timekeeping, provide helper functions to make such updates safe. vdso_update_begin() takes the timekeeper_lock to protect against a potential update from timekeeper code and increments the VDSO sequence count to signal data inconsistency to concurrent readers. vdso_update_end() makes the sequence count even again to signal data consistency and drops the timekeeper lock. [ Sven: Add interrupt disable handling to the functions ] Signed-off-by: Thomas Gleixner Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200804150124.41692-3-svens@linux.ibm.com --- include/vdso/vsyscall.h | 3 +++ kernel/time/timekeeping.c | 2 +- kernel/time/timekeeping_internal.h | 11 +++++++--- kernel/time/vsyscall.c | 41 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/vdso/vsyscall.h b/include/vdso/vsyscall.h index 2c6134e0c23d..b0fdc9c6bf43 100644 --- a/include/vdso/vsyscall.h +++ b/include/vdso/vsyscall.h @@ -6,6 +6,9 @@ #include +unsigned long vdso_update_begin(void); +void vdso_update_end(unsigned long flags); + #endif /* !__ASSEMBLY__ */ #endif /* __VDSO_VSYSCALL_H */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 63a632f9896c..4c7212f3c603 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -50,7 +50,7 @@ static struct { .seq = SEQCNT_ZERO(tk_core.seq), }; -static DEFINE_RAW_SPINLOCK(timekeeper_lock); +DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; /** diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index bcbb52db2256..4ca2787d1642 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -1,12 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TIMEKEEPING_INTERNAL_H #define _TIMEKEEPING_INTERNAL_H -/* - * timekeeping debug functions - */ + #include +#include #include +/* + * timekeeping debug functions + */ #ifdef CONFIG_DEBUG_FS extern void tk_debug_account_sleep_time(const struct timespec64 *t); #else @@ -31,4 +33,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) } #endif +/* Semi public for serialization of non timekeeper VDSO updates. */ +extern raw_spinlock_t timekeeper_lock; + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 54ce6eb2ca36..88e6b8ed6ca5 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -13,6 +13,8 @@ #include #include +#include "timekeeping_internal.h" + static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { @@ -127,3 +129,42 @@ void update_vsyscall_tz(void) __arch_sync_vdso_data(vdata); } + +/** + * vdso_update_begin - Start of a VDSO update section + * + * Allows architecture code to safely update the architecture specific VDSO + * data. Disables interrupts, acquires timekeeper lock to serialize against + * concurrent updates from timekeeping and invalidates the VDSO data + * sequence counter to prevent concurrent readers from accessing + * inconsistent data. + * + * Returns: Saved interrupt flags which need to be handed in to + * vdso_update_end(). + */ +unsigned long vdso_update_begin(void) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + vdso_write_begin(vdata); + return flags; +} + +/** + * vdso_update_end - End of a VDSO update section + * @flags: Interrupt flags as returned from vdso_update_begin() + * + * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data + * synchronization if the architecture requires it, drops timekeeper lock + * and restores interrupt flags. + */ +void vdso_update_end(unsigned long flags) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + + vdso_write_end(vdata); + __arch_sync_vdso_data(vdata); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} -- cgit v1.2.3 From 45fd22da97c6125d8d0d35bd1791e7c0c4175279 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Wed, 5 Aug 2020 10:56:56 +0300 Subject: perf/core: Take over CAP_SYS_PTRACE creds to CAP_PERFMON capability Open access to per-process monitoring for CAP_PERFMON only privileged processes [1]. Extend ptrace_may_access() check in perf_events subsystem with perfmon_capable() to simplify user experience and make monitoring more secure by reducing attack surface. [1] https://lore.kernel.org/lkml/7776fa40-6c65-2aa6-1322-eb3a01201000@linux.intel.com/ Signed-off-by: Alexey Budankov Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/6e8392ff-4732-0012-2949-e1587709f0f6@linux.intel.com --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 78e69e10482a..41e0cefb429b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11689,7 +11689,7 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; /* - * Reuse ptrace permission checks for now. + * Preserve ptrace permission check for backwards compatibility. * * We must hold exec_update_mutex across this and any potential * perf_install_in_context() call for this new event to @@ -11697,7 +11697,7 @@ SYSCALL_DEFINE5(perf_event_open, * perf_event_exit_task() that could imply). */ err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) + if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) goto err_cred; } -- cgit v1.2.3 From 10de795a5addd1962406796a6e13ba6cc0fc6bee Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 6 Aug 2020 01:20:46 +0800 Subject: kprobes: Fix compiler warning for !CONFIG_KPROBES_ON_FTRACE Fix compiler warning(as show below) for !CONFIG_KPROBES_ON_FTRACE. kernel/kprobes.c: In function 'kill_kprobe': kernel/kprobes.c:1116:33: warning: statement with no effect [-Wunused-value] 1116 | #define disarm_kprobe_ftrace(p) (-ENODEV) | ^ kernel/kprobes.c:2154:3: note: in expansion of macro 'disarm_kprobe_ftrace' 2154 | disarm_kprobe_ftrace(p); Link: https://lore.kernel.org/r/20200805142136.0331f7ea@canb.auug.org.au Link: https://lkml.kernel.org/r/20200805172046.19066-1-songmuchun@bytedance.com Reported-by: Stephen Rothwell Fixes: 0cb2f1372baa ("kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler") Acked-by: Masami Hiramatsu Acked-by: John Fastabend Signed-off-by: Muchun Song Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 07bf03fcf574..66d14107968d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1079,9 +1079,20 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } #else /* !CONFIG_KPROBES_ON_FTRACE */ -#define prepare_kprobe(p) arch_prepare_kprobe(p) -#define arm_kprobe_ftrace(p) (-ENODEV) -#define disarm_kprobe_ftrace(p) (-ENODEV) +static inline int prepare_kprobe(struct kprobe *p) +{ + return arch_prepare_kprobe(p); +} + +static inline int arm_kprobe_ftrace(struct kprobe *p) +{ + return -ENODEV; +} + +static inline int disarm_kprobe_ftrace(struct kprobe *p) +{ + return -ENODEV; +} #endif /* Arm a kprobe with text_mutex */ -- cgit v1.2.3 From 820903c784a01bf6e143253418508da4f5790cff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Jul 2020 12:14:05 +0200 Subject: posix-cpu-timers: Split run_posix_cpu_timers() Split it up as a preparatory step to move the heavy lifting out of interrupt context. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20200730102337.677439437@linutronix.de --- kernel/time/posix-cpu-timers.c | 43 +++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 165117996ea0..e5ad87320468 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1080,32 +1080,15 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) return false; } -/* - * This is called from the timer interrupt handler. The irq handler has - * already updated our counts. We need to check if any timers fire now. - * Interrupts are disabled. - */ -void run_posix_cpu_timers(void) +static void __run_posix_cpu_timers(struct task_struct *tsk) { - struct task_struct *tsk = current; struct k_itimer *timer, *next; unsigned long flags; LIST_HEAD(firing); - lockdep_assert_irqs_disabled(); - - /* - * The fast path checks that there are no expired thread or thread - * group timers. If that's so, just return. - */ - if (!fastpath_timer_check(tsk)) + if (!lock_task_sighand(tsk, &flags)) return; - lockdep_posixtimer_enter(); - if (!lock_task_sighand(tsk, &flags)) { - lockdep_posixtimer_exit(); - return; - } /* * Here we take off tsk->signal->cpu_timers[N] and * tsk->cpu_timers[N] all the timers that are firing, and @@ -1147,6 +1130,28 @@ void run_posix_cpu_timers(void) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } +} + +/* + * This is called from the timer interrupt handler. The irq handler has + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +void run_posix_cpu_timers(void) +{ + struct task_struct *tsk = current; + + lockdep_assert_irqs_disabled(); + + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) + return; + + lockdep_posixtimer_enter(); + __run_posix_cpu_timers(tsk); lockdep_posixtimer_exit(); } -- cgit v1.2.3 From 1fb497dd003009be95ce67689ac800c446b7acc5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Jul 2020 12:14:06 +0200 Subject: posix-cpu-timers: Provide mechanisms to defer timer handling to task_work Running posix CPU timers in hard interrupt context has a few downsides: - For PREEMPT_RT it cannot work as the expiry code needs to take sighand lock, which is a 'sleeping spinlock' in RT. The original RT approach of offloading the posix CPU timer handling into a high priority thread was clumsy and provided no real benefit in general. - For fine grained accounting it's just wrong to run this in context of the timer interrupt because that way a process specific CPU time is accounted to the timer interrupt. - Long running timer interrupts caused by a large amount of expiring timers which can be created and armed by unpriviledged user space. There is no hard requirement to expire them in interrupt context. If the signal is targeted at the task itself then it won't be delivered before the task returns to user space anyway. If the signal is targeted at a supervisor process then it might be slightly delayed, but posix CPU timers are inaccurate anyway due to the fact that they are tied to the tick. Provide infrastructure to schedule task work which allows splitting the posix CPU timer code into a quick check in interrupt context and a thread context expiry and signal delivery function. This has to be enabled by architectures as it requires that the architecture specific KVM implementation handles pending task work before exiting to guest mode. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20200730102337.783470146@linutronix.de --- include/linux/posix-timers.h | 17 ++++ include/linux/sched.h | 4 + kernel/time/Kconfig | 9 ++ kernel/time/posix-cpu-timers.c | 185 ++++++++++++++++++++++++++++++++++++++--- kernel/time/timer.c | 1 + 5 files changed, 204 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index e3f0f8585da4..896c16d2c5fb 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -6,6 +6,7 @@ #include #include #include +#include struct kernel_siginfo; struct task_struct; @@ -125,6 +126,16 @@ struct posix_cputimers { unsigned int expiry_active; }; +/** + * posix_cputimers_work - Container for task work based posix CPU timer expiry + * @work: The task work to be scheduled + * @scheduled: @work has been scheduled already, no further processing + */ +struct posix_cputimers_work { + struct callback_head work; + unsigned int scheduled; +}; + static inline void posix_cputimers_init(struct posix_cputimers *pct) { memset(pct, 0, sizeof(*pct)); @@ -165,6 +176,12 @@ static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } #endif +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK +void posix_cputimers_init_work(void); +#else +static inline void posix_cputimers_init_work(void) { } +#endif + #define REQUEUE_PENDING 1 /** diff --git a/include/linux/sched.h b/include/linux/sched.h index 06ec60462af0..e9942ce07ef1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -889,6 +889,10 @@ struct task_struct { /* Empty if CONFIG_POSIX_CPUTIMERS=n */ struct posix_cputimers posix_cputimers; +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK + struct posix_cputimers_work posix_cputimers_work; +#endif + /* Process credentials: */ /* Tracer's credentials at attach: */ diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fcc42353f125..a09b1d61df6a 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -52,6 +52,15 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST config GENERIC_CMOS_UPDATE bool +# Select to handle posix CPU timers from task_work +# and not from the timer interrupt context +config HAVE_POSIX_CPU_TIMERS_TASK_WORK + bool + +config POSIX_CPU_TIMERS_TASK_WORK + bool + default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK + if GENERIC_CLOCKEVENTS menu "Timers subsystem" diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e5ad87320468..a71758e34e45 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -377,6 +377,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { + static struct lock_class_key posix_cpu_timers_key; struct pid *pid; rcu_read_lock(); @@ -386,6 +387,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return -EINVAL; } + /* + * If posix timer expiry is handled in task work context then + * timer::it_lock can be taken without disabling interrupts as all + * other locking happens in task context. This requires a seperate + * lock class key otherwise regular posix timer expiry would record + * the lock class being taken in interrupt context and generate a + * false positive warning. + */ + if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK)) + lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key); + new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); new_timer->it.cpu.pid = get_pid(pid); @@ -1080,26 +1092,163 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) return false; } -static void __run_posix_cpu_timers(struct task_struct *tsk) +static void handle_posix_cpu_timers(struct task_struct *tsk); + +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK +static void posix_cpu_timers_work(struct callback_head *work) +{ + handle_posix_cpu_timers(current); +} + +/* + * Initialize posix CPU timers task work in init task. Out of line to + * keep the callback static and to avoid header recursion hell. + */ +void __init posix_cputimers_init_work(void) +{ + init_task_work(¤t->posix_cputimers_work.work, + posix_cpu_timers_work); +} + +/* + * Note: All operations on tsk->posix_cputimer_work.scheduled happen either + * in hard interrupt context or in task context with interrupts + * disabled. Aside of that the writer/reader interaction is always in the + * context of the current task, which means they are strict per CPU. + */ +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return tsk->posix_cputimers_work.scheduled; +} + +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled)) + return; + + /* Schedule task work to actually expire the timers */ + tsk->posix_cputimers_work.scheduled = true; + task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME); +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + bool ret = true; + + /* + * On !RT kernels interrupts are disabled while collecting expired + * timers, so no tick can happen and the fast path check can be + * reenabled without further checks. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + tsk->posix_cputimers_work.scheduled = false; + return true; + } + + /* + * On RT enabled kernels ticks can happen while the expired timers + * are collected under sighand lock. But any tick which observes + * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath + * checks. So reenabling the tick work has do be done carefully: + * + * Disable interrupts and run the fast path check if jiffies have + * advanced since the collecting of expired timers started. If + * jiffies have not advanced or the fast path check did not find + * newly expired timers, reenable the fast path check in the timer + * interrupt. If there are newly expired timers, return false and + * let the collection loop repeat. + */ + local_irq_disable(); + if (start != jiffies && fastpath_timer_check(tsk)) + ret = false; + else + tsk->posix_cputimers_work.scheduled = false; + local_irq_enable(); + + return ret; +} +#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + lockdep_posixtimer_enter(); + handle_posix_cpu_timers(tsk); + lockdep_posixtimer_exit(); +} + +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return false; +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + return true; +} +#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ + +static void handle_posix_cpu_timers(struct task_struct *tsk) { struct k_itimer *timer, *next; - unsigned long flags; + unsigned long flags, start; LIST_HEAD(firing); if (!lock_task_sighand(tsk, &flags)) return; - /* - * Here we take off tsk->signal->cpu_timers[N] and - * tsk->cpu_timers[N] all the timers that are firing, and - * put them on the firing list. - */ - check_thread_timers(tsk, &firing); + do { + /* + * On RT locking sighand lock does not disable interrupts, + * so this needs to be careful vs. ticks. Store the current + * jiffies value. + */ + start = READ_ONCE(jiffies); + barrier(); - check_process_timers(tsk, &firing); + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + + check_process_timers(tsk, &firing); + + /* + * The above timer checks have updated the exipry cache and + * because nothing can have queued or modified timers after + * sighand lock was taken above it is guaranteed to be + * consistent. So the next timer interrupt fastpath check + * will find valid data. + * + * If timer expiry runs in the timer interrupt context then + * the loop is not relevant as timers will be directly + * expired in interrupt context. The stub function below + * returns always true which allows the compiler to + * optimize the loop out. + * + * If timer expiry is deferred to task work context then + * the following rules apply: + * + * - On !RT kernels no tick can have happened on this CPU + * after sighand lock was acquired because interrupts are + * disabled. So reenabling task work before dropping + * sighand lock and reenabling interrupts is race free. + * + * - On RT kernels ticks might have happened but the tick + * work ignored posix CPU timer handling because the + * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work + * must be done very carefully including a check whether + * ticks have happened since the start of the timer + * expiry checks. posix_cpu_timers_enable_work() takes + * care of that and eventually lets the expiry checks + * run again. + */ + } while (!posix_cpu_timers_enable_work(tsk, start)); /* - * We must release these locks before taking any timer's lock. + * We must release sighand lock before taking any timer's lock. * There is a potential race with timer deletion here, as the * siglock now protects our private firing list. We have set * the firing flag in each timer, so that a deletion attempt @@ -1117,6 +1266,13 @@ static void __run_posix_cpu_timers(struct task_struct *tsk) list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { int cpu_firing; + /* + * spin_lock() is sufficient here even independent of the + * expiry context. If expiry happens in hard interrupt + * context it's obvious. For task work context it's safe + * because all other operations on timer::it_lock happen in + * task context (syscall or exit). + */ spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.elist); cpu_firing = timer->it.cpu.firing; @@ -1143,6 +1299,13 @@ void run_posix_cpu_timers(void) lockdep_assert_irqs_disabled(); + /* + * If the actual expiry is deferred to task work context and the + * work is already scheduled there is no point to do anything here. + */ + if (posix_cpu_timers_work_scheduled(tsk)) + return; + /* * The fast path checks that there are no expired thread or thread * group timers. If that's so, just return. @@ -1150,9 +1313,7 @@ void run_posix_cpu_timers(void) if (!fastpath_timer_check(tsk)) return; - lockdep_posixtimer_enter(); __run_posix_cpu_timers(tsk); - lockdep_posixtimer_exit(); } /* diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ae5029f984a8..a16764b0116e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2017,6 +2017,7 @@ static void __init init_timer_cpus(void) void __init init_timers(void) { init_timer_cpus(); + posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } -- cgit v1.2.3 From a2e9a95d2190ef55bf0724ecdf8a466d393a86b6 Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Tue, 4 Aug 2020 12:49:32 +0800 Subject: kexec: Improve & fix crash_exclude_mem_range() to handle overlapping ranges The crash_exclude_mem_range() function can only handle one memory region a time. It will fail in the case in which the passed in area covers several memory regions. In this case, it will only exclude the first region, then return, but leave the later regions unsolved. E.g in a NEC system with two usable RAM regions inside the low 1M: ... BIOS-e820: [mem 0x0000000000000000-0x000000000003efff] usable BIOS-e820: [mem 0x000000000003f000-0x000000000003ffff] reserved BIOS-e820: [mem 0x0000000000040000-0x000000000009ffff] usable It will only exclude the memory region [0, 0x3efff], the memory region [0x40000, 0x9ffff] will still be added into /proc/vmcore, which may cause the following failure when dumping vmcore: ioremap on RAM at 0x0000000000040000 - 0x0000000000040fff WARNING: CPU: 0 PID: 665 at arch/x86/mm/ioremap.c:186 __ioremap_caller+0x2c7/0x2e0 ... RIP: 0010:__ioremap_caller+0x2c7/0x2e0 ... cp: error reading '/proc/vmcore': Cannot allocate memory kdump: saving vmcore failed In order to fix this bug, let's extend the crash_exclude_mem_range() to handle the overlapping ranges. [ mingo: Amended the changelog. ] Signed-off-by: Lianbo Jiang Signed-off-by: Ingo Molnar Acked-by: Dave Young Link: https://lore.kernel.org/r/20200804044933.1973-3-lijiang@redhat.com --- kernel/kexec_file.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 94661d2d13ad..97fa68267197 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -1157,24 +1157,26 @@ int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend) { int i, j; - unsigned long long start, end; + unsigned long long start, end, p_start, p_end; struct crash_mem_range temp_range = {0, 0}; for (i = 0; i < mem->nr_ranges; i++) { start = mem->ranges[i].start; end = mem->ranges[i].end; + p_start = mstart; + p_end = mend; if (mstart > end || mend < start) continue; /* Truncate any area outside of range */ if (mstart < start) - mstart = start; + p_start = start; if (mend > end) - mend = end; + p_end = end; /* Found completely overlapping range */ - if (mstart == start && mend == end) { + if (p_start == start && p_end == end) { mem->ranges[i].start = 0; mem->ranges[i].end = 0; if (i < mem->nr_ranges - 1) { @@ -1185,20 +1187,29 @@ int crash_exclude_mem_range(struct crash_mem *mem, mem->ranges[j].end = mem->ranges[j+1].end; } + + /* + * Continue to check if there are another overlapping ranges + * from the current position because of shifting the above + * mem ranges. + */ + i--; + mem->nr_ranges--; + continue; } mem->nr_ranges--; return 0; } - if (mstart > start && mend < end) { + if (p_start > start && p_end < end) { /* Split original range */ - mem->ranges[i].end = mstart - 1; - temp_range.start = mend + 1; + mem->ranges[i].end = p_start - 1; + temp_range.start = p_end + 1; temp_range.end = end; - } else if (mstart != start) - mem->ranges[i].end = mstart - 1; + } else if (p_start != start) + mem->ranges[i].end = p_start - 1; else - mem->ranges[i].start = mend + 1; + mem->ranges[i].start = p_end + 1; break; } @@ -1243,7 +1254,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). * I think this is required by tools like gdb. So same physical - * memory will be mapped in two elf headers. One will contain kernel + * memory will be mapped in two elf headers. One will contain kernel * text virtual addresses and other will have __va(physical) addresses. */ @@ -1270,7 +1281,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, ehdr->e_ehsize = sizeof(Elf64_Ehdr); ehdr->e_phentsize = sizeof(Elf64_Phdr); - /* Prepare one phdr of type PT_NOTE for each present cpu */ + /* Prepare one phdr of type PT_NOTE for each present CPU */ for_each_present_cpu(cpu) { phdr->p_type = PT_NOTE; notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); -- cgit v1.2.3 From 475f63ae63b5102ae6423d1712333929d04d6ecc Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Tue, 4 Aug 2020 12:49:33 +0800 Subject: kexec_file: Correctly output debugging information for the PT_LOAD ELF header Currently, when we enable the debugging switch to debug kexec_file, we always get the following incorrect results: kexec_file: Crash PT_LOAD elf header. phdr=00000000c988639b vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=51 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=000000003cca69a0 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=52 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000c584cb9f vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=53 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000cf85d57f vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=54 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000a4a8f847 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=55 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000272ec49f vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=56 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000ea0b65de vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=57 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=000000001f5e490c vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=58 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000dfe4109e vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=59 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000480ed2b6 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=60 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=0000000080b65151 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=61 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=0000000024e31c5e vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=62 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000332e0385 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=63 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=000000002754d5da vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=64 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=00000000783320dd vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=65 p_offset=0x0 kexec_file: Crash PT_LOAD elf header. phdr=0000000076fe5b64 vaddr=0x0, paddr=0x0, sz=0x0 e_phnum=66 p_offset=0x0 The reason is that kernel always prints the values of the next PT_LOAD instead of the current PT_LOAD. Change it to ensure that we can get the correct debugging information. [ mingo: Amended changelog, capitalized "ELF". ] Signed-off-by: Lianbo Jiang Signed-off-by: Ingo Molnar Acked-by: Dave Young Link: https://lore.kernel.org/r/20200804044933.1973-4-lijiang@redhat.com --- kernel/kexec_file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 97fa68267197..3f7867c1820f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -1246,7 +1246,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, unsigned long long notes_addr; unsigned long mstart, mend; - /* extra phdr for vmcoreinfo elf note */ + /* extra phdr for vmcoreinfo ELF note */ nr_phdr = nr_cpus + 1; nr_phdr += mem->nr_ranges; @@ -1254,7 +1254,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). * I think this is required by tools like gdb. So same physical - * memory will be mapped in two elf headers. One will contain kernel + * memory will be mapped in two ELF headers. One will contain kernel * text virtual addresses and other will have __va(physical) addresses. */ @@ -1323,10 +1323,10 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; phdr->p_align = 0; ehdr->e_phnum++; - phdr++; - pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, ehdr->e_phnum, phdr->p_offset); + phdr++; } *addr = buf; -- cgit v1.2.3 From 5e7b30205cef80f6bb922e61834437ca7bff5837 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 4 Aug 2020 22:50:56 -0700 Subject: bpf: Change uapi for bpf iterator map elements Commit a5cbe05a6673 ("bpf: Implement bpf iterator for map elements") added bpf iterator support for map elements. The map element bpf iterator requires info to identify a particular map. In the above commit, the attr->link_create.target_fd is used to carry map_fd and an enum bpf_iter_link_info is added to uapi to specify the target_fd actually representing a map_fd: enum bpf_iter_link_info { BPF_ITER_LINK_UNSPEC = 0, BPF_ITER_LINK_MAP_FD = 1, MAX_BPF_ITER_LINK_INFO, }; This is an extensible approach as we can grow enumerator for pid, cgroup_id, etc. and we can unionize target_fd for pid, cgroup_id, etc. But in the future, there are chances that more complex customization may happen, e.g., for tasks, it could be filtered based on both cgroup_id and user_id. This patch changed the uapi to have fields __aligned_u64 iter_info; __u32 iter_info_len; for additional iter_info for link_create. The iter_info is defined as union bpf_iter_link_info { struct { __u32 map_fd; } map; }; So future extension for additional customization will be easier. The bpf_iter_link_info will be passed to target callback to validate and generic bpf_iter framework does not need to deal it any more. Note that map_fd = 0 will be considered invalid and -EBADF will be returned to user space. Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com --- include/linux/bpf.h | 10 ++++---- include/uapi/linux/bpf.h | 15 ++++++------ kernel/bpf/bpf_iter.c | 58 +++++++++++++++++++++++------------------------ kernel/bpf/map_iter.c | 37 +++++++++++++++++++++++------- kernel/bpf/syscall.c | 2 +- net/core/bpf_sk_storage.c | 37 +++++++++++++++++++++++------- 6 files changed, 102 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cef4ef0d2b4e..55f694b63164 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1214,15 +1214,17 @@ struct bpf_iter_aux_info { struct bpf_map *map; }; -typedef int (*bpf_iter_check_target_t)(struct bpf_prog *prog, - struct bpf_iter_aux_info *aux); +typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux); +typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux); #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; - bpf_iter_check_target_t check_target; + bpf_iter_attach_target_t attach_target; + bpf_iter_detach_target_t detach_target; u32 ctx_arg_info_size; - enum bpf_iter_link_info req_linfo; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b134e679e9db..0480f893facd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -81,6 +81,12 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type */ }; +union bpf_iter_link_info { + struct { + __u32 map_fd; + } map; +}; + /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, @@ -249,13 +255,6 @@ enum bpf_link_type { MAX_BPF_LINK_TYPE, }; -enum bpf_iter_link_info { - BPF_ITER_LINK_UNSPEC = 0, - BPF_ITER_LINK_MAP_FD = 1, - - MAX_BPF_ITER_LINK_INFO, -}; - /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -623,6 +622,8 @@ union bpf_attr { }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ } link_create; struct { /* struct used by BPF_LINK_UPDATE command */ diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 363b9cafc2d8..b6715964b685 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -338,8 +338,8 @@ static void bpf_iter_link_release(struct bpf_link *link) struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); - if (iter_link->aux.map) - bpf_map_put_with_uref(iter_link->aux.map); + if (iter_link->tinfo->reg_info->detach_target) + iter_link->tinfo->reg_info->detach_target(&iter_link->aux); } static void bpf_iter_link_dealloc(struct bpf_link *link) @@ -390,15 +390,35 @@ bool bpf_link_is_iter(struct bpf_link *link) int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + union bpf_iter_link_info __user *ulinfo; struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; - struct bpf_iter_aux_info aux = {}; + union bpf_iter_link_info linfo; struct bpf_iter_link *link; - u32 prog_btf_id, target_fd; + u32 prog_btf_id, linfo_len; bool existed = false; - struct bpf_map *map; int err; + if (attr->link_create.target_fd || attr->link_create.flags) + return -EINVAL; + + memset(&linfo, 0, sizeof(union bpf_iter_link_info)); + + ulinfo = u64_to_user_ptr(attr->link_create.iter_info); + linfo_len = attr->link_create.iter_info_len; + if (!ulinfo ^ !linfo_len) + return -EINVAL; + + if (ulinfo) { + err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo), + linfo_len); + if (err) + return err; + linfo_len = min_t(u32, linfo_len, sizeof(linfo)); + if (copy_from_user(&linfo, ulinfo, linfo_len)) + return -EFAULT; + } + prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { @@ -411,13 +431,6 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (!existed) return -ENOENT; - /* Make sure user supplied flags are target expected. */ - target_fd = attr->link_create.target_fd; - if (attr->link_create.flags != tinfo->reg_info->req_linfo) - return -EINVAL; - if (!attr->link_create.flags && target_fd) - return -EINVAL; - link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -431,28 +444,15 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } - if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) { - map = bpf_map_get_with_uref(target_fd); - if (IS_ERR(map)) { - err = PTR_ERR(map); - goto cleanup_link; - } - - aux.map = map; - err = tinfo->reg_info->check_target(prog, &aux); + if (tinfo->reg_info->attach_target) { + err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux); if (err) { - bpf_map_put_with_uref(map); - goto cleanup_link; + bpf_link_cleanup(&link_primer); + return err; } - - link->aux.map = map; } return bpf_link_settle(&link_primer); - -cleanup_link: - bpf_link_cleanup(&link_primer); - return err; } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index fbe1f557cb88..af86048e5afd 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -98,12 +98,21 @@ static struct bpf_iter_reg bpf_map_reg_info = { .seq_info = &bpf_map_seq_info, }; -static int bpf_iter_check_map(struct bpf_prog *prog, - struct bpf_iter_aux_info *aux) +static int bpf_iter_attach_map(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) { u32 key_acc_size, value_acc_size, key_size, value_size; - struct bpf_map *map = aux->map; + struct bpf_map *map; bool is_percpu = false; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -112,7 +121,7 @@ static int bpf_iter_check_map(struct bpf_prog *prog, else if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) - return -EINVAL; + goto put_map; key_acc_size = prog->aux->max_rdonly_access; value_acc_size = prog->aux->max_rdwr_access; @@ -122,10 +131,22 @@ static int bpf_iter_check_map(struct bpf_prog *prog, else value_size = round_up(map->value_size, 8) * num_possible_cpus(); - if (key_acc_size > key_size || value_acc_size > value_size) - return -EACCES; + if (key_acc_size > key_size || value_acc_size > value_size) { + err = -EACCES; + goto put_map; + } + aux->map = map; return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); } DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, @@ -133,8 +154,8 @@ DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, static const struct bpf_iter_reg bpf_map_elem_reg_info = { .target = "bpf_map_elem", - .check_target = bpf_iter_check_map, - .req_linfo = BPF_ITER_LINK_MAP_FD, + .attach_target = bpf_iter_attach_map, + .detach_target = bpf_iter_detach_map, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2f343ce15747..86299a292214 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3883,7 +3883,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog * return -EINVAL; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.flags +#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len static int link_create(union bpf_attr *attr) { enum bpf_prog_type ptype; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d3377c90a291..b988f48153a4 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -1384,18 +1384,39 @@ static int bpf_iter_init_sk_storage_map(void *priv_data, return 0; } -static int bpf_iter_check_map(struct bpf_prog *prog, - struct bpf_iter_aux_info *aux) +static int bpf_iter_attach_map(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) { - struct bpf_map *map = aux->map; + struct bpf_map *map; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) - return -EINVAL; + goto put_map; - if (prog->aux->max_rdonly_access > map->value_size) - return -EACCES; + if (prog->aux->max_rdonly_access > map->value_size) { + err = -EACCES; + goto put_map; + } + aux->map = map; return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); } static const struct seq_operations bpf_sk_storage_map_seq_ops = { @@ -1414,8 +1435,8 @@ static const struct bpf_iter_seq_info iter_seq_info = { static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { .target = "bpf_sk_storage_map", - .check_target = bpf_iter_check_map, - .req_linfo = BPF_ITER_LINK_MAP_FD, + .attach_target = bpf_iter_attach_map, + .detach_target = bpf_iter_detach_map, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), -- cgit v1.2.3 From 0d360d64b01231cdb36e1936de889f308fd9317f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 6 Aug 2020 11:26:12 -0700 Subject: bpf: Remove inline from bpf_do_trace_printk I get the following error during compilation on my side: kernel/trace/bpf_trace.c: In function 'bpf_do_trace_printk': kernel/trace/bpf_trace.c:386:34: error: function 'bpf_do_trace_printk' can never be inlined because it uses variable argument lists static inline __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) ^ Fixes: ac5a72ea5c89 ("bpf: Use dedicated bpf_trace_printk event instead of trace_printk()") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200806182612.1390883-1-sdf@google.com --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cb91ef902cc4..a8d4f253ed77 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -383,7 +383,7 @@ static DEFINE_RAW_SPINLOCK(trace_printk_lock); #define BPF_TRACE_PRINTK_SIZE 1024 -static inline __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) +static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) { static char buf[BPF_TRACE_PRINTK_SIZE]; unsigned long flags; -- cgit v1.2.3 From b8c1a3090741f349322ad855d2b66d6e9752a60d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 6 Aug 2020 20:31:41 -0700 Subject: bpf: Delete repeated words in comments Drop repeated words in kernel/bpf/: {has, the} Signed-off-by: Randy Dunlap Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200807033141.10437-1-rdunlap@infradead.org --- kernel/bpf/core.c | 2 +- kernel/bpf/verifier.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index bde93344164d..ed0b3578867c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1966,7 +1966,7 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array, * @index: the index of the program to replace * * Skips over dummy programs, by not counting them, when calculating - * the the position of the program to replace. + * the position of the program to replace. * * Return: * * 0 - Success diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b6ccfce3bf4c..ef938f17b944 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8294,7 +8294,7 @@ static bool stacksafe(struct bpf_func_state *old, if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) /* Ex: old explored (safe) state has STACK_SPILL in - * this stack slot, but current has has STACK_MISC -> + * this stack slot, but current has STACK_MISC -> * this verifier states are not equivalent, * return false to continue verification of this path */ -- cgit v1.2.3 From 11990a5bd7e558e9203c1070fc52fb6f0488e75b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 6 Aug 2020 14:15:23 -0700 Subject: module: Correctly truncate sysfs sections output The only-root-readable /sys/module/$module/sections/$section files did not truncate their output to the available buffer size. While most paths into the kernfs read handlers end up using PAGE_SIZE buffers, it's possible to get there through other paths (e.g. splice, sendfile). Actually limit the output to the "count" passed into the read function, and report it back correctly. *sigh* Reported-by: kernel test robot Link: https://lore.kernel.org/lkml/20200805002015.GE23458@shao2-debian Fixes: ed66f991bb19 ("module: Refactor section attr into bin attribute") Cc: stable@vger.kernel.org Reviewed-by: Greg Kroah-Hartman Acked-by: Jessica Yu Signed-off-by: Kees Cook --- kernel/module.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index aa183c9ac0a2..08c46084d8cc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1520,18 +1520,34 @@ struct module_sect_attrs { struct module_sect_attr attrs[]; }; +#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4)) static ssize_t module_sect_read(struct file *file, struct kobject *kobj, struct bin_attribute *battr, char *buf, loff_t pos, size_t count) { struct module_sect_attr *sattr = container_of(battr, struct module_sect_attr, battr); + char bounce[MODULE_SECT_READ_SIZE + 1]; + size_t wrote; if (pos != 0) return -EINVAL; - return sprintf(buf, "0x%px\n", - kallsyms_show_value(file->f_cred) ? (void *)sattr->address : NULL); + /* + * Since we're a binary read handler, we must account for the + * trailing NUL byte that sprintf will write: if "buf" is + * too small to hold the NUL, or the NUL is exactly the last + * byte, the read will look like it got truncated by one byte. + * Since there is no way to ask sprintf nicely to not write + * the NUL, we have to use a bounce buffer. + */ + wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n", + kallsyms_show_value(file->f_cred) + ? (void *)sattr->address : NULL); + count = min(count, wrote); + memcpy(buf, bounce, count); + + return count; } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -1580,7 +1596,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) goto out; sect_attrs->nsections++; sattr->battr.read = module_sect_read; - sattr->battr.size = 3 /* "0x", "\n" */ + (BITS_PER_LONG / 4); + sattr->battr.size = MODULE_SECT_READ_SIZE; sattr->battr.attr.mode = 0400; *(gattr++) = &(sattr++)->battr; } -- cgit v1.2.3 From 38cf307c1f2011d413750c5acb725456f47d9172 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2020 23:17:16 -0700 Subject: mm: fix kthread_use_mm() vs TLB invalidate For SMP systems using IPI based TLB invalidation, looking at current->active_mm is entirely reasonable. This then presents the following race condition: CPU0 CPU1 flush_tlb_mm(mm) use_mm(mm) tsk->active_mm = mm; if (tsk->active_mm == mm) // flush TLBs switch_mm(old_mm,mm,tsk); Where it is possible the IPI flushed the TLBs for @old_mm, not @mm, because the IPI lands before we actually switched. Avoid this by disabling IRQs across changing ->active_mm and switch_mm(). Of the (SMP) architectures that have IPI based TLB invalidate: Alpha - checks active_mm ARC - ASID specific IA64 - checks active_mm MIPS - ASID specific flush OpenRISC - shoots down world PARISC - shoots down world SH - ASID specific SPARC - ASID specific x86 - N/A xtensa - checks active_mm So at the very least Alpha, IA64 and Xtensa are suspect. On top of this, for scheduler consistency we need at least preemption disabled across changing tsk->mm and doing switch_mm(), which is currently provided by task_lock(), but that's not sufficient for PREEMPT_RT. [akpm@linux-foundation.org: add comment] Reported-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Cc: Nicholas Piggin Cc: Jens Axboe Cc: Kees Cook Cc: Jann Horn Cc: Will Deacon Cc: Christoph Hellwig Cc: Mathieu Desnoyers Cc: Link: http://lkml.kernel.org/r/20200721154106.GE10769@hirez.programming.kicks-ass.net Signed-off-by: Linus Torvalds --- kernel/kthread.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 1d9e2fdfd67a..1c8964feeb01 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1241,13 +1241,16 @@ void kthread_use_mm(struct mm_struct *mm) WARN_ON_ONCE(tsk->mm); task_lock(tsk); + /* Hold off tlb flush IPIs while switching mm's */ + local_irq_disable(); active_mm = tsk->active_mm; if (active_mm != mm) { mmgrab(mm); tsk->active_mm = mm; } tsk->mm = mm; - switch_mm(active_mm, mm, tsk); + switch_mm_irqs_off(active_mm, mm, tsk); + local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); @@ -1276,9 +1279,11 @@ void kthread_unuse_mm(struct mm_struct *mm) task_lock(tsk); sync_mm_rss(mm); + local_irq_disable(); tsk->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); + local_irq_enable(); task_unlock(tsk); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); -- cgit v1.2.3 From 4ca1085c9573ea08767521dabce62456e3fc2fd0 Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Thu, 6 Aug 2020 23:17:19 -0700 Subject: kthread: remove incorrect comment in kthread_create_on_cpu() Originally kthread_create_on_cpu() parked and woke up the new thread. However, since commit a65d40961dc7 ("kthread/smpboot: do not park in kthread_create_on_cpu()") this is no longer the case. This patch removes the comment that has been left behind and is now incorrect / stale. Fixes: a65d40961dc7 ("kthread/smpboot: do not park in kthread_create_on_cpu()") Signed-off-by: Ilias Stamatis Signed-off-by: Andrew Morton Reviewed-by: Petr Mladek Link: http://lkml.kernel.org/r/20200611135920.240551-1-stamatis.iliass@gmail.com Signed-off-by: Linus Torvalds --- kernel/kthread.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 1c8964feeb01..b2807e7be772 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -480,7 +480,6 @@ EXPORT_SYMBOL(kthread_bind); * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread - * The thread will be woken and put into park mode. */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, -- cgit v1.2.3 From d42f3245c7e299e017213fa028c319316bcdb7f4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 6 Aug 2020 23:20:39 -0700 Subject: mm: memcg: convert vmstat slab counters to bytes In order to prepare for per-object slab memory accounting, convert NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE vmstat items to bytes. To make it obvious, rename them to NR_SLAB_RECLAIMABLE_B and NR_SLAB_UNRECLAIMABLE_B (similar to NR_KERNEL_STACK_KB). Internally global and per-node counters are stored in pages, however memcg and lruvec counters are stored in bytes. This scheme may look weird, but only for now. As soon as slab pages will be shared between multiple cgroups, global and node counters will reflect the total number of slab pages. However memcg and lruvec counters will be used for per-memcg slab memory tracking, which will take separate kernel objects in the account. Keeping global and node counters in pages helps to avoid additional overhead. The size of slab memory shouldn't exceed 4Gb on 32-bit machines, so it will fit into atomic_long_t we use for vmstats. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Michal Hocko Cc: Tejun Heo Link: http://lkml.kernel.org/r/20200623174037.3951353-4-guro@fb.com Signed-off-by: Linus Torvalds --- drivers/base/node.c | 4 ++-- fs/proc/meminfo.c | 4 ++-- include/linux/mmzone.h | 16 +++++++++++++--- kernel/power/snapshot.c | 2 +- mm/memcontrol.c | 11 ++++------- mm/oom_kill.c | 2 +- mm/page_alloc.c | 8 ++++---- mm/slab.h | 15 ++++++++------- mm/slab_common.c | 4 ++-- mm/slob.c | 12 ++++++------ mm/slub.c | 8 ++++---- mm/vmscan.c | 3 ++- mm/workingset.c | 6 ++++-- 13 files changed, 53 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/drivers/base/node.c b/drivers/base/node.c index e21e31359297..0cf13e31603c 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -368,8 +368,8 @@ static ssize_t node_read_meminfo(struct device *dev, unsigned long sreclaimable, sunreclaimable; si_meminfo_node(&i, nid); - sreclaimable = node_page_state(pgdat, NR_SLAB_RECLAIMABLE); - sunreclaimable = node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE); + sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B); + sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B); n = sprintf(buf, "Node %d MemTotal: %8lu kB\n" "Node %d MemFree: %8lu kB\n" diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index e9a6841fc25b..38ea95fd919a 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -52,8 +52,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); - sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE); - sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE); + sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); + sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); show_val_kb(m, "MemTotal: ", i.totalram); show_val_kb(m, "MemFree: ", i.freeram); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f16306e15b98..b79100edd228 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -174,8 +174,8 @@ enum node_stat_item { NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ - NR_SLAB_RECLAIMABLE, - NR_SLAB_UNRECLAIMABLE, + NR_SLAB_RECLAIMABLE_B, + NR_SLAB_UNRECLAIMABLE_B, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_NODES, @@ -213,7 +213,17 @@ enum node_stat_item { */ static __always_inline bool vmstat_item_in_bytes(int idx) { - return false; + /* + * Global and per-node slab counters track slab pages. + * It's expected that changes are multiples of PAGE_SIZE. + * Internally values are stored in pages. + * + * Per-memcg and per-lruvec counters track memory, consumed + * by individual slab objects. These counters are actually + * byte-precise. + */ + return (idx == NR_SLAB_RECLAIMABLE_B || + idx == NR_SLAB_UNRECLAIMABLE_B); } /* diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cef154261fe2..d25749bce7cf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1663,7 +1663,7 @@ static unsigned long minimum_image_size(unsigned long saveable) { unsigned long size; - size = global_node_page_state(NR_SLAB_RECLAIMABLE) + size = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + global_node_page_state(NR_ACTIVE_ANON) + global_node_page_state(NR_INACTIVE_ANON) + global_node_page_state(NR_ACTIVE_FILE) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 61ae6658d59f..328b7e7bf9ab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1391,9 +1391,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg) (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024); seq_buf_printf(&s, "slab %llu\n", - (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + - memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * - PAGE_SIZE); + (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B))); seq_buf_printf(&s, "sock %llu\n", (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE); @@ -1423,11 +1422,9 @@ static char *memory_stat_format(struct mem_cgroup *memcg) PAGE_SIZE); seq_buf_printf(&s, "slab_reclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * - PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B)); seq_buf_printf(&s, "slab_unreclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * - PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)); /* Accumulated memory events */ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 6e94962893ee..d30ce75f23fb 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -184,7 +184,7 @@ static bool is_dump_unreclaim_slabs(void) global_node_page_state(NR_ISOLATED_FILE) + global_node_page_state(NR_UNEVICTABLE); - return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru); + return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru); } /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 901a21f61d68..f9ad093814d2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5220,8 +5220,8 @@ long si_mem_available(void) * items that are in use, and cannot be freed. Cap this estimate at the * low watermark. */ - reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) + - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); + reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); available += reclaimable - min(reclaimable / 2, wmark_low); if (available < 0) @@ -5364,8 +5364,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_UNEVICTABLE), global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), - global_node_page_state(NR_SLAB_RECLAIMABLE), - global_node_page_state(NR_SLAB_UNRECLAIMABLE), + global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), + global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), global_zone_page_state(NR_PAGETABLE), diff --git a/mm/slab.h b/mm/slab.h index fceb4341ba91..09be3ca6fe87 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -273,7 +273,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); static inline int cache_vmstat_idx(struct kmem_cache *s) { return (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE; + NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; } #ifdef CONFIG_SLUB_DEBUG @@ -390,7 +390,7 @@ static __always_inline int memcg_charge_slab(struct page *page, if (unlikely(!memcg || mem_cgroup_is_root(memcg))) { mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), - nr_pages); + nr_pages << PAGE_SHIFT); percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages); return 0; } @@ -400,7 +400,7 @@ static __always_inline int memcg_charge_slab(struct page *page, goto out; lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); - mod_lruvec_state(lruvec, cache_vmstat_idx(s), nr_pages); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), nr_pages << PAGE_SHIFT); /* transer try_charge() page references to kmem_cache */ percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages); @@ -425,11 +425,12 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, memcg = READ_ONCE(s->memcg_params.memcg); if (likely(!mem_cgroup_is_root(memcg))) { lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); - mod_lruvec_state(lruvec, cache_vmstat_idx(s), -nr_pages); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), + -(nr_pages << PAGE_SHIFT)); memcg_kmem_uncharge(memcg, nr_pages); } else { mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), - -nr_pages); + -(nr_pages << PAGE_SHIFT)); } rcu_read_unlock(); @@ -513,7 +514,7 @@ static __always_inline int charge_slab_page(struct page *page, { if (is_root_cache(s)) { mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), - 1 << order); + PAGE_SIZE << order); return 0; } @@ -525,7 +526,7 @@ static __always_inline void uncharge_slab_page(struct page *page, int order, { if (is_root_cache(s)) { mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), - -(1 << order)); + -(PAGE_SIZE << order)); return; } diff --git a/mm/slab_common.c b/mm/slab_common.c index 616ec8a0d91a..a73f168b1035 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1363,8 +1363,8 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) page = alloc_pages(flags, order); if (likely(page)) { ret = page_address(page); - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, - 1 << order); + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); } ret = kasan_kmalloc_large(ret, size, flags); /* As ret might get tagged, call kmemleak hook after KASAN. */ diff --git a/mm/slob.c b/mm/slob.c index ac2aecfbc7a8..7cc9805c8091 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -202,8 +202,8 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) if (!page) return NULL; - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, - 1 << order); + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); return page_address(page); } @@ -214,8 +214,8 @@ static void slob_free_pages(void *b, int order) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += 1 << order; - mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE, - -(1 << order)); + mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); __free_pages(sp, order); } @@ -552,8 +552,8 @@ void kfree(const void *block) slob_free(m, *m + align); } else { unsigned int order = compound_order(sp); - mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE, - -(1 << order)); + mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); __free_pages(sp, order); } diff --git a/mm/slub.c b/mm/slub.c index ae39eb392396..2d73d677f7ac 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3991,8 +3991,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) page = alloc_pages_node(node, flags, order); if (page) { ptr = page_address(page); - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, - 1 << order); + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); } return kmalloc_large_node_hook(ptr, size, flags); @@ -4123,8 +4123,8 @@ void kfree(const void *x) BUG_ON(!PageCompound(page)); kfree_hook(object); - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, - -(1 << order)); + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); __free_pages(page, order); return; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 749d239c62b2..2ac43664aba4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4222,7 +4222,8 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) * unmapped file backed pages. */ if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && - node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) + node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <= + pgdat->min_slab_pages) return NODE_RECLAIM_FULL; /* diff --git a/mm/workingset.c b/mm/workingset.c index 50b7937bab32..b199726924dd 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -486,8 +486,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, NR_LRU_BASE + i); - pages += lruvec_page_state_local(lruvec, NR_SLAB_RECLAIMABLE); - pages += lruvec_page_state_local(lruvec, NR_SLAB_UNRECLAIMABLE); + pages += lruvec_page_state_local( + lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; + pages += lruvec_page_state_local( + lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; } else #endif pages = node_present_pages(sc->nid); -- cgit v1.2.3 From 991e7673859ed41e7ba83c8c4e57afe8cfebe314 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 6 Aug 2020 23:21:37 -0700 Subject: mm: memcontrol: account kernel stack per node Currently the kernel stack is being accounted per-zone. There is no need to do that. In addition due to being per-zone, memcg has to keep a separate MEMCG_KERNEL_STACK_KB. Make the stat per-node and deprecate MEMCG_KERNEL_STACK_KB as memcg_stat_item is an extension of node_stat_item. In addition localize the kernel stack stats updates to account_kernel_stack(). Signed-off-by: Shakeel Butt Signed-off-by: Andrew Morton Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200630161539.1759185-1-shakeelb@google.com Signed-off-by: Linus Torvalds --- drivers/base/node.c | 4 ++-- fs/proc/meminfo.c | 4 ++-- include/linux/memcontrol.h | 21 +++++++++++++++++-- include/linux/mmzone.h | 8 ++++---- kernel/fork.c | 51 +++++++++++++--------------------------------- kernel/scs.c | 2 +- mm/memcontrol.c | 2 +- mm/page_alloc.c | 16 +++++++-------- mm/vmstat.c | 8 ++++---- 9 files changed, 55 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/drivers/base/node.c b/drivers/base/node.c index 0cf13e31603c..508b80f6329b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -440,9 +440,9 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_FILE_MAPPED)), nid, K(node_page_state(pgdat, NR_ANON_MAPPED)), nid, K(i.sharedram), - nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB), + nid, node_page_state(pgdat, NR_KERNEL_STACK_KB), #ifdef CONFIG_SHADOW_CALL_STACK - nid, sum_zone_node_page_state(nid, NR_KERNEL_SCS_KB), + nid, node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)), nid, 0UL, diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 38ea95fd919a..2a4c58f70fb9 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -101,10 +101,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SReclaimable: ", sreclaimable); show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", - global_zone_page_state(NR_KERNEL_STACK_KB)); + global_node_page_state(NR_KERNEL_STACK_KB)); #ifdef CONFIG_SHADOW_CALL_STACK seq_printf(m, "ShadowCallStack:%8lu kB\n", - global_zone_page_state(NR_KERNEL_SCS_KB)); + global_node_page_state(NR_KERNEL_SCS_KB)); #endif show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5a8b62d075e6..624400c27eba 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -32,8 +32,6 @@ struct kmem_cache; enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, - /* XXX: why are these zone and not node counters? */ - MEMCG_KERNEL_STACK_KB, MEMCG_NR_STAT, }; @@ -729,8 +727,19 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val); + void mod_memcg_obj_state(void *p, int idx, int val); +static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, + int val) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_lruvec_slab_state(p, idx, val); + local_irq_restore(flags); +} + static inline void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { @@ -1151,6 +1160,14 @@ static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, __mod_node_page_state(page_pgdat(page), idx, val); } +static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, + int val) +{ + struct page *page = virt_to_head_page(p); + + mod_node_page_state(page_pgdat(page), idx, val); +} + static inline void mod_memcg_obj_state(void *p, int idx, int val) { } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b79100edd228..a3bd54139a30 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -155,10 +155,6 @@ enum zone_stat_item { NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ NR_PAGETABLE, /* used for pagetables */ - NR_KERNEL_STACK_KB, /* measured in KiB */ -#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) - NR_KERNEL_SCS_KB, /* measured in KiB */ -#endif /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) @@ -203,6 +199,10 @@ enum node_stat_item { NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ + NR_KERNEL_STACK_KB, /* measured in KiB */ +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + NR_KERNEL_SCS_KB, /* measured in KiB */ +#endif NR_VM_NODE_STAT_ITEMS }; diff --git a/kernel/fork.c b/kernel/fork.c index 76d3f3387554..c7b4ce9d2647 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -276,13 +276,8 @@ static inline void free_thread_stack(struct task_struct *tsk) if (vm) { int i; - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - mod_memcg_page_state(vm->pages[i], - MEMCG_KERNEL_STACK_KB, - -(int)(PAGE_SIZE / 1024)); - + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) memcg_kmem_uncharge_page(vm->pages[i], 0); - } for (i = 0; i < NR_CACHED_STACKS; i++) { if (this_cpu_cmpxchg(cached_stacks[i], @@ -382,31 +377,14 @@ static void account_kernel_stack(struct task_struct *tsk, int account) void *stack = task_stack_page(tsk); struct vm_struct *vm = task_stack_vm_area(tsk); - BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); - - if (vm) { - int i; - - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - mod_zone_page_state(page_zone(vm->pages[i]), - NR_KERNEL_STACK_KB, - PAGE_SIZE / 1024 * account); - } - } else { - /* - * All stack pages are in the same zone and belong to the - * same memcg. - */ - struct page *first_page = virt_to_page(stack); - - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, - THREAD_SIZE / 1024 * account); - - mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); - } + /* All stack pages are in the same node. */ + if (vm) + mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + else + mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } static int memcg_charge_kernel_stack(struct task_struct *tsk) @@ -415,24 +393,23 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk) struct vm_struct *vm = task_stack_vm_area(tsk); int ret; + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + if (vm) { int i; + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { /* * If memcg_kmem_charge_page() fails, page->mem_cgroup - * pointer is NULL, and both memcg_kmem_uncharge_page() - * and mod_memcg_page_state() in free_thread_stack() - * will ignore this page. So it's safe. + * pointer is NULL, and memcg_kmem_uncharge_page() in + * free_thread_stack() will ignore this page. */ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); if (ret) return ret; - - mod_memcg_page_state(vm->pages[i], - MEMCG_KERNEL_STACK_KB, - PAGE_SIZE / 1024); } } #endif diff --git a/kernel/scs.c b/kernel/scs.c index 5d4d9bbdec36..4ff4a7ba0094 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -17,7 +17,7 @@ static void __scs_account(void *s, int account) { struct page *scs_page = virt_to_page(s); - mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB, + mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB, account * (SCS_SIZE / SZ_1K)); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 473f9b91d51f..a3e963366769 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1485,7 +1485,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg) (u64)memcg_page_state(memcg, NR_FILE_PAGES) * PAGE_SIZE); seq_buf_printf(&s, "kernel_stack %llu\n", - (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * + (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) * 1024); seq_buf_printf(&s, "slab %llu\n", (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f9ad093814d2..8d5d8526c2f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5396,6 +5396,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " anon_thp: %lukB" #endif " writeback_tmp:%lukB" + " kernel_stack:%lukB" +#ifdef CONFIG_SHADOW_CALL_STACK + " shadow_call_stack:%lukB" +#endif " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5417,6 +5421,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), + node_page_state(pgdat, NR_KERNEL_STACK_KB), +#ifdef CONFIG_SHADOW_CALL_STACK + node_page_state(pgdat, NR_KERNEL_SCS_KB), +#endif pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } @@ -5448,10 +5456,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " kernel_stack:%lukB" -#ifdef CONFIG_SHADOW_CALL_STACK - " shadow_call_stack:%lukB" -#endif " pagetables:%lukB" " bounce:%lukB" " free_pcp:%lukB" @@ -5473,10 +5477,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), - zone_page_state(zone, NR_KERNEL_STACK_KB), -#ifdef CONFIG_SHADOW_CALL_STACK - zone_page_state(zone, NR_KERNEL_SCS_KB), -#endif K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), diff --git a/mm/vmstat.c b/mm/vmstat.c index b171a76bfe83..2b866cbab11d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1140,10 +1140,6 @@ const char * const vmstat_text[] = { "nr_zone_write_pending", "nr_mlock", "nr_page_table_pages", - "nr_kernel_stack", -#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) - "nr_shadow_call_stack", -#endif "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", @@ -1194,6 +1190,10 @@ const char * const vmstat_text[] = { "nr_kernel_misc_reclaimable", "nr_foll_pin_acquired", "nr_foll_pin_released", + "nr_kernel_stack", +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + "nr_shadow_call_stack", +#endif /* enum writeback_stat_item counters */ "nr_dirty_threshold", -- cgit v1.2.3 From 56f3547bfa4d361148aa748ccb86073bc57f5e6c Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 6 Aug 2020 23:23:15 -0700 Subject: mm: adjust vm_committed_as_batch according to vm overcommit policy When checking a performance change for will-it-scale scalability mmap test [1], we found very high lock contention for spinlock of percpu counter 'vm_committed_as': 94.14% 0.35% [kernel.kallsyms] [k] _raw_spin_lock_irqsave 48.21% _raw_spin_lock_irqsave;percpu_counter_add_batch;__vm_enough_memory;mmap_region;do_mmap; 45.91% _raw_spin_lock_irqsave;percpu_counter_add_batch;__do_munmap; Actually this heavy lock contention is not always necessary. The 'vm_committed_as' needs to be very precise when the strict OVERCOMMIT_NEVER policy is set, which requires a rather small batch number for the percpu counter. So keep 'batch' number unchanged for strict OVERCOMMIT_NEVER policy, and lift it to 64X for OVERCOMMIT_ALWAYS and OVERCOMMIT_GUESS policies. Also add a sysctl handler to adjust it when the policy is reconfigured. Benchmark with the same testcase in [1] shows 53% improvement on a 8C/16T desktop, and 2097%(20X) on a 4S/72C/144T server. We tested with test platforms in 0day (server, desktop and laptop), and 80%+ platforms shows improvements with that test. And whether it shows improvements depends on if the test mmap size is bigger than the batch number computed. And if the lift is 16X, 1/3 of the platforms will show improvements, though it should help the mmap/unmap usage generally, as Michal Hocko mentioned: : I believe that there are non-synthetic worklaods which would benefit from : a larger batch. E.g. large in memory databases which do large mmaps : during startups from multiple threads. [1] https://lore.kernel.org/lkml/20200305062138.GI5972@shao2-debian/ Signed-off-by: Feng Tang Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Matthew Wilcox (Oracle) Cc: Johannes Weiner Cc: Mel Gorman Cc: Qian Cai Cc: Kees Cook Cc: Andi Kleen Cc: Tim Chen Cc: Dave Hansen Cc: Huang Ying Cc: Christoph Lameter Cc: Dennis Zhou Cc: Haiyang Zhang Cc: kernel test robot Cc: "K. Y. Srinivasan" Cc: Tejun Heo Link: http://lkml.kernel.org/r/1589611660-89854-4-git-send-email-feng.tang@intel.com Link: http://lkml.kernel.org/r/1592725000-73486-4-git-send-email-feng.tang@intel.com Link: http://lkml.kernel.org/r/1594389708-60781-5-git-send-email-feng.tang@intel.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ include/linux/mman.h | 4 ++++ kernel/sysctl.c | 2 +- mm/mm_init.c | 22 ++++++++++++++++------ mm/util.c | 41 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 64 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2830f1c0fdc3..1c3470550395 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -206,6 +206,8 @@ int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) diff --git a/include/linux/mman.h b/include/linux/mman.h index 4b08e9c9c538..6f34c33075f9 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -57,8 +57,12 @@ extern struct percpu_counter vm_committed_as; #ifdef CONFIG_SMP extern s32 vm_committed_as_batch; +extern void mm_compute_batch(int overcommit_policy); #else #define vm_committed_as_batch 0 +static inline void mm_compute_batch(int overcommit_policy) +{ +} #endif unsigned long vm_memory_committed(void); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1b4d2dc270a5..f785de3caac0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2671,7 +2671,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = overcommit_policy_handler, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, diff --git a/mm/mm_init.c b/mm/mm_init.c index 435e5f794b3b..b06a30fbedff 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT @@ -144,14 +145,23 @@ EXPORT_SYMBOL_GPL(mm_kobj); #ifdef CONFIG_SMP s32 vm_committed_as_batch = 32; -static void __meminit mm_compute_batch(void) +void mm_compute_batch(int overcommit_policy) { u64 memsized_batch; s32 nr = num_present_cpus(); s32 batch = max_t(s32, nr*2, 32); - - /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ - memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff); + unsigned long ram_pages = totalram_pages(); + + /* + * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of + * (total memory/#cpus), and lift it to 25% for other policies + * to easy the possible lock contention for percpu_counter + * vm_committed_as, while the max limit is INT_MAX + */ + if (overcommit_policy == OVERCOMMIT_NEVER) + memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX); + else + memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX); vm_committed_as_batch = max_t(s32, memsized_batch, batch); } @@ -162,7 +172,7 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self, switch (action) { case MEM_ONLINE: case MEM_OFFLINE: - mm_compute_batch(); + mm_compute_batch(sysctl_overcommit_memory); default: break; } @@ -176,7 +186,7 @@ static struct notifier_block compute_batch_nb __meminitdata = { static int __init mm_compute_batch_init(void) { - mm_compute_batch(); + mm_compute_batch(sysctl_overcommit_memory); register_hotmemory_notifier(&compute_batch_nb); return 0; diff --git a/mm/util.c b/mm/util.c index 1c9d097d7e48..8d6280c05238 100644 --- a/mm/util.c +++ b/mm/util.c @@ -746,6 +746,47 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, return ret; } +static void sync_overcommit_as(struct work_struct *dummy) +{ + percpu_counter_sync(&vm_committed_as); +} + +int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int new_policy; + int ret; + + /* + * The deviation of sync_overcommit_as could be big with loose policy + * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to + * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply + * with the strict "NEVER", and to avoid possible race condtion (even + * though user usually won't too frequently do the switching to policy + * OVERCOMMIT_NEVER), the switch is done in the following order: + * 1. changing the batch + * 2. sync percpu count on each CPU + * 3. switch the policy + */ + if (write) { + t = *table; + t.data = &new_policy; + ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (ret) + return ret; + + mm_compute_batch(new_policy); + if (new_policy == OVERCOMMIT_NEVER) + schedule_on_each_cpu(sync_overcommit_as); + sysctl_overcommit_memory = new_policy; + } else { + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + } + + return ret; +} + int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { -- cgit v1.2.3 From 26e760c9a7c8ec31fa1a6bfbbce3f63f189ccef0 Mon Sep 17 00:00:00 2001 From: Walter Wu Date: Thu, 6 Aug 2020 23:24:35 -0700 Subject: rcu: kasan: record and print call_rcu() call stack Patch series "kasan: memorize and print call_rcu stack", v8. This patchset improves KASAN reports by making them to have call_rcu() call stack information. It is useful for programmers to solve use-after-free or double-free memory issue. The KASAN report was as follows(cleaned up slightly): BUG: KASAN: use-after-free in kasan_rcu_reclaim+0x58/0x60 Freed by task 0: kasan_save_stack+0x24/0x50 kasan_set_track+0x24/0x38 kasan_set_free_info+0x18/0x20 __kasan_slab_free+0x10c/0x170 kasan_slab_free+0x10/0x18 kfree+0x98/0x270 kasan_rcu_reclaim+0x1c/0x60 Last call_rcu(): kasan_save_stack+0x24/0x50 kasan_record_aux_stack+0xbc/0xd0 call_rcu+0x8c/0x580 kasan_rcu_uaf+0xf4/0xf8 Generic KASAN will record the last two call_rcu() call stacks and print up to 2 call_rcu() call stacks in KASAN report. it is only suitable for generic KASAN. This feature considers the size of struct kasan_alloc_meta and kasan_free_meta, we try to optimize the structure layout and size, lets it get better memory consumption. [1]https://bugzilla.kernel.org/show_bug.cgi?id=198437 [2]https://groups.google.com/forum/#!searchin/kasan-dev/better$20stack$20traces$20for$20rcu%7Csort:date/kasan-dev/KQsjT_88hDE/7rNUZprRBgAJ This patch (of 4): This feature will record the last two call_rcu() call stacks and prints up to 2 call_rcu() call stacks in KASAN report. When call_rcu() is called, we store the call_rcu() call stack into slub alloc meta-data, so that the KASAN report can print rcu stack. [1]https://bugzilla.kernel.org/show_bug.cgi?id=198437 [2]https://groups.google.com/forum/#!searchin/kasan-dev/better$20stack$20traces$20for$20rcu%7Csort:date/kasan-dev/KQsjT_88hDE/7rNUZprRBgAJ [walter-zh.wu@mediatek.com: build fix] Link: http://lkml.kernel.org/r/20200710162401.23816-1-walter-zh.wu@mediatek.com Suggested-by: Dmitry Vyukov Signed-off-by: Walter Wu Signed-off-by: Andrew Morton Tested-by: Dmitry Vyukov Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Acked-by: Paul E. McKenney Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Josh Triplett Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Jonathan Corbet Cc: Matthias Brugger Link: http://lkml.kernel.org/r/20200710162123.23713-1-walter-zh.wu@mediatek.com Link: http://lkml.kernel.org/r/20200601050847.1096-1-walter-zh.wu@mediatek.com Link: http://lkml.kernel.org/r/20200601050927.1153-1-walter-zh.wu@mediatek.com Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 2 ++ kernel/rcu/tree.c | 2 ++ mm/kasan/common.c | 4 ++-- mm/kasan/generic.c | 21 +++++++++++++++++++++ mm/kasan/kasan.h | 9 +++++++++ mm/kasan/report.c | 28 +++++++++++++++++++++++----- 6 files changed, 59 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 82522e996c76..18452e35e7b2 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -174,11 +174,13 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); +void kasan_record_aux_stack(void *ptr); #else /* CONFIG_KASAN_GENERIC */ static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} +static inline void kasan_record_aux_stack(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ac7198ed3197..8ce77d9ac716 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -59,6 +59,7 @@ #include #include #include +#include #include "../time/tick-internal.h" #include "tree.h" @@ -2890,6 +2891,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) head->func = func; head->next = NULL; local_irq_save(flags); + kasan_record_aux_stack(head); rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 757d4074fe28..ad24666f50e4 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -40,7 +40,7 @@ #include "kasan.h" #include "../slab.h" -static inline depot_stack_handle_t save_stack(gfp_t flags) +depot_stack_handle_t kasan_save_stack(gfp_t flags) { unsigned long entries[KASAN_STACK_DEPTH]; unsigned int nr_entries; @@ -53,7 +53,7 @@ static inline depot_stack_handle_t save_stack(gfp_t flags) static inline void set_track(struct kasan_track *track, gfp_t flags) { track->pid = current->pid; - track->stack = save_stack(flags); + track->stack = kasan_save_stack(flags); } void kasan_enable_current(void) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 098a7dbaced6..d70586393b04 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -324,3 +324,24 @@ DEFINE_ASAN_SET_SHADOW(f2); DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); + +void kasan_record_aux_stack(void *addr) +{ + struct page *page = kasan_addr_to_page(addr); + struct kmem_cache *cache; + struct kasan_alloc_meta *alloc_info; + void *object; + + if (!(page && PageSlab(page))) + return; + + cache = page->slab_cache; + object = nearest_obj(cache, page, addr); + alloc_info = get_alloc_info(cache, object); + + /* + * record the last two call_rcu() call stacks. + */ + alloc_info->aux_stack[1] = alloc_info->aux_stack[0]; + alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT); +} diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cfade6413528..f89a195e336a 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -104,6 +104,13 @@ struct kasan_track { struct kasan_alloc_meta { struct kasan_track alloc_track; +#ifdef CONFIG_KASAN_GENERIC + /* + * call_rcu() call stack is stored into struct kasan_alloc_meta. + * The free stack is stored into struct kasan_free_meta. + */ + depot_stack_handle_t aux_stack[2]; +#endif struct kasan_track free_track[KASAN_NR_FREE_STACKS]; #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY u8 free_pointer_tag[KASAN_NR_FREE_STACKS]; @@ -159,6 +166,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip); struct page *kasan_addr_to_page(const void *addr); +depot_stack_handle_t kasan_save_stack(gfp_t flags); + #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 51ec45407a0b..445a9d56eb13 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -106,15 +106,20 @@ static void end_report(unsigned long *flags) kasan_enable_current(); } +static void print_stack(depot_stack_handle_t stack) +{ + unsigned long *entries; + unsigned int nr_entries; + + nr_entries = stack_depot_fetch(stack, &entries); + stack_trace_print(entries, nr_entries, 0); +} + static void print_track(struct kasan_track *track, const char *prefix) { pr_err("%s by task %u:\n", prefix, track->pid); if (track->stack) { - unsigned long *entries; - unsigned int nr_entries; - - nr_entries = stack_depot_fetch(track->stack, &entries); - stack_trace_print(entries, nr_entries, 0); + print_stack(track->stack); } else { pr_err("(stack is not available)\n"); } @@ -193,6 +198,19 @@ static void describe_object(struct kmem_cache *cache, void *object, free_track = kasan_get_free_track(cache, object, tag); print_track(free_track, "Freed"); pr_err("\n"); + +#ifdef CONFIG_KASAN_GENERIC + if (alloc_info->aux_stack[0]) { + pr_err("Last call_rcu():\n"); + print_stack(alloc_info->aux_stack[0]); + pr_err("\n"); + } + if (alloc_info->aux_stack[1]) { + pr_err("Second to last call_rcu():\n"); + print_stack(alloc_info->aux_stack[1]); + pr_err("\n"); + } +#endif } describe_object_addr(cache, object, addr); -- cgit v1.2.3 From 8dcc1d34661d58a7889fb06517c8738d1412d1bc Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 6 Aug 2020 23:24:57 -0700 Subject: kasan: don't tag stacks allocated with pagealloc Patch series "kasan: support stack instrumentation for tag-based mode", v2. This patch (of 5): Prepare Software Tag-Based KASAN for stack tagging support. With Tag-Based KASAN when kernel stacks are allocated via pagealloc (which happens when CONFIG_VMAP_STACK is not enabled), they get tagged. KASAN instrumentation doesn't expect the sp register to be tagged, and this leads to false-positive reports. Fix by resetting the tag of kernel stack pointers after allocation. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Marco Elver Cc: Walter Wu Cc: Elena Petrova Cc: Vincenzo Frascino Cc: Catalin Marinas Cc: Ard Biesheuvel Link: http://lkml.kernel.org/r/cover.1596199677.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/cover.1596544734.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/12d8c678869268dd0884b01271ab592f30792abf.1596544734.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/01c678b877755bcf29009176592402cdf6f2cb15.1596199677.git.andreyknvl@google.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=203497 Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c7b4ce9d2647..35e9894d394c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -261,7 +261,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) THREAD_SIZE_ORDER); if (likely(page)) { - tsk->stack = page_address(page); + tsk->stack = kasan_reset_tag(page_address(page)); return tsk->stack; } return NULL; @@ -302,6 +302,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, { unsigned long *stack; stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); + stack = kasan_reset_tag(stack); tsk->stack = stack; return stack; } -- cgit v1.2.3 From 38ce2a9e33db61a3041840310077072d6210ead4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 6 Aug 2020 12:46:49 -0400 Subject: tracing: Add trace_array_init_printk() to initialize instance trace_printk() buffers As trace_array_printk() used with not global instances will not add noise to the main buffer, they are OK to have in the kernel (unlike trace_printk()). This require the subsystem to create their own tracing instance, and the trace_array_printk() only writes into those instances. Add trace_array_init_printk() to initialize the trace_printk() buffers without printing out the WARNING message. Reported-by: Sean Paul Reviewed-by: Sean Paul Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace.h | 1 + kernel/trace/trace.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'kernel') diff --git a/include/linux/trace.h b/include/linux/trace.h index 7fd86d3c691f..36d255d66f88 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -29,6 +29,7 @@ struct trace_array; void trace_printk_init_buffers(void); int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); +int trace_array_init_printk(struct trace_array *tr); void trace_array_put(struct trace_array *tr); struct trace_array *trace_array_get_by_name(const char *name); int trace_array_destroy(struct trace_array *tr); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 06c0feae5ff9..c5f822736261 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3129,6 +3129,9 @@ static int alloc_percpu_trace_buffer(void) { struct trace_buffer_struct *buffers; + if (trace_percpu_buffer) + return 0; + buffers = alloc_percpu(struct trace_buffer_struct); if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) return -ENOMEM; @@ -3331,6 +3334,26 @@ int trace_array_vprintk(struct trace_array *tr, return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } +/** + * trace_array_printk - Print a message to a specific instance + * @tr: The instance trace_array descriptor + * @ip: The instruction pointer that this is called from. + * @fmt: The format to print (printf format) + * + * If a subsystem sets up its own instance, they have the right to + * printk strings into their tracing instance buffer using this + * function. Note, this function will not write into the top level + * buffer (use trace_printk() for that), as writing into the top level + * buffer should only have events that can be individually disabled. + * trace_printk() is only used for debugging a kernel, and should not + * be ever encorporated in normal use. + * + * trace_array_printk() can be used, as it will not add noise to the + * top level tracing buffer. + * + * Note, trace_array_init_printk() must be called on @tr before this + * can be used. + */ __printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) @@ -3355,6 +3378,27 @@ int trace_array_printk(struct trace_array *tr, } EXPORT_SYMBOL_GPL(trace_array_printk); +/** + * trace_array_init_printk - Initialize buffers for trace_array_printk() + * @tr: The trace array to initialize the buffers for + * + * As trace_array_printk() only writes into instances, they are OK to + * have in the kernel (unlike trace_printk()). This needs to be called + * before trace_array_printk() can be used on a trace_array. + */ +int trace_array_init_printk(struct trace_array *tr) +{ + if (!tr) + return -ENOENT; + + /* This is only allowed for created instances */ + if (tr == &global_trace) + return -EINVAL; + + return alloc_percpu_trace_buffer(); +} +EXPORT_SYMBOL_GPL(trace_array_init_printk); + __printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) -- cgit v1.2.3 From 15d5761ad31dfb194ebe76554e6af0437eb20424 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 7 Jul 2020 18:21:16 +0900 Subject: kbuild: introduce ccflags-remove-y and asflags-remove-y CFLAGS_REMOVE_.o filters out flags when compiling a particular object, but there is no convenient way to do that for every object in a directory. Add ccflags-remove-y and asflags-remove-y to make it easily. Use ccflags-remove-y to clean up some Makefiles. The add/remove order works as follows: [1] KBUILD_CFLAGS specifies compiler flags used globally [2] ccflags-y adds compiler flags for all objects in the current Makefile [3] ccflags-remove-y removes compiler flags for all objects in the current Makefile (New feature) [4] CFLAGS_ adds compiler flags per file. [5] CFLAGS_REMOVE_ removes compiler flags per file. Having [3] before [4] allows us to remove flags from most (but not all) objects in the current Makefile. For example, kernel/trace/Makefile removes $(CC_FLAGS_FTRACE) from all objects in the directory, then adds it back to trace_selftest_dynamic.o and CFLAGS_trace_kprobe_selftest.o The same applies to lib/livepatch/Makefile. Please note ccflags-remove-y has no effect to the sub-directories. In contrast, the previous notation got rid of compiler flags also from all the sub-directories. The following are not affected because they have no sub-directories: arch/arm/boot/compressed/ arch/powerpc/xmon/ arch/sh/ kernel/trace/ However, lib/ has several sub-directories. To keep the behavior, I added ccflags-remove-y to all Makefiles in subdirectories of lib/, except the following: lib/vdso/Makefile - Kbuild does not descend into this Makefile lib/raid/test/Makefile - This is not used for the kernel build I think commit 2464a609ded0 ("ftrace: do not trace library functions") excluded too much. In the next commit, I will remove ccflags-remove-y from the sub-directories of lib/. Suggested-by: Sami Tolvanen Signed-off-by: Masahiro Yamada Acked-by: Steven Rostedt (VMware) Acked-by: Michael Ellerman (powerpc) Acked-by: Brendan Higgins (KUnit) Tested-by: Anders Roxell --- Documentation/kbuild/makefiles.rst | 14 ++++++++++++++ arch/arm/boot/compressed/Makefile | 6 +----- arch/powerpc/xmon/Makefile | 3 +-- arch/sh/boot/compressed/Makefile | 5 +---- kernel/trace/Makefile | 4 ++-- lib/842/Makefile | 3 +++ lib/Makefile | 5 +---- lib/crypto/Makefile | 2 ++ lib/dim/Makefile | 2 ++ lib/fonts/Makefile | 2 ++ lib/kunit/Makefile | 3 +++ lib/livepatch/Makefile | 2 ++ lib/lz4/Makefile | 1 + lib/lzo/Makefile | 2 ++ lib/math/Makefile | 2 ++ lib/mpi/Makefile | 2 ++ lib/raid6/Makefile | 3 +++ lib/reed_solomon/Makefile | 2 ++ lib/xz/Makefile | 3 +++ lib/zlib_deflate/Makefile | 2 ++ lib/zlib_dfltcc/Makefile | 2 ++ lib/zlib_inflate/Makefile | 2 ++ lib/zstd/Makefile | 1 + scripts/Makefile.lib | 14 ++++++++------ 24 files changed, 64 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index 6515ebc12b6f..14d8e7d23c04 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -368,6 +368,14 @@ more details, with real examples. subdir-ccflags-y := -Werror + ccflags-remove-y, asflags-remove-y + These flags are used to remove particular flags for the compiler, + assembler invocations. + + Example:: + + ccflags-remove-$(CONFIG_MCOUNT) += -pg + CFLAGS_$@, AFLAGS_$@ CFLAGS_$@ and AFLAGS_$@ only apply to commands in current kbuild makefile. @@ -375,6 +383,9 @@ more details, with real examples. $(CFLAGS_$@) specifies per-file options for $(CC). The $@ part has a literal value which specifies the file that it is for. + CFLAGS_$@ has the higher priority than ccflags-remove-y; CFLAGS_$@ + can re-add compiler flags that were removed by ccflags-remove-y. + Example:: # drivers/scsi/Makefile @@ -387,6 +398,9 @@ more details, with real examples. $(AFLAGS_$@) is a similar feature for source files in assembly languages. + AFLAGS_$@ has the higher priority than asflags-remove-y; AFLAGS_$@ + can re-add assembler flags that were removed by asflags-remove-y. + Example:: # arch/arm/kernel/Makefile diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index cb7a56c6723c..b1147b7f2c8d 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -102,13 +102,9 @@ clean-files += piggy_data lib1funcs.S ashldi3.S bswapsdi2.S hyp-stub.S KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING -ifeq ($(CONFIG_FUNCTION_TRACER),y) -ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS)) -endif - ccflags-y := -fpic $(call cc-option,-mno-single-pic-base,) -fno-builtin \ -I$(obj) $(DISABLE_ARM_SSP_PER_TASK_PLUGIN) +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += -pg asflags-y := -DZIMAGE # Supply kernel BSS size to the decompressor via a linker symbol. diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 89c76ca35640..eb25d7554ffd 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -7,8 +7,7 @@ UBSAN_SANITIZE := n KASAN_SANITIZE := n # Disable ftrace for the entire directory -ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) ifdef CONFIG_CC_IS_CLANG # clang stores addresses on the stack causing the frame size to blow diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile index ad0e2403e56f..589d2d8a573d 100644 --- a/arch/sh/boot/compressed/Makefile +++ b/arch/sh/boot/compressed/Makefile @@ -28,10 +28,7 @@ IMAGE_OFFSET := $(shell /bin/bash -c 'printf "0x%08x" \ $(CONFIG_BOOT_LINK_OFFSET)]') endif -ifeq ($(CONFIG_MCOUNT),y) -ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS)) -endif +ccflags-remove-$(CONFIG_MCOUNT) += -pg LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(IMAGE_OFFSET) -e startup \ -T $(obj)/../../kernel/vmlinux.lds diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 6575bb0a0434..7492844a8b1b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -2,9 +2,9 @@ # Do not instrument the tracer itself: +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + ifdef CONFIG_FUNCTION_TRACER -ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) # Avoid recursion due to instrumentation. KCSAN_SANITIZE := n diff --git a/lib/842/Makefile b/lib/842/Makefile index 6f7aad269288..b815e824ae37 100644 --- a/lib/842/Makefile +++ b/lib/842/Makefile @@ -1,3 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only + +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + obj-$(CONFIG_842_COMPRESS) += 842_compress.o obj-$(CONFIG_842_DECOMPRESS) += 842_decompress.o diff --git a/lib/Makefile b/lib/Makefile index 0cda70649f1c..233f6e644eeb 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -3,10 +3,7 @@ # Makefile for some libs needed in the kernel. # -ifdef CONFIG_FUNCTION_TRACER -ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) -endif +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) # These files are disabled because they produce lots of non-interesting and/or # flaky coverage that is not a function of syscall inputs. For example, diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 3a435629d9ce..b557ef0b07c2 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + # chacha is used by the /dev/random driver which is always builtin obj-y += chacha.o obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC) += libchacha.o diff --git a/lib/dim/Makefile b/lib/dim/Makefile index 1d6858a108cb..97fc3e89d34e 100644 --- a/lib/dim/Makefile +++ b/lib/dim/Makefile @@ -2,6 +2,8 @@ # DIM Dynamic Interrupt Moderation library # +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + obj-$(CONFIG_DIMLIB) += dim.o dim-y := dim.o net_dim.o rdma_dim.o diff --git a/lib/fonts/Makefile b/lib/fonts/Makefile index ed95070860de..f951750c179e 100644 --- a/lib/fonts/Makefile +++ b/lib/fonts/Makefile @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 # Font handling +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + font-objs := fonts.o font-objs-$(CONFIG_FONT_SUN8x16) += font_sun8x16.o diff --git a/lib/kunit/Makefile b/lib/kunit/Makefile index 724b94311ca3..8c847557ab24 100644 --- a/lib/kunit/Makefile +++ b/lib/kunit/Makefile @@ -1,3 +1,6 @@ + +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + obj-$(CONFIG_KUNIT) += kunit.o kunit-objs += test.o \ diff --git a/lib/livepatch/Makefile b/lib/livepatch/Makefile index 295b94bff370..9abdf615b088 100644 --- a/lib/livepatch/Makefile +++ b/lib/livepatch/Makefile @@ -2,6 +2,8 @@ # # Makefile for livepatch test code. +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + obj-$(CONFIG_TEST_LIVEPATCH) += test_klp_atomic_replace.o \ test_klp_callbacks_demo.o \ test_klp_callbacks_demo2.o \ diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile index 5b42242afaa2..53da4cab7015 100644 --- a/lib/lz4/Makefile +++ b/lib/lz4/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only ccflags-y += -O3 +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) obj-$(CONFIG_LZ4_COMPRESS) += lz4_compress.o obj-$(CONFIG_LZ4HC_COMPRESS) += lz4hc_compress.o diff --git a/lib/lzo/Makefile b/lib/lzo/Makefile index 2f58fafbbddd..9565a555275b 100644 --- a/lib/lzo/Makefile +++ b/lib/lzo/Makefile @@ -1,4 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + lzo_compress-objs := lzo1x_compress.o lzo_decompress-objs := lzo1x_decompress_safe.o diff --git a/lib/math/Makefile b/lib/math/Makefile index be6909e943bd..49aa50e28185 100644 --- a/lib/math/Makefile +++ b/lib/math/Makefile @@ -1,4 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + obj-y += div64.o gcd.o lcm.o int_pow.o int_sqrt.o reciprocal_div.o obj-$(CONFIG_CORDIC) += cordic.o diff --git a/lib/mpi/Makefile b/lib/mpi/Makefile index d5874a7f5ff9..df7883521619 100644 --- a/lib/mpi/Makefile +++ b/lib/mpi/Makefile @@ -3,6 +3,8 @@ # MPI multiprecision maths library (from gpg) # +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + obj-$(CONFIG_MPILIB) = mpi.o mpi-y = \ diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index b4c0df6d706d..3482d6ae3f3b 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 + +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + obj-$(CONFIG_RAID6_PQ) += raid6_pq.o raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ diff --git a/lib/reed_solomon/Makefile b/lib/reed_solomon/Makefile index 5d4fa68f26cb..a5c9defdac7f 100644 --- a/lib/reed_solomon/Makefile +++ b/lib/reed_solomon/Makefile @@ -3,5 +3,7 @@ # This is a modified version of reed solomon lib, # +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + obj-$(CONFIG_REED_SOLOMON) += reed_solomon.o obj-$(CONFIG_REED_SOLOMON_TEST) += test_rslib.o diff --git a/lib/xz/Makefile b/lib/xz/Makefile index fa6af814a8d1..fae9b6c7c389 100644 --- a/lib/xz/Makefile +++ b/lib/xz/Makefile @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only + +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + obj-$(CONFIG_XZ_DEC) += xz_dec.o xz_dec-y := xz_dec_syms.o xz_dec_stream.o xz_dec_lzma2.o xz_dec-$(CONFIG_XZ_DEC_BCJ) += xz_dec_bcj.o diff --git a/lib/zlib_deflate/Makefile b/lib/zlib_deflate/Makefile index 2622e03c0b94..1fcefe73536f 100644 --- a/lib/zlib_deflate/Makefile +++ b/lib/zlib_deflate/Makefile @@ -7,6 +7,8 @@ # decompression code. # +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate.o zlib_deflate-objs := deflate.o deftree.o deflate_syms.o diff --git a/lib/zlib_dfltcc/Makefile b/lib/zlib_dfltcc/Makefile index 8e4d5afbbb10..7a8067f6e772 100644 --- a/lib/zlib_dfltcc/Makefile +++ b/lib/zlib_dfltcc/Makefile @@ -6,6 +6,8 @@ # This is the code for s390 zlib hardware support. # +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + obj-$(CONFIG_ZLIB_DFLTCC) += zlib_dfltcc.o zlib_dfltcc-objs := dfltcc.o dfltcc_deflate.o dfltcc_inflate.o dfltcc_syms.o diff --git a/lib/zlib_inflate/Makefile b/lib/zlib_inflate/Makefile index 27327d3e9f54..a451e96f9845 100644 --- a/lib/zlib_inflate/Makefile +++ b/lib/zlib_inflate/Makefile @@ -14,6 +14,8 @@ # uncompression can be done without blocking on allocation). # +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate.o zlib_inflate-objs := inffast.o inflate.o infutil.o \ diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile index f5d778e7e5c7..01be908a2d94 100644 --- a/lib/zstd/Makefile +++ b/lib/zstd/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_ZSTD_COMPRESS) += zstd_compress.o obj-$(CONFIG_ZSTD_DECOMPRESS) += zstd_decompress.o ccflags-y += -O3 +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) zstd_compress-y := fse_compress.o huf_compress.o compress.o \ entropy_common.o fse_decompress.o zstd_common.o diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 8fa9aa2c9fca..5cfd377778b4 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -111,12 +111,14 @@ basename_flags = -DKBUILD_BASENAME=$(call name-fix,$(basetarget)) modname_flags = -DKBUILD_MODNAME=$(call name-fix,$(modname)) modfile_flags = -DKBUILD_MODFILE=$(call stringify,$(modfile)) -orig_c_flags = $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) \ - $(ccflags-y) $(CFLAGS_$(target-stem).o) -_c_flags = $(filter-out $(CFLAGS_REMOVE_$(target-stem).o), $(orig_c_flags)) -orig_a_flags = $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) \ - $(asflags-y) $(AFLAGS_$(target-stem).o) -_a_flags = $(filter-out $(AFLAGS_REMOVE_$(target-stem).o), $(orig_a_flags)) +_c_flags = $(filter-out $(CFLAGS_REMOVE_$(target-stem).o), \ + $(filter-out $(ccflags-remove-y), \ + $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(ccflags-y)) \ + $(CFLAGS_$(target-stem).o)) +_a_flags = $(filter-out $(AFLAGS_REMOVE_$(target-stem).o), \ + $(filter-out $(asflags-remove-y), \ + $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) $(asflags-y)) \ + $(AFLAGS_$(target-stem).o)) _cpp_flags = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(target-stem).lds) # -- cgit v1.2.3 From b0294f30256bb6023b2044fd607855123863d98f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 6 Aug 2020 20:32:48 -0700 Subject: time: Delete repeated words in comments Drop repeated words in kernel/time/. {when, one, into} Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/r/20200807033248.8452-1-rdunlap@infradead.org --- kernel/time/alarmtimer.c | 2 +- kernel/time/sched_clock.c | 2 +- kernel/time/timekeeping.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2ffb466af77e..ca223a89530a 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -192,7 +192,7 @@ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) * When a alarm timer fires, this runs through the timerqueue to * see which alarms expired, and runs those. If there are more alarm * timers queued for the future, we set the hrtimer to fire when - * when the next future alarm timer expires. + * the next future alarm timer expires. */ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0deaf4b79fb4..1c03eec6ca9b 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -229,7 +229,7 @@ void __init generic_sched_clock_init(void) { /* * If no sched_clock() function has been provided at that point, - * make it the final one one. + * make it the final one. */ if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4c7212f3c603..35cd10f1143b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2001,7 +2001,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into - * into a shifted interval nanoseconds. Allows for O(log) accumulation + * a shifted interval nanoseconds. Allows for O(log) accumulation * loop. * * Returns the unconsumed cycles. -- cgit v1.2.3 From e27b1636e9337d1a1d174b191e53d0f86421a822 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 11 Aug 2020 11:00:01 -0700 Subject: genirq/PM: Always unlock IRQ descriptor in rearm_wake_irq() rearm_wake_irq() does not unlock the irq descriptor if the interrupt is not suspended or if wakeup is not enabled on it. Restucture the exit conditions so the unlock is always ensured. Fixes: 3a79bc63d9075 ("PCI: irq: Introduce rearm_wake_irq()") Signed-off-by: Guenter Roeck Signed-off-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200811180001.80203-1-linux@roeck-us.net --- kernel/irq/pm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 8f557fa1f4fe..c6c7e187ae74 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -185,14 +185,18 @@ void rearm_wake_irq(unsigned int irq) unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - if (!desc || !(desc->istate & IRQS_SUSPENDED) || - !irqd_is_wakeup_set(&desc->irq_data)) + if (!desc) return; + if (!(desc->istate & IRQS_SUSPENDED) || + !irqd_is_wakeup_set(&desc->irq_data)) + goto unlock; + desc->istate &= ~IRQS_SUSPENDED; irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); __enable_irq(desc); +unlock: irq_put_desc_busunlock(desc, flags); } -- cgit v1.2.3 From b518154e59aab3ad0780a169c5cc84bd4ee4357e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 11 Aug 2020 18:30:40 -0700 Subject: mm/vmscan: protect the workingset on anonymous LRU In current implementation, newly created or swap-in anonymous page is started on active list. Growing active list results in rebalancing active/inactive list so old pages on active list are demoted to inactive list. Hence, the page on active list isn't protected at all. Following is an example of this situation. Assume that 50 hot pages on active list. Numbers denote the number of pages on active/inactive list (active | inactive). 1. 50 hot pages on active list 50(h) | 0 2. workload: 50 newly created (used-once) pages 50(uo) | 50(h) 3. workload: another 50 newly created (used-once) pages 50(uo) | 50(uo), swap-out 50(h) This patch tries to fix this issue. Like as file LRU, newly created or swap-in anonymous pages will be inserted to the inactive list. They are promoted to active list if enough reference happens. This simple modification changes the above example as following. 1. 50 hot pages on active list 50(h) | 0 2. workload: 50 newly created (used-once) pages 50(h) | 50(uo) 3. workload: another 50 newly created (used-once) pages 50(h) | 50(uo), swap-out 50(uo) As you can see, hot pages on active list would be protected. Note that, this implementation has a drawback that the page cannot be promoted and will be swapped-out if re-access interval is greater than the size of inactive list but less than the size of total(active+inactive). To solve this potential issue, following patch will apply workingset detection similar to the one that's already applied to file LRU. Signed-off-by: Joonsoo Kim Signed-off-by: Andrew Morton Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Link: http://lkml.kernel.org/r/1595490560-15117-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/memory.c | 9 ++++----- mm/migrate.c | 2 +- mm/swap.c | 13 +++++++------ mm/swapfile.c | 2 +- mm/userfaultfd.c | 2 +- mm/vmscan.c | 4 +--- 10 files changed, 19 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/swap.h b/include/linux/swap.h index 7eb59bc552a5..51ec9cdb92c0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -352,7 +352,7 @@ extern void deactivate_page(struct page *page); extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); -extern void lru_cache_add_active_or_unevictable(struct page *page, +extern void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma); /* linux/mm/vmscan.c */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 25de10c904e6..49047d479c57 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); page_add_new_anon_rmap(new_page, vma, addr, false); - lru_cache_add_active_or_unevictable(new_page, vma); + lru_cache_add_inactive_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 206f52b36ffb..863c495776d7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -640,7 +640,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr, true); - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b52bd46ad146..15a9af791014 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1173,7 +1173,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); - lru_cache_add_active_or_unevictable(new_page, vma); + lru_cache_add_inactive_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); diff --git a/mm/memory.c b/mm/memory.c index c39a13b09602..6fe8b5b22c57 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2715,7 +2715,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); page_add_new_anon_rmap(new_page, vma, vmf->address, false); - lru_cache_add_active_or_unevictable(new_page, vma); + lru_cache_add_inactive_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the @@ -3266,10 +3266,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); } else { do_page_add_anon_rmap(page, vma, vmf->address, exclusive); - activate_page(page); } swap_free(entry); @@ -3414,7 +3413,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -3672,7 +3671,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); diff --git a/mm/migrate.c b/mm/migrate.c index d179657f8685..819e55130134 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2830,7 +2830,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); if (!is_zone_device_page(page)) - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); get_page(page); if (flush) { diff --git a/mm/swap.c b/mm/swap.c index de257c0a89b1..9285e60c7d6e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -476,23 +476,24 @@ void lru_cache_add(struct page *page) EXPORT_SYMBOL(lru_cache_add); /** - * lru_cache_add_active_or_unevictable + * lru_cache_add_inactive_or_unevictable * @page: the page to be added to LRU * @vma: vma in which page is mapped for determining reclaimability * - * Place @page on the active or unevictable LRU list, depending on its + * Place @page on the inactive or unevictable LRU list, depending on its * evictability. Note that if the page is not evictable, it goes * directly back onto it's zone's unevictable list, it does NOT use a * per cpu pagevec. */ -void lru_cache_add_active_or_unevictable(struct page *page, +void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) { + bool unevictable; + VM_BUG_ON_PAGE(PageLRU(page), page); - if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) - SetPageActive(page); - else if (!TestSetPageMlocked(page)) { + unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; + if (unlikely(unevictable) && !TestSetPageMlocked(page)) { /* * We use the irq-unsafe __mod_zone_page_stat because this * counter is not modified from interrupt context, and the pte diff --git a/mm/swapfile.c b/mm/swapfile.c index 6c26916e95fd..82183432fdd0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1915,7 +1915,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, page_add_anon_rmap(page, vma, addr, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); } swap_free(entry); /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index b80419320c7d..9a3d451402d7 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -123,7 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, inc_mm_counter(dst_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, dst_vma, dst_addr, false); - lru_cache_add_active_or_unevictable(page, dst_vma); + lru_cache_add_inactive_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); diff --git a/mm/vmscan.c b/mm/vmscan.c index e34fc04b7045..783cd7fdc61a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -998,8 +998,6 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; if (referenced_ptes) { - if (PageSwapBacked(page)) - return PAGEREF_ACTIVATE; /* * All mapped pages start out with page table * references from the instantiating fault, so we need @@ -1022,7 +1020,7 @@ static enum page_references page_check_references(struct page *page, /* * Activate file-backed executable pages after first usage. */ - if (vm_flags & VM_EXEC) + if ((vm_flags & VM_EXEC) && !PageSwapBacked(page)) return PAGEREF_ACTIVATE; return PAGEREF_KEEP; -- cgit v1.2.3 From facdaa917c4d5a376d09d25865f5a863f906234a Mon Sep 17 00:00:00 2001 From: Nitin Gupta Date: Tue, 11 Aug 2020 18:31:00 -0700 Subject: mm: proactive compaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For some applications, we need to allocate almost all memory as hugepages. However, on a running system, higher-order allocations can fail if the memory is fragmented. Linux kernel currently does on-demand compaction as we request more hugepages, but this style of compaction incurs very high latency. Experiments with one-time full memory compaction (followed by hugepage allocations) show that kernel is able to restore a highly fragmented memory state to a fairly compacted memory state within <1 sec for a 32G system. Such data suggests that a more proactive compaction can help us allocate a large fraction of memory as hugepages keeping allocation latencies low. For a more proactive compaction, the approach taken here is to define a new sysctl called 'vm.compaction_proactiveness' which dictates bounds for external fragmentation which kcompactd tries to maintain. The tunable takes a value in range [0, 100], with a default of 20. Note that a previous version of this patch [1] was found to introduce too many tunables (per-order extfrag{low, high}), but this one reduces them to just one sysctl. Also, the new tunable is an opaque value instead of asking for specific bounds of "external fragmentation", which would have been difficult to estimate. The internal interpretation of this opaque value allows for future fine-tuning. Currently, we use a simple translation from this tunable to [low, high] "fragmentation score" thresholds (low=100-proactiveness, high=low+10%). The score for a node is defined as weighted mean of per-zone external fragmentation. A zone's present_pages determines its weight. To periodically check per-node score, we reuse per-node kcompactd threads, which are woken up every 500 milliseconds to check the same. If a node's score exceeds its high threshold (as derived from user-provided proactiveness value), proactive compaction is started until its score reaches its low threshold value. By default, proactiveness is set to 20, which implies threshold values of low=80 and high=90. This patch is largely based on ideas from Michal Hocko [2]. See also the LWN article [3]. Performance data ================ System: x64_64, 1T RAM, 80 CPU threads. Kernel: 5.6.0-rc3 + this patch echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/enabled echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/defrag Before starting the driver, the system was fragmented from a userspace program that allocates all memory and then for each 2M aligned section, frees 3/4 of base pages using munmap. The workload is mainly anonymous userspace pages, which are easy to move around. I intentionally avoided unmovable pages in this test to see how much latency we incur when hugepage allocations hit direct compaction. 1. Kernel hugepage allocation latencies With the system in such a fragmented state, a kernel driver then allocates as many hugepages as possible and measures allocation latency: (all latency values are in microseconds) - With vanilla 5.6.0-rc3 percentile latency –––––––––– ––––––– 5 7894 10 9496 25 12561 30 15295 40 18244 50 21229 60 27556 75 30147 80 31047 90 32859 95 33799 Total 2M hugepages allocated = 383859 (749G worth of hugepages out of 762G total free => 98% of free memory could be allocated as hugepages) - With 5.6.0-rc3 + this patch, with proactiveness=20 sysctl -w vm.compaction_proactiveness=20 percentile latency –––––––––– ––––––– 5 2 10 2 25 3 30 3 40 3 50 4 60 4 75 4 80 4 90 5 95 429 Total 2M hugepages allocated = 384105 (750G worth of hugepages out of 762G total free => 98% of free memory could be allocated as hugepages) 2. JAVA heap allocation In this test, we first fragment memory using the same method as for (1). Then, we start a Java process with a heap size set to 700G and request the heap to be allocated with THP hugepages. We also set THP to madvise to allow hugepage backing of this heap. /usr/bin/time java -Xms700G -Xmx700G -XX:+UseTransparentHugePages -XX:+AlwaysPreTouch The above command allocates 700G of Java heap using hugepages. - With vanilla 5.6.0-rc3 17.39user 1666.48system 27:37.89elapsed - With 5.6.0-rc3 + this patch, with proactiveness=20 8.35user 194.58system 3:19.62elapsed Elapsed time remains around 3:15, as proactiveness is further increased. Note that proactive compaction happens throughout the runtime of these workloads. The situation of one-time compaction, sufficient to supply hugepages for following allocation stream, can probably happen for more extreme proactiveness values, like 80 or 90. In the above Java workload, proactiveness is set to 20. The test starts with a node's score of 80 or higher, depending on the delay between the fragmentation step and starting the benchmark, which gives more-or-less time for the initial round of compaction. As t he benchmark consumes hugepages, node's score quickly rises above the high threshold (90) and proactive compaction starts again, which brings down the score to the low threshold level (80). Repeat. bpftrace also confirms proactive compaction running 20+ times during the runtime of this Java benchmark. kcompactd threads consume 100% of one of the CPUs while it tries to bring a node's score within thresholds. Backoff behavior ================ Above workloads produce a memory state which is easy to compact. However, if memory is filled with unmovable pages, proactive compaction should essentially back off. To test this aspect: - Created a kernel driver that allocates almost all memory as hugepages followed by freeing first 3/4 of each hugepage. - Set proactiveness=40 - Note that proactive_compact_node() is deferred maximum number of times with HPAGE_FRAG_CHECK_INTERVAL_MSEC of wait between each check (=> ~30 seconds between retries). [1] https://patchwork.kernel.org/patch/11098289/ [2] https://lore.kernel.org/linux-mm/20161230131412.GI13301@dhcp22.suse.cz/ [3] https://lwn.net/Articles/817905/ Signed-off-by: Nitin Gupta Signed-off-by: Andrew Morton Tested-by: Oleksandr Natalenko Reviewed-by: Vlastimil Babka Reviewed-by: Khalid Aziz Reviewed-by: Oleksandr Natalenko Cc: Vlastimil Babka Cc: Khalid Aziz Cc: Michal Hocko Cc: Mel Gorman Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Joonsoo Kim Cc: David Rientjes Cc: Nitin Gupta Cc: Oleksandr Natalenko Link: http://lkml.kernel.org/r/20200616204527.19185-1-nigupta@nvidia.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 15 +++ include/linux/compaction.h | 2 + kernel/sysctl.c | 9 ++ mm/compaction.c | 183 +++++++++++++++++++++++++++++++- mm/internal.h | 1 + mm/vmstat.c | 18 ++++ 6 files changed, 223 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index d997cc3c26d0..4b9d2e8e9142 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -119,6 +119,21 @@ all zones are compacted such that free memory is available in contiguous blocks where possible. This can be important for example in the allocation of huge pages although processes will also directly compact memory as required. +compaction_proactiveness +======================== + +This tunable takes a value in the range [0, 100] with a default value of +20. This tunable determines how aggressively compaction is done in the +background. Setting it to 0 disables proactive compaction. + +Note that compaction has a non-trivial system-wide impact as pages +belonging to different processes are moved around, which could also lead +to latency spikes in unsuspecting applications. The kernel employs +various heuristics to avoid wasting CPU cycles if it detects that +proactive compaction is not being effective. + +Be careful when setting it to extreme values like 100, as that may +cause excessive background compaction activity. compact_unevictable_allowed =========================== diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 6fa0eea3f530..7a242d46454e 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -85,11 +85,13 @@ static inline unsigned long compact_gap(unsigned int order) #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; +extern int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; +extern int extfrag_for_order(struct zone *zone, unsigned int order); extern int fragmentation_index(struct zone *zone, unsigned int order); extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f785de3caac0..ab72e52f8a7b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2851,6 +2851,15 @@ static struct ctl_table vm_table[] = { .mode = 0200, .proc_handler = sysctl_compaction_handler, }, + { + .procname = "compaction_proactiveness", + .data = &sysctl_compaction_proactiveness, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, { .procname = "extfrag_threshold", .data = &sysctl_extfrag_threshold, diff --git a/mm/compaction.c b/mm/compaction.c index 86375605faa9..544a98811c82 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -50,6 +50,24 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) #define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) +/* + * Fragmentation score check interval for proactive compaction purposes. + */ +static const int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500; + +/* + * Page order with-respect-to which proactive compaction + * calculates external fragmentation, which is used as + * the "fragmentation score" of a node/zone. + */ +#if defined CONFIG_TRANSPARENT_HUGEPAGE +#define COMPACTION_HPAGE_ORDER HPAGE_PMD_ORDER +#elif defined HUGETLB_PAGE_ORDER +#define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER +#else +#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) +#endif + static unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; @@ -1857,6 +1875,76 @@ static inline bool is_via_compact_memory(int order) return order == -1; } +static bool kswapd_is_running(pg_data_t *pgdat) +{ + return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING); +} + +/* + * A zone's fragmentation score is the external fragmentation wrt to the + * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value + * in the range [0, 100]. + * + * The scaling factor ensures that proactive compaction focuses on larger + * zones like ZONE_NORMAL, rather than smaller, specialized zones like + * ZONE_DMA32. For smaller zones, the score value remains close to zero, + * and thus never exceeds the high threshold for proactive compaction. + */ +static int fragmentation_score_zone(struct zone *zone) +{ + unsigned long score; + + score = zone->present_pages * + extfrag_for_order(zone, COMPACTION_HPAGE_ORDER); + return div64_ul(score, zone->zone_pgdat->node_present_pages + 1); +} + +/* + * The per-node proactive (background) compaction process is started by its + * corresponding kcompactd thread when the node's fragmentation score + * exceeds the high threshold. The compaction process remains active till + * the node's score falls below the low threshold, or one of the back-off + * conditions is met. + */ +static int fragmentation_score_node(pg_data_t *pgdat) +{ + unsigned long score = 0; + int zoneid; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone; + + zone = &pgdat->node_zones[zoneid]; + score += fragmentation_score_zone(zone); + } + + return score; +} + +static int fragmentation_score_wmark(pg_data_t *pgdat, bool low) +{ + int wmark_low; + + /* + * Cap the low watermak to avoid excessive compaction + * activity in case a user sets the proactivess tunable + * close to 100 (maximum). + */ + wmark_low = max(100 - sysctl_compaction_proactiveness, 5); + return low ? wmark_low : min(wmark_low + 10, 100); +} + +static bool should_proactive_compact_node(pg_data_t *pgdat) +{ + int wmark_high; + + if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat)) + return false; + + wmark_high = fragmentation_score_wmark(pgdat, false); + return fragmentation_score_node(pgdat) > wmark_high; +} + static enum compact_result __compact_finished(struct compact_control *cc) { unsigned int order; @@ -1883,6 +1971,25 @@ static enum compact_result __compact_finished(struct compact_control *cc) return COMPACT_PARTIAL_SKIPPED; } + if (cc->proactive_compaction) { + int score, wmark_low; + pg_data_t *pgdat; + + pgdat = cc->zone->zone_pgdat; + if (kswapd_is_running(pgdat)) + return COMPACT_PARTIAL_SKIPPED; + + score = fragmentation_score_zone(cc->zone); + wmark_low = fragmentation_score_wmark(pgdat, true); + + if (score > wmark_low) + ret = COMPACT_CONTINUE; + else + ret = COMPACT_SUCCESS; + + goto out; + } + if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; @@ -1941,6 +2048,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) } } +out: if (cc->contended || fatal_signal_pending(current)) ret = COMPACT_CONTENDED; @@ -2421,6 +2529,41 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, return rc; } +/* + * Compact all zones within a node till each zone's fragmentation score + * reaches within proactive compaction thresholds (as determined by the + * proactiveness tunable). + * + * It is possible that the function returns before reaching score targets + * due to various back-off conditions, such as, contention on per-node or + * per-zone locks. + */ +static void proactive_compact_node(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = -1, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + .whole_zone = true, + .gfp_mask = GFP_KERNEL, + .proactive_compaction = true, + }; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + cc.zone = zone; + + compact_zone(&cc, NULL); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } +} /* Compact all zones within a node */ static void compact_node(int nid) @@ -2467,6 +2610,13 @@ static void compact_nodes(void) /* The written value is actually unused, all memory is compacted */ int sysctl_compact_memory; +/* + * Tunable for proactive compaction. It determines how + * aggressively the kernel should compact memory in the + * background. It takes values in the range [0, 100]. + */ +int __read_mostly sysctl_compaction_proactiveness = 20; + /* * This is the entry point for compacting all nodes via * /proc/sys/vm/compact_memory @@ -2646,6 +2796,7 @@ static int kcompactd(void *p) { pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; + unsigned int proactive_defer = 0; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -2661,12 +2812,34 @@ static int kcompactd(void *p) unsigned long pflags; trace_mm_compaction_kcompactd_sleep(pgdat->node_id); - wait_event_freezable(pgdat->kcompactd_wait, - kcompactd_work_requested(pgdat)); + if (wait_event_freezable_timeout(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat), + msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) { + + psi_memstall_enter(&pflags); + kcompactd_do_work(pgdat); + psi_memstall_leave(&pflags); + continue; + } - psi_memstall_enter(&pflags); - kcompactd_do_work(pgdat); - psi_memstall_leave(&pflags); + /* kcompactd wait timeout */ + if (should_proactive_compact_node(pgdat)) { + unsigned int prev_score, score; + + if (proactive_defer) { + proactive_defer--; + continue; + } + prev_score = fragmentation_score_node(pgdat); + proactive_compact_node(pgdat); + score = fragmentation_score_node(pgdat); + /* + * Defer proactive compaction if the fragmentation + * score did not go down i.e. no progress made. + */ + proactive_defer = score < prev_score ? + 0 : 1 << COMPACT_MAX_DEFER_SHIFT; + } } return 0; diff --git a/mm/internal.h b/mm/internal.h index 9886db20d94f..42cf0b610847 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -239,6 +239,7 @@ struct compact_control { bool no_set_skip_hint; /* Don't mark blocks for skipping */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ + bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock or sched contention */ bool rescan; /* Rescanning the same pageblock */ diff --git a/mm/vmstat.c b/mm/vmstat.c index fef03463a0cf..f183aa37994e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1096,6 +1096,24 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); } +/* + * Calculates external fragmentation within a zone wrt the given order. + * It is defined as the percentage of pages found in blocks of size + * less than 1 << order. It returns values in range [0, 100]. + */ +int extfrag_for_order(struct zone *zone, unsigned int order) +{ + struct contig_page_info info; + + fill_contig_page_info(zone, order, &info); + if (info.free_pages == 0) + return 0; + + return div_u64((info.free_pages - + (info.free_blocks_suitable << order)) * 100, + info.free_pages); +} + /* Same as __fragmentation index but allocs contig_page_info on stack */ int fragmentation_index(struct zone *zone, unsigned int order) { -- cgit v1.2.3 From d34c0a7599ea8c301bc471dfa1eb2bf2db6752d1 Mon Sep 17 00:00:00 2001 From: Nitin Gupta Date: Tue, 11 Aug 2020 18:31:07 -0700 Subject: mm: use unsigned types for fragmentation score Proactive compaction uses per-node/zone "fragmentation score" which is always in range [0, 100], so use unsigned type of these scores as well as for related constants. Signed-off-by: Nitin Gupta Signed-off-by: Andrew Morton Reviewed-by: Baoquan He Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Vlastimil Babka Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200618010319.13159-1-nigupta@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 4 ++-- kernel/sysctl.c | 2 +- mm/compaction.c | 18 +++++++++--------- mm/vmstat.c | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 7a242d46454e..25a521d299c1 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -85,13 +85,13 @@ static inline unsigned long compact_gap(unsigned int order) #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; -extern int sysctl_compaction_proactiveness; +extern unsigned int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; -extern int extfrag_for_order(struct zone *zone, unsigned int order); +extern unsigned int extfrag_for_order(struct zone *zone, unsigned int order); extern int fragmentation_index(struct zone *zone, unsigned int order); extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ab72e52f8a7b..287862f91717 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2854,7 +2854,7 @@ static struct ctl_table vm_table[] = { { .procname = "compaction_proactiveness", .data = &sysctl_compaction_proactiveness, - .maxlen = sizeof(int), + .maxlen = sizeof(sysctl_compaction_proactiveness), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, diff --git a/mm/compaction.c b/mm/compaction.c index ed8ea1511634..b7d433f1706a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -53,7 +53,7 @@ static inline void count_compact_events(enum vm_event_item item, long delta) /* * Fragmentation score check interval for proactive compaction purposes. */ -static const int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500; +static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500; /* * Page order with-respect-to which proactive compaction @@ -1890,7 +1890,7 @@ static bool kswapd_is_running(pg_data_t *pgdat) * ZONE_DMA32. For smaller zones, the score value remains close to zero, * and thus never exceeds the high threshold for proactive compaction. */ -static int fragmentation_score_zone(struct zone *zone) +static unsigned int fragmentation_score_zone(struct zone *zone) { unsigned long score; @@ -1906,9 +1906,9 @@ static int fragmentation_score_zone(struct zone *zone) * the node's score falls below the low threshold, or one of the back-off * conditions is met. */ -static int fragmentation_score_node(pg_data_t *pgdat) +static unsigned int fragmentation_score_node(pg_data_t *pgdat) { - unsigned long score = 0; + unsigned int score = 0; int zoneid; for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { @@ -1921,17 +1921,17 @@ static int fragmentation_score_node(pg_data_t *pgdat) return score; } -static int fragmentation_score_wmark(pg_data_t *pgdat, bool low) +static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low) { - int wmark_low; + unsigned int wmark_low; /* * Cap the low watermak to avoid excessive compaction * activity in case a user sets the proactivess tunable * close to 100 (maximum). */ - wmark_low = max(100 - sysctl_compaction_proactiveness, 5); - return low ? wmark_low : min(wmark_low + 10, 100); + wmark_low = max(100U - sysctl_compaction_proactiveness, 5U); + return low ? wmark_low : min(wmark_low + 10, 100U); } static bool should_proactive_compact_node(pg_data_t *pgdat) @@ -2615,7 +2615,7 @@ int sysctl_compact_memory; * aggressively the kernel should compact memory in the * background. It takes values in the range [0, 100]. */ -int __read_mostly sysctl_compaction_proactiveness = 20; +unsigned int __read_mostly sysctl_compaction_proactiveness = 20; /* * This is the entry point for compacting all nodes via diff --git a/mm/vmstat.c b/mm/vmstat.c index f183aa37994e..3bb034c99887 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1101,7 +1101,7 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in * It is defined as the percentage of pages found in blocks of size * less than 1 << order. It returns values in range [0, 100]. */ -int extfrag_for_order(struct zone *zone, unsigned int order) +unsigned int extfrag_for_order(struct zone *zone, unsigned int order) { struct contig_page_info info; -- cgit v1.2.3 From 3d13f313ce4c34c524ccc37986fe77172f601ff3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Aug 2020 18:33:47 -0700 Subject: uaccess: add force_uaccess_{begin,end} helpers Add helpers to wrap the get_fs/set_fs magic for undoing any damange done by set_fs(KERNEL_DS). There is no real functional benefit, but this documents the intent of these calls better, and will allow stubbing the functions out easily for kernels builds that do not allow address space overrides in the future. [hch@lst.de: drop two incorrect hunks, fix a commit log typo] Link: http://lkml.kernel.org/r/20200714105505.935079-6-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Acked-by: Mark Rutland Acked-by: Greentime Hu Acked-by: Geert Uytterhoeven Cc: Nick Hu Cc: Vincent Chen Cc: Paul Walmsley Cc: Palmer Dabbelt Link: http://lkml.kernel.org/r/20200710135706.537715-6-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm64/kernel/sdei.c | 2 +- arch/m68k/include/asm/tlbflush.h | 6 +++--- arch/mips/kernel/unaligned.c | 27 +++++++++++++-------------- arch/nds32/mm/alignment.c | 7 +++---- arch/sh/kernel/traps_32.c | 12 +++++------- drivers/firmware/arm_sdei.c | 5 ++--- include/linux/uaccess.h | 18 ++++++++++++++++++ kernel/events/callchain.c | 5 ++--- kernel/events/core.c | 5 ++--- kernel/kthread.c | 5 ++--- kernel/stacktrace.c | 5 ++--- mm/maccess.c | 22 ++++++++++------------ 12 files changed, 63 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index dab88260b137..7689f2031c0c 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -180,7 +180,7 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, /* * We didn't take an exception to get here, set PAN. UAO will be cleared - * by sdei_event_handler()s set_fs(USER_DS) call. + * by sdei_event_handler()s force_uaccess_begin() call. */ __uaccess_enable_hw_pan(); diff --git a/arch/m68k/include/asm/tlbflush.h b/arch/m68k/include/asm/tlbflush.h index 191e75a6bb24..5337bc2c262f 100644 --- a/arch/m68k/include/asm/tlbflush.h +++ b/arch/m68k/include/asm/tlbflush.h @@ -85,10 +85,10 @@ static inline void flush_tlb_mm(struct mm_struct *mm) static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { if (vma->vm_mm == current->active_mm) { - mm_segment_t old_fs = get_fs(); - set_fs(USER_DS); + mm_segment_t old_fs = force_uaccess_begin(); + __flush_tlb_one(addr); - set_fs(old_fs); + force_uaccess_end(old_fs); } } diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index 0adce604fa44..126a5f3f4e4c 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -191,17 +191,16 @@ static void emulate_load_store_insn(struct pt_regs *regs, * memory, so we need to "switch" the address limit to * user space, so that address check can work properly. */ - seg = get_fs(); - set_fs(USER_DS); + seg = force_uaccess_begin(); switch (insn.spec3_format.func) { case lhe_op: if (!access_ok(addr, 2)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } LoadHWE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } compute_return_epc(regs); @@ -209,12 +208,12 @@ static void emulate_load_store_insn(struct pt_regs *regs, break; case lwe_op: if (!access_ok(addr, 4)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } LoadWE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } compute_return_epc(regs); @@ -222,12 +221,12 @@ static void emulate_load_store_insn(struct pt_regs *regs, break; case lhue_op: if (!access_ok(addr, 2)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } LoadHWUE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } compute_return_epc(regs); @@ -235,35 +234,35 @@ static void emulate_load_store_insn(struct pt_regs *regs, break; case she_op: if (!access_ok(addr, 2)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } compute_return_epc(regs); value = regs->regs[insn.spec3_format.rt]; StoreHWE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } break; case swe_op: if (!access_ok(addr, 4)) { - set_fs(seg); + force_uaccess_end(seg); goto sigbus; } compute_return_epc(regs); value = regs->regs[insn.spec3_format.rt]; StoreWE(addr, value, res); if (res) { - set_fs(seg); + force_uaccess_end(seg); goto fault; } break; default: - set_fs(seg); + force_uaccess_end(seg); goto sigill; } - set_fs(seg); + force_uaccess_end(seg); } #endif break; diff --git a/arch/nds32/mm/alignment.c b/arch/nds32/mm/alignment.c index c8b9061a2ee3..1eb7ded6992b 100644 --- a/arch/nds32/mm/alignment.c +++ b/arch/nds32/mm/alignment.c @@ -512,7 +512,7 @@ int do_unaligned_access(unsigned long addr, struct pt_regs *regs) { unsigned long inst; int ret = -EFAULT; - mm_segment_t seg = get_fs(); + mm_segment_t seg; inst = get_inst(regs->ipc); @@ -520,13 +520,12 @@ int do_unaligned_access(unsigned long addr, struct pt_regs *regs) "Faulting addr: 0x%08lx, pc: 0x%08lx [inst: 0x%08lx ]\n", addr, regs->ipc, inst); - set_fs(USER_DS); - + seg = force_uaccess_begin(); if (inst & NDS32_16BIT_INSTRUCTION) ret = do_16((inst >> 16) & 0xffff, regs); else ret = do_32(inst, regs); - set_fs(seg); + force_uaccess_end(seg); return ret; } diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c index 058c6181bb30..b62ad0ba2395 100644 --- a/arch/sh/kernel/traps_32.c +++ b/arch/sh/kernel/traps_32.c @@ -482,8 +482,6 @@ asmlinkage void do_address_error(struct pt_regs *regs, error_code = lookup_exception_vector(); #endif - oldfs = get_fs(); - if (user_mode(regs)) { int si_code = BUS_ADRERR; unsigned int user_action; @@ -491,13 +489,13 @@ asmlinkage void do_address_error(struct pt_regs *regs, local_irq_enable(); inc_unaligned_user_access(); - set_fs(USER_DS); + oldfs = force_uaccess_begin(); if (copy_from_user(&instruction, (insn_size_t *)(regs->pc & ~1), sizeof(instruction))) { - set_fs(oldfs); + force_uaccess_end(oldfs); goto uspace_segv; } - set_fs(oldfs); + force_uaccess_end(oldfs); /* shout about userspace fixups */ unaligned_fixups_notify(current, instruction, regs); @@ -520,11 +518,11 @@ fixup: goto uspace_segv; } - set_fs(USER_DS); + oldfs = force_uaccess_begin(); tmp = handle_unaligned_access(instruction, regs, &user_mem_access, 0, address); - set_fs(oldfs); + force_uaccess_end(oldfs); if (tmp == 0) return; /* sorted */ diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index e7e36aab2386..b4b9ce97f415 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -1136,15 +1136,14 @@ int sdei_event_handler(struct pt_regs *regs, * access kernel memory. * Do the same here because this doesn't come via the same entry code. */ - orig_addr_limit = get_fs(); - set_fs(USER_DS); + orig_addr_limit = force_uaccess_begin(); err = arg->callback(event_num, regs, arg->callback_arg); if (err) pr_err_ratelimited("event %u on CPU %u failed with error: %d\n", event_num, smp_processor_id(), err); - set_fs(orig_addr_limit); + force_uaccess_end(orig_addr_limit); return err; } diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 5c62d0c6f15b..94b285411659 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -8,6 +8,24 @@ #include +/* + * Force the uaccess routines to be wired up for actual userspace access, + * overriding any possible set_fs(KERNEL_DS) still lingering around. Undone + * using force_uaccess_end below. + */ +static inline mm_segment_t force_uaccess_begin(void) +{ + mm_segment_t fs = get_fs(); + + set_fs(USER_DS); + return fs; +} + +static inline void force_uaccess_end(mm_segment_t oldfs) +{ + set_fs(oldfs); +} + /* * Architectures should provide two primitives (raw_copy_{to,from}_user()) * and get rid of their private instances of copy_{to,from}_user() and diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c6ce894e4ce9..58cbe357fb2b 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -217,10 +217,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); perf_callchain_user(&ctx, regs); - set_fs(fs); + force_uaccess_end(fs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index d1f0a7e5b182..6961333ebad5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6453,10 +6453,9 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); rem = __output_copy_user(handle, (void *) sp, dump_size); - set_fs(fs); + force_uaccess_end(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); diff --git a/kernel/kthread.c b/kernel/kthread.c index b2807e7be772..3edaa380dc7b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1258,8 +1258,7 @@ void kthread_use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); - to_kthread(tsk)->oldfs = get_fs(); - set_fs(USER_DS); + to_kthread(tsk)->oldfs = force_uaccess_begin(); } EXPORT_SYMBOL_GPL(kthread_use_mm); @@ -1274,7 +1273,7 @@ void kthread_unuse_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); - set_fs(to_kthread(tsk)->oldfs); + force_uaccess_end(to_kthread(tsk)->oldfs); task_lock(tsk); sync_mm_rss(mm); diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 2af66e449aa6..946f44a9e86a 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -233,10 +233,9 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) if (current->flags & PF_KTHREAD) return 0; - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); - set_fs(fs); + force_uaccess_end(fs); return c.len; } diff --git a/mm/maccess.c b/mm/maccess.c index f98ff91e32c6..3bd70405f2d8 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -205,15 +205,14 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) long copy_from_user_nofault(void *dst, const void __user *src, size_t size) { long ret = -EFAULT; - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs = force_uaccess_begin(); - set_fs(USER_DS); if (access_ok(src, size)) { pagefault_disable(); ret = __copy_from_user_inatomic(dst, src, size); pagefault_enable(); } - set_fs(old_fs); + force_uaccess_end(old_fs); if (ret) return -EFAULT; @@ -233,15 +232,14 @@ EXPORT_SYMBOL_GPL(copy_from_user_nofault); long copy_to_user_nofault(void __user *dst, const void *src, size_t size) { long ret = -EFAULT; - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs = force_uaccess_begin(); - set_fs(USER_DS); if (access_ok(dst, size)) { pagefault_disable(); ret = __copy_to_user_inatomic(dst, src, size); pagefault_enable(); } - set_fs(old_fs); + force_uaccess_end(old_fs); if (ret) return -EFAULT; @@ -270,17 +268,17 @@ EXPORT_SYMBOL_GPL(copy_to_user_nofault); long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count) { - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs; long ret; if (unlikely(count <= 0)) return 0; - set_fs(USER_DS); + old_fs = force_uaccess_begin(); pagefault_disable(); ret = strncpy_from_user(dst, unsafe_addr, count); pagefault_enable(); - set_fs(old_fs); + force_uaccess_end(old_fs); if (ret >= count) { ret = count; @@ -310,14 +308,14 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, */ long strnlen_user_nofault(const void __user *unsafe_addr, long count) { - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs; int ret; - set_fs(USER_DS); + old_fs = force_uaccess_begin(); pagefault_disable(); ret = strnlen_user(unsafe_addr, count); pagefault_enable(); - set_fs(old_fs); + force_uaccess_end(old_fs); return ret; } -- cgit v1.2.3 From fe81417596fa8b6577fedb7e206ff3e4c7015c13 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Aug 2020 18:33:50 -0700 Subject: exec: use force_uaccess_begin during exec and exit Both exec and exit want to ensure that the uaccess routines actually do access user pointers. Use the newly added force_uaccess_begin helper instead of an open coded set_fs for that to prepare for kernel builds where set_fs() does not exist. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Nick Hu Cc: Greentime Hu Cc: Vincent Chen Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20200710135706.537715-7-hch@lst.de Signed-off-by: Linus Torvalds --- fs/exec.c | 7 ++++++- kernel/exit.c | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 3698252719a3..29ef78ae9f50 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1402,7 +1402,12 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) goto out_unlock; - set_fs(USER_DS); + /* + * Ensure that the uaccess routines can actually operate on userspace + * pointers: + */ + force_uaccess_begin(); + me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); diff --git a/kernel/exit.c b/kernel/exit.c index e731c414e024..c2d2961576f2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -732,7 +732,7 @@ void __noreturn do_exit(long code) * mm_release()->clear_child_tid() from writing to a user-controlled * kernel address. */ - set_fs(USER_DS); + force_uaccess_begin(); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", -- cgit v1.2.3 From 8043fc147a97ec2eefc582487f344f2cbe86d12e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Aug 2020 18:34:10 -0700 Subject: kernel: add a kernel_wait helper Add a helper that waits for a pid and stores the status in the passed in kernel pointer. Use it to fix the usage of kernel_wait4 in call_usermodehelper_exec_sync that only happens to work due to the implicit set_fs(KERNEL_DS) for kernel threads. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Acked-by: "Eric W. Biederman" Cc: Luis Chamberlain Link: http://lkml.kernel.org/r/20200721130449.5008-1-hch@lst.de Signed-off-by: Linus Torvalds --- include/linux/sched/task.h | 1 + kernel/exit.c | 16 ++++++++++++++++ kernel/umh.c | 29 ++++------------------------- 3 files changed, 21 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ae3060f0b0c9..a98965007eef 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -88,6 +88,7 @@ struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); +int kernel_wait(pid_t pid, int *stat); extern void free_task(struct task_struct *tsk); diff --git a/kernel/exit.c b/kernel/exit.c index c2d2961576f2..733e80f334e7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1626,6 +1626,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options, return ret; } +int kernel_wait(pid_t pid, int *stat) +{ + struct wait_opts wo = { + .wo_type = PIDTYPE_PID, + .wo_pid = find_get_pid(pid), + .wo_flags = WEXITED, + }; + int ret; + + ret = do_wait(&wo); + if (ret > 0 && wo.wo_stat) + *stat = wo.wo_stat; + put_pid(wo.wo_pid); + return ret; +} + SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { diff --git a/kernel/umh.c b/kernel/umh.c index a25433f9cd9a..fcf3ee803630 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -119,37 +119,16 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { pid_t pid; - /* If SIGCLD is ignored kernel_wait4 won't populate the status. */ + /* If SIGCLD is ignored do_wait won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); - if (pid < 0) { + if (pid < 0) sub_info->retval = pid; - } else { - int ret = -ECHILD; - /* - * Normally it is bogus to call wait4() from in-kernel because - * wait4() wants to write the exit code to a userspace address. - * But call_usermodehelper_exec_sync() always runs as kernel - * thread (workqueue) and put_user() to a kernel address works - * OK for kernel threads, due to their having an mm_segment_t - * which spans the entire address space. - * - * Thus the __user pointer cast is valid here. - */ - kernel_wait4(pid, (int __user *)&ret, 0, NULL); - - /* - * If ret is 0, either call_usermodehelper_exec_async failed and - * the real error code is already in sub_info->retval or - * sub_info->retval is 0 anyway, so don't mess with it then. - */ - if (ret) - sub_info->retval = ret; - } + else + kernel_wait(pid, &sub_info->retval); /* Restore default kernel sig handler */ kernel_sigaction(SIGCHLD, SIG_IGN); - umh_complete(sub_info); } -- cgit v1.2.3 From 6f9e148c218641ad876845abdea4b0597468c55e Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 11 Aug 2020 18:36:12 -0700 Subject: kmod: remove redundant "be an" in the comment There exists redundant "be an" in the comment, remove it. Signed-off-by: Tiezhu Yang Signed-off-by: Luis Chamberlain Signed-off-by: Andrew Morton Acked-by: Luis Chamberlain Cc: Alexei Starovoitov Cc: Al Viro Cc: Christian Brauner Cc: Chuck Lever Cc: David Howells Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Jakub Kicinski Cc: James Morris Cc: Jarkko Sakkinen Cc: J. Bruce Fields Cc: Jens Axboe Cc: Josh Triplett Cc: Kees Cook Cc: Lars Ellenberg Cc: Nikolay Aleksandrov Cc: Philipp Reisner Cc: Roopa Prabhu Cc: "Serge E. Hallyn" Cc: Sergei Trofimovich Cc: Sergey Kvachonok Cc: Shuah Khan Cc: Tony Vroon Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20200610154923.27510-3-mcgrof@kernel.org Signed-off-by: Linus Torvalds --- kernel/kmod.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 37c3c4b97b8e..3cd075ce2a1e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -36,9 +36,8 @@ * * If you need less than 50 threads would mean we're dealing with systems * smaller than 3200 pages. This assumes you are capable of having ~13M memory, - * and this would only be an be an upper limit, after which the OOM killer - * would take effect. Systems like these are very unlikely if modules are - * enabled. + * and this would only be an upper limit, after which the OOM killer would take + * effect. Systems like these are very unlikely if modules are enabled. */ #define MAX_KMOD_CONCURRENT 50 static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); -- cgit v1.2.3 From 0935288c6e008c0682ab6171fb5605dcc049e7bd Mon Sep 17 00:00:00 2001 From: Vijay Balakrishna Date: Tue, 11 Aug 2020 18:36:33 -0700 Subject: kdump: append kernel build-id string to VMCOREINFO Make kernel GNU build-id available in VMCOREINFO. Having build-id in VMCOREINFO facilitates presenting appropriate kernel namelist image with debug information file to kernel crash dump analysis tools. Currently VMCOREINFO lacks uniquely identifiable key for crash analysis automation. Regarding if this patch is necessary or matching of linux_banner and OSRELEASE in VMCOREINFO employed by crash(8) meets the need -- IMO, build-id approach more foolproof, in most instances it is a cryptographic hash generated using internal code/ELF bits unlike kernel version string upon which linux_banner is based that is external to the code. I feel each is intended for a different purpose. Also OSRELEASE is not suitable when two different kernel builds from same version with different features enabled. Currently for most linux (and non-linux) systems build-id can be extracted using standard methods for file types such as user mode crash dumps, shared libraries, loadable kernel modules etc., This is an exception for linux kernel dump. Having build-id in VMCOREINFO brings some uniformity for automation tools. Tyler said: : I think this is a nice improvement over today's linux_banner approach for : correlating vmlinux to a kernel dump. : : The elf notes parsing in this patch lines up with what is described in in : the "Notes (Nhdr)" section of the elf(5) man page. : : BUILD_ID_MAX is sufficient to hold a sha1 build-id, which is the default : build-id type today in GNU ld(2). It is also sufficient to hold the : "fast" build-id, which is the default build-id type today in LLVM lld(2). Signed-off-by: Vijay Balakrishna Signed-off-by: Andrew Morton Reviewed-by: Tyler Hicks Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Link: http://lkml.kernel.org/r/1591849672-34104-1-git-send-email-vijayb@linux.microsoft.com Signed-off-by: Linus Torvalds --- include/linux/crash_core.h | 6 ++++++ kernel/crash_core.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) (limited to 'kernel') diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 525510a9f965..6594dbc34a37 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -38,6 +38,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_OSRELEASE(value) \ vmcoreinfo_append_str("OSRELEASE=%s\n", value) +#define VMCOREINFO_BUILD_ID(value) \ + vmcoreinfo_append_str("BUILD-ID=%s\n", value) #define VMCOREINFO_PAGESIZE(value) \ vmcoreinfo_append_str("PAGESIZE=%ld\n", value) #define VMCOREINFO_SYMBOL(name) \ @@ -64,6 +66,10 @@ extern unsigned char *vmcoreinfo_data; extern size_t vmcoreinfo_size; extern u32 *vmcoreinfo_note; +/* raw contents of kernel .notes section */ +extern const void __start_notes __weak; +extern const void __stop_notes __weak; + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); void final_note(Elf_Word *buf); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 18175687133a..106e4500fd53 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -11,6 +11,8 @@ #include #include +#include + /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; @@ -376,6 +378,53 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void) } EXPORT_SYMBOL(paddr_vmcoreinfo_note); +#define NOTES_SIZE (&__stop_notes - &__start_notes) +#define BUILD_ID_MAX SHA1_DIGEST_SIZE +#define NT_GNU_BUILD_ID 3 + +struct elf_note_section { + struct elf_note n_hdr; + u8 n_data[]; +}; + +/* + * Add build ID from .notes section as generated by the GNU ld(1) + * or LLVM lld(1) --build-id option. + */ +static void add_build_id_vmcoreinfo(void) +{ + char build_id[BUILD_ID_MAX * 2 + 1]; + int n_remain = NOTES_SIZE; + + while (n_remain >= sizeof(struct elf_note)) { + const struct elf_note_section *note_sec = + &__start_notes + NOTES_SIZE - n_remain; + const u32 n_namesz = note_sec->n_hdr.n_namesz; + + if (note_sec->n_hdr.n_type == NT_GNU_BUILD_ID && + n_namesz != 0 && + !strcmp((char *)¬e_sec->n_data[0], "GNU")) { + if (note_sec->n_hdr.n_descsz <= BUILD_ID_MAX) { + const u32 n_descsz = note_sec->n_hdr.n_descsz; + const u8 *s = ¬e_sec->n_data[n_namesz]; + + s = PTR_ALIGN(s, 4); + bin2hex(build_id, s, n_descsz); + build_id[2 * n_descsz] = '\0'; + VMCOREINFO_BUILD_ID(build_id); + return; + } + pr_warn("Build ID is too large to include in vmcoreinfo: %u > %u\n", + note_sec->n_hdr.n_descsz, + BUILD_ID_MAX); + return; + } + n_remain -= sizeof(struct elf_note) + + ALIGN(note_sec->n_hdr.n_namesz, 4) + + ALIGN(note_sec->n_hdr.n_descsz, 4); + } +} + static int __init crash_save_vmcoreinfo_init(void) { vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); @@ -394,6 +443,7 @@ static int __init crash_save_vmcoreinfo_init(void) } VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + add_build_id_vmcoreinfo(); VMCOREINFO_PAGESIZE(PAGE_SIZE); VMCOREINFO_SYMBOL(init_uts_ns); -- cgit v1.2.3 From 79076e1241bb3bf02d0aac7d39120d8161fe07b1 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 11 Aug 2020 18:36:46 -0700 Subject: kernel/panic.c: make oops_may_print() return bool The return value of oops_may_print() is true or false, so change its type to reflect that. Signed-off-by: Tiezhu Yang Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Xuefeng Li Link: http://lkml.kernel.org/r/1591103358-32087-1-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 2 +- kernel/panic.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 92517ae53e7c..211005163682 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -322,7 +322,7 @@ void nmi_panic(struct pt_regs *regs, const char *msg); extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); -extern int oops_may_print(void); +extern bool oops_may_print(void); void do_exit(long error_code) __noreturn; void complete_and_exit(struct completion *, long) __noreturn; diff --git a/kernel/panic.c b/kernel/panic.c index e2157ca387c8..93ba061d1c79 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -505,7 +505,7 @@ static void do_oops_enter_exit(void) * Return true if the calling CPU is allowed to print oops-related info. * This is a bit racy.. */ -int oops_may_print(void) +bool oops_may_print(void) { return pause_on_oops_flag == 0; } -- cgit v1.2.3 From 63037f74725ddd8a767ed2ad0369e60a3bf1f2ce Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Tue, 11 Aug 2020 18:36:53 -0700 Subject: panic: make print_oops_end_marker() static Since print_oops_end_marker() is not used externally, also remove it in kernel.h at the same time. Signed-off-by: Yue Hu Signed-off-by: Andrew Morton Cc: Kees Cook Link: http://lkml.kernel.org/r/20200724011516.12756-1-zbestahu@gmail.com Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 - kernel/panic.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 211005163682..500def620d8f 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -321,7 +321,6 @@ void panic(const char *fmt, ...) __noreturn __cold; void nmi_panic(struct pt_regs *regs, const char *msg); extern void oops_enter(void); extern void oops_exit(void); -void print_oops_end_marker(void); extern bool oops_may_print(void); void do_exit(long error_code) __noreturn; void complete_and_exit(struct completion *, long) __noreturn; diff --git a/kernel/panic.c b/kernel/panic.c index 93ba061d1c79..aef8872ba843 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -551,7 +551,7 @@ static int init_oops_id(void) } late_initcall(init_oops_id); -void print_oops_end_marker(void) +static void print_oops_end_marker(void) { init_oops_id(); pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); -- cgit v1.2.3 From 31a1b9878c0667945b078c92bce907552f57f2b6 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 11 Aug 2020 18:36:56 -0700 Subject: kcov: unconditionally add -fno-stack-protector to compiler options Unconditionally add -fno-stack-protector to KCOV's compiler options, as all supported compilers support the option. This saves a compiler invocation to determine if the option is supported. Because Clang does not support -fno-conserve-stack, and -fno-stack-protector was wrapped in the same cc-option, we were missing -fno-stack-protector with Clang. Unconditionally adding this option fixes this for Clang. Suggested-by: Nick Desaulniers Signed-off-by: Marco Elver Signed-off-by: Andrew Morton Reviewed-by: Nick Desaulniers Reviewed-by: Andrey Konovalov Cc: Dmitry Vyukov Cc: Alexander Potapenko Link: http://lkml.kernel.org/r/20200615184302.7591-1-elver@google.com Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 5350fd292910..b3da548691c9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -36,7 +36,7 @@ KCOV_INSTRUMENT_stacktrace.o := n KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n -CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) +CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) -- cgit v1.2.3 From fed79d057d0842d2e381632b783af30745dd7938 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 11 Aug 2020 18:36:59 -0700 Subject: kcov: make some symbols static Fix sparse build warnings: kernel/kcov.c:99:1: warning: symbol '__pcpu_scope_kcov_percpu_data' was not declared. Should it be static? kernel/kcov.c:778:6: warning: symbol 'kcov_remote_softirq_start' was not declared. Should it be static? kernel/kcov.c:795:6: warning: symbol 'kcov_remote_softirq_stop' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Reviewed-by: Andrey Konovalov Link: http://lkml.kernel.org/r/20200702115501.73077-1-weiyongjun1@huawei.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 6afae0bcbac4..6b8368be89c8 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -96,7 +96,7 @@ struct kcov_percpu_data { int saved_sequence; }; -DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); +static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) @@ -775,7 +775,7 @@ static inline bool kcov_mode_enabled(unsigned int mode) return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; } -void kcov_remote_softirq_start(struct task_struct *t) +static void kcov_remote_softirq_start(struct task_struct *t) { struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); unsigned int mode; @@ -792,7 +792,7 @@ void kcov_remote_softirq_start(struct task_struct *t) } } -void kcov_remote_softirq_stop(struct task_struct *t) +static void kcov_remote_softirq_stop(struct task_struct *t) { struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); -- cgit v1.2.3 From 64019a2e467a288a16b65ab55ddcbf58c1b00187 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 11 Aug 2020 18:39:01 -0700 Subject: mm/gup: remove task_struct pointer for all gup code After the cleanup of page fault accounting, gup does not need to pass task_struct around any more. Remove that parameter in the whole gup stack. Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Link: http://lkml.kernel.org/r/20200707225021.200906-26-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/arc/kernel/process.c | 2 +- arch/s390/kvm/interrupt.c | 2 +- arch/s390/kvm/kvm-s390.c | 2 +- arch/s390/kvm/priv.c | 8 +-- arch/s390/mm/gmap.c | 4 +- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 2 +- drivers/infiniband/core/umem_odp.c | 2 +- drivers/vfio/vfio_iommu_type1.c | 4 +- fs/exec.c | 2 +- include/linux/mm.h | 9 ++- kernel/events/uprobes.c | 6 +- kernel/futex.c | 2 +- mm/gup.c | 101 ++++++++++++---------------- mm/memory.c | 2 +- mm/process_vm_access.c | 2 +- security/tomoyo/domain.c | 2 +- virt/kvm/async_pf.c | 2 +- virt/kvm/kvm_main.c | 2 +- 18 files changed, 69 insertions(+), 87 deletions(-) (limited to 'kernel') diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index e12c80d71b78..efeba1fe7252 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -91,7 +91,7 @@ fault: goto fail; mmap_read_lock(current->mm); - ret = fixup_user_fault(current, current->mm, (unsigned long) uaddr, + ret = fixup_user_fault(current->mm, (unsigned long) uaddr, FAULT_FLAG_WRITE, NULL); mmap_read_unlock(current->mm); diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 1608fd99bbee..2f177298c663 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2768,7 +2768,7 @@ static struct page *get_map_page(struct kvm *kvm, u64 uaddr) struct page *page = NULL; mmap_read_lock(kvm->mm); - get_user_pages_remote(NULL, kvm->mm, uaddr, 1, FOLL_WRITE, + get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, &page, NULL, NULL); mmap_read_unlock(kvm->mm); return page; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 66da278a67fb..6b74b92c1a58 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1892,7 +1892,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) r = set_guest_storage_key(current->mm, hva, keys[i], 0); if (r) { - r = fixup_user_fault(current, current->mm, hva, + r = fixup_user_fault(current->mm, hva, FAULT_FLAG_WRITE, &unlocked); if (r) break; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 2f721a923b54..cd74989ce0b0 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -273,7 +273,7 @@ retry: rc = get_guest_storage_key(current->mm, vmaddr, &key); if (rc) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); if (!rc) { mmap_read_unlock(current->mm); @@ -319,7 +319,7 @@ retry: mmap_read_lock(current->mm); rc = reset_guest_reference_bit(current->mm, vmaddr); if (rc < 0) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); if (!rc) { mmap_read_unlock(current->mm); @@ -390,7 +390,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) m3 & SSKE_MC); if (rc < 0) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); rc = !rc ? -EAGAIN : rc; } @@ -1094,7 +1094,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) rc = cond_set_guest_storage_key(current->mm, vmaddr, key, NULL, nq, mr, mc); if (rc < 0) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); rc = !rc ? -EAGAIN : rc; } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 190357ff86b3..8747487c50a8 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -649,7 +649,7 @@ retry: rc = vmaddr; goto out_up; } - if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, + if (fixup_user_fault(gmap->mm, vmaddr, fault_flags, &unlocked)) { rc = -EFAULT; goto out_up; @@ -879,7 +879,7 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, BUG_ON(gmap_is_shadow(gmap)); fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; - if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked)) + if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) return -EFAULT; if (unlocked) /* lost mmap_lock, caller has to retry __gmap_translate */ diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index e946032b13e4..2c2bf24140c9 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -469,7 +469,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) locked = 1; } ret = pin_user_pages_remote - (work->task, mm, + (mm, obj->userptr.ptr + pinned * PAGE_SIZE, npages - pinned, flags, diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 5e32f61a2fe4..cc6b4befde7c 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -439,7 +439,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, * complex (and doesn't gain us much performance in most use * cases). */ - npages = get_user_pages_remote(owning_process, owning_mm, + npages = get_user_pages_remote(owning_mm, user_virt, gup_num_pages, flags, local_page_list, NULL, NULL); mmap_read_unlock(owning_mm); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 5e556ac9102a..9d41105bfd01 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -425,7 +425,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, if (ret) { bool unlocked = false; - ret = fixup_user_fault(NULL, mm, vaddr, + ret = fixup_user_fault(mm, vaddr, FAULT_FLAG_REMOTE | (write_fault ? FAULT_FLAG_WRITE : 0), &unlocked); @@ -453,7 +453,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, flags |= FOLL_WRITE; mmap_read_lock(mm); - ret = pin_user_pages_remote(NULL, mm, vaddr, 1, flags | FOLL_LONGTERM, + ret = pin_user_pages_remote(mm, vaddr, 1, flags | FOLL_LONGTERM, page, NULL, NULL); if (ret == 1) { *pfn = page_to_pfn(page[0]); diff --git a/fs/exec.c b/fs/exec.c index a57d9785832b..a91003e28eaa 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -217,7 +217,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ - ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, + ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, &page, NULL, NULL); if (ret <= 0) return NULL; diff --git a/include/linux/mm.h b/include/linux/mm.h index ec0ffb423769..e7602a3bcef1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1661,7 +1661,7 @@ int invalidate_inode_page(struct page *page); extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs); -extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, +extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); void unmap_mapping_pages(struct address_space *mapping, @@ -1677,8 +1677,7 @@ static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, BUG(); return VM_FAULT_SIGBUS; } -static inline int fixup_user_fault(struct task_struct *tsk, - struct mm_struct *mm, unsigned long address, +static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ @@ -1704,11 +1703,11 @@ extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); -long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, +long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked); -long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, +long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 49047d479c57..649fd53dc9ad 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -376,7 +376,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) if (!vaddr || !d) return -EINVAL; - ret = get_user_pages_remote(NULL, mm, vaddr, 1, + ret = get_user_pages_remote(mm, vaddr, 1, FOLL_WRITE, &page, &vma, NULL); if (unlikely(ret <= 0)) { /* @@ -477,7 +477,7 @@ retry: if (is_register) gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags, + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -2029,7 +2029,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, + result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL, NULL); if (result < 0) return result; diff --git a/kernel/futex.c b/kernel/futex.c index 83404124b77b..61e8153e6c76 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -678,7 +678,7 @@ static int fault_in_user_writeable(u32 __user *uaddr) int ret; mmap_read_lock(mm); - ret = fixup_user_fault(current, mm, (unsigned long)uaddr, + ret = fixup_user_fault(mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); mmap_read_unlock(mm); diff --git a/mm/gup.c b/mm/gup.c index d5d44c68fa19..39e58df6925d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -859,7 +859,7 @@ unmap: * does not include FOLL_NOWAIT, the mmap_lock may be released. If it * is, *@locked will be set to 0 and -EBUSY returned. */ -static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, +static int faultin_page(struct vm_area_struct *vma, unsigned long address, unsigned int *flags, int *locked) { unsigned int fault_flags = 0; @@ -962,7 +962,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) /** * __get_user_pages() - pin user pages in memory - * @tsk: task_struct of target task * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin @@ -1021,7 +1020,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ -static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) @@ -1103,8 +1102,7 @@ retry: page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page) { - ret = faultin_page(tsk, vma, start, &foll_flags, - locked); + ret = faultin_page(vma, start, &foll_flags, locked); switch (ret) { case 0: goto retry; @@ -1178,8 +1176,6 @@ static bool vma_permits_fault(struct vm_area_struct *vma, /** * fixup_user_fault() - manually resolve a user page fault - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. * @mm: mm_struct of target mm * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() @@ -1207,7 +1203,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma, * This function will not return with an unlocked mmap_lock. So it has not the * same semantics wrt the @mm->mmap_lock as does filemap_fault(). */ -int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, +int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { @@ -1256,8 +1252,7 @@ EXPORT_SYMBOL_GPL(fixup_user_fault); * Please note that this function, unlike __get_user_pages will not * return 0 for nr_pages > 0 without FOLL_NOWAIT */ -static __always_inline long __get_user_pages_locked(struct task_struct *tsk, - struct mm_struct *mm, +static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, @@ -1290,7 +1285,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, pages_done = 0; lock_dropped = false; for (;;) { - ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, + ret = __get_user_pages(mm, start, nr_pages, flags, pages, vmas, locked); if (!locked) /* VM_FAULT_RETRY couldn't trigger, bypass */ @@ -1350,7 +1345,7 @@ retry: } *locked = 1; - ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, + ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, pages, NULL, locked); if (!*locked) { /* Continue to retry until we succeeded */ @@ -1437,7 +1432,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ - return __get_user_pages(current, mm, start, nr_pages, gup_flags, + return __get_user_pages(mm, start, nr_pages, gup_flags, NULL, NULL, locked); } @@ -1521,7 +1516,7 @@ struct page *get_dump_page(unsigned long addr) struct vm_area_struct *vma; struct page *page; - if (__get_user_pages(current, current->mm, addr, 1, + if (__get_user_pages(current->mm, addr, 1, FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, NULL) < 1) return NULL; @@ -1530,8 +1525,7 @@ struct page *get_dump_page(unsigned long addr) } #endif /* CONFIG_ELF_CORE */ #else /* CONFIG_MMU */ -static long __get_user_pages_locked(struct task_struct *tsk, - struct mm_struct *mm, unsigned long start, +static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, struct vm_area_struct **vmas, int *locked, unsigned int foll_flags) @@ -1596,8 +1590,7 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) } #ifdef CONFIG_CMA -static long check_and_migrate_cma_pages(struct task_struct *tsk, - struct mm_struct *mm, +static long check_and_migrate_cma_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, @@ -1675,7 +1668,7 @@ check_again: * again migrating any new CMA pages which we failed to isolate * earlier. */ - ret = __get_user_pages_locked(tsk, mm, start, nr_pages, + ret = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, gup_flags); @@ -1689,8 +1682,7 @@ check_again: return ret; } #else -static long check_and_migrate_cma_pages(struct task_struct *tsk, - struct mm_struct *mm, +static long check_and_migrate_cma_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, @@ -1705,8 +1697,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which * allows us to process the FOLL_LONGTERM flag. */ -static long __gup_longterm_locked(struct task_struct *tsk, - struct mm_struct *mm, +static long __gup_longterm_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, @@ -1731,7 +1722,7 @@ static long __gup_longterm_locked(struct task_struct *tsk, flags = memalloc_nocma_save(); } - rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, + rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas_tmp, NULL, gup_flags); if (gup_flags & FOLL_LONGTERM) { @@ -1745,7 +1736,7 @@ static long __gup_longterm_locked(struct task_struct *tsk, goto out; } - rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages, + rc = check_and_migrate_cma_pages(mm, start, rc, pages, vmas_tmp, gup_flags); out: memalloc_nocma_restore(flags); @@ -1756,22 +1747,20 @@ out: return rc; } #else /* !CONFIG_FS_DAX && !CONFIG_CMA */ -static __always_inline long __gup_longterm_locked(struct task_struct *tsk, - struct mm_struct *mm, +static __always_inline long __gup_longterm_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, struct vm_area_struct **vmas, unsigned int flags) { - return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, flags); } #endif /* CONFIG_FS_DAX || CONFIG_CMA */ #ifdef CONFIG_MMU -static long __get_user_pages_remote(struct task_struct *tsk, - struct mm_struct *mm, +static long __get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) @@ -1790,20 +1779,18 @@ static long __get_user_pages_remote(struct task_struct *tsk, * This will check the vmas (even if our vmas arg is NULL) * and return -ENOTSUPP if DAX isn't allowed in this case: */ - return __gup_longterm_locked(tsk, mm, start, nr_pages, pages, + return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } - return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } /** * get_user_pages_remote() - pin user pages in memory - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin @@ -1862,7 +1849,7 @@ static long __get_user_pages_remote(struct task_struct *tsk, * should use get_user_pages_remote because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ -long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, +long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) @@ -1874,13 +1861,13 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) return -EINVAL; - return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, + return __get_user_pages_remote(mm, start, nr_pages, gup_flags, pages, vmas, locked); } EXPORT_SYMBOL(get_user_pages_remote); #else /* CONFIG_MMU */ -long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, +long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) @@ -1888,8 +1875,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, return 0; } -static long __get_user_pages_remote(struct task_struct *tsk, - struct mm_struct *mm, +static long __get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) @@ -1909,11 +1895,10 @@ static long __get_user_pages_remote(struct task_struct *tsk, * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. * - * This is the same as get_user_pages_remote(), just with a - * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's and don't allow - * passing of a locked parameter. We also obviously don't pass - * FOLL_REMOTE in here. + * This is the same as get_user_pages_remote(), just with a less-flexible + * calling convention where we assume that the mm being operated on belongs to + * the current task, and doesn't allow passing of a locked parameter. We also + * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, @@ -1926,7 +1911,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) return -EINVAL; - return __gup_longterm_locked(current, current->mm, start, nr_pages, + return __gup_longterm_locked(current->mm, start, nr_pages, pages, vmas, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); @@ -1936,7 +1921,7 @@ EXPORT_SYMBOL(get_user_pages); * * mmap_read_lock(mm); * do_something() - * get_user_pages(tsk, mm, ..., pages, NULL); + * get_user_pages(mm, ..., pages, NULL); * mmap_read_unlock(mm); * * to: @@ -1944,7 +1929,7 @@ EXPORT_SYMBOL(get_user_pages); * int locked = 1; * mmap_read_lock(mm); * do_something() - * get_user_pages_locked(tsk, mm, ..., pages, &locked); + * get_user_pages_locked(mm, ..., pages, &locked); * if (locked) * mmap_read_unlock(mm); * @@ -1982,7 +1967,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) return -EINVAL; - return __get_user_pages_locked(current, current->mm, start, nr_pages, + return __get_user_pages_locked(current->mm, start, nr_pages, pages, NULL, locked, gup_flags | FOLL_TOUCH); } @@ -1992,12 +1977,12 @@ EXPORT_SYMBOL(get_user_pages_locked); * get_user_pages_unlocked() is suitable to replace the form: * * mmap_read_lock(mm); - * get_user_pages(tsk, mm, ..., pages, NULL); + * get_user_pages(mm, ..., pages, NULL); * mmap_read_unlock(mm); * * with: * - * get_user_pages_unlocked(tsk, mm, ..., pages); + * get_user_pages_unlocked(mm, ..., pages); * * It is functionally equivalent to get_user_pages_fast so * get_user_pages_fast should be used instead if specific gup_flags @@ -2020,7 +2005,7 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, return -EINVAL; mmap_read_lock(mm); - ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, + ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL, &locked, gup_flags | FOLL_TOUCH); if (locked) mmap_read_unlock(mm); @@ -2665,7 +2650,7 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages, */ if (gup_flags & FOLL_LONGTERM) { mmap_read_lock(current->mm); - ret = __gup_longterm_locked(current, current->mm, + ret = __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, gup_flags); mmap_read_unlock(current->mm); @@ -2908,10 +2893,8 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages, EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); /** - * pin_user_pages_remote() - pin pages of a remote process (task != current) + * pin_user_pages_remote() - pin pages of a remote process * - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin @@ -2932,7 +2915,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. */ -long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, +long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) @@ -2942,7 +2925,7 @@ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, return -EINVAL; gup_flags |= FOLL_PIN; - return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, + return __get_user_pages_remote(mm, start, nr_pages, gup_flags, pages, vmas, locked); } EXPORT_SYMBOL(pin_user_pages_remote); @@ -2974,7 +2957,7 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, return -EINVAL; gup_flags |= FOLL_PIN; - return __gup_longterm_locked(current, current->mm, start, nr_pages, + return __gup_longterm_locked(current->mm, start, nr_pages, pages, vmas, gup_flags); } EXPORT_SYMBOL(pin_user_pages); @@ -3019,7 +3002,7 @@ long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, return -EINVAL; gup_flags |= FOLL_PIN; - return __get_user_pages_locked(current, current->mm, start, nr_pages, + return __get_user_pages_locked(current->mm, start, nr_pages, pages, NULL, locked, gup_flags | FOLL_TOUCH); } diff --git a/mm/memory.c b/mm/memory.c index 2b7f0e00f312..228efaca75d3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4742,7 +4742,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, void *maddr; struct page *page = NULL; - ret = get_user_pages_remote(tsk, mm, addr, 1, + ret = get_user_pages_remote(mm, addr, 1, gup_flags, &page, &vma, NULL); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index cc85ce81914a..29c052099aff 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -105,7 +105,7 @@ static int process_vm_rw_single_vec(unsigned long addr, * current/current->mm */ mmap_read_lock(mm); - pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages, + pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages, flags, process_pages, NULL, &locked); if (locked) diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 53b3e1f5f227..dc4ecc0b2038 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -914,7 +914,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, * (represented by bprm). 'current' is the process doing * the execve(). */ - if (get_user_pages_remote(current, bprm->mm, pos, 1, + if (get_user_pages_remote(bprm->mm, pos, 1, FOLL_FORCE, &page, NULL, NULL) <= 0) return false; #else diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 390f758d5a27..dd777688d14a 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -61,7 +61,7 @@ static void async_pf_execute(struct work_struct *work) * access remotely. */ mmap_read_lock(mm); - get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL, + get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, NULL, &locked); if (locked) mmap_read_unlock(mm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2c2c0254c2d8..737666db02de 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1893,7 +1893,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * not call the fault handler, so do it here. */ bool unlocked = false; - r = fixup_user_fault(current, current->mm, addr, + r = fixup_user_fault(current->mm, addr, (write_fault ? FAULT_FLAG_WRITE : 0), &unlocked); if (unlocked) -- cgit v1.2.3 From f107cee94ba4d2c7357fde59a1d84346c73d4958 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 11 Aug 2020 11:00:12 -0700 Subject: genirq: Unlock irq descriptor after errors In irq_set_irqchip_state(), the irq descriptor is not unlocked after an error is encountered. While that should never happen in practice, a buggy driver may trigger it. This would result in a lockup, so fix it. Fixes: 1d0326f352bb ("genirq: Check irq_data_get_irq_chip() return value before use") Signed-off-by: Guenter Roeck Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200811180012.80269-1-linux@roeck-us.net --- kernel/irq/manage.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d55ba625d426..52ac5391dcc6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2731,8 +2731,10 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); - if (WARN_ON_ONCE(!chip)) - return -ENODEV; + if (WARN_ON_ONCE(!chip)) { + err = -ENODEV; + goto out_unlock; + } if (chip->irq_set_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY @@ -2745,6 +2747,7 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, if (data) err = chip->irq_set_irqchip_state(data, which, val); +out_unlock: irq_put_desc_busunlock(desc, flags); return err; } -- cgit v1.2.3 From ebf0d100df0731901c16632f78d78d35f4123bc4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Aug 2020 09:01:38 -0600 Subject: task_work: only grab task signal lock when needed If JOBCTL_TASK_WORK is already set on the targeted task, then we need not go through {lock,unlock}_task_sighand() to set it again and queue a signal wakeup. This is safe as we're checking it _after_ adding the new task_work with cmpxchg(). The ordering is as follows: task_work_add() get_signal() -------------------------------------------------------------- STORE(task->task_works, new_work); STORE(task->jobctl); mb(); mb(); LOAD(task->jobctl); LOAD(task->task_works); This speeds up TWA_SIGNAL handling quite a bit, which is important now that io_uring is relying on it for all task_work deliveries. Cc: Peter Zijlstra Cc: Jann Horn Acked-by: Oleg Nesterov Signed-off-by: Jens Axboe --- kernel/signal.c | 16 +++++++++++++++- kernel/task_work.c | 8 +++++++- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 6f16f7c5d375..42b67d2cea37 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2541,7 +2541,21 @@ bool get_signal(struct ksignal *ksig) relock: spin_lock_irq(&sighand->siglock); - current->jobctl &= ~JOBCTL_TASK_WORK; + /* + * Make sure we can safely read ->jobctl() in task_work add. As Oleg + * states: + * + * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we + * roughly have + * + * task_work_add: get_signal: + * STORE(task->task_works, new_work); STORE(task->jobctl); + * mb(); mb(); + * LOAD(task->jobctl); LOAD(task->task_works); + * + * and we can rely on STORE-MB-LOAD [ in task_work_add]. + */ + smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK); if (unlikely(current->task_works)) { spin_unlock_irq(&sighand->siglock); task_work_run(); diff --git a/kernel/task_work.c b/kernel/task_work.c index 5c0848ca1287..613b2d634af8 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -42,7 +42,13 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) set_notify_resume(task); break; case TWA_SIGNAL: - if (lock_task_sighand(task, &flags)) { + /* + * Only grab the sighand lock if we don't already have some + * task_work pending. This pairs with the smp_store_mb() + * in get_signal(), see comment there. + */ + if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && + lock_task_sighand(task, &flags)) { task->jobctl |= JOBCTL_TASK_WORK; signal_wake_up(task, 0); unlock_task_sighand(task, &flags); -- cgit v1.2.3 From 405fa8ac89e7aaa87282df659e525992f2639e76 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 13 Aug 2020 08:21:17 -0400 Subject: futex: Convert to use the preferred 'fallthrough' macro Signed-off-by: Miaohe Lin Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200813122117.51173-1-linmiaohe@huawei.com --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 61e8153e6c76..a5876694a60e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3744,12 +3744,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; - /* fall through */ + fallthrough; case FUTEX_WAIT_BITSET: return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; - /* fall through */ + fallthrough; case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: -- cgit v1.2.3 From cc172ff301d8079e941a6eb31758951a6d764084 Mon Sep 17 00:00:00 2001 From: Libing Zhou Date: Fri, 14 Aug 2020 11:02:36 +0800 Subject: sched/debug: Fix the alignment of the show-state debug output Current sysrq(t) output task fields name are not aligned with actual task fields value, e.g.: kernel: sysrq: Show State kernel: task PC stack pid father kernel: systemd S12456 1 0 0x00000000 kernel: Call Trace: kernel: ? __schedule+0x240/0x740 To make it more readable, print fields name together with task fields value in the same line, with fixed width: kernel: sysrq: Show State kernel: task:systemd state:S stack:12920 pid: 1 ppid: 0 flags:0x00000000 kernel: Call Trace: kernel: __schedule+0x282/0x620 Signed-off-by: Libing Zhou Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200814030236.37835-1-libing.zhou@nokia-sbell.com --- kernel/sched/core.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4a0e7b449b88..09fd62568ba9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6387,10 +6387,10 @@ void sched_show_task(struct task_struct *p) if (!try_get_task_stack(p)) return; - printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); + pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); if (p->state == TASK_RUNNING) - printk(KERN_CONT " running task "); + pr_cont(" running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif @@ -6399,8 +6399,8 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), ppid, + pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", + free, task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); @@ -6435,13 +6435,6 @@ void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; -#if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); -#else - printk(KERN_INFO - " task PC stack pid father\n"); -#endif rcu_read_lock(); for_each_process_thread(g, p) { /* -- cgit v1.2.3 From a85ffd59bd362d6c2e456e7f523b091830cd5454 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 12 Aug 2020 20:17:00 -0700 Subject: dma-debug: fix debug_dma_assert_idle(), use rcu_read_lock() Since commit 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic") improved unlock_page(), it has become more noticeable how cow_user_page() in a kernel with CONFIG_DMA_API_DEBUG=y can create and suffer from heavy contention on DMA debug's radix_lock in debug_dma_assert_idle(). It is only doing a lookup: use rcu_read_lock() and rcu_read_unlock() instead; though that does require the static ents[] to be moved onstack... ...but, hold on, isn't that radix_tree_gang_lookup() and loop doing quite the wrong thing: searching CACHELINES_PER_PAGE entries for an exact match with the first cacheline of the page in question? radix_tree_gang_lookup() is the right tool for the job, but we need nothing more than to check the first entry it can find, reporting if that falls anywhere within the page. (Is RCU safe here? As safe as using the spinlock was. The entries are never freed, so don't need to be freed by RCU. They may be reused, and there is a faint chance of a race, with an offending entry reused while printing its error info; but the spinlock did not prevent that either, and I agree that it's not worth worrying about. ] [ Side noe: this patch is a clear improvement to the status quo, but the next patch will be removing this debug function entirely. But just in case we decide we want to resurrect the debugging code some day, I'm first applying this improvement patch so that it doesn't get lost - Linus ] Fixes: 3b7a6418c749 ("dma debug: account for cachelines and read-only mappings in overlap tracking") Signed-off-by: Hugh Dickins Acked-by: Dan Williams Acked-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- kernel/dma/debug.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index f7f807fb6ebb..6f4f4b9d2d03 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -565,11 +565,8 @@ static void active_cacheline_remove(struct dma_debug_entry *entry) */ void debug_dma_assert_idle(struct page *page) { - static struct dma_debug_entry *ents[CACHELINES_PER_PAGE]; - struct dma_debug_entry *entry = NULL; - void **results = (void **) &ents; - unsigned int nents, i; - unsigned long flags; + struct dma_debug_entry *entry; + unsigned long pfn; phys_addr_t cln; if (dma_debug_disabled()) @@ -578,20 +575,14 @@ void debug_dma_assert_idle(struct page *page) if (!page) return; - cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT; - spin_lock_irqsave(&radix_lock, flags); - nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln, - CACHELINES_PER_PAGE); - for (i = 0; i < nents; i++) { - phys_addr_t ent_cln = to_cacheline_number(ents[i]); + pfn = page_to_pfn(page); + cln = (phys_addr_t) pfn << CACHELINE_PER_PAGE_SHIFT; - if (ent_cln == cln) { - entry = ents[i]; - break; - } else if (ent_cln >= cln + CACHELINES_PER_PAGE) - break; - } - spin_unlock_irqrestore(&radix_lock, flags); + rcu_read_lock(); + if (!radix_tree_gang_lookup(&dma_active_cacheline, (void **) &entry, + cln, 1) || entry->pfn != pfn) + entry = NULL; + rcu_read_unlock(); if (!entry) return; -- cgit v1.2.3 From 5848dc5b1b76d83599dcec1b39f188a7cbdca7e2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 14 Aug 2020 15:22:43 -0700 Subject: dma-debug: remove debug_dma_assert_idle() function This remoes the code from the COW path to call debug_dma_assert_idle(), which was added many years ago. Google shows that it hasn't caught anything in the 6+ years we've had it apart from a false positive, and Hugh just noticed how it had a very unfortunate spinlock serialization in the COW path. He fixed that issue the previous commit (a85ffd59bd36: "dma-debug: fix debug_dma_assert_idle(), use rcu_read_lock()"), but let's see if anybody even notices when we remove this function entirely. NOTE! We keep the dma tracking infrastructure that was added by the commit that introduced it. Partly to make it easier to resurrect this debug code if we ever deside to, and partly because that tracking by pfn and offset looks quite reasonable. The problem with this debug code was simply that it was expensive and didn't seem worth it, not that it was wrong per se. Acked-by: Dan Williams Acked-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- include/linux/dma-debug.h | 6 ------ kernel/dma/Kconfig | 5 ----- kernel/dma/debug.c | 46 +--------------------------------------------- mm/memory.c | 2 -- 4 files changed, 1 insertion(+), 58 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h index 4208f94d93f7..7b3b04ba60f3 100644 --- a/include/linux/dma-debug.h +++ b/include/linux/dma-debug.h @@ -67,8 +67,6 @@ extern void debug_dma_sync_sg_for_device(struct device *dev, extern void debug_dma_dump_mappings(struct device *dev); -extern void debug_dma_assert_idle(struct page *page); - #else /* CONFIG_DMA_API_DEBUG */ static inline void dma_debug_add_bus(struct bus_type *bus) @@ -157,10 +155,6 @@ static inline void debug_dma_dump_mappings(struct device *dev) { } -static inline void debug_dma_assert_idle(struct page *page) -{ -} - #endif /* CONFIG_DMA_API_DEBUG */ #endif /* __DMA_DEBUG_H */ diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index f4770fcfa62b..5732b2b3ef17 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -186,11 +186,6 @@ config DMA_API_DEBUG drivers like double-freeing of DMA mappings or freeing mappings that were never allocated. - This also attempts to catch cases where a page owned by DMA is - accessed by the cpu in a way that could cause data corruption. For - example, this enables cow_user_page() to check that the source page is - not undergoing DMA. - This option causes a performance degradation. Use only if you want to debug device drivers and dma interactions. diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 6f4f4b9d2d03..8e9f7b301c6d 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -448,9 +448,6 @@ void debug_dma_dump_mappings(struct device *dev) * dma_active_cacheline entry to track per event. dma_map_sg(), on the * other hand, consumes a single dma_debug_entry, but inserts 'nents' * entries into the tree. - * - * At any time debug_dma_assert_idle() can be called to trigger a - * warning if any cachelines in the given page are in the active set. */ static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT); static DEFINE_SPINLOCK(radix_lock); @@ -497,10 +494,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln) overlap = active_cacheline_set_overlap(cln, ++overlap); /* If we overflowed the overlap counter then we're potentially - * leaking dma-mappings. Otherwise, if maps and unmaps are - * balanced then this overflow may cause false negatives in - * debug_dma_assert_idle() as the cacheline may be marked idle - * prematurely. + * leaking dma-mappings. */ WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP, pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"), @@ -555,44 +549,6 @@ static void active_cacheline_remove(struct dma_debug_entry *entry) spin_unlock_irqrestore(&radix_lock, flags); } -/** - * debug_dma_assert_idle() - assert that a page is not undergoing dma - * @page: page to lookup in the dma_active_cacheline tree - * - * Place a call to this routine in cases where the cpu touching the page - * before the dma completes (page is dma_unmapped) will lead to data - * corruption. - */ -void debug_dma_assert_idle(struct page *page) -{ - struct dma_debug_entry *entry; - unsigned long pfn; - phys_addr_t cln; - - if (dma_debug_disabled()) - return; - - if (!page) - return; - - pfn = page_to_pfn(page); - cln = (phys_addr_t) pfn << CACHELINE_PER_PAGE_SHIFT; - - rcu_read_lock(); - if (!radix_tree_gang_lookup(&dma_active_cacheline, (void **) &entry, - cln, 1) || entry->pfn != pfn) - entry = NULL; - rcu_read_unlock(); - - if (!entry) - return; - - cln = to_cacheline_number(entry); - err_printk(entry->dev, entry, - "cpu touching an active dma mapped cacheline [cln=%pa]\n", - &cln); -} - /* * Wrapper function for adding an entry to the hash. * This function takes care of locking itself. diff --git a/mm/memory.c b/mm/memory.c index 228efaca75d3..d3c3bbd65a7e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2411,8 +2411,6 @@ static inline bool cow_user_page(struct page *dst, struct page *src, struct mm_struct *mm = vma->vm_mm; unsigned long addr = vmf->address; - debug_dma_assert_idle(src); - if (likely(src)) { copy_user_highpage(dst, src, addr, vma); return true; -- cgit v1.2.3 From 846f9e1fb9b9a2c4eecefa2fdbfb51e975a7d532 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Jul 2020 14:18:54 +0200 Subject: dma-mapping: consolidate the NO_DMA definition in kernel/dma/Kconfig Have a single definition that architetures can select. Signed-off-by: Christoph Hellwig Signed-off-by: Rich Felker --- arch/m68k/Kconfig | 4 +--- arch/m68k/Kconfig.machine | 1 + arch/um/Kconfig | 4 +--- kernel/dma/Kconfig | 3 +++ 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 6ad6cdac74b3..8e488369a7e5 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -17,6 +17,7 @@ config M68K select HAVE_COPY_THREAD_TLS select GENERIC_IRQ_SHOW select GENERIC_ATOMIC64 + select NO_DMA if !MMU && !COLDFIRE select HAVE_UID16 select VIRT_TO_BUS select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS @@ -60,9 +61,6 @@ config TIME_LOW_RES config NO_IOPORT_MAP def_bool y -config NO_DMA - def_bool (MMU && SUN3) || (!MMU && !COLDFIRE) - config ZONE_DMA bool default y diff --git a/arch/m68k/Kconfig.machine b/arch/m68k/Kconfig.machine index a82651d58af4..17e8c3a292d7 100644 --- a/arch/m68k/Kconfig.machine +++ b/arch/m68k/Kconfig.machine @@ -126,6 +126,7 @@ config SUN3 depends on MMU depends on !MMU_MOTOROLA select MMU_SUN3 if MMU + select NO_DMA select M68020 help This option enables support for the Sun 3 series of workstations diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 9318dc6d1a0c..32c1d1945033 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -15,6 +15,7 @@ config UML select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_BUGVERBOSE select HAVE_COPY_THREAD_TLS + select NO_DMA select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES select GENERIC_CLOCKEVENTS @@ -168,9 +169,6 @@ config MMAPPER This driver allows a host file to be used as emulated IO memory inside UML. -config NO_DMA - def_bool y - config PGTABLE_LEVELS int default 3 if 3_LEVEL_PGTABLES diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 1da3f44f2565..57533d07676f 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only +config NO_DMA + bool + config HAS_DMA bool depends on !NO_DMA -- cgit v1.2.3 From 88db0aa2421666d2f73486d15b239a4521983d55 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 14 Aug 2020 17:31:07 -0700 Subject: all arch: remove system call sys_sysctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 61a47c1ad3a4dc ("sysctl: Remove the sysctl system call"), sys_sysctl is actually unavailable: any input can only return an error. We have been warning about people using the sysctl system call for years and believe there are no more users. Even if there are users of this interface if they have not complained or fixed their code by now they probably are not going to, so there is no point in warning them any longer. So completely remove sys_sysctl on all architectures. [nixiaoming@huawei.com: s390: fix build error for sys_call_table_emu] Link: http://lkml.kernel.org/r/20200618141426.16884-1-nixiaoming@huawei.com Signed-off-by: Xiaoming Ni Signed-off-by: Andrew Morton Acked-by: Will Deacon [arm/arm64] Acked-by: "Eric W. Biederman" Cc: Aleksa Sarai Cc: Alexander Shishkin Cc: Al Viro Cc: Andi Kleen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bin Meng Cc: Borislav Petkov Cc: Brian Gerst Cc: Catalin Marinas Cc: chenzefeng Cc: Christian Borntraeger Cc: Christian Brauner Cc: Chris Zankel Cc: David Howells Cc: David S. Miller Cc: Diego Elio Pettenò Cc: Dmitry Vyukov Cc: Dominik Brodowski Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Iurii Zaikin Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: Jiri Olsa Cc: Kars de Jong Cc: Kees Cook Cc: Krzysztof Kozlowski Cc: Luis Chamberlain Cc: Marco Elver Cc: Mark Rutland Cc: Martin K. Petersen Cc: Masahiro Yamada Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Miklos Szeredi Cc: Minchan Kim Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Nick Piggin Cc: Oleg Nesterov Cc: Olof Johansson Cc: Paul Burton Cc: "Paul E. McKenney" Cc: Paul Mackerras Cc: Peter Zijlstra (Intel) Cc: Randy Dunlap Cc: Ravi Bangoria Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sami Tolvanen Cc: Sargun Dhillon Cc: Stephen Rothwell Cc: Sudeep Holla Cc: Sven Schnelle Cc: Thiago Jung Bauermann Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Yoshinori Sato Cc: Zhou Yanjie Link: http://lkml.kernel.org/r/20200616030734.87257-1-nixiaoming@huawei.com Signed-off-by: Linus Torvalds --- arch/alpha/kernel/syscalls/syscall.tbl | 2 +- arch/arm/configs/am200epdkit_defconfig | 1 - arch/arm/tools/syscall.tbl | 2 +- arch/arm64/include/asm/unistd32.h | 4 +- arch/ia64/kernel/syscalls/syscall.tbl | 2 +- arch/m68k/kernel/syscalls/syscall.tbl | 2 +- arch/microblaze/kernel/syscalls/syscall.tbl | 2 +- arch/mips/configs/cu1000-neo_defconfig | 1 - arch/mips/kernel/syscalls/syscall_n32.tbl | 2 +- arch/mips/kernel/syscalls/syscall_n64.tbl | 2 +- arch/mips/kernel/syscalls/syscall_o32.tbl | 2 +- arch/parisc/kernel/syscalls/syscall.tbl | 2 +- arch/powerpc/kernel/syscalls/syscall.tbl | 2 +- arch/s390/kernel/syscalls/syscall.tbl | 2 +- arch/sh/configs/dreamcast_defconfig | 1 - arch/sh/configs/espt_defconfig | 1 - arch/sh/configs/hp6xx_defconfig | 1 - arch/sh/configs/landisk_defconfig | 1 - arch/sh/configs/lboxre2_defconfig | 1 - arch/sh/configs/microdev_defconfig | 1 - arch/sh/configs/migor_defconfig | 1 - arch/sh/configs/r7780mp_defconfig | 1 - arch/sh/configs/r7785rp_defconfig | 1 - arch/sh/configs/rts7751r2d1_defconfig | 1 - arch/sh/configs/rts7751r2dplus_defconfig | 1 - arch/sh/configs/se7206_defconfig | 1 - arch/sh/configs/se7343_defconfig | 1 - arch/sh/configs/se7619_defconfig | 1 - arch/sh/configs/se7705_defconfig | 1 - arch/sh/configs/se7750_defconfig | 1 - arch/sh/configs/se7751_defconfig | 1 - arch/sh/configs/secureedge5410_defconfig | 1 - arch/sh/configs/sh03_defconfig | 1 - arch/sh/configs/sh7710voipgw_defconfig | 1 - arch/sh/configs/sh7757lcr_defconfig | 1 - arch/sh/configs/sh7763rdp_defconfig | 1 - arch/sh/configs/shmin_defconfig | 1 - arch/sh/configs/titan_defconfig | 1 - arch/sh/kernel/syscalls/syscall.tbl | 2 +- arch/sparc/kernel/syscalls/syscall.tbl | 2 +- arch/x86/entry/syscalls/syscall_32.tbl | 2 +- arch/x86/entry/syscalls/syscall_64.tbl | 2 +- arch/xtensa/kernel/syscalls/syscall.tbl | 2 +- include/linux/compat.h | 1 - include/linux/syscalls.h | 2 - include/linux/sysctl.h | 6 +- kernel/Makefile | 2 +- kernel/sys_ni.c | 1 - kernel/sysctl_binary.c | 171 --------------------- tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 2 +- tools/perf/arch/s390/entry/syscalls/syscall.tbl | 2 +- tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 2 +- 52 files changed, 24 insertions(+), 227 deletions(-) delete mode 100644 kernel/sysctl_binary.c (limited to 'kernel') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index a28fb211881d..ec8bed9e7b75 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -249,7 +249,7 @@ 316 common mlockall sys_mlockall 317 common munlockall sys_munlockall 318 common sysinfo sys_sysinfo -319 common _sysctl sys_sysctl +319 common _sysctl sys_ni_syscall # 320 was sys_idle 321 common oldumount sys_oldumount 322 common swapon sys_swapon diff --git a/arch/arm/configs/am200epdkit_defconfig b/arch/arm/configs/am200epdkit_defconfig index f56ac394caf1..4e49d6cb2f62 100644 --- a/arch/arm/configs/am200epdkit_defconfig +++ b/arch/arm/configs/am200epdkit_defconfig @@ -3,7 +3,6 @@ CONFIG_LOCALVERSION="gum" CONFIG_SYSVIPC=y CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_EXPERT=y -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_EPOLL is not set # CONFIG_SHMEM is not set # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 7e8ee4adf269..171077cbf419 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -162,7 +162,7 @@ 146 common writev sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 common _sysctl sys_sysctl +149 common _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 17e81bd9a2d3..734860ac7cf9 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -308,8 +308,8 @@ __SYSCALL(__NR_writev, compat_sys_writev) __SYSCALL(__NR_getsid, sys_getsid) #define __NR_fdatasync 148 __SYSCALL(__NR_fdatasync, sys_fdatasync) -#define __NR__sysctl 149 -__SYSCALL(__NR__sysctl, compat_sys_sysctl) + /* 149 was sys_sysctl */ +__SYSCALL(149, sys_ni_syscall) #define __NR_mlock 150 __SYSCALL(__NR_mlock, sys_mlock) #define __NR_munlock 151 diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index ced9c83e47c9..f52a41f4c340 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -135,7 +135,7 @@ 123 common writev sys_writev 124 common pread64 sys_pread64 125 common pwrite64 sys_pwrite64 -126 common _sysctl sys_sysctl +126 common _sysctl sys_ni_syscall 127 common mmap sys_mmap 128 common munmap sys_munmap 129 common mlock sys_mlock diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 1a4822de7292..81fc799d8392 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -156,7 +156,7 @@ 146 common writev sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 common _sysctl sys_sysctl +149 common _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index a3f4be8e7238..b4e263916f41 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -156,7 +156,7 @@ 146 common writev sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 common _sysctl sys_sysctl +149 common _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/arch/mips/configs/cu1000-neo_defconfig b/arch/mips/configs/cu1000-neo_defconfig index 6b471cdb16cf..e924c817f73d 100644 --- a/arch/mips/configs/cu1000-neo_defconfig +++ b/arch/mips/configs/cu1000-neo_defconfig @@ -17,7 +17,6 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 6b4ee92e3aed..f9df9edb67a4 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -159,7 +159,7 @@ 149 n32 munlockall sys_munlockall 150 n32 vhangup sys_vhangup 151 n32 pivot_root sys_pivot_root -152 n32 _sysctl compat_sys_sysctl +152 n32 _sysctl sys_ni_syscall 153 n32 prctl sys_prctl 154 n32 adjtimex sys_adjtimex_time32 155 n32 setrlimit compat_sys_setrlimit diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 391acbf425a0..557f9954a2b9 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -159,7 +159,7 @@ 149 n64 munlockall sys_munlockall 150 n64 vhangup sys_vhangup 151 n64 pivot_root sys_pivot_root -152 n64 _sysctl sys_sysctl +152 n64 _sysctl sys_ni_syscall 153 n64 prctl sys_prctl 154 n64 adjtimex sys_adjtimex 155 n64 setrlimit sys_setrlimit diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 5727c5187508..195b43cf27c8 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -164,7 +164,7 @@ 150 o32 unused150 sys_ni_syscall 151 o32 getsid sys_getsid 152 o32 fdatasync sys_fdatasync -153 o32 _sysctl sys_sysctl compat_sys_sysctl +153 o32 _sysctl sys_ni_syscall 154 o32 mlock sys_mlock 155 o32 munlock sys_munlock 156 o32 mlockall sys_mlockall diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 292baabefade..def64d221cd4 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -163,7 +163,7 @@ 146 common writev sys_writev compat_sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 common _sysctl sys_sysctl compat_sys_sysctl +149 common _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index be9f74546068..c2d737ff2e7b 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -197,7 +197,7 @@ 146 common writev sys_writev compat_sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 nospu _sysctl sys_sysctl compat_sys_sysctl +149 nospu _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index f1fda4375526..10456bc936fb 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -138,7 +138,7 @@ 146 common writev sys_writev compat_sys_writev 147 common getsid sys_getsid sys_getsid 148 common fdatasync sys_fdatasync sys_fdatasync -149 common _sysctl sys_sysctl compat_sys_sysctl +149 common _sysctl - - 150 common mlock sys_mlock sys_mlock 151 common munlock sys_munlock sys_munlock 152 common mlockall sys_mlockall sys_mlockall diff --git a/arch/sh/configs/dreamcast_defconfig b/arch/sh/configs/dreamcast_defconfig index ae067e0b15e3..6a82c7b8ff32 100644 --- a/arch/sh/configs/dreamcast_defconfig +++ b/arch/sh/configs/dreamcast_defconfig @@ -1,7 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_MODULES=y diff --git a/arch/sh/configs/espt_defconfig b/arch/sh/configs/espt_defconfig index a5b865a75d22..9a988c347e9d 100644 --- a/arch/sh/configs/espt_defconfig +++ b/arch/sh/configs/espt_defconfig @@ -5,7 +5,6 @@ CONFIG_LOG_BUF_SHIFT=14 CONFIG_NAMESPACES=y CONFIG_UTS_NS=y CONFIG_IPC_NS=y -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/sh/configs/hp6xx_defconfig b/arch/sh/configs/hp6xx_defconfig index a92db6694ce2..70e6605d7f7e 100644 --- a/arch/sh/configs/hp6xx_defconfig +++ b/arch/sh/configs/hp6xx_defconfig @@ -3,7 +3,6 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y # CONFIG_BLK_DEV_BSG is not set CONFIG_CPU_SUBTYPE_SH7709=y diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig index 567af752b1bb..ba6ec042606f 100644 --- a/arch/sh/configs/landisk_defconfig +++ b/arch/sh/configs/landisk_defconfig @@ -1,6 +1,5 @@ CONFIG_SYSVIPC=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS_EXTRA_PASS=y CONFIG_SLAB=y CONFIG_MODULES=y diff --git a/arch/sh/configs/lboxre2_defconfig b/arch/sh/configs/lboxre2_defconfig index 10f6d371ce2c..05e4ac6fed5f 100644 --- a/arch/sh/configs/lboxre2_defconfig +++ b/arch/sh/configs/lboxre2_defconfig @@ -1,6 +1,5 @@ CONFIG_SYSVIPC=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS_EXTRA_PASS=y CONFIG_SLAB=y CONFIG_MODULES=y diff --git a/arch/sh/configs/microdev_defconfig b/arch/sh/configs/microdev_defconfig index ed84d1303acf..c65667d00313 100644 --- a/arch/sh/configs/microdev_defconfig +++ b/arch/sh/configs/microdev_defconfig @@ -2,7 +2,6 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y # CONFIG_BLK_DEV_BSG is not set CONFIG_CPU_SUBTYPE_SH4_202=y diff --git a/arch/sh/configs/migor_defconfig b/arch/sh/configs/migor_defconfig index 37e9521a99e5..a24cf8cd2cea 100644 --- a/arch/sh/configs/migor_defconfig +++ b/arch/sh/configs/migor_defconfig @@ -4,7 +4,6 @@ CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig index c97ec60cff27..e922659fdadb 100644 --- a/arch/sh/configs/r7780mp_defconfig +++ b/arch/sh/configs/r7780mp_defconfig @@ -3,7 +3,6 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set CONFIG_SLAB=y diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig index 55fce65eb454..5978866358ec 100644 --- a/arch/sh/configs/r7785rp_defconfig +++ b/arch/sh/configs/r7785rp_defconfig @@ -7,7 +7,6 @@ CONFIG_RCU_TRACE=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/sh/configs/rts7751r2d1_defconfig b/arch/sh/configs/rts7751r2d1_defconfig index 6a3cfe08295f..fc9c22152b08 100644 --- a/arch/sh/configs/rts7751r2d1_defconfig +++ b/arch/sh/configs/rts7751r2d1_defconfig @@ -1,7 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_LOG_BUF_SHIFT=14 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/sh/configs/rts7751r2dplus_defconfig b/arch/sh/configs/rts7751r2dplus_defconfig index 2b3d7d280672..ff3fd6787fd6 100644 --- a/arch/sh/configs/rts7751r2dplus_defconfig +++ b/arch/sh/configs/rts7751r2dplus_defconfig @@ -1,7 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_LOG_BUF_SHIFT=14 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig index 21a43f14ffac..ff5bb4489922 100644 --- a/arch/sh/configs/se7206_defconfig +++ b/arch/sh/configs/se7206_defconfig @@ -18,7 +18,6 @@ CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_BLK_DEV_INITRD=y # CONFIG_UID16 is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS_ALL=y # CONFIG_ELF_CORE is not set # CONFIG_COMPAT_BRK is not set diff --git a/arch/sh/configs/se7343_defconfig b/arch/sh/configs/se7343_defconfig index 4e794e719a28..5d6c19338ebf 100644 --- a/arch/sh/configs/se7343_defconfig +++ b/arch/sh/configs/se7343_defconfig @@ -2,7 +2,6 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_SHMEM is not set diff --git a/arch/sh/configs/se7619_defconfig b/arch/sh/configs/se7619_defconfig index 3264415a5931..71a672c30716 100644 --- a/arch/sh/configs/se7619_defconfig +++ b/arch/sh/configs/se7619_defconfig @@ -1,7 +1,6 @@ # CONFIG_LOCALVERSION_AUTO is not set CONFIG_LOG_BUF_SHIFT=14 # CONFIG_UID16 is not set -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_KALLSYMS is not set # CONFIG_HOTPLUG is not set # CONFIG_ELF_CORE is not set diff --git a/arch/sh/configs/se7705_defconfig b/arch/sh/configs/se7705_defconfig index 4496b94b7d88..ed00a6eeadf5 100644 --- a/arch/sh/configs/se7705_defconfig +++ b/arch/sh/configs/se7705_defconfig @@ -2,7 +2,6 @@ CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_KALLSYMS is not set # CONFIG_HOTPLUG is not set CONFIG_SLAB=y diff --git a/arch/sh/configs/se7750_defconfig b/arch/sh/configs/se7750_defconfig index b23f67542728..3f1c13799d79 100644 --- a/arch/sh/configs/se7750_defconfig +++ b/arch/sh/configs/se7750_defconfig @@ -5,7 +5,6 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_HOTPLUG is not set CONFIG_SLAB=y CONFIG_MODULES=y diff --git a/arch/sh/configs/se7751_defconfig b/arch/sh/configs/se7751_defconfig index 162343683937..4a024065bb75 100644 --- a/arch/sh/configs/se7751_defconfig +++ b/arch/sh/configs/se7751_defconfig @@ -3,7 +3,6 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_HOTPLUG is not set CONFIG_SLAB=y CONFIG_MODULES=y diff --git a/arch/sh/configs/secureedge5410_defconfig b/arch/sh/configs/secureedge5410_defconfig index 360592d63a2f..8422599cfb04 100644 --- a/arch/sh/configs/secureedge5410_defconfig +++ b/arch/sh/configs/secureedge5410_defconfig @@ -1,7 +1,6 @@ # CONFIG_SWAP is not set CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_HOTPLUG is not set CONFIG_SLAB=y # CONFIG_BLK_DEV_BSG is not set diff --git a/arch/sh/configs/sh03_defconfig b/arch/sh/configs/sh03_defconfig index 87db9a84b5ec..f0073ed39947 100644 --- a/arch/sh/configs/sh03_defconfig +++ b/arch/sh/configs/sh03_defconfig @@ -3,7 +3,6 @@ CONFIG_POSIX_MQUEUE=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=m diff --git a/arch/sh/configs/sh7710voipgw_defconfig b/arch/sh/configs/sh7710voipgw_defconfig index 08426913c0e3..0d814770b07f 100644 --- a/arch/sh/configs/sh7710voipgw_defconfig +++ b/arch/sh/configs/sh7710voipgw_defconfig @@ -2,7 +2,6 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set # CONFIG_SHMEM is not set diff --git a/arch/sh/configs/sh7757lcr_defconfig b/arch/sh/configs/sh7757lcr_defconfig index d0933a9b9799..a2700ab165af 100644 --- a/arch/sh/configs/sh7757lcr_defconfig +++ b/arch/sh/configs/sh7757lcr_defconfig @@ -8,7 +8,6 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS_ALL=y CONFIG_SLAB=y CONFIG_MODULES=y diff --git a/arch/sh/configs/sh7763rdp_defconfig b/arch/sh/configs/sh7763rdp_defconfig index d0a0aa74cecf..26c5fd02c87a 100644 --- a/arch/sh/configs/sh7763rdp_defconfig +++ b/arch/sh/configs/sh7763rdp_defconfig @@ -5,7 +5,6 @@ CONFIG_LOG_BUF_SHIFT=14 CONFIG_NAMESPACES=y CONFIG_UTS_NS=y CONFIG_IPC_NS=y -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/sh/configs/shmin_defconfig b/arch/sh/configs/shmin_defconfig index a27b129b93c5..c0b6f40d01cc 100644 --- a/arch/sh/configs/shmin_defconfig +++ b/arch/sh/configs/shmin_defconfig @@ -1,7 +1,6 @@ # CONFIG_SWAP is not set CONFIG_LOG_BUF_SHIFT=14 # CONFIG_UID16 is not set -# CONFIG_SYSCTL_SYSCALL is not set # CONFIG_KALLSYMS is not set # CONFIG_HOTPLUG is not set # CONFIG_BUG is not set diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig index 4ec961ace688..ba887f1351be 100644 --- a/arch/sh/configs/titan_defconfig +++ b/arch/sh/configs/titan_defconfig @@ -6,7 +6,6 @@ CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=16 CONFIG_BLK_DEV_INITRD=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_SYSCTL_SYSCALL is not set CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 96848db9659e..ae0a00beea5f 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -156,7 +156,7 @@ 146 common writev sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 common _sysctl sys_sysctl +149 common _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 46024e80ee86..4af114e84f20 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -300,7 +300,7 @@ 249 64 nanosleep sys_nanosleep 250 32 mremap sys_mremap 250 64 mremap sys_64_mremap -251 common _sysctl sys_sysctl compat_sys_sysctl +251 common _sysctl sys_ni_syscall 252 common getsid sys_getsid 253 common fdatasync sys_fdatasync 254 32 nfsservctl sys_ni_syscall sys_nis_syscall diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index e31a75262c9c..9d1102873666 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -160,7 +160,7 @@ 146 i386 writev sys_writev compat_sys_writev 147 i386 getsid sys_getsid 148 i386 fdatasync sys_fdatasync -149 i386 _sysctl sys_sysctl compat_sys_sysctl +149 i386 _sysctl sys_ni_syscall 150 i386 mlock sys_mlock 151 i386 munlock sys_munlock 152 i386 mlockall sys_mlockall diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 9d82078c949a..f30d6ae9a688 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -164,7 +164,7 @@ 153 common vhangup sys_vhangup 154 common modify_ldt sys_modify_ldt 155 common pivot_root sys_pivot_root -156 64 _sysctl sys_sysctl +156 64 _sysctl sys_ni_syscall 157 common prctl sys_prctl 158 common arch_prctl sys_arch_prctl 159 common adjtimex sys_adjtimex diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index d216ccba42f7..6276e3c2d3fc 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -222,7 +222,7 @@ 204 common quotactl sys_quotactl # 205 was old nfsservctl 205 common nfsservctl sys_ni_syscall -206 common _sysctl sys_sysctl +206 common _sysctl sys_ni_syscall 207 common bdflush sys_bdflush 208 common uname sys_newuname 209 common sysinfo sys_sysinfo diff --git a/include/linux/compat.h b/include/linux/compat.h index c4255d8a4a8a..d38c4d7e83bd 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -851,7 +851,6 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u32); asmlinkage long compat_sys_recv(int fd, void __user *buf, compat_size_t len, unsigned flags); -asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args); /* obsolete: fs/readdir.c */ asmlinkage long compat_sys_old_readdir(unsigned int fd, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index dc2b827c81e5..75ac7f8ae93c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -47,7 +47,6 @@ struct stat64; struct statfs; struct statfs64; struct statx; -struct __sysctl_args; struct sysinfo; struct timespec; struct __kernel_old_timeval; @@ -1117,7 +1116,6 @@ asmlinkage long sys_send(int, void __user *, size_t, unsigned); asmlinkage long sys_bdflush(int func, long data); asmlinkage long sys_oldumount(char __user *name); asmlinkage long sys_uselib(const char __user *library); -asmlinkage long sys_sysctl(struct __sysctl_args __user *args); asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2); asmlinkage long sys_fork(void); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 50bb7f383a1b..51298a4f4623 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -74,15 +74,13 @@ int proc_do_static_key(struct ctl_table *table, int write, void *buffer, * sysctl names can be mirrored automatically under /proc/sys. The * procname supplied controls /proc naming. * - * The table's mode will be honoured both for sys_sysctl(2) and - * proc-fs access. + * The table's mode will be honoured for proc-fs access. * * Leaf nodes in the sysctl tree will be represented by a single file * under /proc; non-leaf nodes will be represented by directories. A * null procname disables /proc mirroring at this node. * - * sysctl(2) can automatically manage read and write requests through - * the sysctl table. The data and maxlen fields of the ctl_table + * The data and maxlen fields of the ctl_table * struct enable minimal validation of the values being written to be * performed, and the mode field allows minimal authentication. * diff --git a/kernel/Makefile b/kernel/Makefile index b3da548691c9..9a20016d4900 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,7 +5,7 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ - sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ + sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 3b69a560a7ac..4d59775ea79c 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -364,7 +364,6 @@ COND_SYSCALL(socketcall); COND_SYSCALL_COMPAT(socketcall); /* compat syscalls for arm64, x86, ... */ -COND_SYSCALL_COMPAT(sysctl); COND_SYSCALL_COMPAT(fanotify_mark); /* x86 */ diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c deleted file mode 100644 index 7d550cc76a3b..000000000000 --- a/kernel/sysctl_binary.c +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include "../fs/xfs/xfs_sysctl.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static ssize_t binary_sysctl(const int *name, int nlen, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -static void deprecated_sysctl_warning(const int *name, int nlen) -{ - int i; - - /* - * CTL_KERN/KERN_VERSION is used by older glibc and cannot - * ever go away. - */ - if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) - return; - - if (printk_ratelimit()) { - printk(KERN_INFO - "warning: process `%s' used the deprecated sysctl " - "system call with ", current->comm); - for (i = 0; i < nlen; i++) - printk(KERN_CONT "%d.", name[i]); - printk(KERN_CONT "\n"); - } - return; -} - -#define WARN_ONCE_HASH_BITS 8 -#define WARN_ONCE_HASH_SIZE (1<nlen. */ - if (nlen < 0 || nlen > CTL_MAXNAME) - return -ENOTDIR; - /* Read in the sysctl name for simplicity */ - for (i = 0; i < nlen; i++) - if (get_user(name[i], args_name + i)) - return -EFAULT; - - warn_on_bintable(name, nlen); - - return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); -} - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen, - tmp.newval, tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - return result; -} - - -#ifdef CONFIG_COMPAT - -struct compat_sysctl_args { - compat_uptr_t name; - int nlen; - compat_uptr_t oldval; - compat_uptr_t oldlenp; - compat_uptr_t newval; - compat_size_t newlen; - compat_ulong_t __unused[4]; -}; - -COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args) -{ - struct compat_sysctl_args tmp; - compat_size_t __user *compat_oldlenp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - compat_oldlenp = compat_ptr(tmp.oldlenp); - if (compat_oldlenp && get_user(oldlen, compat_oldlenp)) - return -EFAULT; - - result = do_sysctl(compat_ptr(tmp.name), tmp.nlen, - compat_ptr(tmp.oldval), oldlen, - compat_ptr(tmp.newval), tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (compat_oldlenp && put_user(oldlen, compat_oldlenp)) - return -EFAULT; - - return result; -} - -#endif /* CONFIG_COMPAT */ diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index b190f2eb2611..3ca6fe057a0b 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -193,7 +193,7 @@ 146 common writev sys_writev compat_sys_writev 147 common getsid sys_getsid 148 common fdatasync sys_fdatasync -149 nospu _sysctl sys_sysctl compat_sys_sysctl +149 nospu _sysctl sys_ni_syscall 150 common mlock sys_mlock 151 common munlock sys_munlock 152 common mlockall sys_mlockall diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index 56ae24b6e4be..6a0bbea225db 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -138,7 +138,7 @@ 146 common writev sys_writev compat_sys_writev 147 common getsid sys_getsid sys_getsid 148 common fdatasync sys_fdatasync sys_fdatasync -149 common _sysctl sys_sysctl compat_sys_sysctl +149 common _sysctl - - 150 common mlock sys_mlock compat_sys_mlock 151 common munlock sys_munlock compat_sys_munlock 152 common mlockall sys_mlockall sys_mlockall diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 9d82078c949a..f30d6ae9a688 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -164,7 +164,7 @@ 153 common vhangup sys_vhangup 154 common modify_ldt sys_modify_ldt 155 common pivot_root sys_pivot_root -156 64 _sysctl sys_sysctl +156 64 _sysctl sys_ni_syscall 157 common prctl sys_prctl 158 common arch_prctl sys_arch_prctl 159 common adjtimex sys_adjtimex -- cgit v1.2.3