From c92a7a52243871eb10e0ea2260685def47fb5094 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 12 Oct 2022 18:20:14 -0500 Subject: bpf: Allow bpf_user_ringbuf_drain() callbacks to return 1 The bpf_user_ringbuf_drain() helper function allows a BPF program to specify a callback that is invoked when draining entries from a BPF_MAP_TYPE_USER_RINGBUF ring buffer map. The API is meant to allow the callback to return 0 if it wants to continue draining samples, and 1 if it's done draining. Unfortunately, bpf_user_ringbuf_drain() landed shortly after commit 1bfe26fb0827 ("bpf: Add verifier support for custom callback return range"), which changed the default behavior of callbacks to only support returning 0. This patch corrects that oversight by allowing bpf_user_ringbuf_drain() callbacks to return 0 or 1. A follow-on patch will update the user_ringbuf selftests to also return 1 from a bpf_user_ringbuf_drain() callback to prevent this from regressing in the future. Fixes: 205715673844 ("bpf: Add bpf_user_ringbuf_drain() helper") Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221012232015.1510043-2-void@manifault.com --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f6d2d511c06..9ab7188d8f68 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6946,6 +6946,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } -- cgit v1.2.3 From ca6c21327c6af02b7eec31ce4b9a740a18c6c13f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Oct 2022 15:00:39 +0200 Subject: perf: Fix missing SIGTRAPs Marco reported: Due to the implementation of how SIGTRAP are delivered if perf_event_attr::sigtrap is set, we've noticed 3 issues: 1. Missing SIGTRAP due to a race with event_sched_out() (more details below). 2. Hardware PMU events being disabled due to returning 1 from perf_event_overflow(). The only way to re-enable the event is for user space to first "properly" disable the event and then re-enable it. 3. The inability to automatically disable an event after a specified number of overflows via PERF_EVENT_IOC_REFRESH. The worst of the 3 issues is problem (1), which occurs when a pending_disable is "consumed" by a racing event_sched_out(), observed as follows: CPU0 | CPU1 --------------------------------+--------------------------- __perf_event_overflow() | perf_event_disable_inatomic() | pending_disable = CPU0 | ... | _perf_event_enable() | event_function_call() | task_function_call() | /* sends IPI to CPU0 */ | ... __perf_event_enable() +--------------------------- ctx_resched() task_ctx_sched_out() ctx_sched_out() group_sched_out() event_sched_out() pending_disable = -1 perf_pending_event() perf_pending_event_disable() /* Fails to send SIGTRAP because no pending_disable! */ In the above case, not only is that particular SIGTRAP missed, but also all future SIGTRAPs because 'event_limit' is not reset back to 1. To fix, rework pending delivery of SIGTRAP via IRQ-work by introduction of a separate 'pending_sigtrap', no longer using 'event_limit' and 'pending_disable' for its delivery. Additionally; and different to Marco's proposed patch: - recognise that pending_disable effectively duplicates oncpu for the case where it is set. As such, change the irq_work handler to use ->oncpu to target the event and use pending_* as boolean toggles. - observe that SIGTRAP targets the ctx->task, so the context switch optimization that carries contexts between tasks is invalid. If the irq_work were delayed enough to hit after a context switch the SIGTRAP would be delivered to the wrong task. - observe that if the event gets scheduled out (rotation/migration/context-switch/...) the irq-work would be insufficient to deliver the SIGTRAP when the event gets scheduled back in (the irq-work might still be pending on the old CPU). Therefore have event_sched_out() convert the pending sigtrap into a task_work which will deliver the signal at return_to_user. Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: Dmitry Vyukov Debugged-by: Dmitry Vyukov Reported-by: Marco Elver Debugged-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Tested-by: Marco Elver --- include/linux/perf_event.h | 19 ++++-- kernel/events/core.c | 151 +++++++++++++++++++++++++++++++++----------- kernel/events/ring_buffer.c | 2 +- 3 files changed, 129 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 853f64b6c8c2..0031f7b4d9ab 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -756,11 +756,14 @@ struct perf_event { struct fasync_struct *fasync; /* delayed work for NMIs and such */ - int pending_wakeup; - int pending_kill; - int pending_disable; + unsigned int pending_wakeup; + unsigned int pending_kill; + unsigned int pending_disable; + unsigned int pending_sigtrap; unsigned long pending_addr; /* SIGTRAP */ - struct irq_work pending; + struct irq_work pending_irq; + struct callback_head pending_task; + unsigned int pending_work; atomic_t event_limit; @@ -877,6 +880,14 @@ struct perf_event_context { #endif void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; + + /* + * Sum (event->pending_sigtrap + event->pending_work) + * + * The SIGTRAP is targeted at ctx->task, as such it won't do changing + * that until the signal is delivered. + */ + local_t nr_pending; }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index aefc1e08e015..01933db7629c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -54,6 +54,7 @@ #include #include #include +#include #include "internal.h" @@ -2276,11 +2277,26 @@ event_sched_out(struct perf_event *event, event->pmu->del(event, 0); event->oncpu = -1; - if (READ_ONCE(event->pending_disable) >= 0) { - WRITE_ONCE(event->pending_disable, -1); + if (event->pending_disable) { + event->pending_disable = 0; perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } + + if (event->pending_sigtrap) { + bool dec = true; + + event->pending_sigtrap = 0; + if (state != PERF_EVENT_STATE_OFF && + !event->pending_work) { + event->pending_work = 1; + dec = false; + task_work_add(current, &event->pending_task, TWA_RESUME); + } + if (dec) + local_dec(&event->ctx->nr_pending); + } + perf_event_set_state(event, state); if (!is_software_event(event)) @@ -2432,7 +2448,7 @@ static void __perf_event_disable(struct perf_event *event, * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * - * When called from perf_pending_event it's OK because event->ctx + * When called from perf_pending_irq it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ @@ -2471,9 +2487,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { - WRITE_ONCE(event->pending_disable, smp_processor_id()); - /* can fail, see perf_pending_event_disable() */ - irq_work_queue(&event->pending); + event->pending_disable = 1; + irq_work_queue(&event->pending_irq); } #define MAX_INTERRUPTS (~0ULL) @@ -3428,11 +3443,23 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { + perf_pmu_disable(pmu); + + /* PMIs are disabled; ctx->nr_pending is stable. */ + if (local_read(&ctx->nr_pending) || + local_read(&next_ctx->nr_pending)) { + /* + * Must not swap out ctx when there's pending + * events that rely on the ctx->task relation. + */ + raw_spin_unlock(&next_ctx->lock); + rcu_read_unlock(); + goto inside_switch; + } + WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - perf_pmu_disable(pmu); - if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(ctx, false); @@ -3473,6 +3500,7 @@ unlock: raw_spin_lock(&ctx->lock); perf_pmu_disable(pmu); +inside_switch: if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(ctx, false); task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); @@ -4939,7 +4967,7 @@ static void perf_addr_filters_splice(struct perf_event *event, static void _free_event(struct perf_event *event) { - irq_work_sync(&event->pending); + irq_work_sync(&event->pending_irq); unaccount_event(event); @@ -6439,7 +6467,8 @@ static void perf_sigtrap(struct perf_event *event) return; /* - * perf_pending_event() can race with the task exiting. + * Both perf_pending_task() and perf_pending_irq() can race with the + * task exiting. */ if (current->flags & PF_EXITING) return; @@ -6448,23 +6477,33 @@ static void perf_sigtrap(struct perf_event *event) event->attr.type, event->attr.sig_data); } -static void perf_pending_event_disable(struct perf_event *event) +/* + * Deliver the pending work in-event-context or follow the context. + */ +static void __perf_pending_irq(struct perf_event *event) { - int cpu = READ_ONCE(event->pending_disable); + int cpu = READ_ONCE(event->oncpu); + /* + * If the event isn't running; we done. event_sched_out() will have + * taken care of things. + */ if (cpu < 0) return; + /* + * Yay, we hit home and are in the context of the event. + */ if (cpu == smp_processor_id()) { - WRITE_ONCE(event->pending_disable, -1); - - if (event->attr.sigtrap) { + if (event->pending_sigtrap) { + event->pending_sigtrap = 0; perf_sigtrap(event); - atomic_set_release(&event->event_limit, 1); /* rearm event */ - return; + local_dec(&event->ctx->nr_pending); + } + if (event->pending_disable) { + event->pending_disable = 0; + perf_event_disable_local(event); } - - perf_event_disable_local(event); return; } @@ -6484,35 +6523,62 @@ static void perf_pending_event_disable(struct perf_event *event) * irq_work_queue(); // FAILS * * irq_work_run() - * perf_pending_event() + * perf_pending_irq() * * But the event runs on CPU-B and wants disabling there. */ - irq_work_queue_on(&event->pending, cpu); + irq_work_queue_on(&event->pending_irq, cpu); } -static void perf_pending_event(struct irq_work *entry) +static void perf_pending_irq(struct irq_work *entry) { - struct perf_event *event = container_of(entry, struct perf_event, pending); + struct perf_event *event = container_of(entry, struct perf_event, pending_irq); int rctx; - rctx = perf_swevent_get_recursion_context(); /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ + rctx = perf_swevent_get_recursion_context(); - perf_pending_event_disable(event); - + /* + * The wakeup isn't bound to the context of the event -- it can happen + * irrespective of where the event is. + */ if (event->pending_wakeup) { event->pending_wakeup = 0; perf_event_wakeup(event); } + __perf_pending_irq(event); + if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } +static void perf_pending_task(struct callback_head *head) +{ + struct perf_event *event = container_of(head, struct perf_event, pending_task); + int rctx; + + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + preempt_disable_notrace(); + rctx = perf_swevent_get_recursion_context(); + + if (event->pending_work) { + event->pending_work = 0; + perf_sigtrap(event); + local_dec(&event->ctx->nr_pending); + } + + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); + preempt_enable_notrace(); +} + #ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; @@ -9212,8 +9278,8 @@ int perf_event_account_interrupt(struct perf_event *event) */ static int __perf_event_overflow(struct perf_event *event, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) { int events = atomic_read(&event->event_limit); int ret = 0; @@ -9236,24 +9302,36 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_addr = data->addr; - perf_event_disable_inatomic(event); } + if (event->attr.sigtrap) { + /* + * Should not be able to return to user space without processing + * pending_sigtrap (kernel events can overflow multiple times). + */ + WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel); + if (!event->pending_sigtrap) { + event->pending_sigtrap = 1; + local_inc(&event->ctx->nr_pending); + } + event->pending_addr = data->addr; + irq_work_queue(&event->pending_irq); + } + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; - irq_work_queue(&event->pending); + irq_work_queue(&event->pending_irq); } return ret; } int perf_event_overflow(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { return __perf_event_overflow(event, 1, data, regs); } @@ -11570,8 +11648,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_waitqueue_head(&event->waitq); - event->pending_disable = -1; - init_irq_work(&event->pending, perf_pending_event); + init_irq_work(&event->pending_irq, perf_pending_irq); + init_task_work(&event->pending_task, perf_pending_task); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -11593,9 +11671,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (parent_event) event->event_caps = parent_event->event_caps; - if (event->attr.sigtrap) - atomic_set(&event->event_limit, 1); - if (task) { event->attach_state = PERF_ATTACH_TASK; /* diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 726132039c38..273a0fe7910a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) atomic_set(&handle->rb->poll, EPOLLIN); handle->event->pending_wakeup = 1; - irq_work_queue(&handle->event->pending); + irq_work_queue(&handle->event->pending_irq); } /* -- cgit v1.2.3 From 21da7472a040420f2dc624ffec70291a72c5d6a6 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Fri, 7 Oct 2022 10:13:27 +0200 Subject: bpf: Fix sample_flags for bpf_perf_event_output * Raw data is also filled by bpf_perf_event_output. * Add sample_flags to indicate raw data. * This eliminates the segfaults as shown below: Run ./samples/bpf/trace_output BUG pid 9 cookie 1001000000004 sized 4 BUG pid 9 cookie 1001000000004 sized 4 BUG pid 9 cookie 1001000000004 sized 4 Segmentation fault (core dumped) Fixes: 838d9bb62d13 ("perf: Use sample_flags for raw_data") Signed-off-by: Sumanth Korikkar Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20221007081327.1047552-1-sumanthk@linux.ibm.com --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 49fb9ec8366d..1ed08967fb97 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -687,6 +687,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, perf_sample_data_init(sd, 0, 0); sd->raw = &raw; + sd->sample_flags |= PERF_SAMPLE_RAW; err = __bpf_perf_event_output(regs, map, flags, sd); @@ -745,6 +746,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); sd->raw = &raw; + sd->sample_flags |= PERF_SAMPLE_RAW; ret = __bpf_perf_event_output(regs, map, flags, sd); out: -- cgit v1.2.3 From e705968dd687574b6ca3ebe772683d5642759132 Mon Sep 17 00:00:00 2001 From: Lin Shengwang Date: Sat, 8 Oct 2022 10:27:09 +0800 Subject: sched/core: Fix comparison in sched_group_cookie_match() In commit 97886d9dcd86 ("sched: Migration changes for core scheduling"), sched_group_cookie_match() was added to help determine if a cookie matches the core state. However, while it iterates the SMT group, it fails to actually use the RQ for each of the CPUs iterated, use cpu_rq(cpu) instead of rq to fix things. Fixes: 97886d9dcd86 ("sched: Migration changes for core scheduling") Signed-off-by: Lin Shengwang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221008022709.642-1-linshengwang1@huawei.com --- kernel/sched/sched.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1644242ecd11..0d0851127369 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1182,6 +1182,14 @@ static inline bool is_migration_disabled(struct task_struct *p) #endif } +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() this_cpu_ptr(&runqueues) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() raw_cpu_ptr(&runqueues) + struct sched_group; #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -1269,7 +1277,7 @@ static inline bool sched_group_cookie_match(struct rq *rq, return true; for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) { - if (sched_core_cookie_match(rq, p)) + if (sched_core_cookie_match(cpu_rq(cpu), p)) return true; } return false; @@ -1384,14 +1392,6 @@ static inline void update_idle_core(struct rq *rq) static inline void update_idle_core(struct rq *rq) { } #endif -DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); - -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() this_cpu_ptr(&runqueues) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() raw_cpu_ptr(&runqueues) - #ifdef CONFIG_FAIR_GROUP_SCHED static inline struct task_struct *task_of(struct sched_entity *se) { -- cgit v1.2.3 From 8e5bad7dccec2014f24497b57d8a8ee0b752c290 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 7 Oct 2022 17:07:58 -0700 Subject: sched: Introduce struct balance_callback to avoid CFI mismatches Introduce distinct struct balance_callback instead of performing function pointer casting which will trip CFI. Avoids warnings as found by Clang's future -Wcast-function-type-strict option: In file included from kernel/sched/core.c:84: kernel/sched/sched.h:1755:15: warning: cast from 'void (*)(struct rq *)' to 'void (*)(struct callback_head *)' converts to incompatible function type [-Wcast-function-type-strict] head->func = (void (*)(struct callback_head *))func; ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ No binary differences result from this change. This patch is a cleanup based on Brad Spengler/PaX Team's modifications to sched code in their last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Reported-by: Sami Tolvanen Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Nathan Chancellor Link: https://github.com/ClangBuiltLinux/linux/issues/1724 Link: https://lkml.kernel.org/r/20221008000758.2957718-1-keescook@chromium.org --- kernel/sched/core.c | 24 ++++++++++++------------ kernel/sched/deadline.c | 4 ++-- kernel/sched/rt.c | 4 ++-- kernel/sched/sched.h | 14 ++++++++++---- 4 files changed, 26 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5800b0623ff3..cb2aa2b54c7a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4823,10 +4823,10 @@ static inline void finish_task(struct task_struct *prev) #ifdef CONFIG_SMP -static void do_balance_callbacks(struct rq *rq, struct callback_head *head) +static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) { void (*func)(struct rq *rq); - struct callback_head *next; + struct balance_callback *next; lockdep_assert_rq_held(rq); @@ -4853,15 +4853,15 @@ static void balance_push(struct rq *rq); * This abuse is tolerated because it places all the unlikely/odd cases behind * a single test, namely: rq->balance_callback == NULL. */ -struct callback_head balance_push_callback = { +struct balance_callback balance_push_callback = { .next = NULL, - .func = (void (*)(struct callback_head *))balance_push, + .func = balance_push, }; -static inline struct callback_head * +static inline struct balance_callback * __splice_balance_callbacks(struct rq *rq, bool split) { - struct callback_head *head = rq->balance_callback; + struct balance_callback *head = rq->balance_callback; if (likely(!head)) return NULL; @@ -4883,7 +4883,7 @@ __splice_balance_callbacks(struct rq *rq, bool split) return head; } -static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) { return __splice_balance_callbacks(rq, true); } @@ -4893,7 +4893,7 @@ static void __balance_callbacks(struct rq *rq) do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); } -static inline void balance_callbacks(struct rq *rq, struct callback_head *head) +static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) { unsigned long flags; @@ -4910,12 +4910,12 @@ static inline void __balance_callbacks(struct rq *rq) { } -static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) { return NULL; } -static inline void balance_callbacks(struct rq *rq, struct callback_head *head) +static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) { } @@ -6188,7 +6188,7 @@ static void sched_core_balance(struct rq *rq) preempt_enable(); } -static DEFINE_PER_CPU(struct callback_head, core_balance_head); +static DEFINE_PER_CPU(struct balance_callback, core_balance_head); static void queue_core_balance(struct rq *rq) { @@ -7419,7 +7419,7 @@ static int __sched_setscheduler(struct task_struct *p, int oldpolicy = -1, policy = attr->sched_policy; int retval, oldprio, newprio, queued, running; const struct sched_class *prev_class; - struct callback_head *head; + struct balance_callback *head; struct rq_flags rf; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 86dea6a05267..9ae8f41e3372 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -644,8 +644,8 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) return rq->online && dl_task(prev); } -static DEFINE_PER_CPU(struct callback_head, dl_push_head); -static DEFINE_PER_CPU(struct callback_head, dl_pull_head); +static DEFINE_PER_CPU(struct balance_callback, dl_push_head); +static DEFINE_PER_CPU(struct balance_callback, dl_pull_head); static void push_dl_tasks(struct rq *); static void pull_dl_task(struct rq *); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d869bcf898cc..ed2a47e4ddae 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -410,8 +410,8 @@ static inline int has_pushable_tasks(struct rq *rq) return !plist_head_empty(&rq->rt.pushable_tasks); } -static DEFINE_PER_CPU(struct callback_head, rt_push_head); -static DEFINE_PER_CPU(struct callback_head, rt_pull_head); +static DEFINE_PER_CPU(struct balance_callback, rt_push_head); +static DEFINE_PER_CPU(struct balance_callback, rt_pull_head); static void push_rt_tasks(struct rq *); static void pull_rt_task(struct rq *); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0d0851127369..a4a20046e586 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -938,6 +938,12 @@ struct uclamp_rq { DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ +struct rq; +struct balance_callback { + struct balance_callback *next; + void (*func)(struct rq *rq); +}; + /* * This is the main, per-CPU runqueue data structure. * @@ -1036,7 +1042,7 @@ struct rq { unsigned long cpu_capacity; unsigned long cpu_capacity_orig; - struct callback_head *balance_callback; + struct balance_callback *balance_callback; unsigned char nohz_idle_balance; unsigned char idle_balance; @@ -1544,7 +1550,7 @@ struct rq_flags { #endif }; -extern struct callback_head balance_push_callback; +extern struct balance_callback balance_push_callback; /* * Lockdep annotation that avoids accidental unlocks; it's like a @@ -1724,7 +1730,7 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) static inline void queue_balance_callback(struct rq *rq, - struct callback_head *head, + struct balance_callback *head, void (*func)(struct rq *rq)) { lockdep_assert_rq_held(rq); @@ -1737,7 +1743,7 @@ queue_balance_callback(struct rq *rq, if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) return; - head->func = (void (*)(struct callback_head *))func; + head->func = func; head->next = rq->balance_callback; rq->balance_callback = head; } -- cgit v1.2.3 From ea68376c8bed5cd156900852aada20c3a0874d17 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 14 Oct 2022 17:24:44 -0700 Subject: bpf: prevent decl_tag from being referenced in func_proto Syzkaller was able to hit the following issue: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 3609 at kernel/bpf/btf.c:1946 btf_type_id_size+0x2d5/0x9d0 kernel/bpf/btf.c:1946 Modules linked in: CPU: 0 PID: 3609 Comm: syz-executor361 Not tainted 6.0.0-syzkaller-02734-g0326074ff465 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/22/2022 RIP: 0010:btf_type_id_size+0x2d5/0x9d0 kernel/bpf/btf.c:1946 Code: ef e8 7f 8e e4 ff 41 83 ff 0b 77 28 f6 44 24 10 18 75 3f e8 6d 91 e4 ff 44 89 fe bf 0e 00 00 00 e8 20 8e e4 ff e8 5b 91 e4 ff <0f> 0b 45 31 f6 e9 98 02 00 00 41 83 ff 12 74 18 e8 46 91 e4 ff 44 RSP: 0018:ffffc90003cefb40 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000 RDX: ffff8880259c0000 RSI: ffffffff81968415 RDI: 0000000000000005 RBP: ffff88801270ca00 R08: 0000000000000005 R09: 000000000000000e R10: 0000000000000011 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000011 R14: ffff888026ee6424 R15: 0000000000000011 FS: 000055555641b300(0000) GS:ffff8880b9a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000f2e258 CR3: 000000007110e000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btf_func_proto_check kernel/bpf/btf.c:4447 [inline] btf_check_all_types kernel/bpf/btf.c:4723 [inline] btf_parse_type_sec kernel/bpf/btf.c:4752 [inline] btf_parse kernel/bpf/btf.c:5026 [inline] btf_new_fd+0x1926/0x1e70 kernel/bpf/btf.c:6892 bpf_btf_load kernel/bpf/syscall.c:4324 [inline] __sys_bpf+0xb7d/0x4cf0 kernel/bpf/syscall.c:5010 __do_sys_bpf kernel/bpf/syscall.c:5069 [inline] __se_sys_bpf kernel/bpf/syscall.c:5067 [inline] __x64_sys_bpf+0x75/0xb0 kernel/bpf/syscall.c:5067 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f0fbae41c69 Code: 28 c3 e8 2a 14 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffc8aeb6228 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0fbae41c69 RDX: 0000000000000020 RSI: 0000000020000140 RDI: 0000000000000012 RBP: 00007f0fbae05e10 R08: 0000000000000000 R09: 0000000000000000 R10: 00000000ffffffff R11: 0000000000000246 R12: 00007f0fbae05ea0 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Looks like it tries to create a func_proto which return type is decl_tag. For the details, see Martin's spot on analysis in [0]. 0: https://lore.kernel.org/bpf/CAKH8qBuQDLva_hHxxBuZzyAcYNO4ejhovz6TQeVSk8HY-2SO6g@mail.gmail.com/T/#mea6524b3fcd6298347432226e81b1e6155efc62c Cc: Yonghong Song Cc: Martin KaFai Lau Fixes: bd16dee66ae4 ("bpf: Add BTF_KIND_DECL_TAG typedef support") Reported-by: syzbot+d8bd751aef7c6b39a344@syzkaller.appspotmail.com Signed-off-by: Stanislav Fomichev Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20221015002444.2680969-2-sdf@google.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/btf.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index eba603cec2c5..35c07afac924 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4436,6 +4436,11 @@ static int btf_func_proto_check(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_is_resolve_source_only(ret_type)) { + btf_verifier_log_type(env, t, "Invalid return type"); + return -EINVAL; + } + if (btf_type_needs_resolve(ret_type) && !env_type_is_resolved(env, ret_type_id)) { err = btf_resolve(env, ret_type, ret_type_id); -- cgit v1.2.3 From 60a9bb9048f9e95029df10a9bc346f6b066c593c Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Wed, 19 Oct 2022 11:36:00 +0800 Subject: blktrace: introduce 'blk_trace_{start,stop}' helper Introduce 'blk_trace_{start,stop}' helper. No functional changed. Signed-off-by: Ye Bin Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221019033602.752383-2-yebin@huaweicloud.com Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 74 ++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7f5eb295fe19..50b6f241b5f7 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -346,6 +346,37 @@ static void put_probe_ref(void) mutex_unlock(&blk_probe_mutex); } +static int blk_trace_start(struct blk_trace *bt) +{ + if (bt->trace_state != Blktrace_setup && + bt->trace_state != Blktrace_stopped) + return -EINVAL; + + blktrace_seq++; + smp_mb(); + bt->trace_state = Blktrace_running; + raw_spin_lock_irq(&running_trace_lock); + list_add(&bt->running_list, &running_trace_list); + raw_spin_unlock_irq(&running_trace_lock); + trace_note_time(bt); + + return 0; +} + +static int blk_trace_stop(struct blk_trace *bt) +{ + if (bt->trace_state != Blktrace_running) + return -EINVAL; + + bt->trace_state = Blktrace_stopped; + raw_spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + raw_spin_unlock_irq(&running_trace_lock); + relay_flush(bt->rchan); + + return 0; +} + static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { synchronize_rcu(); @@ -658,7 +689,6 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, static int __blk_trace_startstop(struct request_queue *q, int start) { - int ret; struct blk_trace *bt; bt = rcu_dereference_protected(q->blk_trace, @@ -666,36 +696,10 @@ static int __blk_trace_startstop(struct request_queue *q, int start) if (bt == NULL) return -EINVAL; - /* - * For starting a trace, we can transition from a setup or stopped - * trace. For stopping a trace, the state must be running - */ - ret = -EINVAL; - if (start) { - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) { - blktrace_seq++; - smp_mb(); - bt->trace_state = Blktrace_running; - raw_spin_lock_irq(&running_trace_lock); - list_add(&bt->running_list, &running_trace_list); - raw_spin_unlock_irq(&running_trace_lock); - - trace_note_time(bt); - ret = 0; - } - } else { - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - raw_spin_lock_irq(&running_trace_lock); - list_del_init(&bt->running_list); - raw_spin_unlock_irq(&running_trace_lock); - relay_flush(bt->rchan); - ret = 0; - } - } - - return ret; + if (start) + return blk_trace_start(bt); + else + return blk_trace_stop(bt); } int blk_trace_startstop(struct request_queue *q, int start) @@ -1614,13 +1618,7 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - raw_spin_lock_irq(&running_trace_lock); - list_del_init(&bt->running_list); - raw_spin_unlock_irq(&running_trace_lock); - relay_flush(bt->rchan); - } + blk_trace_stop(bt); put_probe_ref(); synchronize_rcu(); -- cgit v1.2.3 From dcd1a59c62dc49da75539213611156d6db50ab5d Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Wed, 19 Oct 2022 11:36:01 +0800 Subject: blktrace: fix possible memleak in '__blk_trace_remove' When test as follows: step1: ioctl(sda, BLKTRACESETUP, &arg) step2: ioctl(sda, BLKTRACESTART, NULL) step3: ioctl(sda, BLKTRACETEARDOWN, NULL) step4: ioctl(sda, BLKTRACESETUP, &arg) Got issue as follows: debugfs: File 'dropped' in directory 'sda' already present! debugfs: File 'msg' in directory 'sda' already present! debugfs: File 'trace0' in directory 'sda' already present! And also find syzkaller report issue like "KASAN: use-after-free Read in relay_switch_subbuf" "https://syzkaller.appspot.com/bug?id=13849f0d9b1b818b087341691be6cc3ac6a6bfb7" If remove block trace without stop(BLKTRACESTOP) block trace, '__blk_trace_remove' will just set 'q->blk_trace' with NULL. However, debugfs file isn't removed, so will report file already present when call BLKTRACESETUP. static int __blk_trace_remove(struct request_queue *q) { struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, lockdep_is_held(&q->debugfs_mutex)); if (!bt) return -EINVAL; if (bt->trace_state != Blktrace_running) blk_trace_cleanup(q, bt); return 0; } If do test as follows: step1: ioctl(sda, BLKTRACESETUP, &arg) step2: ioctl(sda, BLKTRACESTART, NULL) step3: ioctl(sda, BLKTRACETEARDOWN, NULL) step4: remove sda There will remove debugfs directory which will remove recursively all file under directory. >> blk_release_queue >> debugfs_remove_recursive(q->debugfs_dir) So all files which created in 'do_blk_trace_setup' are removed, and 'dentry->d_inode' is NULL. But 'q->blk_trace' is still in 'running_trace_lock', 'trace_note_tsk' will traverse 'running_trace_lock' all nodes. >>trace_note_tsk >> trace_note >> relay_reserve >> relay_switch_subbuf >> d_inode(buf->dentry)->i_size To solve above issues, reference commit '5afedf670caf', call 'blk_trace_cleanup' unconditionally in '__blk_trace_remove' and first stop block trace in 'blk_trace_cleanup'. Signed-off-by: Ye Bin Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221019033602.752383-3-yebin@huaweicloud.com Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 50b6f241b5f7..e17bba027a2c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -379,6 +379,7 @@ static int blk_trace_stop(struct blk_trace *bt) static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { + blk_trace_stop(bt); synchronize_rcu(); blk_trace_free(q, bt); put_probe_ref(); @@ -393,8 +394,7 @@ static int __blk_trace_remove(struct request_queue *q) if (!bt) return -EINVAL; - if (bt->trace_state != Blktrace_running) - blk_trace_cleanup(q, bt); + blk_trace_cleanup(q, bt); return 0; } -- cgit v1.2.3 From 2db96217e7e515071726ca4ec791742c4202a1b2 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Wed, 19 Oct 2022 11:36:02 +0800 Subject: blktrace: remove unnessary stop block trace in 'blk_trace_shutdown' As previous commit, 'blk_trace_cleanup' will stop block trace if block trace's state is 'Blktrace_running'. So remove unnessary stop block trace in 'blk_trace_shutdown'. Signed-off-by: Ye Bin Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221019033602.752383-4-yebin@huaweicloud.com Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e17bba027a2c..a995ea1ef849 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -776,10 +776,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) void blk_trace_shutdown(struct request_queue *q) { if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->debugfs_mutex))) { - __blk_trace_startstop(q, 0); + lockdep_is_held(&q->debugfs_mutex))) __blk_trace_remove(q); - } } #ifdef CONFIG_BLK_CGROUP -- cgit v1.2.3 From 31d8aaa87fcef1be5932f3813ea369e21bd3b11d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Oct 2022 10:58:14 -0700 Subject: rcu: Keep synchronize_rcu() from enabling irqs in early boot Making polled RCU grace periods account for expedited grace periods required acquiring the leaf rcu_node structure's lock during early boot, but after rcu_init() was called. This lock is irq-disabled, but the code incorrectly assumes that irqs are always disabled when invoking synchronize_rcu(). The exception is early boot before the scheduler has started, which means that upon return from synchronize_rcu(), irqs will be incorrectly enabled. This commit fixes this bug by using irqsave/irqrestore locking primitives. Fixes: bf95b2bc3e42 ("rcu: Switch polled grace-period APIs to ->gp_seq_polled") Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6bb8e72bc815..93416afebd59 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1403,30 +1403,32 @@ static void rcu_poll_gp_seq_end(unsigned long *snap) // where caller does not hold the root rcu_node structure's lock. static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) { + unsigned long flags; struct rcu_node *rnp = rcu_get_root(); if (rcu_init_invoked()) { lockdep_assert_irqs_enabled(); - raw_spin_lock_irq_rcu_node(rnp); + raw_spin_lock_irqsave_rcu_node(rnp, flags); } rcu_poll_gp_seq_start(snap); if (rcu_init_invoked()) - raw_spin_unlock_irq_rcu_node(rnp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } // Make the polled API aware of the end of a grace period, but where // caller does not hold the root rcu_node structure's lock. static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) { + unsigned long flags; struct rcu_node *rnp = rcu_get_root(); if (rcu_init_invoked()) { lockdep_assert_irqs_enabled(); - raw_spin_lock_irq_rcu_node(rnp); + raw_spin_lock_irqsave_rcu_node(rnp, flags); } rcu_poll_gp_seq_end(snap); if (rcu_init_invoked()) - raw_spin_unlock_irq_rcu_node(rnp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* -- cgit v1.2.3 From dbe69b29988465b011f198f2797b1c2b6980b50e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 18 Oct 2022 09:59:34 +0200 Subject: bpf: Fix dispatcher patchable function entry to 5 bytes nop The patchable_function_entry(5) might output 5 single nop instructions (depends on toolchain), which will clash with bpf_arch_text_poke check for 5 bytes nop instruction. Adding early init call for dispatcher that checks and change the patchable entry into expected 5 nop instruction if needed. There's no need to take text_mutex, because we are using it in early init call which is called at pre-smp time. Fixes: ceea991a019c ("bpf: Move bpf_dispatcher function out of ftrace locations") Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221018075934.574415-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 13 +++++++++++++ include/linux/bpf.h | 14 +++++++++++++- kernel/bpf/dispatcher.c | 6 ++++++ 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 99620428ad78..00127abd89ee 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -388,6 +389,18 @@ out: return ret; } +int __init bpf_arch_init_dispatcher_early(void *ip) +{ + const u8 *nop_insn = x86_nops[5]; + + if (is_endbr(*(u32 *)ip)) + ip += ENDBR_INSN_SIZE; + + if (memcmp(ip, nop_insn, X86_PATCH_SIZE)) + text_poke_early(ip, nop_insn, X86_PATCH_SIZE); + return 0; +} + int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *old_addr, void *new_addr) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9e7d46d16032..0566705c1d4e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -27,6 +27,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -970,6 +971,8 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs); +int __init bpf_arch_init_dispatcher_early(void *ip); + #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ .func = &_name##_func, \ @@ -983,6 +986,13 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func }, \ } +#define BPF_DISPATCHER_INIT_CALL(_name) \ + static int __init _name##_init(void) \ + { \ + return bpf_arch_init_dispatcher_early(_name##_func); \ + } \ + early_initcall(_name##_init) + #ifdef CONFIG_X86_64 #define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5))) #else @@ -1000,7 +1010,9 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func } \ EXPORT_SYMBOL(bpf_dispatcher_##name##_func); \ struct bpf_dispatcher bpf_dispatcher_##name = \ - BPF_DISPATCHER_INIT(bpf_dispatcher_##name); + BPF_DISPATCHER_INIT(bpf_dispatcher_##name); \ + BPF_DISPATCHER_INIT_CALL(bpf_dispatcher_##name); + #define DECLARE_BPF_DISPATCHER(name) \ unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index fa64b80b8bca..04f0a045dcaa 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -4,6 +4,7 @@ #include #include #include +#include /* The BPF dispatcher is a multiway branch code generator. The * dispatcher is a mechanism to avoid the performance penalty of an @@ -90,6 +91,11 @@ int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int n return -ENOTSUPP; } +int __weak __init bpf_arch_init_dispatcher_early(void *ip) +{ + return -ENOTSUPP; +} + static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf) { s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; -- cgit v1.2.3 From 977ef30a7d888eeb52fb6908f99080f33e5309a8 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Thu, 13 Oct 2022 09:40:59 +0200 Subject: gcov: support GCC 12.1 and newer compilers Starting with GCC 12.1, the created .gcda format can't be read by gcov tool. There are 2 significant changes to the .gcda file format that need to be supported: a) [gcov: Use system IO buffering] (23eb66d1d46a34cb28c4acbdf8a1deb80a7c5a05) changed that all sizes in the format are in bytes and not in words (4B) b) [gcov: make profile merging smarter] (72e0c742bd01f8e7e6dcca64042b9ad7e75979de) add a new checksum to the file header. Tested with GCC 7.5, 10.4, 12.2 and the current master. Link: https://lkml.kernel.org/r/624bda92-f307-30e9-9aaa-8cc678b2dfb2@suse.cz Signed-off-by: Martin Liska Tested-by: Peter Oberparleiter Reviewed-by: Peter Oberparleiter Cc: Signed-off-by: Andrew Morton --- kernel/gcov/gcc_4_7.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 460c12b7dfea..7971e989e425 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -30,6 +30,13 @@ #define GCOV_TAG_FUNCTION_LENGTH 3 +/* Since GCC 12.1 sizes are in BYTES and not in WORDS (4B). */ +#if (__GNUC__ >= 12) +#define GCOV_UNIT_SIZE 4 +#else +#define GCOV_UNIT_SIZE 1 +#endif + static struct gcov_info *gcov_info_head; /** @@ -383,12 +390,18 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info) pos += store_gcov_u32(buffer, pos, info->version); pos += store_gcov_u32(buffer, pos, info->stamp); +#if (__GNUC__ >= 12) + /* Use zero as checksum of the compilation unit. */ + pos += store_gcov_u32(buffer, pos, 0); +#endif + for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { fi_ptr = info->functions[fi_idx]; /* Function record. */ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); - pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH); + pos += store_gcov_u32(buffer, pos, + GCOV_TAG_FUNCTION_LENGTH * GCOV_UNIT_SIZE); pos += store_gcov_u32(buffer, pos, fi_ptr->ident); pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum); pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); @@ -402,7 +415,8 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info) /* Counter record. */ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FOR_COUNTER(ct_idx)); - pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2); + pos += store_gcov_u32(buffer, pos, + ci_ptr->num * 2 * GCOV_UNIT_SIZE); for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) { pos += store_gcov_u64(buffer, pos, -- cgit v1.2.3 From 3d05818707bb2cf133ccdcd29f2d5585b5bc1298 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 21 Oct 2022 19:49:12 +0800 Subject: bpf: Wait for busy refill_work when destroying bpf memory allocator A busy irq work is an unfinished irq work and it can be either in the pending state or in the running state. When destroying bpf memory allocator, refill_work may be busy for PREEMPT_RT kernel in which irq work is invoked in a per-CPU RT-kthread. It is also possible for kernel with arch_irq_work_has_interrupt() being false (e.g. 1-cpu arm32 host or mips) and irq work is inovked in timer interrupt. The busy refill_work leads to various issues. The obvious one is that there will be concurrent operations on free_by_rcu and free_list between irq work and memory draining. Another one is call_rcu_in_progress will not be reliable for the checking of pending RCU callback because do_call_rcu() may have not been invoked by irq work yet. The other is there will be use-after-free if irq work is freed before the callback of irq work is invoked as shown below: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor instruction fetch in kernel mode #PF: error_code(0x0010) - not-present page PGD 12ab94067 P4D 12ab94067 PUD 1796b4067 PMD 0 Oops: 0010 [#1] PREEMPT_RT SMP CPU: 5 PID: 64 Comm: irq_work/5 Not tainted 6.0.0-rt11+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) RIP: 0010:0x0 Code: Unable to access opcode bytes at 0xffffffffffffffd6. RSP: 0018:ffffadc080293e78 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffffcdc07fb6a388 RCX: ffffa05000a2e000 RDX: ffffa05000a2e000 RSI: ffffffff96cc9827 RDI: ffffcdc07fb6a388 ...... Call Trace: irq_work_single+0x24/0x60 irq_work_run_list+0x24/0x30 run_irq_workd+0x23/0x30 smpboot_thread_fn+0x203/0x300 kthread+0x126/0x150 ret_from_fork+0x1f/0x30 Considering the ease of concurrency handling, no overhead for irq_work_sync() under non-PREEMPT_RT kernel and has-irq-work-interrupt kernel and the short wait time used for irq_work_sync() under PREEMPT_RT (When running two test_maps on PREEMPT_RT kernel and 72-cpus host, the max wait time is about 8ms and the 99th percentile is 10us), just using irq_work_sync() to wait for busy refill_work to complete before memory draining and memory freeing. Fixes: 7c8199e24fa0 ("bpf: Introduce any context BPF specific memory allocator.") Acked-by: Stanislav Fomichev Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20221021114913.60508-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 5f83be1d2018..1cb24323826f 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -493,6 +493,16 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress = 0; for_each_possible_cpu(cpu) { c = per_cpu_ptr(ma->cache, cpu); + /* + * refill_work may be unfinished for PREEMPT_RT kernel + * in which irq work is invoked in a per-CPU RT thread. + * It is also possible for kernel with + * arch_irq_work_has_interrupt() being false and irq + * work is invoked in timer interrupt. So waiting for + * the completion of irq work to ease the handling of + * concurrency. + */ + irq_work_sync(&c->refill_work); drain_mem_cache(c); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } @@ -507,6 +517,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) cc = per_cpu_ptr(ma->caches, cpu); for (i = 0; i < NUM_CACHES; i++) { c = &cc->cache[i]; + irq_work_sync(&c->refill_work); drain_mem_cache(c); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } -- cgit v1.2.3 From fa4447cb73b2bfe7175f1b7ffdc70580fcfcb991 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 21 Oct 2022 19:49:13 +0800 Subject: bpf: Use __llist_del_all() whenever possbile during memory draining Except for waiting_for_gp list, there are no concurrent operations on free_by_rcu, free_llist and free_llist_extra lists, so use __llist_del_all() instead of llist_del_all(). waiting_for_gp list can be deleted by RCU callback concurrently, so still use llist_del_all(). Acked-by: Stanislav Fomichev Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20221021114913.60508-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 1cb24323826f..4901fa1048cd 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -418,14 +418,17 @@ static void drain_mem_cache(struct bpf_mem_cache *c) /* No progs are using this bpf_mem_cache, but htab_map_free() called * bpf_mem_cache_free() for all remaining elements and they can be in * free_by_rcu or in waiting_for_gp lists, so drain those lists now. + * + * Except for waiting_for_gp list, there are no concurrent operations + * on these lists, so it is safe to use __llist_del_all(). */ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) free_one(c, llnode); llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp)) free_one(c, llnode); - llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist)) + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist)) free_one(c, llnode); - llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist_extra)) free_one(c, llnode); } -- cgit v1.2.3 From 52826d3b2d1d8e1180a84bef7d72596d6a024a38 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 23 Oct 2022 12:01:01 -0700 Subject: kernel/utsname_sysctl.c: Fix hostname polling Commit bfca3dd3d068 ("kernel/utsname_sysctl.c: print kernel arch") added a new entry to the uts_kern_table[] array, but didn't update the UTS_PROC_xyz enumerators of older entries, breaking anything that used them. Which is admittedly not many cases: it's really just the two uses of uts_proc_notify() in kernel/sys.c. But apparently journald-systemd actually uses this to detect hostname changes. Reported-by: Torsten Hilbrich Fixes: bfca3dd3d068 ("kernel/utsname_sysctl.c: print kernel arch") Link: https://lore.kernel.org/lkml/0c2b92a6-0f25-9538-178f-eee3b06da23f@secunet.com/ Link: https://linux-regtracking.leemhuis.info/regzbot/regression/0c2b92a6-0f25-9538-178f-eee3b06da23f@secunet.com/ Cc: Petr Vorel Signed-off-by: Linus Torvalds --- include/linux/utsname.h | 1 + kernel/utsname_sysctl.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 2b1737c9b244..bf7613ba412b 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -10,6 +10,7 @@ #include enum uts_proc { + UTS_PROC_ARCH, UTS_PROC_OSTYPE, UTS_PROC_OSRELEASE, UTS_PROC_VERSION, diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 064072c16e3d..f50398cb790d 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -74,6 +74,7 @@ static int proc_do_uts_string(struct ctl_table *table, int write, static DEFINE_CTL_TABLE_POLL(hostname_poll); static DEFINE_CTL_TABLE_POLL(domainname_poll); +// Note: update 'enum uts_proc' to match any changes to this table static struct ctl_table uts_kern_table[] = { { .procname = "arch", -- cgit v1.2.3