From 22b6d14992b733e9421a475f4d43df24629737ab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Jun 2021 12:37:44 -0700 Subject: scftorture: Avoid false-positive warnings in scftorture_invoker() If the call to set_cpus_allowed_ptr() in scftorture_invoker() fails, a later WARN_ONCE() complains. But with the advent of 570a752b7a9b ("lib/smp_processor_id: Use is_percpu_thread() instead of nr_cpus_allowed"), this complaint can be drowned out by complaints from smp_processor_id(). The rationale for this change is that scftorture's kthreads are not marked with PF_NO_SETAFFINITY, which means that a system administrator could change affinity at any time. However, scftorture is a torture test, and the system administrator might well have a valid test-the-test reason for changing affinity. This commit therefore changes to raw_smp_processor_id() in order to avoid the noise, and also adds a WARN_ON_ONCE() to the call to set_cpus_allowed_ptr() in order to directly detect immediate failure. There is no WARN_ON_ONCE() within the test loop, allowing human-reflex-based affinity resetting, if desired. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 2377cbb32474..29e8fc5d91a7 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -405,15 +405,15 @@ static int scftorture_invoker(void *arg) VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu); cpu = scfp->cpu % nr_cpu_ids; - set_cpus_allowed_ptr(current, cpumask_of(cpu)); + WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(cpu))); set_user_nice(current, MAX_NICE); if (holdoff) schedule_timeout_interruptible(holdoff * HZ); - VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id()); + VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, raw_smp_processor_id()); // Make sure that the CPU is affinitized appropriately during testing. - curcpu = smp_processor_id(); + curcpu = raw_smp_processor_id(); WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids, "%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n", __func__, scfp->cpu, curcpu, nr_cpu_ids); -- cgit v1.2.3 From 05bc276cf243d90b9f1eb6ae2962f41eeb53a741 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Jun 2021 09:24:43 -0700 Subject: refscale: Avoid false-positive warnings in ref_scale_reader() If the call to set_cpus_allowed_ptr() in ref_scale_reader() fails, a later WARN_ONCE() complains. But with the advent of 570a752b7a9b ("lib/smp_processor_id: Use is_percpu_thread() instead of nr_cpus_allowed"), this complaint can be drowned out by complaints from smp_processor_id(). The rationale for this change is that refscale's kthreads are not marked with PF_NO_SETAFFINITY, which means that a system administrator could change affinity at any time. However, refscale is a performance/stress test, and the system administrator might well have a valid test-the-test reason for changing affinity. This commit therefore changes to raw_smp_processor_id() in order to avoid the noise, and also adds a WARN_ON_ONCE() to the call to set_cpus_allowed_ptr() in order to directly detect immediate failure. There is no WARN_ON_ONCE() within the test loop, allowing human-reflex-based affinity resetting, if desired. Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 313d4547cbc7..d998a76fb542 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -487,13 +487,13 @@ ref_scale_reader(void *arg) s64 duration; VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me); - set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids))); set_user_nice(current, MAX_NICE); atomic_inc(&n_init); if (holdoff) schedule_timeout_interruptible(holdoff * HZ); repeat: - VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, raw_smp_processor_id()); // Wait for signal that this reader can start. wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || @@ -503,7 +503,7 @@ repeat: goto end; // Make sure that the CPU is affinitized appropriately during testing. - WARN_ON_ONCE(smp_processor_id() != me); + WARN_ON_ONCE(raw_smp_processor_id() != me); WRITE_ONCE(rt->start_reader, 0); if (!atomic_dec_return(&n_started)) -- cgit v1.2.3 From 1d10bf55d85d34eb73dd8263635f43fd72135d2d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 May 2021 10:12:45 -0700 Subject: rcu-tasks: Don't delete holdouts within trc_inspect_reader() As Yanfei pointed out, although invoking trc_del_holdout() is safe from the viewpoint of the integrity of the holdout list itself, the put_task_struct() invoked by trc_del_holdout() can result in use-after-free errors due to later accesses to this task_struct structure by the RCU Tasks Trace grace-period kthread. This commit therefore removes this call to trc_del_holdout() from trc_inspect_reader() in favor of the grace-period thread's existing call to trc_del_holdout(), thus eliminating that particular class of use-after-free errors. Reported-by: "Xu, Yanfei" Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 03a118d1c003..3d5cb6cb8a6d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -953,10 +953,9 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) in_qs = likely(!t->trc_reader_nesting); } - // Mark as checked. Because this is called from the grace-period - // kthread, also remove the task from the holdout list. + // Mark as checked so that the grace-period kthread will + // remove it from the holdout list. t->trc_reader_checked = true; - trc_del_holdout(t); if (in_qs) return true; // Already in quiescent state, done!!! -- cgit v1.2.3 From a9ab9cce9367a2cc02a3c7eb57a004dc0b8f380d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 May 2021 11:28:40 -0700 Subject: rcu-tasks: Don't delete holdouts within trc_wait_for_one_reader() Invoking trc_del_holdout() from within trc_wait_for_one_reader() is only a performance optimization because the RCU Tasks Trace grace-period kthread will eventually do this within check_all_holdout_tasks_trace(). But it is not a particularly important performance optimization because it only applies to the grace-period kthread, of which there is but one. This commit therefore removes this invocation of trc_del_holdout() in favor of the one in check_all_holdout_tasks_trace() in the grace-period kthread. Reported-by: "Xu, Yanfei" Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 3d5cb6cb8a6d..8536c55df514 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -982,7 +982,6 @@ static void trc_wait_for_one_reader(struct task_struct *t, // The current task had better be in a quiescent state. if (t == current) { t->trc_reader_checked = true; - trc_del_holdout(t); WARN_ON_ONCE(t->trc_reader_nesting); return; } -- cgit v1.2.3 From 2a2ed5618a0e8a890d948b88b368c0459f35136c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Jul 2021 13:59:35 -0700 Subject: rcu: Fix pr_info() formats and values in show_rcu_gp_kthreads() This commit changes from "%lx" to "%x" and from "0x1ffffL" to "0x1ffff" to match the change in type between the old field ->state (unsigned long) and the new field ->__state (unsigned int). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 3f937b20814f..6c76988cc019 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -795,9 +795,9 @@ void show_rcu_gp_kthreads(void) jr = j - data_race(rcu_state.gp_req_activity); js = j - data_race(rcu_state.gp_start); jw = j - data_race(rcu_state.gp_wake_time); - pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n", + pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, t ? t->__state : 0x1ffffL, t ? t->rt_priority : 0xffU, + rcu_state.gp_state, t ? t->__state : 0x1ffff, t ? t->rt_priority : 0xffU, js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), (long)data_race(rcu_state.gp_seq), (long)data_race(rcu_get_root()->gp_seq_needed), -- cgit v1.2.3 From 1adee589cd6da2ead7f1b5dd82419eac59a2e2b0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 12 Jul 2021 11:03:35 -0500 Subject: kernel: debug: Fix unreachable code in gdb_serial_stub() Fix the following warning: kernel/debug/gdbstub.c:1049:4: warning: fallthrough annotation in unreachable code [-Wimplicit-fallthrough] fallthrough; ^ include/linux/compiler_attributes.h:210:41: note: expanded from macro 'fallthrough' # define fallthrough __attribute__((__fallthrough__) by placing the fallthrough; statement inside ifdeffery. Reported-by: kernel test robot Signed-off-by: Gustavo A. R. Silva --- kernel/debug/gdbstub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 8372897402f4..b6f28fad4307 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1045,8 +1045,8 @@ int gdb_serial_stub(struct kgdb_state *ks) gdb_cmd_detachkill(ks); return DBG_PASS_EVENT; } -#endif fallthrough; +#endif case 'C': /* Exception passing */ tmp = gdb_cmd_exception_pass(ks); if (tmp > 0) -- cgit v1.2.3 From 5dd0a6b8582ffbfa88351949d50eccd5b6694ade Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 12 Jul 2021 22:57:35 +0200 Subject: bpf: Fix tail_call_reachable rejection for interpreter when jit failed During testing of f263a81451c1 ("bpf: Track subprog poke descriptors correctly and fix use-after-free") under various failure conditions, for example, when jit_subprogs() fails and tries to clean up the program to be run under the interpreter, we ran into the following freeze: [...] #127/8 tailcall_bpf2bpf_3:FAIL [...] [ 92.041251] BUG: KASAN: slab-out-of-bounds in ___bpf_prog_run+0x1b9d/0x2e20 [ 92.042408] Read of size 8 at addr ffff88800da67f68 by task test_progs/682 [ 92.043707] [ 92.044030] CPU: 1 PID: 682 Comm: test_progs Tainted: G O 5.13.0-53301-ge6c08cb33a30-dirty #87 [ 92.045542] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014 [ 92.046785] Call Trace: [ 92.047171] ? __bpf_prog_run_args64+0xc0/0xc0 [ 92.047773] ? __bpf_prog_run_args32+0x8b/0xb0 [ 92.048389] ? __bpf_prog_run_args64+0xc0/0xc0 [ 92.049019] ? ktime_get+0x117/0x130 [...] // few hundred [similar] lines more [ 92.659025] ? ktime_get+0x117/0x130 [ 92.659845] ? __bpf_prog_run_args64+0xc0/0xc0 [ 92.660738] ? __bpf_prog_run_args32+0x8b/0xb0 [ 92.661528] ? __bpf_prog_run_args64+0xc0/0xc0 [ 92.662378] ? print_usage_bug+0x50/0x50 [ 92.663221] ? print_usage_bug+0x50/0x50 [ 92.664077] ? bpf_ksym_find+0x9c/0xe0 [ 92.664887] ? ktime_get+0x117/0x130 [ 92.665624] ? kernel_text_address+0xf5/0x100 [ 92.666529] ? __kernel_text_address+0xe/0x30 [ 92.667725] ? unwind_get_return_address+0x2f/0x50 [ 92.668854] ? ___bpf_prog_run+0x15d4/0x2e20 [ 92.670185] ? ktime_get+0x117/0x130 [ 92.671130] ? __bpf_prog_run_args64+0xc0/0xc0 [ 92.672020] ? __bpf_prog_run_args32+0x8b/0xb0 [ 92.672860] ? __bpf_prog_run_args64+0xc0/0xc0 [ 92.675159] ? ktime_get+0x117/0x130 [ 92.677074] ? lock_is_held_type+0xd5/0x130 [ 92.678662] ? ___bpf_prog_run+0x15d4/0x2e20 [ 92.680046] ? ktime_get+0x117/0x130 [ 92.681285] ? __bpf_prog_run32+0x6b/0x90 [ 92.682601] ? __bpf_prog_run64+0x90/0x90 [ 92.683636] ? lock_downgrade+0x370/0x370 [ 92.684647] ? mark_held_locks+0x44/0x90 [ 92.685652] ? ktime_get+0x117/0x130 [ 92.686752] ? lockdep_hardirqs_on+0x79/0x100 [ 92.688004] ? ktime_get+0x117/0x130 [ 92.688573] ? __cant_migrate+0x2b/0x80 [ 92.689192] ? bpf_test_run+0x2f4/0x510 [ 92.689869] ? bpf_test_timer_continue+0x1c0/0x1c0 [ 92.690856] ? rcu_read_lock_bh_held+0x90/0x90 [ 92.691506] ? __kasan_slab_alloc+0x61/0x80 [ 92.692128] ? eth_type_trans+0x128/0x240 [ 92.692737] ? __build_skb+0x46/0x50 [ 92.693252] ? bpf_prog_test_run_skb+0x65e/0xc50 [ 92.693954] ? bpf_prog_test_run_raw_tp+0x2d0/0x2d0 [ 92.694639] ? __fget_light+0xa1/0x100 [ 92.695162] ? bpf_prog_inc+0x23/0x30 [ 92.695685] ? __sys_bpf+0xb40/0x2c80 [ 92.696324] ? bpf_link_get_from_fd+0x90/0x90 [ 92.697150] ? mark_held_locks+0x24/0x90 [ 92.698007] ? lockdep_hardirqs_on_prepare+0x124/0x220 [ 92.699045] ? finish_task_switch+0xe6/0x370 [ 92.700072] ? lockdep_hardirqs_on+0x79/0x100 [ 92.701233] ? finish_task_switch+0x11d/0x370 [ 92.702264] ? __switch_to+0x2c0/0x740 [ 92.703148] ? mark_held_locks+0x24/0x90 [ 92.704155] ? __x64_sys_bpf+0x45/0x50 [ 92.705146] ? do_syscall_64+0x35/0x80 [ 92.706953] ? entry_SYSCALL_64_after_hwframe+0x44/0xae [...] Turns out that the program rejection from e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT") is buggy since env->prog->aux->tail_call_reachable is never true. Commit ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") added a tracker into check_max_stack_depth() which propagates the tail_call_reachable condition throughout the subprograms. This info is then assigned to the subprogram's func[i]->aux->tail_call_reachable. However, in the case of the rejection check upon JIT failure, env->prog->aux->tail_call_reachable is used. func[0]->aux->tail_call_reachable which represents the main program's information did not propagate this to the outer env->prog->aux, though. Add this propagation into check_max_stack_depth() where it needs to belong so that the check can be done reliably. Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Fixes: e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT") Co-developed-by: John Fastabend Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/618c34e3163ad1a36b1e82377576a6081e182f25.1626123173.git.daniel@iogearbox.net --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 42a4063de7cd..9de3c9c3267c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3677,6 +3677,8 @@ continue_func: if (tail_call_reachable) for (j = 0; j < frame; j++) subprog[ret_prog[j]].tail_call_reachable = true; + if (subprog[0].tail_call_reachable) + env->prog->aux->tail_call_reachable = true; /* end of for() loop means the last insn of the 'subprog' * was reached. Doesn't matter whether it was JA or EXIT -- cgit v1.2.3 From 704adfb5a9978462cd861f170201ae2b5e3d3a80 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 15 Jul 2021 00:02:06 -0400 Subject: tracing: Do not reference char * as a string in histograms The histogram logic was allowing events with char * pointers to be used as normal strings. But it was easy to crash the kernel with: # echo 'hist:keys=filename' > events/syscalls/sys_enter_openat/trigger And open some files, and boom! BUG: unable to handle page fault for address: 00007f2ced0c3280 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 1173fa067 P4D 1173fa067 PUD 1171b6067 PMD 1171dd067 PTE 0 Oops: 0000 [#1] PREEMPT SMP CPU: 6 PID: 1810 Comm: cat Not tainted 5.13.0-rc5-test+ #61 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:strlen+0x0/0x20 Code: f6 82 80 2a 0b a9 20 74 11 0f b6 50 01 48 83 c0 01 f6 82 80 2a 0b a9 20 75 ef c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 <80> 3f 00 74 10 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 c3 RSP: 0018:ffffbdbf81567b50 EFLAGS: 00010246 RAX: 0000000000000003 RBX: ffff93815cdb3800 RCX: ffff9382401a22d0 RDX: 0000000000000100 RSI: 0000000000000000 RDI: 00007f2ced0c3280 RBP: 0000000000000100 R08: ffff9382409ff074 R09: ffffbdbf81567c98 R10: ffff9382409ff074 R11: 0000000000000000 R12: ffff9382409ff074 R13: 0000000000000001 R14: ffff93815a744f00 R15: 00007f2ced0c3280 FS: 00007f2ced0f8580(0000) GS:ffff93825a800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f2ced0c3280 CR3: 0000000107069005 CR4: 00000000001706e0 Call Trace: event_hist_trigger+0x463/0x5f0 ? find_held_lock+0x32/0x90 ? sched_clock_cpu+0xe/0xd0 ? lock_release+0x155/0x440 ? kernel_init_free_pages+0x6d/0x90 ? preempt_count_sub+0x9b/0xd0 ? kernel_init_free_pages+0x6d/0x90 ? get_page_from_freelist+0x12c4/0x1680 ? __rb_reserve_next+0xe5/0x460 ? ring_buffer_lock_reserve+0x12a/0x3f0 event_triggers_call+0x52/0xe0 ftrace_syscall_enter+0x264/0x2c0 syscall_trace_enter.constprop.0+0x1ee/0x210 do_syscall_64+0x1c/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Where it triggered a fault on strlen(key) where key was the filename. The reason is that filename is a char * to user space, and the histogram code just blindly dereferenced it, with obvious bad results. I originally tried to use strncpy_from_user/kernel_nofault() but found that there's other places that its dereferenced and not worth the effort. Just do not allow "char *" to act like strings. Link: https://lkml.kernel.org/r/20210715000206.025df9d2@rorschach.local.home Cc: Ingo Molnar Cc: Andrew Morton Cc: Masami Hiramatsu Cc: Tzvetomir Stoyanov Cc: stable@vger.kernel.org Acked-by: Namhyung Kim Acked-by: Tom Zanussi Fixes: 79e577cbce4c4 ("tracing: Support string type key properly") Fixes: 5967bd5c4239 ("tracing: Let filter_assign_type() detect FILTER_PTR_STRING") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0207aeed31e6..16a9dfc9fffc 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1689,7 +1689,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (WARN_ON_ONCE(!field)) goto out; - if (is_string_field(field)) { + /* Pointers to strings are just pointers and dangerous to dereference */ + if (is_string_field(field) && + (field->filter_type != FILTER_PTR_STRING)) { flags |= HIST_FIELD_FL_STRING; hist_field->size = MAX_FILTER_STR_VAL; @@ -4495,8 +4497,6 @@ static inline void add_to_key(char *compound_key, void *key, field = key_field->field; if (field->filter_type == FILTER_DYN_STRING) size = *(u32 *)(rec + field->offset) >> 16; - else if (field->filter_type == FILTER_PTR_STRING) - size = strlen(key); else if (field->filter_type == FILTER_STATIC_STRING) size = field->size; -- cgit v1.2.3