From d3bd7413e0ca40b60cf60d4003246d067cafdeda Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 6 Jan 2019 00:54:37 +0100 Subject: bpf: fix sanitation of alu op with pointer / scalar type from different paths While 979d63d50c0c ("bpf: prevent out of bounds speculation on pointer arithmetic") took care of rejecting alu op on pointer when e.g. pointer came from two different map values with different map properties such as value size, Jann reported that a case was not covered yet when a given alu op is used in both "ptr_reg += reg" and "numeric_reg += reg" from different branches where we would incorrectly try to sanitize based on the pointer's limit. Catch this corner case and reject the program instead. Fixes: 979d63d50c0c ("bpf: prevent out of bounds speculation on pointer arithmetic") Reported-by: Jann Horn Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 61 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f6bc62a9ee8e..56674a7c3778 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3103,6 +3103,40 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, } } +static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; +} + +static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, + u32 alu_state, u32 alu_limit) +{ + /* If we arrived here from different branches with different + * state or limits to sanitize, then this won't work. + */ + if (aux->alu_state && + (aux->alu_state != alu_state || + aux->alu_limit != alu_limit)) + return -EACCES; + + /* Corresponding fixup done in fixup_bpf_calls(). */ + aux->alu_state = alu_state; + aux->alu_limit = alu_limit; + return 0; +} + +static int sanitize_val_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_insn_aux_data *aux = cur_aux(env); + + if (can_skip_alu_sanitation(env, insn)) + return 0; + + return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0); +} + static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, @@ -3117,7 +3151,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_reg_state tmp; bool ret; - if (env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K) + if (can_skip_alu_sanitation(env, insn)) return 0; /* We already marked aux for masking from non-speculative @@ -3133,19 +3167,8 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg)) return 0; - - /* If we arrived here from different branches with different - * limits to sanitize, then this won't work. - */ - if (aux->alu_state && - (aux->alu_state != alu_state || - aux->alu_limit != alu_limit)) + if (update_alu_sanitation_state(aux, alu_state, alu_limit)) return -EACCES; - - /* Corresponding fixup done in fixup_bpf_calls(). */ - aux->alu_state = alu_state; - aux->alu_limit = alu_limit; - do_sim: /* Simulate and find potential out-of-bounds access under * speculative execution from truncation as a result of @@ -3418,6 +3441,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, s64 smin_val, smax_val; u64 umin_val, umax_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + u32 dst = insn->dst_reg; + int ret; if (insn_bitness == 32) { /* Relevant for 32-bit RSH: Information can propagate towards @@ -3452,6 +3477,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_val_alu(env, insn); + if (ret < 0) { + verbose(env, "R%d tried to add from different pointers or scalars\n", dst); + return ret; + } if (signed_add_overflows(dst_reg->smin_value, smin_val) || signed_add_overflows(dst_reg->smax_value, smax_val)) { dst_reg->smin_value = S64_MIN; @@ -3471,6 +3501,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: + ret = sanitize_val_alu(env, insn); + if (ret < 0) { + verbose(env, "R%d tried to sub from different pointers or scalars\n", dst); + return ret; + } if (signed_sub_overflows(dst_reg->smin_value, smax_val) || signed_sub_overflows(dst_reg->smax_value, smin_val)) { /* Overflow possible, we know nothing */ -- cgit v1.2.3 From 7b55851367136b1efd84d98fea81ba57a98304cf Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Tue, 8 Jan 2019 13:58:52 +0100 Subject: fork: record start_time late This changes the fork(2) syscall to record the process start_time after initializing the basic task structure but still before making the new process visible to user-space. Technically, we could record the start_time anytime during fork(2). But this might lead to scenarios where a start_time is recorded long before a process becomes visible to user-space. For instance, with userfaultfd(2) and TLS, user-space can delay the execution of fork(2) for an indefinite amount of time (and will, if this causes network access, or similar). By recording the start_time late, it much closer reflects the point in time where the process becomes live and can be observed by other processes. Lastly, this makes it much harder for user-space to predict and control the start_time they get assigned. Previously, user-space could fork a process and stall it in copy_thread_tls() before its pid is allocated, but after its start_time is recorded. This can be misused to later-on cycle through PIDs and resume the stalled fork(2) yielding a process that has the same pid and start_time as a process that existed before. This can be used to circumvent security systems that identify processes by their pid+start_time combination. Even though user-space was always aware that start_time recording is flaky (but several projects are known to still rely on start_time-based identification), changing the start_time to be recorded late will help mitigate existing attacks and make it much harder for user-space to control the start_time a process gets assigned. Reported-by: Jann Horn Signed-off-by: Tom Gundersen Signed-off-by: David Herrmann Signed-off-by: Linus Torvalds --- kernel/fork.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a60459947f18..7f49be94eba9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1833,8 +1833,6 @@ static __latent_entropy struct task_struct *copy_process( posix_cpu_timers_init(p); - p->start_time = ktime_get_ns(); - p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); @@ -2000,6 +1998,17 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_free_pid; + /* + * From this point on we must avoid any synchronous user-space + * communication until we take the tasklist-lock. In particular, we do + * not want user-space to be able to predict the process start-time by + * stalling fork(2) after we recorded the start_time but before it is + * visible to the system. + */ + + p->start_time = ktime_get_ns(); + p->real_start_time = ktime_get_boot_ns(); + /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! -- cgit v1.2.3 From ba4a45746c362b665e245c50b870615f02f34781 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 8 Jan 2019 15:22:57 -0800 Subject: fork, memcg: fix cached_stacks case Commit 5eed6f1dff87 ("fork,memcg: fix crash in free_thread_stack on memcg charge fail") fixes a crash caused due to failed memcg charge of the kernel stack. However the fix misses the cached_stacks case which this patch fixes. So, the same crash can happen if the memcg charge of a cached stack is failed. Link: http://lkml.kernel.org/r/20190102180145.57406-1-shakeelb@google.com Fixes: 5eed6f1dff87 ("fork,memcg: fix crash in free_thread_stack on memcg charge fail") Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Rik van Riel Cc: Rik van Riel Cc: Roman Gushchin Cc: Johannes Weiner Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a60459947f18..5ad60d47f7e7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -217,6 +217,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) memset(s->addr, 0, THREAD_SIZE); tsk->stack_vm_area = s; + tsk->stack = s->addr; return s->addr; } -- cgit v1.2.3 From beaf3d1901f4ea46fbd5c9d857227d99751de469 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 8 Jan 2019 14:20:44 -0800 Subject: bpf: fix panic in stack_map_get_build_id() on i386 and arm32 As Naresh reported, test_stacktrace_build_id() causes panic on i386 and arm32 systems. This is caused by page_address() returns NULL in certain cases. This patch fixes this error by using kmap_atomic/kunmap_atomic instead of page_address. Fixes: 615755a77b24 (" bpf: extend stackmap to save binary_build_id+offset instead of address") Reported-by: Naresh Kamboju Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/stackmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 90daf285de03..d9e2483669d0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -260,7 +260,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma, return -EFAULT; /* page not mapped */ ret = -EINVAL; - page_addr = page_address(page); + page_addr = kmap_atomic(page); ehdr = (Elf32_Ehdr *)page_addr; /* compare magic x7f "ELF" */ @@ -276,6 +276,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma, else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) ret = stack_map_get_build_id_64(page_addr, build_id); out: + kunmap_atomic(page_addr); put_page(page); return ret; } -- cgit v1.2.3 From 17e3ac812541f73224299d8958ddb420c2d5bbd8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 10 Jan 2019 11:14:00 -0800 Subject: bpf: fix bpffs bitfield pretty print Commit 9d5f9f701b18 ("bpf: btf: fix struct/union/fwd types with kind_flag") introduced kind_flag and used bitfield_size in the btf_member to directly pretty print member values. The commit contained a bug where the incorrect parameters could be passed to function btf_bitfield_seq_show(). The bits_offset parameter in the function expects a value less than 8. Instead, the member offset in the structure is passed. The below is btf_bitfield_seq_show() func signature: void btf_bitfield_seq_show(void *data, u8 bits_offset, u8 nr_bits, struct seq_file *m) both bits_offset and nr_bits are u8 type. If the bitfield member offset is greater than 256, incorrect value will be printed. This patch fixed the issue by calculating correct proper data offset and bits_offset similar to non kind_flag case. Fixes: 9d5f9f701b18 ("bpf: btf: fix struct/union/fwd types with kind_flag") Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 715f9fcf4712..a2f53642592b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1219,8 +1219,6 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset, u8 nr_copy_bits; u64 print_num; - data += BITS_ROUNDDOWN_BYTES(bits_offset); - bits_offset = BITS_PER_BYTE_MASKED(bits_offset); nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); @@ -1255,7 +1253,9 @@ static void btf_int_bits_seq_show(const struct btf *btf, * BTF_INT_OFFSET() cannot exceed 64 bits. */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); - btf_bitfield_seq_show(data, total_bits_offset, nr_bits, m); + data += BITS_ROUNDDOWN_BYTES(total_bits_offset); + bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); + btf_bitfield_seq_show(data, bits_offset, nr_bits, m); } static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, @@ -2001,12 +2001,12 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, member_offset = btf_member_bit_offset(t, member); bitfield_size = btf_member_bitfield_size(t, member); + bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); + bits8_offset = BITS_PER_BYTE_MASKED(member_offset); if (bitfield_size) { - btf_bitfield_seq_show(data, member_offset, + btf_bitfield_seq_show(data + bytes_offset, bits8_offset, bitfield_size, m); } else { - bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); - bits8_offset = BITS_PER_BYTE_MASKED(member_offset); ops = btf_type_ops(member_type); ops->seq_show(btf, member_type, member->type, data + bytes_offset, bits8_offset, m); -- cgit v1.2.3 From 73ab1cb2de9e3efe7f818d5453de271e5371df1d Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 9 Jan 2019 02:23:56 +0900 Subject: umh: add exit routine for UMH process A UMH process which is created by the fork_usermode_blob() such as bpfilter needs to release members of the umh_info when process is terminated. But the do_exit() does not release members of the umh_info. hence module which uses UMH needs own code to detect whether UMH process is terminated or not. But this implementation needs extra code for checking the status of UMH process. it eventually makes the code more complex. The new PF_UMH flag is added and it is used to identify UMH processes. The exit_umh() does not release members of the umh_info. Hence umh_info->cleanup callback should release both members of the umh_info and the private data. Suggested-by: David S. Miller Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/sched.h | 9 +++++++++ include/linux/umh.h | 2 ++ kernel/exit.c | 1 + kernel/umh.c | 33 +++++++++++++++++++++++++++++++-- 4 files changed, 43 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 89541d248893..e35e35b9fc48 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1406,6 +1406,7 @@ extern struct pid *cad_pid; #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ +#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ @@ -1904,6 +1905,14 @@ static inline void rseq_execve(struct task_struct *t) #endif +void __exit_umh(struct task_struct *tsk); + +static inline void exit_umh(struct task_struct *tsk) +{ + if (unlikely(tsk->flags & PF_UMH)) + __exit_umh(tsk); +} + #ifdef CONFIG_DEBUG_RSEQ void rseq_syscall(struct pt_regs *regs); diff --git a/include/linux/umh.h b/include/linux/umh.h index 235f51b62c71..0c08de356d0d 100644 --- a/include/linux/umh.h +++ b/include/linux/umh.h @@ -47,6 +47,8 @@ struct umh_info { const char *cmdline; struct file *pipe_to_umh; struct file *pipe_from_umh; + struct list_head list; + void (*cleanup)(struct umh_info *info); pid_t pid; }; int fork_usermode_blob(void *data, size_t len, struct umh_info *info); diff --git a/kernel/exit.c b/kernel/exit.c index 8a01b671dc1f..dad70419195c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -866,6 +866,7 @@ void __noreturn do_exit(long code) exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); + exit_umh(tsk); /* * Flush inherited counters to the parent - before the parent diff --git a/kernel/umh.c b/kernel/umh.c index 0baa672e023c..d937cbad903a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -37,6 +37,8 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); +static LIST_HEAD(umh_list); +static DEFINE_MUTEX(umh_list_lock); static void call_usermodehelper_freeinfo(struct subprocess_info *info) { @@ -100,10 +102,12 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); sub_info->pid = task_pid_nr(current); - if (sub_info->file) + if (sub_info->file) { retval = do_execve_file(sub_info->file, sub_info->argv, sub_info->envp); - else + if (!retval) + current->flags |= PF_UMH; + } else retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); @@ -517,6 +521,11 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) goto out; err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + if (!err) { + mutex_lock(&umh_list_lock); + list_add(&info->list, &umh_list); + mutex_unlock(&umh_list_lock); + } out: fput(file); return err; @@ -679,6 +688,26 @@ static int proc_cap_handler(struct ctl_table *table, int write, return 0; } +void __exit_umh(struct task_struct *tsk) +{ + struct umh_info *info; + pid_t pid = tsk->pid; + + mutex_lock(&umh_list_lock); + list_for_each_entry(info, &umh_list, list) { + if (info->pid == pid) { + list_del(&info->list); + mutex_unlock(&umh_list_lock); + goto out; + } + } + mutex_unlock(&umh_list_lock); + return; +out: + if (info->cleanup) + info->cleanup(info); +} + struct ctl_table usermodehelper_table[] = { { .procname = "bset", -- cgit v1.2.3 From b7285b425318331c2de4af2a784a18e6dccef484 Mon Sep 17 00:00:00 2001 From: Jonathan Neuschäfer Date: Sat, 12 Jan 2019 18:14:30 +0100 Subject: kernel/sys.c: Clarify that UNAME26 does not generate unique versions anymore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UNAME26 is a mechanism to report Linux's version as 2.6.x, for compatibility with old/broken software. Due to the way it is implemented, it would have to be updated after 5.0, to keep the resulting versions unique. Linus Torvalds argued: "Do we actually need this? I'd rather let it bitrot, and just let it return random versions. It will just start again at 2.4.60, won't it? Anybody who uses UNAME26 for a 5.x kernel might as well think it's still 4.x. The user space is so old that it can't possibly care about differences between 4.x and 5.x, can it? The only thing that matters is that it shows "2.4.", which it will do regardless" Signed-off-by: Jonathan Neuschäfer Signed-off-by: Linus Torvalds --- kernel/sys.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index a48cbf1414b8..f7eb62eceb24 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1207,7 +1207,8 @@ DECLARE_RWSEM(uts_sem); /* * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 - * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60. + * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be + * 2.6.60. */ static int override_release(char __user *release, size_t len) { -- cgit v1.2.3 From bddda606ec76550dd63592e32a6e87e7d32583f7 Mon Sep 17 00:00:00 2001 From: Srinivas Ramana Date: Thu, 20 Dec 2018 19:05:57 +0530 Subject: genirq: Make sure the initial affinity is not empty If all CPUs in the irq_default_affinity mask are offline when an interrupt is initialized then irq_setup_affinity() can set an empty affinity mask for a newly allocated interrupt. Fix this by falling back to cpu_online_mask in case the resulting affinity mask is zero. Signed-off-by: Srinivas Ramana Signed-off-by: Thomas Gleixner Cc: linux-arm-msm@vger.kernel.org Link: https://lkml.kernel.org/r/1545312957-8504-1-git-send-email-sramana@codeaurora.org --- kernel/irq/manage.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a4888ce4667a..84b54a17b95d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -393,6 +393,9 @@ int irq_setup_affinity(struct irq_desc *desc) } cpumask_and(&mask, cpu_online_mask, set); + if (cpumask_empty(&mask)) + cpumask_copy(&mask, cpu_online_mask); + if (node != NUMA_NO_NODE) { const struct cpumask *nodemask = cpumask_of_node(node); -- cgit v1.2.3 From 93ad0fc088c5b4631f796c995bdd27a082ef33a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Jan 2019 14:33:16 +0100 Subject: posix-cpu-timers: Unbreak timer rearming The recent commit which prevented a division by 0 issue in the alarm timer code broke posix CPU timers as an unwanted side effect. The reason is that the common rearm code checks for timer->it_interval being 0 now. What went unnoticed is that the posix cpu timer setup does not initialize timer->it_interval as it stores the interval in CPU timer specific storage. The reason for the separate storage is historical as the posix CPU timers always had a 64bit nanoseconds representation internally while timer->it_interval is type ktime_t which used to be a modified timespec representation on 32bit machines. Instead of reverting the offending commit and fixing the alarmtimer issue in the alarmtimer code, store the interval in timer->it_interval at CPU timer setup time so the common code check works. This also repairs the existing inconistency of the posix CPU timer code which kept a single shot timer armed despite of the interval being 0. The separate storage can be removed in mainline, but that needs to be a separate commit as the current one has to be backported to stable kernels. Fixes: 0e334db6bb4b ("posix-timers: Fix division by zero bug") Reported-by: H.J. Lu Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190111133500.840117406@linutronix.de --- kernel/time/posix-cpu-timers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 8f0644af40be..80f955210861 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -685,6 +685,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * set up the signal and overrun bookkeeping. */ timer->it.cpu.incr = timespec64_to_ns(&new->it_interval); + timer->it_interval = ns_to_ktime(timer->it.cpu.incr); /* * This acts as a modification timestamp for the timer, -- cgit v1.2.3 From 8b05a3a7503c2a982c9c462eae96cfbd59506783 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 11 Jan 2019 07:01:13 +0100 Subject: tracing/kprobes: Fix NULL pointer dereference in trace_kprobe_create() It is possible to trigger a NULL pointer dereference by writing an incorrectly formatted string to krpobe_events (trying to create a kretprobe omitting the symbol). Example: echo "r:event_1 " >> /sys/kernel/debug/tracing/kprobe_events That triggers this: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 #PF error: [normal kernel read fault] PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 6 PID: 1757 Comm: bash Not tainted 5.0.0-rc1+ #125 Hardware name: Dell Inc. XPS 13 9370/0F6P3V, BIOS 1.5.1 08/09/2018 RIP: 0010:kstrtoull+0x2/0x20 Code: 28 00 00 00 75 17 48 83 c4 18 5b 41 5c 5d c3 b8 ea ff ff ff eb e1 b8 de ff ff ff eb da e8 d6 36 bb ff 66 0f 1f 44 00 00 31 c0 <80> 3f 2b 55 48 89 e5 0f 94 c0 48 01 c7 e8 5c ff ff ff 5d c3 66 2e RSP: 0018:ffffb5d482e57cb8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffffffff82b12720 RDX: ffffb5d482e57cf8 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffb5d482e57d70 R08: ffffa0c05e5a7080 R09: ffffa0c05e003980 R10: 0000000000000000 R11: 0000000040000000 R12: ffffa0c04fe87b08 R13: 0000000000000001 R14: 000000000000000b R15: ffffa0c058d749e1 FS: 00007f137c7f7740(0000) GS:ffffa0c05e580000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000497d46004 CR4: 00000000003606e0 Call Trace: ? trace_kprobe_create+0xb6/0x840 ? _cond_resched+0x19/0x40 ? _cond_resched+0x19/0x40 ? __kmalloc+0x62/0x210 ? argv_split+0x8f/0x140 ? trace_kprobe_create+0x840/0x840 ? trace_kprobe_create+0x840/0x840 create_or_delete_trace_kprobe+0x11/0x30 trace_run_command+0x50/0x90 trace_parse_run_command+0xc1/0x160 probes_write+0x10/0x20 __vfs_write+0x3a/0x1b0 ? apparmor_file_permission+0x1a/0x20 ? security_file_permission+0x31/0xf0 ? _cond_resched+0x19/0x40 vfs_write+0xb1/0x1a0 ksys_write+0x55/0xc0 __x64_sys_write+0x1a/0x20 do_syscall_64+0x5a/0x120 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix by doing the proper argument checks in trace_kprobe_create(). Cc: Ingo Molnar Link: https://lore.kernel.org/lkml/20190111095108.b79a2ee026185cbd62365977@kernel.org Link: http://lkml.kernel.org/r/20190111060113.GA22841@xps-13 Fixes: 6212dd29683e ("tracing/kprobes: Use dyn_event framework for kprobe events") Acked-by: Masami Hiramatsu Signed-off-by: Andrea Righi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5c19b8c41c7e..d5fb09ebba8b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -607,11 +607,17 @@ static int trace_kprobe_create(int argc, const char *argv[]) char buf[MAX_EVENT_NAME_LEN]; unsigned int flags = TPARG_FL_KERNEL; - /* argc must be >= 1 */ - if (argv[0][0] == 'r') { + switch (argv[0][0]) { + case 'r': is_return = true; flags |= TPARG_FL_RETURN; - } else if (argv[0][0] != 'p' || argc < 2) + break; + case 'p': + break; + default: + return -ECANCELED; + } + if (argc < 2) return -ECANCELED; event = strchr(&argv[0][1], ':'); -- cgit v1.2.3 From a811dc61559e0c8003f1086c2a4dc8e4d5ae4cb8 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Sat, 12 Jan 2019 11:24:20 -0700 Subject: seccomp: fix UAF in user-trap code On the failure path, we do an fput() of the listener fd if the filter fails to install (e.g. because of a TSYNC race that's lost, or if the thread is killed, etc.). fput() doesn't actually release the fd, it just ads it to a work queue. Then the thread proceeds to free the filter, even though the listener struct file has a reference to it. To fix this, on the failure path let's set the private data to null, so we know in ->release() to ignore the filter. Reported-by: syzbot+981c26489b2d1c6316ba@syzkaller.appspotmail.com Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Signed-off-by: Tycho Andersen Acked-by: Kees Cook Signed-off-by: James Morris --- kernel/seccomp.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d7f538847b84..e815781ed751 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -976,6 +976,9 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) struct seccomp_filter *filter = file->private_data; struct seccomp_knotif *knotif; + if (!filter) + return 0; + mutex_lock(&filter->notify_lock); /* @@ -1300,6 +1303,7 @@ out: out_put_fd: if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { if (ret < 0) { + listener_f->private_data = NULL; fput(listener_f); put_unused_fd(listener); } else { -- cgit v1.2.3 From 227a76b64718888c1687cc237463aa000ae6fb2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Jan 2019 21:14:08 +0100 Subject: swiotlb: clear io_tlb_start and io_tlb_end in swiotlb_exit Otherwise is_swiotlb_buffer will return false positives when we first initialize a swiotlb buffer, but then free it because we have an IOMMU available. Fixes: 55897af63091 ("dma-direct: merge swiotlb_dma_ops into the dma_direct code") Reported-by: Sibren Vasse Signed-off-by: Christoph Hellwig Tested-by: Sibren Vasse Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d6361776dc5c..1fb6fd68b9c7 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -378,6 +378,8 @@ void __init swiotlb_exit(void) memblock_free_late(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); } + io_tlb_start = 0; + io_tlb_end = 0; io_tlb_nslabs = 0; max_segment = 0; } -- cgit v1.2.3 From ea6eb5e7d15e1838de335609994b4546e2abcaaf Mon Sep 17 00:00:00 2001 From: Andreas Ziegler Date: Thu, 17 Jan 2019 14:30:23 +0100 Subject: tracing: uprobes: Fix typo in pr_fmt string The subsystem-specific message prefix for uprobes was also "trace_kprobe: " instead of "trace_uprobe: " as described in the original commit message. Link: http://lkml.kernel.org/r/20190117133023.19292-1-andreas.ziegler@fau.de Cc: Ingo Molnar Cc: stable@vger.kernel.org Acked-by: Masami Hiramatsu Fixes: 7257634135c24 ("tracing/probe: Show subsystem name in messages") Signed-off-by: Andreas Ziegler Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e335576b9411..19a1a8e19062 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -5,7 +5,7 @@ * Copyright (C) IBM Corporation, 2010-2012 * Author: Srikar Dronamraju */ -#define pr_fmt(fmt) "trace_kprobe: " fmt +#define pr_fmt(fmt) "trace_uprobe: " fmt #include #include -- cgit v1.2.3 From 0b698005a9d11c0e91141ec11a2c4918a129f703 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 16 Jan 2019 14:03:15 -0800 Subject: bpf: don't assume build-id length is always 20 bytes Build-id length is not fixed to 20, it can be (`man ld` /--build-id): * 128-bit (uuid) * 160-bit (sha1) * any length specified in ld --build-id=0xhexstring To fix the issue of missing BPF_STACK_BUILD_ID_VALID for shorter build-ids, assume that build-id is somewhere in the range of 1 .. 20. Set the remaining bytes to zero. v2: * don't introduce new "len = min(BPF_BUILD_ID_SIZE, nhdr->n_descsz)", we already know that nhdr->n_descsz <= BPF_BUILD_ID_SIZE if we enter this 'if' condition Fixes: 615755a77b24 ("bpf: extend stackmap to save binary_build_id+offset instead of address") Acked-by: Song Liu Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- kernel/bpf/stackmap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index d9e2483669d0..f9df545e92f6 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -180,11 +180,14 @@ static inline int stack_map_parse_build_id(void *page_addr, if (nhdr->n_type == BPF_BUILD_ID && nhdr->n_namesz == sizeof("GNU") && - nhdr->n_descsz == BPF_BUILD_ID_SIZE) { + nhdr->n_descsz > 0 && + nhdr->n_descsz <= BPF_BUILD_ID_SIZE) { memcpy(build_id, note_start + note_offs + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - BPF_BUILD_ID_SIZE); + nhdr->n_descsz); + memset(build_id + nhdr->n_descsz, 0, + BPF_BUILD_ID_SIZE - nhdr->n_descsz); return 0; } new_offs = note_offs + sizeof(Elf32_Nhdr) + -- cgit v1.2.3 From 4af396ae4836c4ecab61e975b8e61270c551894d Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 16 Jan 2019 14:03:16 -0800 Subject: bpf: zero out build_id for BPF_STACK_BUILD_ID_IP When returning BPF_STACK_BUILD_ID_IP from stack_map_get_build_id_offset, make sure that build_id field is empty. Since we are using percpu free list, there is a possibility that we might reuse some previous bpf_stack_build_id with non-zero build_id. Fixes: 615755a77b24 ("bpf: extend stackmap to save binary_build_id+offset instead of address") Acked-by: Song Liu Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- kernel/bpf/stackmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index f9df545e92f6..d43b14535827 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -314,6 +314,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; + memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); } return; } @@ -324,6 +325,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; + memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); continue; } id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] -- cgit v1.2.3 From 583c53185399cea5c51195064564d1c9ddc70cf3 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 16 Jan 2019 20:29:40 +0100 Subject: bpf: Make function btf_name_offset_valid static Initially in commit 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") the function 'btf_name_offset_valid' was introduced as static function it was later on changed to a non-static one, and then finally in commit 23127b33ec80 ("bpf: Create a new btf_name_by_offset() for non type name use case") the function prototype was removed. Revert back to original implementation and make the function static. Remove warning triggered with W=1: kernel/bpf/btf.c:470:6: warning: no previous prototype for 'btf_name_offset_valid' [-Wmissing-prototypes] Fixes: 23127b33ec80 ("bpf: Create a new btf_name_by_offset() for non type name use case") Signed-off-by: Mathieu Malaterre Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a2f53642592b..befe570be5ba 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -467,7 +467,7 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) return kind_ops[BTF_INFO_KIND(t->info)]; } -bool btf_name_offset_valid(const struct btf *btf, u32 offset) +static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { return BTF_STR_OFFSET_VALID(offset) && offset < btf->hdr.str_len; -- cgit v1.2.3 From c8dc79806e7f6cb6b0952aae1ce626c39905ad7e Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 16 Jan 2019 20:35:29 +0100 Subject: bpf: Annotate implicit fall through in cgroup_dev_func_proto There is a plan to build the kernel with -Wimplicit-fallthrough and this place in the code produced a warnings (W=1). This commit removes the following warning: kernel/bpf/cgroup.c:719:6: warning: this statement may fall through [-Wimplicit-fallthrough=] Signed-off-by: Mathieu Malaterre Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9425c2fb872f..ab612fe9862f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -718,6 +718,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); + /* fall through */ default: return NULL; } -- cgit v1.2.3 From 0722069a5374b904ec1a67f91249f90e1cfae259 Mon Sep 17 00:00:00 2001 From: Andreas Ziegler Date: Wed, 16 Jan 2019 15:16:29 +0100 Subject: tracing/uprobes: Fix output for multiple string arguments When printing multiple uprobe arguments as strings the output for the earlier arguments would also include all later string arguments. This is best explained in an example: Consider adding a uprobe to a function receiving two strings as parameters which is at offset 0xa0 in strlib.so and we want to print both parameters when the uprobe is hit (on x86_64): $ echo 'p:func /lib/strlib.so:0xa0 +0(%di):string +0(%si):string' > \ /sys/kernel/debug/tracing/uprobe_events When the function is called as func("foo", "bar") and we hit the probe, the trace file shows a line like the following: [...] func: (0x7f7e683706a0) arg1="foobar" arg2="bar" Note the extra "bar" printed as part of arg1. This behaviour stacks up for additional string arguments. The strings are stored in a dynamically growing part of the uprobe buffer by fetch_store_string() after copying them from userspace via strncpy_from_user(). The return value of strncpy_from_user() is then directly used as the required size for the string. However, this does not take the terminating null byte into account as the documentation for strncpy_from_user() cleary states that it "[...] returns the length of the string (not including the trailing NUL)" even though the null byte will be copied to the destination. Therefore, subsequent calls to fetch_store_string() will overwrite the terminating null byte of the most recently fetched string with the first character of the current string, leading to the "accumulation" of strings in earlier arguments in the output. Fix this by incrementing the return value of strncpy_from_user() by one if we did not hit the maximum buffer size. Link: http://lkml.kernel.org/r/20190116141629.5752-1-andreas.ziegler@fau.de Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 5baaa59ef09e ("tracing/probes: Implement 'memory' fetch method for uprobes") Acked-by: Masami Hiramatsu Signed-off-by: Andreas Ziegler Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 19a1a8e19062..9bde07c06362 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -160,6 +160,13 @@ fetch_store_string(unsigned long addr, void *dest, void *base) if (ret >= 0) { if (ret == maxlen) dst[ret - 1] = '\0'; + else + /* + * Include the terminating null byte. In this case it + * was copied by strncpy_from_user but not accounted + * for in ret. + */ + ret++; *(u32 *)dest = make_data_loc(ret, (void *)dst - base); } -- cgit v1.2.3 From 12fee4cd5be2c4a73cc13d7ad76eb2d2feda8a71 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 17 Jan 2019 11:00:09 +0800 Subject: genirq/irqdesc: Fix double increment in alloc_descs() The recent rework of alloc_descs() introduced a double increment of the loop counter. As a consequence only every second affinity mask is validated. Remove it. [ tglx: Massaged changelog ] Fixes: c410abbbacb9 ("genirq/affinity: Add is_managed to struct irq_affinity_desc") Signed-off-by: Huacai Chen Signed-off-by: Thomas Gleixner Cc: Fuxin Zhang Cc: Zhangjin Wu Cc: Huacai Chen Cc: Dou Liyang Link: https://lkml.kernel.org/r/1547694009-16261-1-git-send-email-chenhc@lemote.com --- kernel/irq/irqdesc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index ee062b7939d3..ef8ad36cadcf 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -457,7 +457,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, /* Validate affinity mask(s) */ if (affinity) { - for (i = 0; i < cnt; i++, i++) { + for (i = 0; i < cnt; i++) { if (cpumask_empty(&affinity[i].mask)) return -EINVAL; } -- cgit v1.2.3 From 1a51c5da5acc6c188c917ba572eebac5f8793432 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 10 Jan 2019 17:17:16 -0800 Subject: perf core: Fix perf_proc_update_handler() bug The perf_proc_update_handler() handles /proc/sys/kernel/perf_event_max_sample_rate syctl variable. When the PMU IRQ handler timing monitoring is disabled, i.e, when /proc/sys/kernel/perf_cpu_time_max_percent is equal to 0 or 100, then no modification to sysctl_perf_event_sample_rate is allowed to prevent possible hang from wrong values. The problem is that the test to prevent modification is made after the sysctl variable is modified in perf_proc_update_handler(). You get an error: $ echo 10001 >/proc/sys/kernel/perf_event_max_sample_rate echo: write error: invalid argument But the value is still modified causing all sorts of inconsistencies: $ cat /proc/sys/kernel/perf_event_max_sample_rate 10001 This patch fixes the problem by moving the parsing of the value after the test. Committer testing: # echo 100 > /proc/sys/kernel/perf_cpu_time_max_percent # echo 10001 > /proc/sys/kernel/perf_event_max_sample_rate -bash: echo: write error: Invalid argument # cat /proc/sys/kernel/perf_event_max_sample_rate 10001 # Signed-off-by: Stephane Eranian Reviewed-by: Andi Kleen Reviewed-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Kan Liang Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1547169436-6266-1-git-send-email-eranian@google.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3cd13a30f732..e5ede6918050 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -436,18 +436,18 @@ int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - - if (ret || !write) - return ret; - + int ret; + int perf_cpu = sysctl_perf_cpu_time_max_percent; /* * If throttling is disabled don't allow the write: */ - if (sysctl_perf_cpu_time_max_percent == 100 || - sysctl_perf_cpu_time_max_percent == 0) + if (write && (perf_cpu == 100 || perf_cpu == 0)) return -EINVAL; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); -- cgit v1.2.3 From 9d5564ddcf2a0f5ba3fa1c3a1f8a1b59ad309553 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 17 Jan 2019 16:34:45 +0100 Subject: bpf: fix inner map masking to prevent oob under speculation During review I noticed that inner meta map setup for map in map is buggy in that it does not propagate all needed data from the reference map which the verifier is later accessing. In particular one such case is index masking to prevent out of bounds access under speculative execution due to missing the map's unpriv_array/index_mask field propagation. Fix this such that the verifier is generating the correct code for inlined lookups in case of unpriviledged use. Before patch (test_verifier's 'map in map access' dump): # bpftool prog dump xla id 3 0: (62) *(u32 *)(r10 -4) = 0 1: (bf) r2 = r10 2: (07) r2 += -4 3: (18) r1 = map[id:4] 5: (07) r1 += 272 | 6: (61) r0 = *(u32 *)(r2 +0) | 7: (35) if r0 >= 0x1 goto pc+6 | Inlined map in map lookup 8: (54) (u32) r0 &= (u32) 0 | with index masking for 9: (67) r0 <<= 3 | map->unpriv_array. 10: (0f) r0 += r1 | 11: (79) r0 = *(u64 *)(r0 +0) | 12: (15) if r0 == 0x0 goto pc+1 | 13: (05) goto pc+1 | 14: (b7) r0 = 0 | 15: (15) if r0 == 0x0 goto pc+11 16: (62) *(u32 *)(r10 -4) = 0 17: (bf) r2 = r10 18: (07) r2 += -4 19: (bf) r1 = r0 20: (07) r1 += 272 | 21: (61) r0 = *(u32 *)(r2 +0) | Index masking missing (!) 22: (35) if r0 >= 0x1 goto pc+3 | for inner map despite 23: (67) r0 <<= 3 | map->unpriv_array set. 24: (0f) r0 += r1 | 25: (05) goto pc+1 | 26: (b7) r0 = 0 | 27: (b7) r0 = 0 28: (95) exit After patch: # bpftool prog dump xla id 1 0: (62) *(u32 *)(r10 -4) = 0 1: (bf) r2 = r10 2: (07) r2 += -4 3: (18) r1 = map[id:2] 5: (07) r1 += 272 | 6: (61) r0 = *(u32 *)(r2 +0) | 7: (35) if r0 >= 0x1 goto pc+6 | Same inlined map in map lookup 8: (54) (u32) r0 &= (u32) 0 | with index masking due to 9: (67) r0 <<= 3 | map->unpriv_array. 10: (0f) r0 += r1 | 11: (79) r0 = *(u64 *)(r0 +0) | 12: (15) if r0 == 0x0 goto pc+1 | 13: (05) goto pc+1 | 14: (b7) r0 = 0 | 15: (15) if r0 == 0x0 goto pc+12 16: (62) *(u32 *)(r10 -4) = 0 17: (bf) r2 = r10 18: (07) r2 += -4 19: (bf) r1 = r0 20: (07) r1 += 272 | 21: (61) r0 = *(u32 *)(r2 +0) | 22: (35) if r0 >= 0x1 goto pc+4 | Now fixed inlined inner map 23: (54) (u32) r0 &= (u32) 0 | lookup with proper index masking 24: (67) r0 <<= 3 | for map->unpriv_array. 25: (0f) r0 += r1 | 26: (05) goto pc+1 | 27: (b7) r0 = 0 | 28: (b7) r0 = 0 29: (95) exit Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/map_in_map.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 99d243e1ad6e..52378d3e34b3 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -12,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) { struct bpf_map *inner_map, *inner_map_meta; + u32 inner_map_meta_size; struct fd f; f = fdget(inner_map_ufd); @@ -36,7 +37,12 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-EINVAL); } - inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); + inner_map_meta_size = sizeof(*inner_map_meta); + /* In some cases verifier needs to access beyond just base map. */ + if (inner_map->ops == &array_map_ops) + inner_map_meta_size = sizeof(struct bpf_array); + + inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); if (!inner_map_meta) { fdput(f); return ERR_PTR(-ENOMEM); @@ -46,9 +52,16 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->key_size = inner_map->key_size; inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; - inner_map_meta->ops = inner_map->ops; inner_map_meta->max_entries = inner_map->max_entries; + /* Misc members not needed in bpf_map_meta_equal() check. */ + inner_map_meta->ops = inner_map->ops; + if (inner_map->ops == &array_map_ops) { + inner_map_meta->unpriv_array = inner_map->unpriv_array; + container_of(inner_map_meta, struct bpf_array, map)->index_mask = + container_of(inner_map, struct bpf_array, map)->index_mask; + } + fdput(f); return inner_map_meta; } -- cgit v1.2.3 From 6dc080eeb2ba01973bfff0d79844d7a59e12542e Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Fri, 30 Nov 2018 20:40:56 +0530 Subject: sched/wait: Fix rcuwait_wake_up() ordering For some peculiar reason rcuwait_wake_up() has the right barrier in the comment, but not in the code. This mistake has been observed to cause a deadlock in the following situation: P1 P2 percpu_up_read() percpu_down_write() rcu_sync_is_idle() // false rcu_sync_enter() ... __percpu_up_read() [S] ,- __this_cpu_dec(*sem->read_count) | smp_rmb(); [L] | task = rcu_dereference(w->task) // NULL | | [S] w->task = current | smp_mb(); | [L] readers_active_check() // fail `-> Where the smp_rmb() (obviously) fails to constrain the store. [ peterz: Added changelog. ] Signed-off-by: Prateek Sood Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andrea Parri Acked-by: Davidlohr Bueso Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 8f95c90ceb54 ("sched/wait, RCU: Introduce rcuwait machinery") Link: https://lkml.kernel.org/r/1543590656-7157-1-git-send-email-prsood@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 284f2fe9a293..3fb7be001964 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -307,7 +307,7 @@ void rcuwait_wake_up(struct rcuwait *w) * MB (A) MB (B) * [L] cond [L] tsk */ - smp_rmb(); /* (B) */ + smp_mb(); /* (B) */ /* * Avoid using task_rcu_dereference() magic as long as we are careful, -- cgit v1.2.3 From e6018c0f5c996e61639adce6a0697391a2861916 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Dec 2018 10:14:53 +0100 Subject: sched/wake_q: Document wake_q_add() The only guarantee provided by wake_q_add() is that a wakeup will happen after it, it does _NOT_ guarantee the wakeup will be delayed until the matching wake_up_q(). If wake_q_add() fails the cmpxchg() a concurrent wakeup is pending and that can happen at any time after the cmpxchg(). This means we should not rely on the wakeup happening at wake_q_up(), but should be ready for wake_q_add() to issue the wakeup. The delay; if provided (most likely); should only result in more efficient behaviour. Reported-by: Yongji Xie Signed-off-by: Peter Zijlstra (Intel) Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Signed-off-by: Ingo Molnar --- include/linux/sched/wake_q.h | 6 +++++- kernel/sched/core.c | 12 ++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index 10b19a192b2d..545f37138057 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -24,9 +24,13 @@ * called near the end of a function. Otherwise, the list can be * re-initialized for later re-use by wake_q_init(). * - * Note that this can cause spurious wakeups. schedule() callers + * NOTE that this can cause spurious wakeups. schedule() callers * must ensure the call is done inside a loop, confirming that the * wakeup condition has in fact occurred. + * + * NOTE that there is no guarantee the wakeup will happen any later than the + * wake_q_add() location. Therefore task must be ready to be woken at the + * location of the wake_q_add(). */ #include diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a674c7db2f29..cc814933f7d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -396,6 +396,18 @@ static bool set_nr_if_polling(struct task_struct *p) #endif #endif +/** + * wake_q_add() - queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + */ void wake_q_add(struct wake_q_head *head, struct task_struct *task) { struct wake_q_node *node = &task->wake_q; -- cgit v1.2.3 From 4c4e3731564c8945ac5ac90fc2a1e1f21cb79c92 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Dec 2018 10:14:53 +0100 Subject: sched/wake_q: Fix wakeup ordering for wake_q Notable cmpxchg() does not provide ordering when it fails, however wake_q_add() requires ordering in this specific case too. Without this it would be possible for the concurrent wakeup to not observe our prior state. Andrea Parri provided: C wake_up_q-wake_q_add { int next = 0; int y = 0; } P0(int *next, int *y) { int r0; /* in wake_up_q() */ WRITE_ONCE(*next, 1); /* node->next = NULL */ smp_mb(); /* implied by wake_up_process() */ r0 = READ_ONCE(*y); } P1(int *next, int *y) { int r1; /* in wake_q_add() */ WRITE_ONCE(*y, 1); /* wake_cond = true */ smp_mb__before_atomic(); r1 = cmpxchg_relaxed(next, 1, 2); } exists (0:r0=0 /\ 1:r1=0) This "exists" clause cannot be satisfied according to the LKMM: Test wake_up_q-wake_q_add Allowed States 3 0:r0=0; 1:r1=1; 0:r0=1; 1:r1=0; 0:r0=1; 1:r1=1; No Witnesses Positive: 0 Negative: 3 Condition exists (0:r0=0 /\ 1:r1=0) Observation wake_up_q-wake_q_add Never 0 3 Reported-by: Yongji Xie Signed-off-by: Peter Zijlstra (Intel) Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cc814933f7d6..d8d76a65cfdd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -417,10 +417,11 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * its already queued (either by us or someone else) and will get the * wakeup due to that. * - * This cmpxchg() executes a full barrier, which pairs with the full - * barrier executed by the wakeup in wake_up_q(). + * In order to ensure that a pending wakeup will observe our pending + * state, even in the failed case, an explicit smp_mb() must be used. */ - if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) + smp_mb__before_atomic(); + if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)) return; get_task_struct(task); -- cgit v1.2.3 From b061c38bef43406df8e73c5be06cbfacad5ee6ad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Nov 2018 14:44:49 +0100 Subject: futex: Fix (possible) missed wakeup We must not rely on wake_q_add() to delay the wakeup; in particular commit: 1d0dcb3ad9d3 ("futex: Implement lockless wakeups") moved wake_q_add() before smp_store_release(&q->lock_ptr, NULL), which could result in futex_wait() waking before observing ->lock_ptr == NULL and going back to sleep again. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 1d0dcb3ad9d3 ("futex: Implement lockless wakeups") Signed-off-by: Ingo Molnar --- kernel/futex.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index be3bff2315ff..fdd312da0992 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1452,11 +1452,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) return; - /* - * Queue the task for later wakeup for after we've released - * the hb->lock. wake_q_add() grabs reference to p. - */ - wake_q_add(wake_q, p); + get_task_struct(p); __unqueue_futex(q); /* * The waiting task can free the futex_q as soon as q->lock_ptr = NULL @@ -1466,6 +1462,13 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) * plist_del in __unqueue_futex(). */ smp_store_release(&q->lock_ptr, NULL); + + /* + * Queue the task for later wakeup for after we've released + * the hb->lock. wake_q_add() grabs reference to p. + */ + wake_q_add(wake_q, p); + put_task_struct(p); } /* -- cgit v1.2.3 From e158488be27b157802753a59b336142dc0eb0380 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 29 Nov 2018 20:50:30 +0800 Subject: locking/rwsem: Fix (possible) missed wakeup Because wake_q_add() can imply an immediate wakeup (cmpxchg failure case), we must not rely on the wakeup being delayed. However, commit: e38513905eea ("locking/rwsem: Rework zeroing reader waiter->task") relies on exactly that behaviour in that the wakeup must not happen until after we clear waiter->task. [ peterz: Added changelog. ] Signed-off-by: Xie Yongji Signed-off-by: Zhang Yu Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: e38513905eea ("locking/rwsem: Rework zeroing reader waiter->task") Link: https://lkml.kernel.org/r/1543495830-2644-1-git-send-email-xieyongji@baidu.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 09b180063ee1..50d9af615dc4 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -198,15 +198,22 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, woken++; tsk = waiter->task; - wake_q_add(wake_q, tsk); + get_task_struct(tsk); list_del(&waiter->list); /* - * Ensure that the last operation is setting the reader + * Ensure calling get_task_struct() before setting the reader * waiter to nil such that rwsem_down_read_failed() cannot * race with do_exit() by always holding a reference count * to the task to wakeup. */ smp_store_release(&waiter->task, NULL); + /* + * Ensure issuing the wakeup (either by us or someone else) + * after setting the reader waiter to nil. + */ + wake_q_add(wake_q, tsk); + /* wake_q_add() already take the task ref */ + put_task_struct(tsk); } adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; -- cgit v1.2.3 From 34d66caf251df91ff27b24a3a786810d29989eca Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 17 Jan 2019 02:10:59 -0800 Subject: x86/speculation: Remove redundant arch_smt_update() invocation With commit a74cfffb03b7 ("x86/speculation: Rework SMT state change"), arch_smt_update() is invoked from each individual CPU hotplug function. Therefore the extra arch_smt_update() call in the sysfs SMT control is redundant. Fixes: a74cfffb03b7 ("x86/speculation: Rework SMT state change") Signed-off-by: Zhenzhong Duan Signed-off-by: Thomas Gleixner Cc: Cc: Cc: Cc: Cc: Cc: Link: https://lkml.kernel.org/r/e2e064f2-e8ef-42ca-bf4f-76b612964752@default --- kernel/cpu.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 91d5c38eb7e5..c0c7f64573ed 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2090,10 +2090,8 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) */ cpuhp_offline_cpu_device(cpu); } - if (!ret) { + if (!ret) cpu_smt_control = ctrlval; - arch_smt_update(); - } cpu_maps_update_done(); return ret; } @@ -2104,7 +2102,6 @@ static int cpuhp_smt_enable(void) cpu_maps_update_begin(); cpu_smt_control = CPU_SMT_ENABLED; - arch_smt_update(); for_each_present_cpu(cpu) { /* Skip online CPUs and CPUs on offline nodes */ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) -- cgit v1.2.3 From 81f5c6f5db37bf2360b64c304b27b8f499b48367 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 29 Jan 2019 16:38:16 -0800 Subject: bpf: btf: allow typedef func_proto Current implementation does not allow typedef func_proto. But it is actually allowed. -bash-4.4$ cat t.c typedef int (f) (int); f *g; -bash-4.4$ clang -O2 -g -c -target bpf t.c -Xclang -target-feature -Xclang +dwarfris -bash-4.4$ pahole -JV t.o File t.o: [1] PTR (anon) type_id=2 [2] TYPEDEF f type_id=3 [3] FUNC_PROTO (anon) return=4 args=(4 (anon)) [4] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED -bash-4.4$ This patch related btf verifier to allow such (typedef func_proto) patterns. Fixes: 2667a2626f4d ("bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO") Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index befe570be5ba..c57bd10340ed 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1459,7 +1459,8 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, /* "typedef void new_void", "const void"...etc */ if (!btf_type_is_void(next_type) && - !btf_type_is_fwd(next_type)) { + !btf_type_is_fwd(next_type) && + !btf_type_is_func_proto(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } -- cgit v1.2.3 From b284909abad48b07d3071a9fc9b5692b3e64914b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 30 Jan 2019 07:13:58 -0600 Subject: cpu/hotplug: Fix "SMT disabled by BIOS" detection for KVM With the following commit: 73d5e2b47264 ("cpu/hotplug: detect SMT disabled by BIOS") ... the hotplug code attempted to detect when SMT was disabled by BIOS, in which case it reported SMT as permanently disabled. However, that code broke a virt hotplug scenario, where the guest is booted with only primary CPU threads, and a sibling is brought online later. The problem is that there doesn't seem to be a way to reliably distinguish between the HW "SMT disabled by BIOS" case and the virt "sibling not yet brought online" case. So the above-mentioned commit was a bit misguided, as it permanently disabled SMT for both cases, preventing future virt sibling hotplugs. Going back and reviewing the original problems which were attempted to be solved by that commit, when SMT was disabled in BIOS: 1) /sys/devices/system/cpu/smt/control showed "on" instead of "notsupported"; and 2) vmx_vm_init() was incorrectly showing the L1TF_MSG_SMT warning. I'd propose that we instead consider #1 above to not actually be a problem. Because, at least in the virt case, it's possible that SMT wasn't disabled by BIOS and a sibling thread could be brought online later. So it makes sense to just always default the smt control to "on" to allow for that possibility (assuming cpuid indicates that the CPU supports SMT). The real problem is #2, which has a simple fix: change vmx_vm_init() to query the actual current SMT state -- i.e., whether any siblings are currently online -- instead of looking at the SMT "control" sysfs value. So fix it by: a) reverting the original "fix" and its followup fix: 73d5e2b47264 ("cpu/hotplug: detect SMT disabled by BIOS") bc2d8d262cba ("cpu/hotplug: Fix SMT supported evaluation") and b) changing vmx_vm_init() to query the actual current SMT state -- instead of the sysfs control value -- to determine whether the L1TF warning is needed. This also requires the 'sched_smt_present' variable to exported, instead of 'cpu_smt_control'. Fixes: 73d5e2b47264 ("cpu/hotplug: detect SMT disabled by BIOS") Reported-by: Igor Mammedov Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Cc: Joe Mario Cc: Jiri Kosina Cc: Peter Zijlstra Cc: kvm@vger.kernel.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/e3a85d585da28cc333ecbc1e78ee9216e6da9396.1548794349.git.jpoimboe@redhat.com --- arch/x86/kernel/cpu/bugs.c | 2 +- arch/x86/kvm/vmx/vmx.c | 3 ++- include/linux/cpu.h | 2 -- kernel/cpu.c | 33 ++++----------------------------- kernel/sched/fair.c | 1 + kernel/smp.c | 2 -- 6 files changed, 8 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1de0f4170178..01874d54f4fd 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -71,7 +71,7 @@ void __init check_bugs(void) * identify_boot_cpu() initialized SMT support information, let the * core code know. */ - cpu_smt_check_topology_early(); + cpu_smt_check_topology(); if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4341175339f3..95d618045001 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -6823,7 +6824,7 @@ static int vmx_vm_init(struct kvm *kvm) * Warn upon starting the first VM in a potentially * insecure environment. */ - if (cpu_smt_control == CPU_SMT_ENABLED) + if (sched_smt_active()) pr_warn_once(L1TF_MSG_SMT); if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) pr_warn_once(L1TF_MSG_L1D); diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 218df7f4d3e1..5041357d0297 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -180,12 +180,10 @@ enum cpuhp_smt_control { #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT) extern enum cpuhp_smt_control cpu_smt_control; extern void cpu_smt_disable(bool force); -extern void cpu_smt_check_topology_early(void); extern void cpu_smt_check_topology(void); #else # define cpu_smt_control (CPU_SMT_ENABLED) static inline void cpu_smt_disable(bool force) { } -static inline void cpu_smt_check_topology_early(void) { } static inline void cpu_smt_check_topology(void) { } #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index c0c7f64573ed..d1c6d152da89 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -376,9 +376,6 @@ void __weak arch_smt_update(void) { } #ifdef CONFIG_HOTPLUG_SMT enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; -EXPORT_SYMBOL_GPL(cpu_smt_control); - -static bool cpu_smt_available __read_mostly; void __init cpu_smt_disable(bool force) { @@ -397,25 +394,11 @@ void __init cpu_smt_disable(bool force) /* * The decision whether SMT is supported can only be done after the full - * CPU identification. Called from architecture code before non boot CPUs - * are brought up. - */ -void __init cpu_smt_check_topology_early(void) -{ - if (!topology_smt_supported()) - cpu_smt_control = CPU_SMT_NOT_SUPPORTED; -} - -/* - * If SMT was disabled by BIOS, detect it here, after the CPUs have been - * brought online. This ensures the smt/l1tf sysfs entries are consistent - * with reality. cpu_smt_available is set to true during the bringup of non - * boot CPUs when a SMT sibling is detected. Note, this may overwrite - * cpu_smt_control's previous setting. + * CPU identification. Called from architecture code. */ void __init cpu_smt_check_topology(void) { - if (!cpu_smt_available) + if (!topology_smt_supported()) cpu_smt_control = CPU_SMT_NOT_SUPPORTED; } @@ -428,18 +411,10 @@ early_param("nosmt", smt_cmdline_disable); static inline bool cpu_smt_allowed(unsigned int cpu) { - if (topology_is_primary_thread(cpu)) + if (cpu_smt_control == CPU_SMT_ENABLED) return true; - /* - * If the CPU is not a 'primary' thread and the booted_once bit is - * set then the processor has SMT support. Store this information - * for the late check of SMT support in cpu_smt_check_topology(). - */ - if (per_cpu(cpuhp_state, cpu).booted_once) - cpu_smt_available = true; - - if (cpu_smt_control == CPU_SMT_ENABLED) + if (topology_is_primary_thread(cpu)) return true; /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 50aa2aba69bd..310d0637fe4b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5980,6 +5980,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); +EXPORT_SYMBOL_GPL(sched_smt_present); static inline void set_idle_cores(int cpu, int val) { diff --git a/kernel/smp.c b/kernel/smp.c index 163c451af42e..f4cf1b0bb3b8 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -584,8 +584,6 @@ void __init smp_init(void) num_nodes, (num_nodes > 1 ? "s" : ""), num_cpus, (num_cpus > 1 ? "s" : "")); - /* Final decision about SMT support */ - cpu_smt_check_topology(); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); } -- cgit v1.2.3 From 2c1cf00eeacb784781cf1c9896b8af001246d339 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 31 Jan 2019 13:57:58 +0100 Subject: relay: check return of create_buf_file() properly If create_buf_file() returns an error, don't try to reference it later as a valid dentry pointer. This problem was exposed when debugfs started to return errors instead of just NULL for some calls when they do not succeed properly. Also, the check for WARN_ON(dentry) was just wrong :) Reported-by: Kees Cook Reported-and-tested-by: syzbot+16c3a70e1e9b29346c43@syzkaller.appspotmail.com Reported-by: Tetsuo Handa Cc: Andrew Morton Cc: David Rientjes Fixes: ff9fb72bc077 ("debugfs: return error values, not NULL") Signed-off-by: Greg Kroah-Hartman --- kernel/relay.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 04f248644e06..9e0f52375487 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -428,6 +428,8 @@ static struct dentry *relay_create_buf_file(struct rchan *chan, dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR, buf, &chan->is_global); + if (IS_ERR(dentry)) + dentry = NULL; kfree(tmpname); @@ -461,7 +463,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) dentry = chan->cb->create_buf_file(NULL, NULL, S_IRUSR, buf, &chan->is_global); - if (WARN_ON(dentry)) + if (IS_ERR_OR_NULL(dentry)) goto free_buf; } -- cgit v1.2.3 From 6cab5e90ab2bd323c9f3811b6c70a4687df51e27 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 28 Jan 2019 18:43:34 -0800 Subject: bpf: run bpf programs with preemption disabled Disabled preemption is necessary for proper access to per-cpu maps from BPF programs. But the sender side of socket filters didn't have preemption disabled: unix_dgram_sendmsg->sk_filter->sk_filter_trim_cap->bpf_prog_run_save_cb->BPF_PROG_RUN and a combination of af_packet with tun device didn't disable either: tpacket_snd->packet_direct_xmit->packet_pick_tx_queue->ndo_select_queue-> tun_select_queue->tun_ebpf_select_queue->bpf_prog_run_clear_cb->BPF_PROG_RUN Disable preemption before executing BPF programs (both classic and extended). Reported-by: Jann Horn Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 21 ++++++++++++++++++--- kernel/bpf/cgroup.c | 2 +- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index ad106d845b22..e532fcc6e4b5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -591,8 +591,8 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb) return qdisc_skb_cb(skb)->data; } -static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, - struct sk_buff *skb) +static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, + struct sk_buff *skb) { u8 *cb_data = bpf_skb_cb(skb); u8 cb_saved[BPF_SKB_CB_LEN]; @@ -611,15 +611,30 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, return res; } +static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, + struct sk_buff *skb) +{ + u32 res; + + preempt_disable(); + res = __bpf_prog_run_save_cb(prog, skb); + preempt_enable(); + return res; +} + static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, struct sk_buff *skb) { u8 *cb_data = bpf_skb_cb(skb); + u32 res; if (unlikely(prog->cb_access)) memset(cb_data, 0, BPF_SKB_CB_LEN); - return BPF_PROG_RUN(prog, skb); + preempt_disable(); + res = BPF_PROG_RUN(prog, skb); + preempt_enable(); + return res; } static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ab612fe9862f..d17d05570a3f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -572,7 +572,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, bpf_compute_and_save_data_end(skb, &saved_data_end); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - bpf_prog_run_save_cb); + __bpf_prog_run_save_cb); bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; -- cgit v1.2.3 From a89fac57b5d080771efd4d71feaae19877cf68f0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Jan 2019 18:12:43 -0800 Subject: bpf: fix lockdep false positive in percpu_freelist Lockdep warns about false positive: [ 12.492084] 00000000e6b28347 (&head->lock){+...}, at: pcpu_freelist_push+0x2a/0x40 [ 12.492696] but this lock was taken by another, HARDIRQ-safe lock in the past: [ 12.493275] (&rq->lock){-.-.} [ 12.493276] [ 12.493276] [ 12.493276] and interrupts could create inverse lock ordering between them. [ 12.493276] [ 12.494435] [ 12.494435] other info that might help us debug this: [ 12.494979] Possible interrupt unsafe locking scenario: [ 12.494979] [ 12.495518] CPU0 CPU1 [ 12.495879] ---- ---- [ 12.496243] lock(&head->lock); [ 12.496502] local_irq_disable(); [ 12.496969] lock(&rq->lock); [ 12.497431] lock(&head->lock); [ 12.497890] [ 12.498104] lock(&rq->lock); [ 12.498368] [ 12.498368] *** DEADLOCK *** [ 12.498368] [ 12.498837] 1 lock held by dd/276: [ 12.499110] #0: 00000000c58cb2ee (rcu_read_lock){....}, at: trace_call_bpf+0x5e/0x240 [ 12.499747] [ 12.499747] the shortest dependencies between 2nd lock and 1st lock: [ 12.500389] -> (&rq->lock){-.-.} { [ 12.500669] IN-HARDIRQ-W at: [ 12.500934] _raw_spin_lock+0x2f/0x40 [ 12.501373] scheduler_tick+0x4c/0xf0 [ 12.501812] update_process_times+0x40/0x50 [ 12.502294] tick_periodic+0x27/0xb0 [ 12.502723] tick_handle_periodic+0x1f/0x60 [ 12.503203] timer_interrupt+0x11/0x20 [ 12.503651] __handle_irq_event_percpu+0x43/0x2c0 [ 12.504167] handle_irq_event_percpu+0x20/0x50 [ 12.504674] handle_irq_event+0x37/0x60 [ 12.505139] handle_level_irq+0xa7/0x120 [ 12.505601] handle_irq+0xa1/0x150 [ 12.506018] do_IRQ+0x77/0x140 [ 12.506411] ret_from_intr+0x0/0x1d [ 12.506834] _raw_spin_unlock_irqrestore+0x53/0x60 [ 12.507362] __setup_irq+0x481/0x730 [ 12.507789] setup_irq+0x49/0x80 [ 12.508195] hpet_time_init+0x21/0x32 [ 12.508644] x86_late_time_init+0xb/0x16 [ 12.509106] start_kernel+0x390/0x42a [ 12.509554] secondary_startup_64+0xa4/0xb0 [ 12.510034] IN-SOFTIRQ-W at: [ 12.510305] _raw_spin_lock+0x2f/0x40 [ 12.510772] try_to_wake_up+0x1c7/0x4e0 [ 12.511220] swake_up_locked+0x20/0x40 [ 12.511657] swake_up_one+0x1a/0x30 [ 12.512070] rcu_process_callbacks+0xc5/0x650 [ 12.512553] __do_softirq+0xe6/0x47b [ 12.512978] irq_exit+0xc3/0xd0 [ 12.513372] smp_apic_timer_interrupt+0xa9/0x250 [ 12.513876] apic_timer_interrupt+0xf/0x20 [ 12.514343] default_idle+0x1c/0x170 [ 12.514765] do_idle+0x199/0x240 [ 12.515159] cpu_startup_entry+0x19/0x20 [ 12.515614] start_kernel+0x422/0x42a [ 12.516045] secondary_startup_64+0xa4/0xb0 [ 12.516521] INITIAL USE at: [ 12.516774] _raw_spin_lock_irqsave+0x38/0x50 [ 12.517258] rq_attach_root+0x16/0xd0 [ 12.517685] sched_init+0x2f2/0x3eb [ 12.518096] start_kernel+0x1fb/0x42a [ 12.518525] secondary_startup_64+0xa4/0xb0 [ 12.518986] } [ 12.519132] ... key at: [] __key.71384+0x0/0x8 [ 12.519649] ... acquired at: [ 12.519892] pcpu_freelist_pop+0x7b/0xd0 [ 12.520221] bpf_get_stackid+0x1d2/0x4d0 [ 12.520563] ___bpf_prog_run+0x8b4/0x11a0 [ 12.520887] [ 12.521008] -> (&head->lock){+...} { [ 12.521292] HARDIRQ-ON-W at: [ 12.521539] _raw_spin_lock+0x2f/0x40 [ 12.521950] pcpu_freelist_push+0x2a/0x40 [ 12.522396] bpf_get_stackid+0x494/0x4d0 [ 12.522828] ___bpf_prog_run+0x8b4/0x11a0 [ 12.523296] INITIAL USE at: [ 12.523537] _raw_spin_lock+0x2f/0x40 [ 12.523944] pcpu_freelist_populate+0xc0/0x120 [ 12.524417] htab_map_alloc+0x405/0x500 [ 12.524835] __do_sys_bpf+0x1a3/0x1a90 [ 12.525253] do_syscall_64+0x4a/0x180 [ 12.525659] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 12.526167] } [ 12.526311] ... key at: [] __key.13130+0x0/0x8 [ 12.526812] ... acquired at: [ 12.527047] __lock_acquire+0x521/0x1350 [ 12.527371] lock_acquire+0x98/0x190 [ 12.527680] _raw_spin_lock+0x2f/0x40 [ 12.527994] pcpu_freelist_push+0x2a/0x40 [ 12.528325] bpf_get_stackid+0x494/0x4d0 [ 12.528645] ___bpf_prog_run+0x8b4/0x11a0 [ 12.528970] [ 12.529092] [ 12.529092] stack backtrace: [ 12.529444] CPU: 0 PID: 276 Comm: dd Not tainted 5.0.0-rc3-00018-g2fa53f892422 #475 [ 12.530043] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 [ 12.530750] Call Trace: [ 12.530948] dump_stack+0x5f/0x8b [ 12.531248] check_usage_backwards+0x10c/0x120 [ 12.531598] ? ___bpf_prog_run+0x8b4/0x11a0 [ 12.531935] ? mark_lock+0x382/0x560 [ 12.532229] mark_lock+0x382/0x560 [ 12.532496] ? print_shortest_lock_dependencies+0x180/0x180 [ 12.532928] __lock_acquire+0x521/0x1350 [ 12.533271] ? find_get_entry+0x17f/0x2e0 [ 12.533586] ? find_get_entry+0x19c/0x2e0 [ 12.533902] ? lock_acquire+0x98/0x190 [ 12.534196] lock_acquire+0x98/0x190 [ 12.534482] ? pcpu_freelist_push+0x2a/0x40 [ 12.534810] _raw_spin_lock+0x2f/0x40 [ 12.535099] ? pcpu_freelist_push+0x2a/0x40 [ 12.535432] pcpu_freelist_push+0x2a/0x40 [ 12.535750] bpf_get_stackid+0x494/0x4d0 [ 12.536062] ___bpf_prog_run+0x8b4/0x11a0 It has been explained that is a false positive here: https://lkml.org/lkml/2018/7/25/756 Recap: - stackmap uses pcpu_freelist - The lock in pcpu_freelist is a percpu lock - stackmap is only used by tracing bpf_prog - A tracing bpf_prog cannot be run if another bpf_prog has already been running (ensured by the percpu bpf_prog_active counter). Eric pointed out that this lockdep splats stops other legit lockdep splats in selftests/bpf/test_progs.c. Fix this by calling local_irq_save/restore for stackmap. Another false positive had also been worked around by calling local_irq_save in commit 89ad2fa3f043 ("bpf: fix lockdep splat"). That commit added unnecessary irq_save/restore to fast path of bpf hash map. irqs are already disabled at that point, since htab is holding per bucket spin_lock with irqsave. Let's reduce overhead for htab by introducing __pcpu_freelist_push/pop function w/o irqsave and convert pcpu_freelist_push/pop to irqsave to be used elsewhere (right now only in stackmap). It stops lockdep false positive in stackmap with a bit of acceptable overhead. Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation") Reported-by: Naresh Kamboju Reported-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 4 ++-- kernel/bpf/percpu_freelist.c | 41 +++++++++++++++++++++++++++++------------ kernel/bpf/percpu_freelist.h | 4 ++++ 3 files changed, 35 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4b7c76765d9d..f9274114c88d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -686,7 +686,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) } if (htab_is_prealloc(htab)) { - pcpu_freelist_push(&htab->freelist, &l->fnode); + __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); l->htab = htab; @@ -748,7 +748,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } else { struct pcpu_freelist_node *l; - l = pcpu_freelist_pop(&htab->freelist); + l = __pcpu_freelist_pop(&htab->freelist); if (!l) return ERR_PTR(-E2BIG); l_new = container_of(l, struct htab_elem, fnode); diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 673fa6fe2d73..0c1b4ba9e90e 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -28,8 +28,8 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s) free_percpu(s->freelist); } -static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, - struct pcpu_freelist_node *node) +static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) { raw_spin_lock(&head->lock); node->next = head->first; @@ -37,12 +37,22 @@ static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, raw_spin_unlock(&head->lock); } -void pcpu_freelist_push(struct pcpu_freelist *s, +void __pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); - __pcpu_freelist_push(head, node); + ___pcpu_freelist_push(head, node); +} + +void pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + unsigned long flags; + + local_irq_save(flags); + __pcpu_freelist_push(s, node); + local_irq_restore(flags); } void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, @@ -63,7 +73,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, for_each_possible_cpu(cpu) { again: head = per_cpu_ptr(s->freelist, cpu); - __pcpu_freelist_push(head, buf); + ___pcpu_freelist_push(head, buf); i++; buf += elem_size; if (i == nr_elems) @@ -74,14 +84,12 @@ again: local_irq_restore(flags); } -struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; - unsigned long flags; int orig_cpu, cpu; - local_irq_save(flags); orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); @@ -89,16 +97,25 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) node = head->first; if (node) { head->first = node->next; - raw_spin_unlock_irqrestore(&head->lock, flags); + raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; - if (cpu == orig_cpu) { - local_irq_restore(flags); + if (cpu == orig_cpu) return NULL; - } } } + +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +{ + struct pcpu_freelist_node *ret; + unsigned long flags; + + local_irq_save(flags); + ret = __pcpu_freelist_pop(s); + local_irq_restore(flags); + return ret; +} diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index 3049aae8ea1e..c3960118e617 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -22,8 +22,12 @@ struct pcpu_freelist_node { struct pcpu_freelist_node *next; }; +/* pcpu_freelist_* do spin_lock_irqsave. */ void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); +/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */ +void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *); void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems); int pcpu_freelist_init(struct pcpu_freelist *); -- cgit v1.2.3 From e16ec34039c701594d55d08a5aa49ee3e1abc821 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Jan 2019 18:12:44 -0800 Subject: bpf: fix potential deadlock in bpf_prog_register Lockdep found a potential deadlock between cpu_hotplug_lock, bpf_event_mutex, and cpuctx_mutex: [ 13.007000] WARNING: possible circular locking dependency detected [ 13.007587] 5.0.0-rc3-00018-g2fa53f892422-dirty #477 Not tainted [ 13.008124] ------------------------------------------------------ [ 13.008624] test_progs/246 is trying to acquire lock: [ 13.009030] 0000000094160d1d (tracepoints_mutex){+.+.}, at: tracepoint_probe_register_prio+0x2d/0x300 [ 13.009770] [ 13.009770] but task is already holding lock: [ 13.010239] 00000000d663ef86 (bpf_event_mutex){+.+.}, at: bpf_probe_register+0x1d/0x60 [ 13.010877] [ 13.010877] which lock already depends on the new lock. [ 13.010877] [ 13.011532] [ 13.011532] the existing dependency chain (in reverse order) is: [ 13.012129] [ 13.012129] -> #4 (bpf_event_mutex){+.+.}: [ 13.012582] perf_event_query_prog_array+0x9b/0x130 [ 13.013016] _perf_ioctl+0x3aa/0x830 [ 13.013354] perf_ioctl+0x2e/0x50 [ 13.013668] do_vfs_ioctl+0x8f/0x6a0 [ 13.014003] ksys_ioctl+0x70/0x80 [ 13.014320] __x64_sys_ioctl+0x16/0x20 [ 13.014668] do_syscall_64+0x4a/0x180 [ 13.015007] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 13.015469] [ 13.015469] -> #3 (&cpuctx_mutex){+.+.}: [ 13.015910] perf_event_init_cpu+0x5a/0x90 [ 13.016291] perf_event_init+0x1b2/0x1de [ 13.016654] start_kernel+0x2b8/0x42a [ 13.016995] secondary_startup_64+0xa4/0xb0 [ 13.017382] [ 13.017382] -> #2 (pmus_lock){+.+.}: [ 13.017794] perf_event_init_cpu+0x21/0x90 [ 13.018172] cpuhp_invoke_callback+0xb3/0x960 [ 13.018573] _cpu_up+0xa7/0x140 [ 13.018871] do_cpu_up+0xa4/0xc0 [ 13.019178] smp_init+0xcd/0xd2 [ 13.019483] kernel_init_freeable+0x123/0x24f [ 13.019878] kernel_init+0xa/0x110 [ 13.020201] ret_from_fork+0x24/0x30 [ 13.020541] [ 13.020541] -> #1 (cpu_hotplug_lock.rw_sem){++++}: [ 13.021051] static_key_slow_inc+0xe/0x20 [ 13.021424] tracepoint_probe_register_prio+0x28c/0x300 [ 13.021891] perf_trace_event_init+0x11f/0x250 [ 13.022297] perf_trace_init+0x6b/0xa0 [ 13.022644] perf_tp_event_init+0x25/0x40 [ 13.023011] perf_try_init_event+0x6b/0x90 [ 13.023386] perf_event_alloc+0x9a8/0xc40 [ 13.023754] __do_sys_perf_event_open+0x1dd/0xd30 [ 13.024173] do_syscall_64+0x4a/0x180 [ 13.024519] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 13.024968] [ 13.024968] -> #0 (tracepoints_mutex){+.+.}: [ 13.025434] __mutex_lock+0x86/0x970 [ 13.025764] tracepoint_probe_register_prio+0x2d/0x300 [ 13.026215] bpf_probe_register+0x40/0x60 [ 13.026584] bpf_raw_tracepoint_open.isra.34+0xa4/0x130 [ 13.027042] __do_sys_bpf+0x94f/0x1a90 [ 13.027389] do_syscall_64+0x4a/0x180 [ 13.027727] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 13.028171] [ 13.028171] other info that might help us debug this: [ 13.028171] [ 13.028807] Chain exists of: [ 13.028807] tracepoints_mutex --> &cpuctx_mutex --> bpf_event_mutex [ 13.028807] [ 13.029666] Possible unsafe locking scenario: [ 13.029666] [ 13.030140] CPU0 CPU1 [ 13.030510] ---- ---- [ 13.030875] lock(bpf_event_mutex); [ 13.031166] lock(&cpuctx_mutex); [ 13.031645] lock(bpf_event_mutex); [ 13.032135] lock(tracepoints_mutex); [ 13.032441] [ 13.032441] *** DEADLOCK *** [ 13.032441] [ 13.032911] 1 lock held by test_progs/246: [ 13.033239] #0: 00000000d663ef86 (bpf_event_mutex){+.+.}, at: bpf_probe_register+0x1d/0x60 [ 13.033909] [ 13.033909] stack backtrace: [ 13.034258] CPU: 1 PID: 246 Comm: test_progs Not tainted 5.0.0-rc3-00018-g2fa53f892422-dirty #477 [ 13.034964] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 [ 13.035657] Call Trace: [ 13.035859] dump_stack+0x5f/0x8b [ 13.036130] print_circular_bug.isra.37+0x1ce/0x1db [ 13.036526] __lock_acquire+0x1158/0x1350 [ 13.036852] ? lock_acquire+0x98/0x190 [ 13.037154] lock_acquire+0x98/0x190 [ 13.037447] ? tracepoint_probe_register_prio+0x2d/0x300 [ 13.037876] __mutex_lock+0x86/0x970 [ 13.038167] ? tracepoint_probe_register_prio+0x2d/0x300 [ 13.038600] ? tracepoint_probe_register_prio+0x2d/0x300 [ 13.039028] ? __mutex_lock+0x86/0x970 [ 13.039337] ? __mutex_lock+0x24a/0x970 [ 13.039649] ? bpf_probe_register+0x1d/0x60 [ 13.039992] ? __bpf_trace_sched_wake_idle_without_ipi+0x10/0x10 [ 13.040478] ? tracepoint_probe_register_prio+0x2d/0x300 [ 13.040906] tracepoint_probe_register_prio+0x2d/0x300 [ 13.041325] bpf_probe_register+0x40/0x60 [ 13.041649] bpf_raw_tracepoint_open.isra.34+0xa4/0x130 [ 13.042068] ? __might_fault+0x3e/0x90 [ 13.042374] __do_sys_bpf+0x94f/0x1a90 [ 13.042678] do_syscall_64+0x4a/0x180 [ 13.042975] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 13.043382] RIP: 0033:0x7f23b10a07f9 [ 13.045155] RSP: 002b:00007ffdef42fdd8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141 [ 13.045759] RAX: ffffffffffffffda RBX: 00007ffdef42ff70 RCX: 00007f23b10a07f9 [ 13.046326] RDX: 0000000000000070 RSI: 00007ffdef42fe10 RDI: 0000000000000011 [ 13.046893] RBP: 00007ffdef42fdf0 R08: 0000000000000038 R09: 00007ffdef42fe10 [ 13.047462] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000 [ 13.048029] R13: 0000000000000016 R14: 00007f23b1db4690 R15: 0000000000000000 Since tracepoints_mutex will be taken in tracepoint_probe_register/unregister() there is no need to take bpf_event_mutex too. bpf_event_mutex is protecting modifications to prog array used in kprobe/perf bpf progs. bpf_raw_tracepoints don't need to take this mutex. Fixes: c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT") Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 8b068adb9da1..f1a86a0d881d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1204,22 +1204,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) { - int err; - - mutex_lock(&bpf_event_mutex); - err = __bpf_probe_register(btp, prog); - mutex_unlock(&bpf_event_mutex); - return err; + return __bpf_probe_register(btp, prog); } int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) { - int err; - - mutex_lock(&bpf_event_mutex); - err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); - mutex_unlock(&bpf_event_mutex); - return err; + return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); } int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, -- cgit v1.2.3 From 7c4cd051add3d00bbff008a133c936c515eaa8fe Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 30 Jan 2019 18:12:45 -0800 Subject: bpf: Fix syscall's stackmap lookup potential deadlock The map_lookup_elem used to not acquiring spinlock in order to optimize the reader. It was true until commit 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation") The syscall's map_lookup_elem(stackmap) calls bpf_stackmap_copy(). bpf_stackmap_copy() may find the elem no longer needed after the copy is done. If that is the case, pcpu_freelist_push() saves this elem for reuse later. This push requires a spinlock. If a tracing bpf_prog got run in the middle of the syscall's map_lookup_elem(stackmap) and this tracing bpf_prog is calling bpf_get_stackid(stackmap) which also requires the same pcpu_freelist's spinlock, it may end up with a dead lock situation as reported by Eric Dumazet in https://patchwork.ozlabs.org/patch/1030266/ The situation is the same as the syscall's map_update_elem() which needs to acquire the pcpu_freelist's spinlock and could race with tracing bpf_prog. Hence, this patch fixes it by protecting bpf_stackmap_copy() with this_cpu_inc(bpf_prog_active) to prevent tracing bpf_prog from running. A later syscall's map_lookup_elem commit f1a2e44a3aec ("bpf: add queue and stack maps") also acquires a spinlock and races with tracing bpf_prog similarly. Hence, this patch is forward looking and protects the majority of the map lookups. bpf_map_offload_lookup_elem() is the exception since it is for network bpf_prog only (i.e. never called by tracing bpf_prog). Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation") Reported-by: Eric Dumazet Acked-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b155cd17c1bd..8577bb7f8be6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -713,8 +713,13 @@ static int map_lookup_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_lookup_elem(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + goto done; + } + + preempt_disable(); + this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -744,7 +749,10 @@ static int map_lookup_elem(union bpf_attr *attr) } rcu_read_unlock(); } + this_cpu_dec(bpf_prog_active); + preempt_enable(); +done: if (err) goto free_value; -- cgit v1.2.3 From 8fb335e078378c8426fabeed1ebee1fbf915690c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 1 Feb 2019 14:20:24 -0800 Subject: kernel/exit.c: release ptraced tasks before zap_pid_ns_processes Currently, exit_ptrace() adds all ptraced tasks in a dead list, then zap_pid_ns_processes() waits on all tasks in a current pidns, and only then are tasks from the dead list released. zap_pid_ns_processes() can get stuck on waiting tasks from the dead list. In this case, we will have one unkillable process with one or more dead children. Thanks to Oleg for the advice to release tasks in find_child_reaper(). Link: http://lkml.kernel.org/r/20190110175200.12442-1-avagin@gmail.com Fixes: 7c8bd2322c7f ("exit: ptrace: shift "reap dead" code from exit_ptrace() to forget_original_parent()") Signed-off-by: Andrei Vagin Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 3fb7be001964..2639a30a8aa5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -558,12 +558,14 @@ static struct task_struct *find_alive_thread(struct task_struct *p) return NULL; } -static struct task_struct *find_child_reaper(struct task_struct *father) +static struct task_struct *find_child_reaper(struct task_struct *father, + struct list_head *dead) __releases(&tasklist_lock) __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *reaper = pid_ns->child_reaper; + struct task_struct *p, *n; if (likely(reaper != father)) return reaper; @@ -579,6 +581,12 @@ static struct task_struct *find_child_reaper(struct task_struct *father) panic("Attempted to kill init! exitcode=0x%08x\n", father->signal->group_exit_code ?: father->exit_code); } + + list_for_each_entry_safe(p, n, dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } + zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); @@ -668,7 +676,7 @@ static void forget_original_parent(struct task_struct *father, exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ - reaper = find_child_reaper(father); + reaper = find_child_reaper(father, dead); if (list_empty(&father->children)) return; -- cgit v1.2.3 From 1b69ac6b40ebd85eed73e4dbccde2a36961ab990 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 1 Feb 2019 14:20:42 -0800 Subject: psi: fix aggregation idle shut-off psi has provisions to shut off the periodic aggregation worker when there is a period of no task activity - and thus no data that needs aggregating. However, while developing psi monitoring, Suren noticed that the aggregation clock currently won't stay shut off for good. Debugging this revealed a flaw in the idle design: an aggregation run will see no task activity and decide to go to sleep; shortly thereafter, the kworker thread that executed the aggregation will go idle and cause a scheduling change, during which the psi callback will kick the !pending worker again. This will ping-pong forever, and is equivalent to having no shut-off logic at all (but with more code!) Fix this by exempting aggregation workers from psi's clock waking logic when the state change is them going to sleep. To do this, tag workers with the last work function they executed, and if in psi we see a worker going to sleep after aggregating psi data, we will not reschedule the aggregation work item. What if the worker is also executing other items before or after? Any psi state times that were incurred by work items preceding the aggregation work will have been collected from the per-cpu buckets during the aggregation itself. If there are work items following the aggregation work, the worker's last_func tag will be overwritten and the aggregator will be kept alive to process this genuine new activity. If the aggregation work is the last thing the worker does, and we decide to go idle, the brief period of non-idle time incurred between the aggregation run and the kworker's dequeue will be stranded in the per-cpu buckets until the clock is woken by later activity. But that should not be a problem. The buckets can hold 4s worth of time, and future activity will wake the clock with a 2s delay, giving us 2s worth of data we can leave behind when disabling aggregation. If it takes a worker more than two seconds to go idle after it finishes its last work item, we likely have bigger problems in the system, and won't notice one sample that was averaged with a bogus per-CPU weight. Link: http://lkml.kernel.org/r/20190116193501.1910-1-hannes@cmpxchg.org Fixes: eb414681d5a0 ("psi: pressure stall information for CPU, memory, and IO") Signed-off-by: Johannes Weiner Reported-by: Suren Baghdasaryan Acked-by: Tejun Heo Cc: Peter Zijlstra Cc: Lai Jiangshan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 21 +++++++++++++++++---- kernel/workqueue.c | 23 +++++++++++++++++++++++ kernel/workqueue_internal.h | 6 +++++- 3 files changed, 45 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index fe24de3fbc93..c3484785b179 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -124,6 +124,7 @@ * sampling of the aggregate task states would be. */ +#include "../workqueue_internal.h" #include #include #include @@ -480,9 +481,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->tasks[t]++; write_seqcount_end(&groupc->seq); - - if (!delayed_work_pending(&group->clock_work)) - schedule_delayed_work(&group->clock_work, PSI_FREQ); } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -513,6 +511,7 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; + bool wake_clock = true; void *iter = NULL; if (!task->pid) @@ -530,8 +529,22 @@ void psi_task_change(struct task_struct *task, int clear, int set) task->psi_flags &= ~clear; task->psi_flags |= set; - while ((group = iterate_groups(task, &iter))) + /* + * Periodic aggregation shuts off if there is a period of no + * task changes, so we wake it back up if necessary. However, + * don't do this if the task change is the aggregation worker + * itself going to sleep, or we'll ping-pong forever. + */ + if (unlikely((clear & TSK_RUNNING) && + (task->flags & PF_WQ_WORKER) && + wq_worker_last_func(task) == psi_update_work)) + wake_clock = false; + + while ((group = iterate_groups(task, &iter))) { psi_group_change(group, cpu, clear, set); + if (wake_clock && !delayed_work_pending(&group->clock_work)) + schedule_delayed_work(&group->clock_work, PSI_FREQ); + } } void psi_memstall_tick(struct task_struct *task, int cpu) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 392be4b252f6..fc5d23d752a5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -909,6 +909,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) return to_wakeup ? to_wakeup->task : NULL; } +/** + * wq_worker_last_func - retrieve worker's last work function + * + * Determine the last function a worker executed. This is called from + * the scheduler to get a worker's last known identity. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + * + * Return: + * The last work function %current executed as a worker, NULL if it + * hasn't executed any work yet. + */ +work_func_t wq_worker_last_func(struct task_struct *task) +{ + struct worker *worker = kthread_data(task); + + return worker->last_func; +} + /** * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self @@ -2184,6 +2204,9 @@ __acquires(&pool->lock) if (unlikely(cpu_intensive)) worker_clr_flags(worker, WORKER_CPU_INTENSIVE); + /* tag the worker for identification in schedule() */ + worker->last_func = worker->current_func; + /* we're done with it, release */ hash_del(&worker->hentry); worker->current_work = NULL; diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 66fbb5a9e633..cb68b03ca89a 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -53,6 +53,9 @@ struct worker { /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ + + /* used by the scheduler to determine a worker's last known identity */ + work_func_t last_func; }; /** @@ -67,9 +70,10 @@ static inline struct worker *current_wq_worker(void) /* * Scheduler hooks for concurrency managed workqueue. Only to be used from - * sched/core.c and workqueue.c. + * sched/ and workqueue.c. */ void wq_worker_waking_up(struct task_struct *task, int cpu); struct task_struct *wq_worker_sleeping(struct task_struct *task); +work_func_t wq_worker_last_func(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ -- cgit v1.2.3 From 9dff0aa95a324e262ffb03f425d00e4751f3294e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 10 Jan 2019 14:27:45 +0000 Subject: perf/core: Don't WARN() for impossible ring-buffer sizes The perf tool uses /proc/sys/kernel/perf_event_mlock_kb to determine how large its ringbuffer mmap should be. This can be configured to arbitrary values, which can be larger than the maximum possible allocation from kmalloc. When this is configured to a suitably large value (e.g. thanks to the perf fuzzer), attempting to use perf record triggers a WARN_ON_ONCE() in __alloc_pages_nodemask(): WARNING: CPU: 2 PID: 5666 at mm/page_alloc.c:4511 __alloc_pages_nodemask+0x3f8/0xbc8 Let's avoid this by checking that the requested allocation is possible before calling kzalloc. Reported-by: Julien Thierry Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Julien Thierry Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Link: https://lkml.kernel.org/r/20190110142745.25495-1-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 4a9937076331..309ef5a64af5 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -734,6 +734,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) size = sizeof(struct ring_buffer); size += nr_pages * sizeof(void *); + if (order_base_2(size) >= MAX_ORDER) + goto fail; + rb = kzalloc(size, GFP_KERNEL); if (!rb) goto fail; -- cgit v1.2.3 From a692933a87691681e880feb708081681ff32400a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 5 Feb 2019 07:19:11 -0600 Subject: signal: Always attempt to allocate siginfo for SIGSTOP Since 2.5.34 the code has had the potential to not allocate siginfo for SIGSTOP signals. Except for ptrace this is perfectly fine as only ptrace can use PTRACE_PEEK_SIGINFO and see what the contents of the delivered siginfo are. Users of PTRACE_PEEK_SIGINFO that care about the contents siginfo for SIGSTOP are rare, but they do exist. A seccomp self test has cared and lldb cares. Jack Andersen writes: > The patch titled > `signal: Never allocate siginfo for SIGKILL or SIGSTOP` > created a regression for users of PTRACE_GETSIGINFO needing to > discern signals that were raised via the tgkill syscall. > > A notable user of this tgkill+ptrace combination is lldb while > debugging a multithreaded program. Without the ability to detect a > SIGSTOP originating from tgkill, lldb does not have a way to > synchronize on a per-thread basis and falls back to SIGSTOP-ing the > entire process. Everyone affected by this please note. The kernel can still fail to allocate a siginfo structure. The allocation is with GFP_KERNEL and is best effort only. If memory is tight when the signal allocation comes in this will fail to allocate a siginfo. So I strongly recommend looking at more robust solutions for synchronizing with a single thread such as PTRACE_INTERRUPT. Or if that does not work persuading your friendly local kernel developer to build the interface you need. Reported-by: Tycho Andersen Reported-by: Kees Cook Reported-by: Jack Andersen Acked-by: Linus Torvalds Reviewed-by: Christian Brauner Cc: stable@vger.kernel.org Fixes: f149b3155744 ("signal: Never allocate siginfo for SIGKILL or SIGSTOP") Fixes: 6dfc88977e42 ("[PATCH] shared thread signals") History Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e1d7ad8e6ab1..9ca8e5278c8e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1057,10 +1057,9 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc result = TRACE_SIGNAL_DELIVERED; /* - * Skip useless siginfo allocation for SIGKILL SIGSTOP, - * and kernel threads. + * Skip useless siginfo allocation for SIGKILL and kernel threads. */ - if (sig_kernel_only(sig) || (t->flags & PF_KTHREAD)) + if ((sig == SIGKILL) || (t->flags & PF_KTHREAD)) goto out_set; /* -- cgit v1.2.3 From 35634ffa1751b6efd8cf75010b509dcb0263e29b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 6 Feb 2019 18:39:40 -0600 Subject: signal: Always notice exiting tasks Recently syzkaller was able to create unkillablle processes by creating a timer that is delivered as a thread local signal on SIGHUP, and receiving SIGHUP SA_NODEFERER. Ultimately causing a loop failing to deliver SIGHUP but always trying. Upon examination it turns out part of the problem is actually most of the solution. Since 2.5 signal delivery has found all fatal signals, marked the signal group for death, and queued SIGKILL in every threads thread queue relying on signal->group_exit_code to preserve the information of which was the actual fatal signal. The conversion of all fatal signals to SIGKILL results in the synchronous signal heuristic in next_signal kicking in and preferring SIGHUP to SIGKILL. Which is especially problematic as all fatal signals have already been transformed into SIGKILL. Instead of dequeueing signals and depending upon SIGKILL to be the first signal dequeued, first test if the signal group has already been marked for death. This guarantees that nothing in the signal queue can prevent a process that needs to exit from exiting. Cc: stable@vger.kernel.org Tested-by: Dmitry Vyukov Reported-by: Dmitry Vyukov Ref: ebf5ebe31d2c ("[PATCH] signal-fixes-2.5.59-A4") History Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 9ca8e5278c8e..5424cb0006bc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2393,6 +2393,11 @@ relock: goto relock; } + /* Has this task already been marked for death? */ + ksig->info.si_signo = signr = SIGKILL; + if (signal_group_exit(signal)) + goto fatal; + for (;;) { struct k_sigaction *ka; @@ -2488,6 +2493,7 @@ relock: continue; } + fatal: spin_unlock_irq(&sighand->siglock); /* -- cgit v1.2.3 From 7146db3317c67b517258cb5e1b08af387da0618b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 6 Feb 2019 17:51:47 -0600 Subject: signal: Better detection of synchronous signals Recently syzkaller was able to create unkillablle processes by creating a timer that is delivered as a thread local signal on SIGHUP, and receiving SIGHUP SA_NODEFERER. Ultimately causing a loop failing to deliver SIGHUP but always trying. When the stack overflows delivery of SIGHUP fails and force_sigsegv is called. Unfortunately because SIGSEGV is numerically higher than SIGHUP next_signal tries again to deliver a SIGHUP. From a quality of implementation standpoint attempting to deliver the timer SIGHUP signal is wrong. We should attempt to deliver the synchronous SIGSEGV signal we just forced. We can make that happening in a fairly straight forward manner by instead of just looking at the signal number we also look at the si_code. In particular for exceptions (aka synchronous signals) the si_code is always greater than 0. That still has the potential to pick up a number of asynchronous signals as in a few cases the same si_codes that are used for synchronous signals are also used for asynchronous signals, and SI_KERNEL is also included in the list of possible si_codes. Still the heuristic is much better and timer signals are definitely excluded. Which is enough to prevent all known ways for someone sending a process signals fast enough to cause unexpected and arguably incorrect behavior. Cc: stable@vger.kernel.org Fixes: a27341cd5fcb ("Prioritize synchronous signals over 'normal' signals") Tested-by: Dmitry Vyukov Reported-by: Dmitry Vyukov Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 5424cb0006bc..99fa8ff06fd9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -688,6 +688,48 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in } EXPORT_SYMBOL_GPL(dequeue_signal); +static int dequeue_synchronous_signal(kernel_siginfo_t *info) +{ + struct task_struct *tsk = current; + struct sigpending *pending = &tsk->pending; + struct sigqueue *q, *sync = NULL; + + /* + * Might a synchronous signal be in the queue? + */ + if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK)) + return 0; + + /* + * Return the first synchronous signal in the queue. + */ + list_for_each_entry(q, &pending->list, list) { + /* Synchronous signals have a postive si_code */ + if ((q->info.si_code > SI_USER) && + (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) { + sync = q; + goto next; + } + } + return 0; +next: + /* + * Check if there is another siginfo for the same signal. + */ + list_for_each_entry_continue(q, &pending->list, list) { + if (q->info.si_signo == sync->info.si_signo) + goto still_pending; + } + + sigdelset(&pending->signal, sync->info.si_signo); + recalc_sigpending(); +still_pending: + list_del_init(&sync->list); + copy_siginfo(info, &sync->info); + __sigqueue_free(sync); + return info->si_signo; +} + /* * Tell a process that it has a new active signal.. * @@ -2411,7 +2453,15 @@ relock: goto relock; } - signr = dequeue_signal(current, ¤t->blocked, &ksig->info); + /* + * Signals generated by the execution of an instruction + * need to be delivered before any other pending signals + * so that the instruction pointer in the signal stack + * frame points to the faulting instruction. + */ + signr = dequeue_synchronous_signal(&ksig->info); + if (!signr) + signr = dequeue_signal(current, ¤t->blocked, &ksig->info); if (!signr) break; /* will return 0 */ -- cgit v1.2.3 From 6f568ebe2afefdc33a6fb06ef20a94f8b96455f1 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 6 Feb 2019 10:56:02 -0800 Subject: futex: Fix barrier comment The current comment for the barrier that guarantees that waiter increment is always before taking the hb spinlock (barrier (A)) needs to be fixed as it is misplaced. This is obviously referring to hb_waiters_inc, which is a full barrier. Reported-by: Peter Zijlstra Signed-off-by: Davidlohr Bueso Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190206185602.949-1-dave@stgolabs.net --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index fdd312da0992..5ec2473a3497 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2221,11 +2221,11 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) * decrement the counter at queue_unlock() when some error has * occurred and we don't end up adding the task to the list. */ - hb_waiters_inc(hb); + hb_waiters_inc(hb); /* implies smp_mb(); (A) */ q->lock_ptr = &hb->lock; - spin_lock(&hb->lock); /* implies smp_mb(); (A) */ + spin_lock(&hb->lock); return hb; } -- cgit v1.2.3 From 1a1fb985f2e2b85ec0d3dc2e519ee48389ec2434 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 Jan 2019 23:15:12 +0100 Subject: futex: Handle early deadlock return correctly commit 56222b212e8e ("futex: Drop hb->lock before enqueueing on the rtmutex") changed the locking rules in the futex code so that the hash bucket lock is not longer held while the waiter is enqueued into the rtmutex wait list. This made the lock and the unlock path symmetric, but unfortunately the possible early exit from __rt_mutex_proxy_start() due to a detected deadlock was not updated accordingly. That allows a concurrent unlocker to observe inconsitent state which triggers the warning in the unlock path. futex_lock_pi() futex_unlock_pi() lock(hb->lock) queue(hb_waiter) lock(hb->lock) lock(rtmutex->wait_lock) unlock(hb->lock) // acquired hb->lock hb_waiter = futex_top_waiter() lock(rtmutex->wait_lock) __rt_mutex_proxy_start() ---> fail remove(rtmutex_waiter); ---> returns -EDEADLOCK unlock(rtmutex->wait_lock) // acquired wait_lock wake_futex_pi() rt_mutex_next_owner() --> returns NULL --> WARN lock(hb->lock) unqueue(hb_waiter) The problem is caused by the remove(rtmutex_waiter) in the failure case of __rt_mutex_proxy_start() as this lets the unlocker observe a waiter in the hash bucket but no waiter on the rtmutex, i.e. inconsistent state. The original commit handles this correctly for the other early return cases (timeout, signal) by delaying the removal of the rtmutex waiter until the returning task reacquired the hash bucket lock. Treat the failure case of __rt_mutex_proxy_start() in the same way and let the existing cleanup code handle the eventual handover of the rtmutex gracefully. The regular rt_mutex_proxy_start() gains the rtmutex waiter removal for the failure case, so that the other callsites are still operating correctly. Add proper comments to the code so all these details are fully documented. Thanks to Peter for helping with the analysis and writing the really valuable code comments. Fixes: 56222b212e8e ("futex: Drop hb->lock before enqueueing on the rtmutex") Reported-by: Heiko Carstens Co-developed-by: Peter Zijlstra Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Tested-by: Heiko Carstens Cc: Martin Schwidefsky Cc: linux-s390@vger.kernel.org Cc: Stefan Liebler Cc: Sebastian Sewior Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1901292311410.1950@nanos.tec.linutronix.de --- kernel/futex.c | 28 ++++++++++++++++++---------- kernel/locking/rtmutex.c | 37 ++++++++++++++++++++++++++++++++----- 2 files changed, 50 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 5ec2473a3497..a0514e01c3eb 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2861,35 +2861,39 @@ retry_private: * and BUG when futex_unlock_pi() interleaves with this. * * Therefore acquire wait_lock while holding hb->lock, but drop the - * latter before calling rt_mutex_start_proxy_lock(). This still fully - * serializes against futex_unlock_pi() as that does the exact same - * lock handoff sequence. + * latter before calling __rt_mutex_start_proxy_lock(). This + * interleaves with futex_unlock_pi() -- which does a similar lock + * handoff -- such that the latter can observe the futex_q::pi_state + * before __rt_mutex_start_proxy_lock() is done. */ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); spin_unlock(q.lock_ptr); + /* + * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter + * such that futex_unlock_pi() is guaranteed to observe the waiter when + * it sees the futex_q::pi_state. + */ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); if (ret) { if (ret == 1) ret = 0; - - spin_lock(q.lock_ptr); - goto no_block; + goto cleanup; } - if (unlikely(to)) hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); +cleanup: spin_lock(q.lock_ptr); /* - * If we failed to acquire the lock (signal/timeout), we must + * If we failed to acquire the lock (deadlock/signal/timeout), we must * first acquire the hb->lock before removing the lock from the - * rt_mutex waitqueue, such that we can keep the hb and rt_mutex - * wait lists consistent. + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait + * lists consistent. * * In particular; it is important that futex_unlock_pi() can not * observe this inconsistency. @@ -3013,6 +3017,10 @@ retry: * there is no point where we hold neither; and therefore * wake_futex_pi() must observe a state consistent with what we * observed. + * + * In particular; this forces __rt_mutex_start_proxy() to + * complete such that we're guaranteed to observe the + * rt_waiter. Also see the WARN in wake_futex_pi(). */ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 581edcc63c26..978d63a8261c 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1726,12 +1726,33 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, rt_mutex_set_owner(lock, NULL); } +/** + * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: does _NOT_ remove the @waiter on failure; must either call + * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this. + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for PI-futex support. + */ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task) { int ret; + lockdep_assert_held(&lock->wait_lock); + if (try_to_take_rt_mutex(lock, task, NULL)) return 1; @@ -1749,9 +1770,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, ret = 0; } - if (unlikely(ret)) - remove_waiter(lock, waiter); - debug_rt_mutex_print_deadlock(waiter); return ret; @@ -1763,12 +1781,18 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, * @waiter: the pre-initialized rt_mutex_waiter * @task: the task to prepare * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter + * on failure. + * * Returns: * 0 - task blocked on lock * 1 - acquired the lock for task, caller should wake it up * <0 - error * - * Special API call for FUTEX_REQUEUE_PI support. + * Special API call for PI-futex support. */ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, @@ -1778,6 +1802,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, raw_spin_lock_irq(&lock->wait_lock); ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + if (unlikely(ret)) + remove_waiter(lock, waiter); raw_spin_unlock_irq(&lock->wait_lock); return ret; @@ -1845,7 +1871,8 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, * @lock: the rt_mutex we were woken on * @waiter: the pre-initialized rt_mutex_waiter * - * Attempt to clean up after a failed rt_mutex_wait_proxy_lock(). + * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or + * rt_mutex_wait_proxy_lock(). * * Unless we acquired the lock; we're still enqueued on the wait-list and can * in fact still be granted ownership until we're removed. Therefore we can -- cgit v1.2.3 From 81ec3f3c4c4d78f2d3b6689c9816bfbdf7417dbb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 4 Feb 2019 13:35:32 +0100 Subject: perf/x86: Add check_period PMU callback Vince (and later on Ravi) reported crashes in the BTS code during fuzzing with the following backtrace: general protection fault: 0000 [#1] SMP PTI ... RIP: 0010:perf_prepare_sample+0x8f/0x510 ... Call Trace: ? intel_pmu_drain_bts_buffer+0x194/0x230 intel_pmu_drain_bts_buffer+0x160/0x230 ? tick_nohz_irq_exit+0x31/0x40 ? smp_call_function_single_interrupt+0x48/0xe0 ? call_function_single_interrupt+0xf/0x20 ? call_function_single_interrupt+0xa/0x20 ? x86_schedule_events+0x1a0/0x2f0 ? x86_pmu_commit_txn+0xb4/0x100 ? find_busiest_group+0x47/0x5d0 ? perf_event_set_state.part.42+0x12/0x50 ? perf_mux_hrtimer_restart+0x40/0xb0 intel_pmu_disable_event+0xae/0x100 ? intel_pmu_disable_event+0xae/0x100 x86_pmu_stop+0x7a/0xb0 x86_pmu_del+0x57/0x120 event_sched_out.isra.101+0x83/0x180 group_sched_out.part.103+0x57/0xe0 ctx_sched_out+0x188/0x240 ctx_resched+0xa8/0xd0 __perf_event_enable+0x193/0x1e0 event_function+0x8e/0xc0 remote_function+0x41/0x50 flush_smp_call_function_queue+0x68/0x100 generic_smp_call_function_single_interrupt+0x13/0x30 smp_call_function_single_interrupt+0x3e/0xe0 call_function_single_interrupt+0xf/0x20 The reason is that while event init code does several checks for BTS events and prevents several unwanted config bits for BTS event (like precise_ip), the PERF_EVENT_IOC_PERIOD allows to create BTS event without those checks being done. Following sequence will cause the crash: If we create an 'almost' BTS event with precise_ip and callchains, and it into a BTS event it will crash the perf_prepare_sample() function because precise_ip events are expected to come in with callchain data initialized, but that's not the case for intel_pmu_drain_bts_buffer() caller. Adding a check_period callback to be called before the period is changed via PERF_EVENT_IOC_PERIOD. It will deny the change if the event would become BTS. Plus adding also the limit_period check as well. Reported-by: Vince Weaver Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Naveen N. Rao Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20190204123532.GA4794@krava Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 14 ++++++++++++++ arch/x86/events/intel/core.c | 9 +++++++++ arch/x86/events/perf_event.h | 16 ++++++++++++++-- include/linux/perf_event.h | 5 +++++ kernel/events/core.c | 16 ++++++++++++++++ 5 files changed, 58 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 374a19712e20..b684f0294f35 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2278,6 +2278,19 @@ void perf_check_microcode(void) x86_pmu.check_microcode(); } +static int x86_pmu_check_period(struct perf_event *event, u64 value) +{ + if (x86_pmu.check_period && x86_pmu.check_period(event, value)) + return -EINVAL; + + if (value && x86_pmu.limit_period) { + if (x86_pmu.limit_period(event, value) > value) + return -EINVAL; + } + + return 0; +} + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, @@ -2302,6 +2315,7 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, .task_ctx_size = sizeof(struct x86_perf_task_context), + .check_period = x86_pmu_check_period, }; void arch_perf_update_userpage(struct perf_event *event, diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index daafb893449b..730978dff63f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3587,6 +3587,11 @@ static void intel_pmu_sched_task(struct perf_event_context *ctx, intel_pmu_lbr_sched_task(ctx, sched_in); } +static int intel_pmu_check_period(struct perf_event *event, u64 value) +{ + return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0; +} + PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); @@ -3667,6 +3672,8 @@ static __initconst const struct x86_pmu core_pmu = { .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, .cpu_dead = intel_pmu_cpu_dead, + + .check_period = intel_pmu_check_period, }; static struct attribute *intel_pmu_attrs[]; @@ -3711,6 +3718,8 @@ static __initconst const struct x86_pmu intel_pmu = { .guest_get_msrs = intel_guest_get_msrs, .sched_task = intel_pmu_sched_task, + + .check_period = intel_pmu_check_period, }; static __init void intel_clovertown_quirk(void) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 78d7b7031bfc..d46fd6754d92 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -646,6 +646,11 @@ struct x86_pmu { * Intel host/guest support (KVM) */ struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); + + /* + * Check period value for PERF_EVENT_IOC_PERIOD ioctl. + */ + int (*check_period) (struct perf_event *event, u64 period); }; struct x86_perf_task_context { @@ -857,7 +862,7 @@ static inline int amd_pmu_init(void) #ifdef CONFIG_CPU_SUP_INTEL -static inline bool intel_pmu_has_bts(struct perf_event *event) +static inline bool intel_pmu_has_bts_period(struct perf_event *event, u64 period) { struct hw_perf_event *hwc = &event->hw; unsigned int hw_event, bts_event; @@ -868,7 +873,14 @@ static inline bool intel_pmu_has_bts(struct perf_event *event) hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); - return hw_event == bts_event && hwc->sample_period == 1; + return hw_event == bts_event && period == 1; +} + +static inline bool intel_pmu_has_bts(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + return intel_pmu_has_bts_period(event, hwc->sample_period); } int intel_pmu_save_and_restart(struct perf_event *event); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1d5c551a5add..e1a051724f7e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -447,6 +447,11 @@ struct pmu { * Filter events for PMU-specific reasons. */ int (*filter_match) (struct perf_event *event); /* optional */ + + /* + * Check period value for PERF_EVENT_IOC_PERIOD ioctl. + */ + int (*check_period) (struct perf_event *event, u64 value); /* optional */ }; enum perf_addr_filter_action_t { diff --git a/kernel/events/core.c b/kernel/events/core.c index e5ede6918050..26d6edab051a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4963,6 +4963,11 @@ static void __perf_event_period(struct perf_event *event, } } +static int perf_event_check_period(struct perf_event *event, u64 value) +{ + return event->pmu->check_period(event, value); +} + static int perf_event_period(struct perf_event *event, u64 __user *arg) { u64 value; @@ -4979,6 +4984,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; + if (perf_event_check_period(event, value)) + return -EINVAL; + event_function_call(event, __perf_event_period, &value); return 0; @@ -9391,6 +9399,11 @@ static int perf_pmu_nop_int(struct pmu *pmu) return 0; } +static int perf_event_nop_int(struct perf_event *event, u64 value) +{ + return 0; +} + static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) @@ -9691,6 +9704,9 @@ got_cpu_context: pmu->pmu_disable = perf_pmu_nop_void; } + if (!pmu->check_period) + pmu->check_period = perf_event_nop_int; + if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; -- cgit v1.2.3 From f6675872db57305fa957021efc788f9983ed3b67 Mon Sep 17 00:00:00 2001 From: Andreas Ziegler Date: Wed, 6 Feb 2019 20:00:13 +0100 Subject: tracing: probeevent: Correctly update remaining space in dynamic area Commit 9178412ddf5a ("tracing: probeevent: Return consumed bytes of dynamic area") improved the string fetching mechanism by returning the number of required bytes after copying the argument to the dynamic area. However, this return value is now only used to increment the pointer inside the dynamic area but misses updating the 'maxlen' variable which indicates the remaining space in the dynamic area. This means that fetch_store_string() always reads the *total* size of the dynamic area from the data_loc pointer instead of the *remaining* size (and passes it along to strncpy_from_{user,unsafe}) even if we're already about to copy data into the middle of the dynamic area. Link: http://lkml.kernel.org/r/20190206190013.16405-1-andreas.ziegler@fau.de Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 9178412ddf5a ("tracing: probeevent: Return consumed bytes of dynamic area") Acked-by: Masami Hiramatsu Signed-off-by: Andreas Ziegler Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe_tmpl.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 5c56afc17cf8..4737bb8c07a3 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -180,10 +180,12 @@ store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); ret = process_fetch_insn(arg->code, regs, dl, base); - if (unlikely(ret < 0 && arg->dynamic)) + if (unlikely(ret < 0 && arg->dynamic)) { *dl = make_data_loc(0, dyndata - base); - else + } else { dyndata += ret; + maxlen -= ret; + } } } -- cgit v1.2.3 From 528871b456026e6127d95b1b2bd8e3a003dc1614 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 Feb 2019 07:57:02 +0100 Subject: perf/core: Fix impossible ring-buffer sizes warning The following commit: 9dff0aa95a32 ("perf/core: Don't WARN() for impossible ring-buffer sizes") results in perf recording failures with larger mmap areas: root@skl:/tmp# perf record -g -a failed to mmap with 12 (Cannot allocate memory) The root cause is that the following condition is buggy: if (order_base_2(size) >= MAX_ORDER) goto fail; The problem is that @size is in bytes and MAX_ORDER is in pages, so the right test is: if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) goto fail; Fix it. Reported-by: "Jin, Yao" Bisected-by: Borislav Petkov Analyzed-by: Peter Zijlstra Cc: Julien Thierry Cc: Mark Rutland Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Fixes: 9dff0aa95a32 ("perf/core: Don't WARN() for impossible ring-buffer sizes") Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 309ef5a64af5..5ab4fe3b1dcc 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -734,7 +734,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) size = sizeof(struct ring_buffer); size += nr_pages * sizeof(void *); - if (order_base_2(size) >= MAX_ORDER) + if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) goto fail; rb = kzalloc(size, GFP_KERNEL); -- cgit v1.2.3 From cf43a757fd49442bc38f76088b70c2299eed2c2f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 11 Feb 2019 23:27:42 -0600 Subject: signal: Restore the stop PTRACE_EVENT_EXIT In the middle of do_exit() there is there is a call "ptrace_event(PTRACE_EVENT_EXIT, code);" That call places the process in TACKED_TRACED aka "(TASK_WAKEKILL | __TASK_TRACED)" and waits for for the debugger to release the task or SIGKILL to be delivered. Skipping past dequeue_signal when we know a fatal signal has already been delivered resulted in SIGKILL remaining pending and TIF_SIGPENDING remaining set. This in turn caused the scheduler to not sleep in PTACE_EVENT_EXIT as it figured a fatal signal was pending. This also caused ptrace_freeze_traced in ptrace_check_attach to fail because it left a per thread SIGKILL pending which is what fatal_signal_pending tests for. This difference in signal state caused strace to report strace: Exit of unknown pid NNNNN ignored Therefore update the signal handling state like dequeue_signal would when removing a per thread SIGKILL, by removing SIGKILL from the per thread signal mask and clearing TIF_SIGPENDING. Acked-by: Oleg Nesterov Reported-by: Oleg Nesterov Reported-by: Ivan Delalande Cc: stable@vger.kernel.org Fixes: 35634ffa1751 ("signal: Always notice exiting tasks") Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 99fa8ff06fd9..57b7771e20d7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2436,9 +2436,12 @@ relock: } /* Has this task already been marked for death? */ - ksig->info.si_signo = signr = SIGKILL; - if (signal_group_exit(signal)) + if (signal_group_exit(signal)) { + ksig->info.si_signo = signr = SIGKILL; + sigdelset(¤t->pending.signal, SIGKILL); + recalc_sigpending(); goto fatal; + } for (;;) { struct k_sigaction *ka; -- cgit v1.2.3