From 07fd5b6cdf3cc30bfde8fe0f644771688be04447 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Jun 2022 12:19:50 -1000 Subject: cgroup: Use separate src/dst nodes when preloading css_sets for migration Each cset (css_set) is pinned by its tasks. When we're moving tasks around across csets for a migration, we need to hold the source and destination csets to ensure that they don't go away while we're moving tasks about. This is done by linking cset->mg_preload_node on either the mgctx->preloaded_src_csets or mgctx->preloaded_dst_csets list. Using the same cset->mg_preload_node for both the src and dst lists was deemed okay as a cset can't be both the source and destination at the same time. Unfortunately, this overloading becomes problematic when multiple tasks are involved in a migration and some of them are identity noop migrations while others are actually moving across cgroups. For example, this can happen with the following sequence on cgroup1: #1> mkdir -p /sys/fs/cgroup/misc/a/b #2> echo $$ > /sys/fs/cgroup/misc/a/cgroup.procs #3> RUN_A_COMMAND_WHICH_CREATES_MULTIPLE_THREADS & #4> PID=$! #5> echo $PID > /sys/fs/cgroup/misc/a/b/tasks #6> echo $PID > /sys/fs/cgroup/misc/a/cgroup.procs the process including the group leader back into a. In this final migration, non-leader threads would be doing identity migration while the group leader is doing an actual one. After #3, let's say the whole process was in cset A, and that after #4, the leader moves to cset B. Then, during #6, the following happens: 1. cgroup_migrate_add_src() is called on B for the leader. 2. cgroup_migrate_add_src() is called on A for the other threads. 3. cgroup_migrate_prepare_dst() is called. It scans the src list. 4. It notices that B wants to migrate to A, so it tries to A to the dst list but realizes that its ->mg_preload_node is already busy. 5. and then it notices A wants to migrate to A as it's an identity migration, it culls it by list_del_init()'ing its ->mg_preload_node and putting references accordingly. 6. The rest of migration takes place with B on the src list but nothing on the dst list. This means that A isn't held while migration is in progress. If all tasks leave A before the migration finishes and the incoming task pins it, the cset will be destroyed leading to use-after-free. This is caused by overloading cset->mg_preload_node for both src and dst preload lists. We wanted to exclude the cset from the src list but ended up inadvertently excluding it from the dst list too. This patch fixes the issue by separating out cset->mg_preload_node into ->mg_src_preload_node and ->mg_dst_preload_node, so that the src and dst preloadings don't interfere with each other. Signed-off-by: Tejun Heo Reported-by: Mukesh Ojha Reported-by: shisiyuan Link: http://lkml.kernel.org/r/1654187688-27411-1-git-send-email-shisiyuan@xiaomi.com Link: https://www.spinics.net/lists/cgroups/msg33313.html Fixes: f817de98513d ("cgroup: prepare migration path for unified hierarchy") Cc: stable@vger.kernel.org # v3.16+ --- kernel/cgroup/cgroup.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1779ccddb734..13c8e91d7862 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -765,7 +765,8 @@ struct css_set init_css_set = { .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), - .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), + .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), + .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), /* @@ -1240,7 +1241,8 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); - INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_src_preload_node); + INIT_LIST_HEAD(&cset->mg_dst_preload_node); INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in @@ -2597,21 +2599,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) */ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { - LIST_HEAD(preloaded); struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); - list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, + mg_src_preload_node) { + cset->mg_src_cgrp = NULL; + cset->mg_dst_cgrp = NULL; + cset->mg_dst_cset = NULL; + list_del_init(&cset->mg_src_preload_node); + put_css_set_locked(cset); + } - list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, + mg_dst_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; - list_del_init(&cset->mg_preload_node); + list_del_init(&cset->mg_dst_preload_node); put_css_set_locked(cset); } @@ -2651,7 +2659,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, if (src_cset->dead) return; - if (!list_empty(&src_cset->mg_preload_node)) + if (!list_empty(&src_cset->mg_src_preload_node)) return; src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); @@ -2664,7 +2672,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); - list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); + list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); } /** @@ -2689,7 +2697,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, - mg_preload_node) { + mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid; @@ -2708,7 +2716,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; - list_del_init(&src_cset->mg_preload_node); + list_del_init(&src_cset->mg_src_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; @@ -2716,8 +2724,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) src_cset->mg_dst_cset = dst_cset; - if (list_empty(&dst_cset->mg_preload_node)) - list_add_tail(&dst_cset->mg_preload_node, + if (list_empty(&dst_cset->mg_dst_preload_node)) + list_add_tail(&dst_cset->mg_dst_preload_node, &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); @@ -2963,7 +2971,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) goto out_finish; spin_lock_irq(&css_set_lock); - list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { + list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, + mg_src_preload_node) { struct task_struct *task, *ntask; /* all tasks in src_csets need to be migrated */ -- cgit v1.2.3 From 35adf9a4e55e0b0a9d5e313e65ad83681dc32e9a Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 1 Jul 2022 12:44:03 +0300 Subject: modules: Fix corruption of /proc/kallsyms The commit 91fb02f31505 ("module: Move kallsyms support into a separate file") changed from using strlcpy() to using strscpy() which created a buffer overflow. That happened because: 1) an incorrect value was passed as the buffer length 2) strscpy() (unlike strlcpy()) may copy beyond the length of the input string when copying word-by-word. The assumption was that because it was already known that the strings being copied would fit in the space available, it was not necessary to correctly set the buffer length. strscpy() breaks that assumption because although it will not touch bytes beyond the given buffer length it may write bytes beyond the input string length when writing word-by-word. The result of the buffer overflow is to corrupt the symbol type information that follows. e.g. $ sudo cat -v /proc/kallsyms | grep '\^' | head ffffffffc0615000 ^@ rfcomm_session_get [rfcomm] ffffffffc061c060 ^@ session_list [rfcomm] ffffffffc06150d0 ^@ rfcomm_send_frame [rfcomm] ffffffffc0615130 ^@ rfcomm_make_uih [rfcomm] ffffffffc07ed58d ^@ bnep_exit [bnep] ffffffffc07ec000 ^@ bnep_rx_control [bnep] ffffffffc07ec1a0 ^@ bnep_session [bnep] ffffffffc07e7000 ^@ input_leds_event [input_leds] ffffffffc07e9000 ^@ input_leds_handler [input_leds] ffffffffc07e7010 ^@ input_leds_disconnect [input_leds] Notably, the null bytes (represented above by ^@) can confuse tools. Fix by correcting the buffer length. Fixes: 91fb02f31505 ("module: Move kallsyms support into a separate file") Signed-off-by: Adrian Hunter Signed-off-by: Luis Chamberlain --- kernel/module/kallsyms.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 3e11523bc6f6..18c23545b984 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -137,6 +137,7 @@ void layout_symtab(struct module *mod, struct load_info *info) info->symoffs = ALIGN(mod->data_layout.size, symsect->sh_addralign ?: 1); info->stroffs = mod->data_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); mod->data_layout.size += strtab_size; + /* Note add_kallsyms() computes strtab_size as core_typeoffs - stroffs */ info->core_typeoffs = mod->data_layout.size; mod->data_layout.size += ndst * sizeof(char); mod->data_layout.size = strict_align(mod->data_layout.size); @@ -169,6 +170,7 @@ void add_kallsyms(struct module *mod, const struct load_info *info) Elf_Sym *dst; char *s; Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; + unsigned long strtab_size; /* Set up to point into init section. */ mod->kallsyms = (void __rcu *)mod->init_layout.base + @@ -190,19 +192,26 @@ void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_kallsyms.symtab = dst = mod->data_layout.base + info->symoffs; mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs; mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs; + strtab_size = info->core_typeoffs - info->stroffs; src = rcu_dereference_sched(mod->kallsyms)->symtab; for (ndst = i = 0; i < rcu_dereference_sched(mod->kallsyms)->num_symtab; i++) { rcu_dereference_sched(mod->kallsyms)->typetab[i] = elf_type(src + i, info); if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { + ssize_t ret; + mod->core_kallsyms.typetab[ndst] = rcu_dereference_sched(mod->kallsyms)->typetab[i]; dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; - s += strscpy(s, - &rcu_dereference_sched(mod->kallsyms)->strtab[src[i].st_name], - KSYM_NAME_LEN) + 1; + ret = strscpy(s, + &rcu_dereference_sched(mod->kallsyms)->strtab[src[i].st_name], + strtab_size); + if (ret < 0) + break; + s += ret + 1; + strtab_size -= ret + 1; } } preempt_enable(); -- cgit v1.2.3 From cfa94c538be621a0ba645adfa9ead005b5fa02f6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sun, 12 Jun 2022 17:21:56 +0200 Subject: module: Fix selfAssignment cppcheck warning cppcheck reports the following warnings: kernel/module/main.c:1455:26: warning: Redundant assignment of 'mod->core_layout.size' to itself. [selfAssignment] mod->core_layout.size = strict_align(mod->core_layout.size); ^ kernel/module/main.c:1489:26: warning: Redundant assignment of 'mod->init_layout.size' to itself. [selfAssignment] mod->init_layout.size = strict_align(mod->init_layout.size); ^ kernel/module/main.c:1493:26: warning: Redundant assignment of 'mod->init_layout.size' to itself. [selfAssignment] mod->init_layout.size = strict_align(mod->init_layout.size); ^ kernel/module/main.c:1504:26: warning: Redundant assignment of 'mod->init_layout.size' to itself. [selfAssignment] mod->init_layout.size = strict_align(mod->init_layout.size); ^ kernel/module/main.c:1459:26: warning: Redundant assignment of 'mod->data_layout.size' to itself. [selfAssignment] mod->data_layout.size = strict_align(mod->data_layout.size); ^ kernel/module/main.c:1463:26: warning: Redundant assignment of 'mod->data_layout.size' to itself. [selfAssignment] mod->data_layout.size = strict_align(mod->data_layout.size); ^ kernel/module/main.c:1467:26: warning: Redundant assignment of 'mod->data_layout.size' to itself. [selfAssignment] mod->data_layout.size = strict_align(mod->data_layout.size); ^ This is due to strict_align() being a no-op when CONFIG_STRICT_MODULE_RWX is not selected. Transform strict_align() macro into an inline function. It will allow type checking and avoid the selfAssignment warning. Reported-by: kernel test robot Signed-off-by: Christophe Leroy Signed-off-by: Luis Chamberlain --- kernel/module/internal.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index bc5507ab8450..ec104c2950c3 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -11,6 +11,7 @@ #include #include #include +#include #ifndef ARCH_SHF_SMALL #define ARCH_SHF_SMALL 0 @@ -30,11 +31,13 @@ * to ensure complete separation of code and data, but * only when CONFIG_STRICT_MODULE_RWX=y */ -#ifdef CONFIG_STRICT_MODULE_RWX -# define strict_align(X) PAGE_ALIGN(X) -#else -# define strict_align(X) (X) -#endif +static inline unsigned int strict_align(unsigned int size) +{ + if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return PAGE_ALIGN(size); + else + return size; +} extern struct mutex module_mutex; extern struct list_head modules; -- cgit v1.2.3 From f963ef123900ac534aeb6141642e5351989ac14c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sun, 12 Jun 2022 17:33:20 +0200 Subject: module: Fix "warning: variable 'exit' set but not used" When CONFIG_MODULE_UNLOAD is not selected, 'exit' is set but never used. It is not possible to replace the #ifdef CONFIG_MODULE_UNLOAD by IS_ENABLED(CONFIG_MODULE_UNLOAD) because mod->exit doesn't exist when CONFIG_MODULE_UNLOAD is not selected. And because of the rcu_read_lock_sched() section it is not easy to regroup everything in a single #ifdef. Let's regroup partially and add missing #ifdef to completely opt out the use of 'exit' when CONFIG_MODULE_UNLOAD is not selected. Reported-by: kernel test robot Signed-off-by: Christophe Leroy Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index fed58d30725d..0548151dd933 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2939,24 +2939,25 @@ static void cfi_init(struct module *mod) { #ifdef CONFIG_CFI_CLANG initcall_t *init; +#ifdef CONFIG_MODULE_UNLOAD exitcall_t *exit; +#endif rcu_read_lock_sched(); mod->cfi_check = (cfi_check_fn) find_kallsyms_symbol_value(mod, "__cfi_check"); init = (initcall_t *) find_kallsyms_symbol_value(mod, "__cfi_jt_init_module"); - exit = (exitcall_t *) - find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module"); - rcu_read_unlock_sched(); - /* Fix init/exit functions to point to the CFI jump table */ if (init) mod->init = *init; #ifdef CONFIG_MODULE_UNLOAD + exit = (exitcall_t *) + find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module"); if (exit) mod->exit = *exit; #endif + rcu_read_unlock_sched(); cfi_module_add(mod, mod_tree.addr_min); #endif -- cgit v1.2.3 From 0326195f523a549e0a9d7fd44c70b26fd7265090 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Jul 2022 12:39:00 +0000 Subject: bpf: Make sure mac_header was set before using it Classic BPF has a way to load bytes starting from the mac header. Some skbs do not have a mac header, and skb_mac_header() in this case is returning a pointer that 65535 bytes after skb->head. Existing range check in bpf_internal_load_pointer_neg_helper() was properly kicking and no illegal access was happening. New sanity check in skb_mac_header() is firing, so we need to avoid it. WARNING: CPU: 1 PID: 28990 at include/linux/skbuff.h:2785 skb_mac_header include/linux/skbuff.h:2785 [inline] WARNING: CPU: 1 PID: 28990 at include/linux/skbuff.h:2785 bpf_internal_load_pointer_neg_helper+0x1b1/0x1c0 kernel/bpf/core.c:74 Modules linked in: CPU: 1 PID: 28990 Comm: syz-executor.0 Not tainted 5.19.0-rc4-syzkaller-00865-g4874fb9484be #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/29/2022 RIP: 0010:skb_mac_header include/linux/skbuff.h:2785 [inline] RIP: 0010:bpf_internal_load_pointer_neg_helper+0x1b1/0x1c0 kernel/bpf/core.c:74 Code: ff ff 45 31 f6 e9 5a ff ff ff e8 aa 27 40 00 e9 3b ff ff ff e8 90 27 40 00 e9 df fe ff ff e8 86 27 40 00 eb 9e e8 2f 2c f3 ff <0f> 0b eb b1 e8 96 27 40 00 e9 79 fe ff ff 90 41 57 41 56 41 55 41 RSP: 0018:ffffc9000309f668 EFLAGS: 00010216 RAX: 0000000000000118 RBX: ffffffffffeff00c RCX: ffffc9000e417000 RDX: 0000000000040000 RSI: ffffffff81873f21 RDI: 0000000000000003 RBP: ffff8880842878c0 R08: 0000000000000003 R09: 000000000000ffff R10: 000000000000ffff R11: 0000000000000001 R12: 0000000000000004 R13: ffff88803ac56c00 R14: 000000000000ffff R15: dffffc0000000000 FS: 00007f5c88a16700(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fdaa9f6c058 CR3: 000000003a82c000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ____bpf_skb_load_helper_32 net/core/filter.c:276 [inline] bpf_skb_load_helper_32+0x191/0x220 net/core/filter.c:264 Fixes: f9aefd6b2aa3 ("net: warn if mac header was not set") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220707123900.945305-1-edumazet@google.com --- kernel/bpf/core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5f6f3f829b36..e7961508a47d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -68,11 +68,13 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns { u8 *ptr = NULL; - if (k >= SKF_NET_OFF) + if (k >= SKF_NET_OFF) { ptr = skb_network_header(skb) + k - SKF_NET_OFF; - else if (k >= SKF_LL_OFF) + } else if (k >= SKF_LL_OFF) { + if (unlikely(!skb_mac_header_was_set(skb))) + return NULL; ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - + } if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; -- cgit v1.2.3 From f8d3da4ef8faf027261e06b7864583930dd7c7b9 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 6 Jul 2022 16:25:47 -0700 Subject: bpf: Add flags arg to bpf_dynptr_read and bpf_dynptr_write APIs Commit 13bbbfbea759 ("bpf: Add bpf_dynptr_read and bpf_dynptr_write") added the bpf_dynptr_write() and bpf_dynptr_read() APIs. However, it will be needed for some dynptr types to pass in flags as well (e.g. when writing to a skb, the user may like to invalidate the hash or recompute the checksum). This patch adds a "u64 flags" arg to the bpf_dynptr_read() and bpf_dynptr_write() APIs before their UAPI signature freezes where we then cannot change them anymore with a 5.19.x released kernel. Fixes: 13bbbfbea759 ("bpf: Add bpf_dynptr_read and bpf_dynptr_write") Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20220706232547.4016651-1-joannelkoong@gmail.com --- include/uapi/linux/bpf.h | 11 +++++++---- kernel/bpf/helpers.c | 12 ++++++++---- tools/include/uapi/linux/bpf.h | 11 +++++++---- tools/testing/selftests/bpf/progs/dynptr_fail.c | 10 +++++----- tools/testing/selftests/bpf/progs/dynptr_success.c | 4 ++-- 5 files changed, 29 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f4009dbdf62d..ef78e0e1a754 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5222,22 +5222,25 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset) + * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. + * *flags* is currently unused. * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length - * of *src*'s data, -EINVAL if *src* is an invalid dynptr. + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. * - * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len) + * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. + * *flags* is currently unused. * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* - * is a read-only dynptr. + * is a read-only dynptr or if *flags* is not 0. * * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) * Description diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 225806a02efb..bb1254f07667 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1497,11 +1497,12 @@ const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, }; -BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset) +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, + u32, offset, u64, flags) { int err; - if (!src->data) + if (!src->data || flags) return -EINVAL; err = bpf_dynptr_check_off_len(src, offset, len); @@ -1521,13 +1522,15 @@ const struct bpf_func_proto bpf_dynptr_read_proto = { .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_DYNPTR, .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len) +BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, + u32, len, u64, flags) { int err; - if (!dst->data || bpf_dynptr_is_rdonly(dst)) + if (!dst->data || flags || bpf_dynptr_is_rdonly(dst)) return -EINVAL; err = bpf_dynptr_check_off_len(dst, offset, len); @@ -1547,6 +1550,7 @@ const struct bpf_func_proto bpf_dynptr_write_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f4009dbdf62d..ef78e0e1a754 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5222,22 +5222,25 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset) + * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. + * *flags* is currently unused. * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length - * of *src*'s data, -EINVAL if *src* is an invalid dynptr. + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. * - * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len) + * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. + * *flags* is currently unused. * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* - * is a read-only dynptr. + * is a read-only dynptr or if *flags* is not 0. * * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) * Description diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index d811cff73597..0a26c243e6e9 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -140,12 +140,12 @@ int use_after_invalid(void *ctx) bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(read_data), 0, &ptr); - bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0); + bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0); bpf_ringbuf_submit_dynptr(&ptr, 0); /* this should fail */ - bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0); + bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0); return 0; } @@ -338,7 +338,7 @@ int invalid_helper2(void *ctx) get_map_val_dynptr(&ptr); /* this should fail */ - bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0); + bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0, 0); return 0; } @@ -377,7 +377,7 @@ int invalid_write2(void *ctx) memcpy((void *)&ptr + 8, &x, sizeof(x)); /* this should fail */ - bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0); + bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0); bpf_ringbuf_submit_dynptr(&ptr, 0); @@ -473,7 +473,7 @@ int invalid_read2(void *ctx) get_map_val_dynptr(&ptr); /* this should fail */ - bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 1, 0); + bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 1, 0, 0); return 0; } diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index d67be48df4b2..a3a6103c8569 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -43,10 +43,10 @@ int test_read_write(void *ctx) bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(write_data), 0, &ptr); /* Write data into the dynptr */ - err = err ?: bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data)); + err = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0); /* Read the data that was written into the dynptr */ - err = err ?: bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0); + err = err ?: bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0); /* Ensure the data we read matches the data we wrote */ for (i = 0; i < sizeof(read_data); i++) { -- cgit v1.2.3 From 1f1be04b4d48a2475ea1aab46a99221bfc5c0968 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 Jul 2022 16:39:52 -0700 Subject: sysctl: Fix data races in proc_dointvec(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_dointvec() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_dointvec() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e52b6e372c60..c8a05655ae60 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -446,14 +446,14 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, if (*negp) { if (*lvalp > (unsigned long) INT_MAX + 1) return -EINVAL; - *valp = -*lvalp; + WRITE_ONCE(*valp, -*lvalp); } else { if (*lvalp > (unsigned long) INT_MAX) return -EINVAL; - *valp = *lvalp; + WRITE_ONCE(*valp, *lvalp); } } else { - int val = *valp; + int val = READ_ONCE(*valp); if (val < 0) { *negp = true; *lvalp = -(unsigned long)val; -- cgit v1.2.3 From 4762b532ec9539755aab61445d5da6e1926ccb99 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 Jul 2022 16:39:53 -0700 Subject: sysctl: Fix data races in proc_douintvec(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_douintvec() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_douintvec() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32 fields") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8a05655ae60..2ab8c2a37e8f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -472,9 +472,9 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, if (write) { if (*lvalp > UINT_MAX) return -EINVAL; - *valp = *lvalp; + WRITE_ONCE(*valp, *lvalp); } else { - unsigned int val = *valp; + unsigned int val = READ_ONCE(*valp); *lvalp = (unsigned long)val; } return 0; -- cgit v1.2.3 From f613d86d014b6375a4085901de39406598121e35 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 Jul 2022 16:39:54 -0700 Subject: sysctl: Fix data races in proc_dointvec_minmax(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_dointvec_minmax() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_dointvec_minmax() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2ab8c2a37e8f..4d87832367f2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -857,7 +857,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, if ((param->min && *param->min > tmp) || (param->max && *param->max < tmp)) return -EINVAL; - *valp = tmp; + WRITE_ONCE(*valp, tmp); } return 0; -- cgit v1.2.3 From 2d3b559df3ed39258737789aae2ae7973d205bc1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 Jul 2022 16:39:55 -0700 Subject: sysctl: Fix data races in proc_douintvec_minmax(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_douintvec_minmax() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_douintvec_minmax() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: 61d9b56a8920 ("sysctl: add unsigned int range support") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4d87832367f2..379721a03d41 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -923,7 +923,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, (param->max && *param->max < tmp)) return -ERANGE; - *valp = tmp; + WRITE_ONCE(*valp, tmp); } return 0; -- cgit v1.2.3 From c31bcc8fb89fc2812663900589c6325ba35d9a65 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 Jul 2022 16:39:56 -0700 Subject: sysctl: Fix data races in proc_doulongvec_minmax(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_doulongvec_minmax() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_doulongvec_minmax() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 379721a03d41..8c55ba01f41b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1090,9 +1090,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, err = -EINVAL; break; } - *i = val; + WRITE_ONCE(*i, val); } else { - val = convdiv * (*i) / convmul; + val = convdiv * READ_ONCE(*i) / convmul; if (!first) proc_put_char(&buffer, &left, '\t'); proc_put_long(&buffer, &left, val, false); -- cgit v1.2.3 From e877820877663fbae8cb9582ea597a7230b94df3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 Jul 2022 16:39:57 -0700 Subject: sysctl: Fix data races in proc_dointvec_jiffies(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_dointvec_jiffies() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_dointvec_jiffies() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8c55ba01f41b..bf9383d17e1b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1173,9 +1173,12 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, if (write) { if (*lvalp > INT_MAX / HZ) return 1; - *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); + if (*negp) + WRITE_ONCE(*valp, -*lvalp * HZ); + else + WRITE_ONCE(*valp, *lvalp * HZ); } else { - int val = *valp; + int val = READ_ONCE(*valp); unsigned long lval; if (val < 0) { *negp = true; -- cgit v1.2.3 From de2a34771f5123270bc3842535ac91673116dd03 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 6 Jul 2022 12:16:25 +0200 Subject: ptrace: fix clearing of JOBCTL_TRACED in ptrace_unfreeze_traced() CI reported the following splat while running the strace testsuite: WARNING: CPU: 1 PID: 3570031 at kernel/ptrace.c:272 ptrace_check_attach+0x12e/0x178 CPU: 1 PID: 3570031 Comm: strace Tainted: G OE 5.19.0-20220624.rc3.git0.ee819a77d4e7.300.fc36.s390x #1 Hardware name: IBM 3906 M04 704 (z/VM 7.1.0) Call Trace: [<00000000ab4b645a>] ptrace_check_attach+0x132/0x178 ([<00000000ab4b6450>] ptrace_check_attach+0x128/0x178) [<00000000ab4b6cde>] __s390x_sys_ptrace+0x86/0x160 [<00000000ac03fcec>] __do_syscall+0x1d4/0x200 [<00000000ac04e312>] system_call+0x82/0xb0 Last Breaking-Event-Address: [<00000000ab4ea3c8>] wait_task_inactive+0x98/0x190 This is because JOBCTL_TRACED is set, but the task is not in TASK_TRACED state. Caused by ptrace_unfreeze_traced() which does: task->jobctl &= ~TASK_TRACED but it should be: task->jobctl &= ~JOBCTL_TRACED Fixes: 31cae1eaae4f ("sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state") Signed-off-by: Sven Schnelle Tested-by: Alexander Gordeev Acked-by: Oleg Nesterov Acked-by: Peter Zijlstra Cc: Eric Biederman Cc: Steven Rostedt Cc: Kees Cook Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 156a99283b11..1893d909e45c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -222,7 +222,7 @@ static void ptrace_unfreeze_traced(struct task_struct *task) if (lock_task_sighand(task, &flags)) { task->jobctl &= ~JOBCTL_PTRACE_FROZEN; if (__fatal_signal_pending(task)) { - task->jobctl &= ~TASK_TRACED; + task->jobctl &= ~JOBCTL_TRACED; wake_up_state(task, __TASK_TRACED); } unlock_task_sighand(task, &flags); -- cgit v1.2.3 From d5b36a4dbd06c5e8e36ca8ccc552f679069e2946 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 11 Jul 2022 18:16:25 +0200 Subject: fix race between exit_itimers() and /proc/pid/timers As Chris explains, the comment above exit_itimers() is not correct, we can race with proc_timers_seq_ops. Change exit_itimers() to clear signal->posix_timers with ->siglock held. Cc: Reported-by: chris@accessvector.net Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/linux/sched/task.h | 2 +- kernel/exit.c | 2 +- kernel/time/posix-timers.c | 19 ++++++++++++++----- 4 files changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 0989fb8472a1..778123259e42 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1301,7 +1301,7 @@ int begin_new_exec(struct linux_binprm * bprm) bprm->mm = NULL; #ifdef CONFIG_POSIX_TIMERS - exit_itimers(me->signal); + exit_itimers(me); flush_itimer_signals(); #endif diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 505aaf9fe477..81cab4b01edc 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -85,7 +85,7 @@ static inline void exit_thread(struct task_struct *tsk) extern __noreturn void do_group_exit(int); extern void exit_files(struct task_struct *); -extern void exit_itimers(struct signal_struct *); +extern void exit_itimers(struct task_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); diff --git a/kernel/exit.c b/kernel/exit.c index f072959fcab7..64c938ce36fe 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -766,7 +766,7 @@ void __noreturn do_exit(long code) #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); - exit_itimers(tsk->signal); + exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1cd10b102c51..5dead89308b7 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1051,15 +1051,24 @@ retry_delete: } /* - * This is called by do_exit or de_thread, only when there are no more - * references to the shared signal_struct. + * This is called by do_exit or de_thread, only when nobody else can + * modify the signal->posix_timers list. Yet we need sighand->siglock + * to prevent the race with /proc/pid/timers. */ -void exit_itimers(struct signal_struct *sig) +void exit_itimers(struct task_struct *tsk) { + struct list_head timers; struct k_itimer *tmr; - while (!list_empty(&sig->posix_timers)) { - tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); + if (list_empty(&tsk->signal->posix_timers)) + return; + + spin_lock_irq(&tsk->sighand->siglock); + list_replace_init(&tsk->signal->posix_timers, &timers); + spin_unlock_irq(&tsk->sighand->siglock); + + while (!list_empty(&timers)) { + tmr = list_first_entry(&timers, struct k_itimer, list); itimer_delete(tmr); } } -- cgit v1.2.3 From e69a66147d49506062cd837f3b230ee3e98102ab Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Mon, 11 Jul 2022 18:17:19 +0100 Subject: module: kallsyms: Ensure preemption in add_kallsyms() with PREEMPT_RT The commit 08126db5ff73 ("module: kallsyms: Fix suspicious rcu usage") under PREEMPT_RT=y, disabling preemption introduced an unbounded latency since the loop is not fixed. This change caused a regression since previously preemption was not disabled and we would dereference RCU-protected pointers explicitly. That being said, these pointers cannot change. Before kallsyms-specific data is prepared/or set-up, we ensure that the unformed module is known to be unique i.e. does not already exist (see load_module()). Therefore, we can fix this by using the common and more appropriate RCU flavour as this section of code can be safely preempted. Reported-by: Steven Rostedt Fixes: 08126db5ff73 ("module: kallsyms: Fix suspicious rcu usage") Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- kernel/module/kallsyms.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 18c23545b984..77e75bead569 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -176,14 +176,14 @@ void add_kallsyms(struct module *mod, const struct load_info *info) mod->kallsyms = (void __rcu *)mod->init_layout.base + info->mod_kallsyms_init_off; - preempt_disable(); + rcu_read_lock(); /* The following is safe since this pointer cannot change */ - rcu_dereference_sched(mod->kallsyms)->symtab = (void *)symsec->sh_addr; - rcu_dereference_sched(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr; + rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - rcu_dereference_sched(mod->kallsyms)->strtab = + rcu_dereference(mod->kallsyms)->strtab = (void *)info->sechdrs[info->index.str].sh_addr; - rcu_dereference_sched(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs; + rcu_dereference(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs; /* * Now populate the cut down core kallsyms for after init @@ -193,20 +193,20 @@ void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs; mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs; strtab_size = info->core_typeoffs - info->stroffs; - src = rcu_dereference_sched(mod->kallsyms)->symtab; - for (ndst = i = 0; i < rcu_dereference_sched(mod->kallsyms)->num_symtab; i++) { - rcu_dereference_sched(mod->kallsyms)->typetab[i] = elf_type(src + i, info); + src = rcu_dereference(mod->kallsyms)->symtab; + for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) { + rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info); if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { ssize_t ret; mod->core_kallsyms.typetab[ndst] = - rcu_dereference_sched(mod->kallsyms)->typetab[i]; + rcu_dereference(mod->kallsyms)->typetab[i]; dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; ret = strscpy(s, - &rcu_dereference_sched(mod->kallsyms)->strtab[src[i].st_name], + &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name], strtab_size); if (ret < 0) break; @@ -214,7 +214,7 @@ void add_kallsyms(struct module *mod, const struct load_info *info) strtab_size -= ret + 1; } } - preempt_enable(); + rcu_read_unlock(); mod->core_kallsyms.num_symtab = ndst; } -- cgit v1.2.3 From 7edc3945bdce9c39198a10d6129377a5c53559c2 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Mon, 11 Jul 2022 09:47:31 +0800 Subject: tracing/histograms: Fix memory leak problem This reverts commit 46bbe5c671e06f070428b9be142cc4ee5cedebac. As commit 46bbe5c671e0 ("tracing: fix double free") said, the "double free" problem reported by clang static analyzer is: > In parse_var_defs() if there is a problem allocating > var_defs.expr, the earlier var_defs.name is freed. > This free is duplicated by free_var_defs() which frees > the rest of the list. However, if there is a problem allocating N-th var_defs.expr: + in parse_var_defs(), the freed 'earlier var_defs.name' is actually the N-th var_defs.name; + then in free_var_defs(), the names from 0th to (N-1)-th are freed; IF ALLOCATING PROBLEM HAPPENED HERE!!! -+ \ | 0th 1th (N-1)-th N-th V +-------------+-------------+-----+-------------+----------- var_defs: | name | expr | name | expr | ... | name | expr | name | /// +-------------+-------------+-----+-------------+----------- These two frees don't act on same name, so there was no "double free" problem before. Conversely, after that commit, we get a "memory leak" problem because the above "N-th var_defs.name" is not freed. If enable CONFIG_DEBUG_KMEMLEAK and inject a fault at where the N-th var_defs.expr allocated, then execute on shell like: $ echo 'hist:key=call_site:val=$v1,$v2:v1=bytes_req,v2=bytes_alloc' > \ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger Then kmemleak reports: unreferenced object 0xffff8fb100ef3518 (size 8): comm "bash", pid 196, jiffies 4295681690 (age 28.538s) hex dump (first 8 bytes): 76 31 00 00 b1 8f ff ff v1...... backtrace: [<0000000038fe4895>] kstrdup+0x2d/0x60 [<00000000c99c049a>] event_hist_trigger_parse+0x206f/0x20e0 [<00000000ae70d2cc>] trigger_process_regex+0xc0/0x110 [<0000000066737a4c>] event_trigger_write+0x75/0xd0 [<000000007341e40c>] vfs_write+0xbb/0x2a0 [<0000000087fde4c2>] ksys_write+0x59/0xd0 [<00000000581e9cdf>] do_syscall_64+0x3a/0x80 [<00000000cf3b065c>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 Link: https://lkml.kernel.org/r/20220711014731.69520-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: 46bbe5c671e0 ("tracing: fix double free") Reported-by: Hulk Robot Suggested-by: Steven Rostedt Reviewed-by: Tom Zanussi Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 48e82e141d54..e87a46794079 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4430,6 +4430,8 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) s = kstrdup(field_str, GFP_KERNEL); if (!s) { + kfree(hist_data->attrs->var_defs.name[n_vars]); + hist_data->attrs->var_defs.name[n_vars] = NULL; ret = -ENOMEM; goto free; } -- cgit v1.2.3 From 495fcec8648cdfb483b5b9ab310f3839f07cb3b8 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 8 Jul 2022 17:09:52 -0700 Subject: tracing: Fix sleeping while atomic in kdb ftdump If you drop into kdb and type "ftdump" you'll get a sleeping while atomic warning from memory allocation in trace_find_next_entry(). This appears to have been caused by commit ff895103a84a ("tracing: Save off entry when peeking at next entry"), which added the allocation in that path. The problematic commit was already fixed by commit 8e99cf91b99b ("tracing: Do not allocate buffer in trace_find_next_entry() in atomic") but that fix missed the kdb case. The fix here is easy: just move the assignment of the static buffer to the place where it should have been to begin with: trace_init_global_iter(). That function is called in two places, once is right before the assignment of the static buffer added by the previous fix and once is in kdb. Note that it appears that there's a second static buffer that we need to assign that was added in commit efbbdaa22bb7 ("tracing: Show real address for trace event arguments"), so we'll move that too. Link: https://lkml.kernel.org/r/20220708170919.1.I75844e5038d9425add2ad853a608cb44bb39df40@changeid Fixes: ff895103a84a ("tracing: Save off entry when peeking at next entry") Fixes: efbbdaa22bb7 ("tracing: Show real address for trace event arguments") Signed-off-by: Douglas Anderson Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a8cfac0611bc..b8dd54627075 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9864,6 +9864,12 @@ void trace_init_global_iter(struct trace_iterator *iter) /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[iter->tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + + /* Can not use kmalloc for iter.temp and iter.fmt */ + iter->temp = static_temp_buf; + iter->temp_size = STATIC_TEMP_BUF_SIZE; + iter->fmt = static_fmt_buf; + iter->fmt_size = STATIC_FMT_BUF_SIZE; } void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) @@ -9896,11 +9902,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* Simulate the iterator */ trace_init_global_iter(&iter); - /* Can not use kmalloc for iter.temp and iter.fmt */ - iter.temp = static_temp_buf; - iter.temp_size = STATIC_TEMP_BUF_SIZE; - iter.fmt = static_fmt_buf; - iter.fmt_size = STATIC_FMT_BUF_SIZE; for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); -- cgit v1.2.3 From 0a6d7d45414a77876e8e9a77e454af754cea3a60 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 6 Jul 2022 16:12:31 -0400 Subject: ftrace: Be more specific about arch impact when function tracer is enabled It was brought up that on ARMv7, that because the FUNCTION_TRACER does not use nops to keep function tracing disabled because of the use of a link register, it does have some performance impact. The start of functions when -pg is used to compile the kernel is: push {lr} bl 8010e7c0 <__gnu_mcount_nc> When function tracing is tuned off, it becomes: push {lr} add sp, sp, #4 Which just puts the stack back to its normal location. But these two instructions at the start of every function does incur some overhead. Be more honest in the Kconfig FUNCTION_TRACER description and specify that the overhead being in the noise was x86 specific, but other architectures may vary. Link: https://lore.kernel.org/all/20220705105416.GE5208@pengutronix.de/ Link: https://lkml.kernel.org/r/20220706161231.085a83da@gandalf.local.home Reported-by: Sascha Hauer Acked-by: Sascha Hauer Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index debbbb083286..ccd6a5ade3e9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -194,7 +194,8 @@ config FUNCTION_TRACER sequence is then dynamically patched into a tracer call when tracing is enabled by the administrator. If it's runtime disabled (the bootup default), then the overhead of the instructions is very - small and not measurable even in micro-benchmarks. + small and not measurable even in micro-benchmarks (at least on + x86, but may have impact on other architectures). config FUNCTION_GRAPH_TRACER bool "Kernel Function Graph Tracer" -- cgit v1.2.3 From 7dee5d7747a69aa2be41f04c6a7ecfe3ac8cdf18 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 11 Jul 2022 17:15:19 -0700 Subject: sysctl: Fix data-races in proc_dou8vec_minmax(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_dou8vec_minmax() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_dou8vec_minmax() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: cb9444130662 ("sysctl: add proc_dou8vec_minmax()") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bf9383d17e1b..b016d68da08a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1007,13 +1007,13 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write, tmp.maxlen = sizeof(val); tmp.data = &val; - val = *data; + val = READ_ONCE(*data); res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos, do_proc_douintvec_minmax_conv, ¶m); if (res) return res; if (write) - *data = val; + WRITE_ONCE(*data, val); return 0; } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); -- cgit v1.2.3 From 7d1025e559782b58824b36cb8ad547a69f2e4b31 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 11 Jul 2022 17:15:20 -0700 Subject: sysctl: Fix data-races in proc_dointvec_ms_jiffies(). A sysctl variable is accessed concurrently, and there is always a chance of data-race. So, all readers and writers need some basic protection to avoid load/store-tearing. This patch changes proc_dointvec_ms_jiffies() to use READ_ONCE() and WRITE_ONCE() internally to fix data-races on the sysctl side. For now, proc_dointvec_ms_jiffies() itself is tolerant to a data-race, but we still need to add annotations on the other subsystem's side. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- kernel/sysctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b016d68da08a..d99bc3945445 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1224,9 +1224,9 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, if (jif > INT_MAX) return 1; - *valp = (int)jif; + WRITE_ONCE(*valp, (int)jif); } else { - int val = *valp; + int val = READ_ONCE(*valp); unsigned long lval; if (val < 0) { *negp = true; @@ -1294,8 +1294,8 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, * @ppos: the current position in the file * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/1000 seconds, and + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and * are converted into jiffies. * * Returns 0 on success. -- cgit v1.2.3 From af16df54b89dee72df253abc5e7b5e8a6d16c11c Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Wed, 13 Jul 2022 15:21:11 +0800 Subject: ima: force signature verification when CONFIG_KEXEC_SIG is configured Currently, an unsigned kernel could be kexec'ed when IMA arch specific policy is configured unless lockdown is enabled. Enforce kernel signature verification check in the kexec_file_load syscall when IMA arch specific policy is configured. Fixes: 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG and KEXEC_SIG_FORCE") Reported-and-suggested-by: Mimi Zohar Signed-off-by: Coiby Xu Signed-off-by: Mimi Zohar --- include/linux/kexec.h | 6 ++++++ kernel/kexec_file.c | 11 ++++++++++- security/integrity/ima/ima_efi.c | 2 ++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index ce6536f1d269..475683cd67f1 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -452,6 +452,12 @@ static inline int kexec_crash_loaded(void) { return 0; } #define kexec_in_progress false #endif /* CONFIG_KEXEC_CORE */ +#ifdef CONFIG_KEXEC_SIG +void set_kexec_sig_enforced(void); +#else +static inline void set_kexec_sig_enforced(void) {} +#endif + #endif /* !defined(__ASSEBMLY__) */ #endif /* LINUX_KEXEC_H */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 145321a5e798..f9261c07b048 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -29,6 +29,15 @@ #include #include "kexec_internal.h" +#ifdef CONFIG_KEXEC_SIG +static bool sig_enforce = IS_ENABLED(CONFIG_KEXEC_SIG_FORCE); + +void set_kexec_sig_enforced(void) +{ + sig_enforce = true; +} +#endif + static int kexec_calculate_store_digests(struct kimage *image); /* @@ -159,7 +168,7 @@ kimage_validate_signature(struct kimage *image) image->kernel_buf_len); if (ret) { - if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { + if (sig_enforce) { pr_notice("Enforced kernel signature verification failed (%d).\n", ret); return ret; } diff --git a/security/integrity/ima/ima_efi.c b/security/integrity/ima/ima_efi.c index 71786d01946f..9db66fe310d4 100644 --- a/security/integrity/ima/ima_efi.c +++ b/security/integrity/ima/ima_efi.c @@ -67,6 +67,8 @@ const char * const *arch_get_ima_policy(void) if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) { if (IS_ENABLED(CONFIG_MODULE_SIG)) set_module_sig_enforced(); + if (IS_ENABLED(CONFIG_KEXEC_SIG)) + set_kexec_sig_enforced(); return sb_arch_rules; } return NULL; -- cgit v1.2.3