From df0734702a7cbba49d6765bd5ba069340bf9c5db Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 2 Nov 2018 10:16:15 -0700 Subject: bpf: show real jited prog address in /proc/kallsyms Currently, /proc/kallsyms shows page address of jited bpf program. The main reason here is to not expose randomized start address. However, This is not ideal for detailed profiling (find hot instructions from stack traces). This patch replaces the page address with real prog start address. This change is OK because these addresses are still protected by sysctl kptr_restrict (see kallsyms_show_value()), and only programs loaded by root are added to kallsyms (see bpf_prog_kallsyms_add()). Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6377225b2082..1a796e0799ec 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -553,7 +553,6 @@ bool is_bpf_text_address(unsigned long addr) int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { - unsigned long symbol_start, symbol_end; struct bpf_prog_aux *aux; unsigned int it = 0; int ret = -ERANGE; @@ -566,10 +565,9 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, if (it++ != symnum) continue; - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); bpf_get_prog_name(aux->prog, sym); - *value = symbol_start; + *value = (unsigned long)aux->prog->bpf_func; *type = BPF_SYM_ELF_TYPE; ret = 0; -- cgit v1.2.3 From de57e99ceb65d0d7775cc14a8ba5931d7de1d708 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 2 Nov 2018 10:16:16 -0700 Subject: bpf: show real jited address in bpf_prog_info->jited_ksyms Currently, jited_ksyms in bpf_prog_info shows page addresses of jited bpf program. The main reason here is to not expose randomized start address. However, this is not ideal for detailed profiling (find hot instructions from stack traces). This patch replaces the page address with real prog start address. This change is OK because bpf_prog_get_info_by_fd() is only available to root. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ccb93277aae2..34a9eef5992c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2172,7 +2172,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, user_ksyms = u64_to_user_ptr(info.jited_ksyms); for (i = 0; i < ulen; i++) { ksym_addr = (ulong) prog->aux->func[i]->bpf_func; - ksym_addr &= PAGE_MASK; if (put_user((u64) ksym_addr, &user_ksyms[i])) return -EFAULT; } -- cgit v1.2.3 From ff1889fc531f582f902175c0acc80321af540b24 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 2 Nov 2018 10:16:17 -0700 Subject: bpf: show main program address and length in bpf_prog_info Currently, when there is no subprog (prog->aux->func_cnt == 0), bpf_prog_info does not return any jited_ksyms or jited_func_lens. This patch adds main program address (prog->bpf_func) and main program length (prog->jited_len) to bpf_prog_info. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 34a9eef5992c..9418174c276c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2158,11 +2158,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } ulen = info.nr_jited_ksyms; - info.nr_jited_ksyms = prog->aux->func_cnt; + info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; if (info.nr_jited_ksyms && ulen) { if (bpf_dump_raw_ok()) { + unsigned long ksym_addr; u64 __user *user_ksyms; - ulong ksym_addr; u32 i; /* copy the address of the kernel symbol @@ -2170,9 +2170,17 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, */ ulen = min_t(u32, info.nr_jited_ksyms, ulen); user_ksyms = u64_to_user_ptr(info.jited_ksyms); - for (i = 0; i < ulen; i++) { - ksym_addr = (ulong) prog->aux->func[i]->bpf_func; - if (put_user((u64) ksym_addr, &user_ksyms[i])) + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + ksym_addr = (unsigned long) + prog->aux->func[i]->bpf_func; + if (put_user((u64) ksym_addr, + &user_ksyms[i])) + return -EFAULT; + } + } else { + ksym_addr = (unsigned long) prog->bpf_func; + if (put_user((u64) ksym_addr, &user_ksyms[0])) return -EFAULT; } } else { @@ -2181,7 +2189,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } ulen = info.nr_jited_func_lens; - info.nr_jited_func_lens = prog->aux->func_cnt; + info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; if (info.nr_jited_func_lens && ulen) { if (bpf_dump_raw_ok()) { u32 __user *user_lens; @@ -2190,9 +2198,16 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, /* copy the JITed image lengths for each function */ ulen = min_t(u32, info.nr_jited_func_lens, ulen); user_lens = u64_to_user_ptr(info.jited_func_lens); - for (i = 0; i < ulen; i++) { - func_len = prog->aux->func[i]->jited_len; - if (put_user(func_len, &user_lens[i])) + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + func_len = + prog->aux->func[i]->jited_len; + if (put_user(func_len, &user_lens[i])) + return -EFAULT; + } + } else { + func_len = prog->jited_len; + if (put_user(func_len, &user_lens[0])) return -EFAULT; } } else { -- cgit v1.2.3 From 28c2fae726bf5003cd209b0d5910a642af98316f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 2 Nov 2018 11:35:46 +0100 Subject: bpf: fix bpf_prog_get_info_by_fd to return 0 func_lens for unpriv While dbecd7388476 ("bpf: get kernel symbol addresses via syscall") zeroed info.nr_jited_ksyms in bpf_prog_get_info_by_fd() for queries from unprivileged users, commit 815581c11cc2 ("bpf: get JITed image lengths of functions via syscall") forgot about doing so and therefore returns the #elems of the user set up buffer which is incorrect. It also needs to indicate a info.nr_jited_func_lens of zero. Fixes: 815581c11cc2 ("bpf: get JITed image lengths of functions via syscall") Signed-off-by: Daniel Borkmann Cc: Sandipan Das Cc: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9418174c276c..cf5040fd5434 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2078,6 +2078,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; + info.nr_jited_func_lens = 0; goto done; } -- cgit v1.2.3 From 40fa3780bac2b654edf23f6b13f4e2dd550aea10 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 23 Oct 2018 14:37:31 +0100 Subject: sched/core: Take the hotplug lock in sched_init_smp() When running on linux-next (8c60c36d0b8c ("Add linux-next specific files for 20181019")) + CONFIG_PROVE_LOCKING=y on a big.LITTLE system (e.g. Juno or HiKey960), we get the following report: [ 0.748225] Call trace: [ 0.750685] lockdep_assert_cpus_held+0x30/0x40 [ 0.755236] static_key_enable_cpuslocked+0x20/0xc8 [ 0.760137] build_sched_domains+0x1034/0x1108 [ 0.764601] sched_init_domains+0x68/0x90 [ 0.768628] sched_init_smp+0x30/0x80 [ 0.772309] kernel_init_freeable+0x278/0x51c [ 0.776685] kernel_init+0x10/0x108 [ 0.780190] ret_from_fork+0x10/0x18 The static_key in question is 'sched_asym_cpucapacity' introduced by commit: df054e8445a4 ("sched/topology: Add static_key for asymmetric CPU capacity optimizations") In this particular case, we enable it because smp_prepare_cpus() will end up fetching the capacity-dmips-mhz entry from the devicetree, so we already have some asymmetry detected when entering sched_init_smp(). This didn't get detected in tip/sched/core because we were missing: commit cb538267ea1e ("jump_label/lockdep: Assert we hold the hotplug lock for _cpuslocked() operations") Calls to build_sched_domains() post sched_init_smp() will hold the hotplug lock, it just so happens that this very first call is a special case. As stated by a comment in sched_init_smp(), "There's no userspace yet to cause hotplug operations" so this is a harmless warning. However, to both respect the semantics of underlying callees and make lockdep happy, take the hotplug lock in sched_init_smp(). This also satisfies the comment atop sched_init_domains() that says "Callers must hold the hotplug lock". Reported-by: Sudeep Holla Tested-by: Sudeep Holla Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar.Eggemann@arm.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: morten.rasmussen@arm.com Cc: quentin.perret@arm.com Link: http://lkml.kernel.org/r/1540301851-3048-1-git-send-email-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fd2fce8a001b..02a20ef196a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5859,11 +5859,14 @@ void __init sched_init_smp(void) /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot - * happen. + * happen. The hotplug lock is nevertheless taken to satisfy lockdep, + * but there won't be any contention on it. */ + cpus_read_lock(); mutex_lock(&sched_domains_mutex); sched_init_domains(cpu_active_mask); mutex_unlock(&sched_domains_mutex); + cpus_read_unlock(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -- cgit v1.2.3 From e1ff516a56ad56c476b47795d3811eef79d25fbe Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 5 Nov 2018 08:50:13 +0800 Subject: sched/fair: Fix a comment in task_numa_fault() Duplicated 'case it'. Signed-off-by: Yi Wang Reviewed-by: Xi Xu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: zhong.weidong@zte.com.cn Link: http://lkml.kernel.org/r/1541379013-11352-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ee271bb661cc..3648d0300fdf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2400,8 +2400,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) local = 1; /* - * Retry task to preferred node migration periodically, in case it - * case it previously failed, or the scheduler moved us. + * Retry to migrate task to preferred node periodically, in case it + * previously failed, or the scheduler moved us. */ if (time_after(jiffies, p->numa_migrate_retry)) { task_numa_placement(p); -- cgit v1.2.3 From f75d651587f719a813ebbbfeee570e6570731d55 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 4 Nov 2018 18:40:14 -0800 Subject: resource/docs: Fix new kernel-doc warnings The first group of warnings is caused by a "/**" kernel-doc notation marker but the function comments are not in kernel-doc format. Also add another error return value here. ../kernel/resource.c:337: warning: Function parameter or member 'start' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'end' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'flags' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'desc' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'first_lvl' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'res' not described in 'find_next_iomem_res' Add the missing function parameter documentation for the other warnings: ../kernel/resource.c:409: warning: Function parameter or member 'arg' not described in 'walk_iomem_res_desc' ../kernel/resource.c:409: warning: Function parameter or member 'func' not described in 'walk_iomem_res_desc' Signed-off-by: Randy Dunlap Cc: Andrew Morton Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: b69c2e20f6e4 ("resource: Clean it up a bit") Link: http://lkml.kernel.org/r/dda2e4d8-bedd-3167-20fe-8c7d2d35b354@infradead.org Signed-off-by: Ingo Molnar --- kernel/resource.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index b3a3a1fc499e..17bcb189d530 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -318,14 +318,14 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -/** +/* * Finds the lowest iomem resource that covers part of [start..end]. The * caller must specify start, end, flags, and desc (which may be * IORES_DESC_NONE). * * If a resource is found, returns 0 and *res is overwritten with the part * of the resource that's within [start..end]; if none is found, returns - * -1. + * -1. Returns -EINVAL for other invalid parameters. * * This function walks the whole tree and not just first level children * unless @first_lvl is true. @@ -390,7 +390,9 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, } /** - * Walks through iomem resources and calls func() with matching resource + * walk_iomem_res_desc - walk through iomem resources + * + * Walks through iomem resources and calls @func() with matching resource * ranges. This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and * desc are valid candidates. @@ -399,6 +401,8 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, * @flags: I/O resource flags * @start: start addr * @end: end addr + * @arg: function argument for the callback @func + * @func: callback function that is called for each qualifying resource area * * NOTE: For a new descriptor search, define a new IORES_DESC in * and set it in 'desc' of a target resource entry. -- cgit v1.2.3 From ee474b81fe5aa5dc0faae920bf66240fbf55f891 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 1 Nov 2018 23:29:28 +0900 Subject: tracing/kprobes: Fix strpbrk() argument order Fix strpbrk()'s argument order, it must pass acceptable string in 2nd argument. Note that this can cause a kernel panic where it recovers backup character to code->data. Link: http://lkml.kernel.org/r/154108256792.2604.1816052586385217811.stgit@devbox Fixes: a6682814f371 ("tracing/kprobes: Allow kprobe-events to record module symbol") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 3ef15a6683c0..bd30e9398d2a 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -535,7 +535,7 @@ int traceprobe_update_arg(struct probe_arg *arg) if (code[1].op != FETCH_OP_IMM) return -EINVAL; - tmp = strpbrk("+-", code->data); + tmp = strpbrk(code->data, "+-"); if (tmp) c = *tmp; ret = traceprobe_split_symbol_offset(code->data, -- cgit v1.2.3 From f26621e60b35369bca9228bc936dc723b3e421af Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 5 Nov 2018 10:33:07 +0100 Subject: resource/docs: Complete kernel-doc style function documentation Add the missing kernel-doc style function parameters documentation. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: linux-tip-commits@vger.kernel.org Cc: rdunlap@infradead.org Fixes: b69c2e20f6e4 ("resource: Clean it up a bit") Link: http://lkml.kernel.org/r/20181105093307.GA12445@zn.tnic Signed-off-by: Ingo Molnar --- kernel/resource.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 17bcb189d530..b0fbf685c77a 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -318,17 +318,24 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -/* - * Finds the lowest iomem resource that covers part of [start..end]. The - * caller must specify start, end, flags, and desc (which may be +/** + * Finds the lowest iomem resource that covers part of [@start..@end]. The + * caller must specify @start, @end, @flags, and @desc (which may be * IORES_DESC_NONE). * - * If a resource is found, returns 0 and *res is overwritten with the part - * of the resource that's within [start..end]; if none is found, returns - * -1. Returns -EINVAL for other invalid parameters. + * If a resource is found, returns 0 and @*res is overwritten with the part + * of the resource that's within [@start..@end]; if none is found, returns + * -1 or -EINVAL for other invalid parameters. * * This function walks the whole tree and not just first level children * unless @first_lvl is true. + * + * @start: start address of the resource searched for + * @end: end address of same resource + * @flags: flags which the resource must have + * @desc: descriptor the resource must have + * @first_lvl: walk only the first level children, if set + * @res: return ptr, if resource found */ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, @@ -390,9 +397,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, } /** - * walk_iomem_res_desc - walk through iomem resources - * - * Walks through iomem resources and calls @func() with matching resource + * Walks through iomem resources and calls func() with matching resource * ranges. This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and * desc are valid candidates. -- cgit v1.2.3 From d2f007dbe7e4c9583eea6eb04d60001e85c6f1bd Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 5 Nov 2018 20:55:09 +0100 Subject: userns: also map extents in the reverse map to kernel IDs The current logic first clones the extent array and sorts both copies, then maps the lower IDs of the forward mapping into the lower namespace, but doesn't map the lower IDs of the reverse mapping. This means that code in a nested user namespace with >5 extents will see incorrect IDs. It also breaks some access checks, like inode_owner_or_capable() and privileged_wrt_inode_uidgid(), so a process can incorrectly appear to be capable relative to an inode. To fix it, we have to make sure that the "lower_first" members of extents in both arrays are translated; and we have to make sure that the reverse map is sorted *after* the translation (since otherwise the translation can break the sorting). This is CVE-2018-18955. Fixes: 6397fac4915a ("userns: bump idmap limits to 340") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Tested-by: Eric W. Biederman Reviewed-by: Eric W. Biederman Signed-off-by: Eric W. Biederman --- kernel/user_namespace.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e5222b5fb4fe..923414a246e9 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -974,10 +974,6 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) goto out; - ret = sort_idmaps(&new_map); - if (ret < 0) - goto out; - ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. @@ -1004,6 +1000,14 @@ static ssize_t map_write(struct file *file, const char __user *buf, e->lower_first = lower_first; } + /* + * If we want to use binary search for lookup, this clones the extent + * array and sorts both copies. + */ + ret = sort_idmaps(&new_map); + if (ret < 0) + goto out; + /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, -- cgit v1.2.3 From e6a2d72c10405b30ddba5af2e44a9d3d925a56d3 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 7 Nov 2018 12:10:32 +0100 Subject: posix-cpu-timers: Remove useless call to check_dl_overrun() check_dl_overrun() is used to send a SIGXCPU to users that asked to be informed when a SCHED_DEADLINE runtime overruns occur. The function is called by check_thread_timers() already, so the call in check_process_timers() is redundant/wrong (even though harmless). Remove it. Fixes: 34be39305a77 ("sched/deadline: Implement "runtime overrun signal" support") Signed-off-by: Juri Lelli Signed-off-by: Thomas Gleixner Reviewed-by: Daniel Bristot de Oliveira Reviewed-by: Steven Rostedt (VMware) Cc: linux-rt-users@vger.kernel.org Cc: mtk.manpages@gmail.com Cc: Mathieu Poirier Cc: Peter Zijlstra Cc: Luca Abeni Cc: Claudio Scordino Link: https://lkml.kernel.org/r/20181107111032.32291-1-juri.lelli@redhat.com --- kernel/time/posix-cpu-timers.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index ce32cf741b25..8f0644af40be 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -917,9 +917,6 @@ static void check_process_timers(struct task_struct *tsk, struct task_cputime cputime; unsigned long soft; - if (dl_task(tsk)) - check_dl_overrun(tsk); - /* * If cputimer is not running, then there are no active * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). -- cgit v1.2.3 From c469933e772132aad040bd6a2adc8edf9ad6f825 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 5 Nov 2018 14:53:58 +0000 Subject: sched/fair: Fix cpu_util_wake() for 'execl' type workloads A ~10% regression has been reported for UnixBench's execl throughput test by Aaron Lu and Ye Xiaolong: https://lkml.org/lkml/2018/10/30/765 That test is pretty simple, it does a "recursive" execve() syscall on the same binary. Starting from the syscall, this sequence is possible: do_execve() do_execveat_common() __do_execve_file() sched_exec() select_task_rq_fair() <==| Task already enqueued find_idlest_cpu() find_idlest_group() capacity_spare_wake() <==| Functions not called from cpu_util_wake() | the wakeup path which means we can end up calling cpu_util_wake() not only from the "wakeup path", as its name would suggest. Indeed, the task doing an execve() syscall is already enqueued on the CPU we want to get the cpu_util_wake() for. The estimated utilization for a CPU computed in cpu_util_wake() was written under the assumption that function can be called only from the wakeup path. If instead the task is already enqueued, we end up with a utilization which does not remove the current task's contribution from the estimated utilization of the CPU. This will wrongly assume a reduced spare capacity on the current CPU and increase the chances to migrate the task on execve. The regression is tracked down to: commit d519329f72a6 ("sched/fair: Update util_est only on util_avg updates") because in that patch we turn on by default the UTIL_EST sched feature. However, the real issue is introduced by: commit f9be3e5961c5 ("sched/fair: Use util_est in LB and WU paths") Let's fix this by ensuring to always discount the task estimated utilization from the CPU's estimated utilization when the task is also the current one. The same benchmark of the bug report, executed on a dual socket 40 CPUs Intel(R) Xeon(R) CPU E5-2690 v2 @ 3.00GHz machine, reports these "Execl Throughput" figures (higher the better): mainline : 48136.5 lps mainline+fix : 55376.5 lps which correspond to a 15% speedup. Moreover, since {cpu_util,capacity_spare}_wake() are not really only used from the wakeup path, let's remove this ambiguity by using a better matching name: {cpu_util,capacity_spare}_without(). Since we are at that, let's also improve the existing documentation. Reported-by: Aaron Lu Reported-by: Ye Xiaolong Tested-by: Aaron Lu Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Quentin Perret Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Fixes: f9be3e5961c5 (sched/fair: Use util_est in LB and WU paths) Link: https://lore.kernel.org/lkml/20181025093100.GB13236@e110439-lin/ Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 62 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3648d0300fdf..ac855b2f4774 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return target; } -static unsigned long cpu_util_wake(int cpu, struct task_struct *p); +static unsigned long cpu_util_without(int cpu, struct task_struct *p); -static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +static unsigned long capacity_spare_without(int cpu, struct task_struct *p) { - return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0); + return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0); } /* @@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); - spare_cap = capacity_spare_wake(i, p); + spare_cap = capacity_spare_without(i, p); if (spare_cap > max_spare_cap) max_spare_cap = spare_cap; @@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return prev_cpu; /* - * We need task's util for capacity_spare_wake, sync it up to prev_cpu's - * last_update_time. + * We need task's util for capacity_spare_without, sync it up to + * prev_cpu's last_update_time. */ if (!(sd_flag & SD_BALANCE_FORK)) sync_entity_load_avg(&p->se); @@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu) } /* - * cpu_util_wake: Compute CPU utilization with any contributions from - * the waking task p removed. + * cpu_util_without: compute cpu utilization without any contributions from *p + * @cpu: the CPU which utilization is requested + * @p: the task which utilization should be discounted + * + * The utilization of a CPU is defined by the utilization of tasks currently + * enqueued on that CPU as well as tasks which are currently sleeping after an + * execution on that CPU. + * + * This method returns the utilization of the specified CPU by discounting the + * utilization of the specified task, whenever the task is currently + * contributing to the CPU utilization. */ -static unsigned long cpu_util_wake(int cpu, struct task_struct *p) +static unsigned long cpu_util_without(int cpu, struct task_struct *p) { struct cfs_rq *cfs_rq; unsigned int util; @@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) cfs_rq = &cpu_rq(cpu)->cfs; util = READ_ONCE(cfs_rq->avg.util_avg); - /* Discount task's blocked util from CPU's util */ + /* Discount task's util from CPU's util */ util -= min_t(unsigned int, util, task_util(p)); /* @@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) * a) if *p is the only task sleeping on this CPU, then: * cpu_util (== task_util) > util_est (== 0) * and thus we return: - * cpu_util_wake = (cpu_util - task_util) = 0 + * cpu_util_without = (cpu_util - task_util) = 0 * * b) if other tasks are SLEEPING on this CPU, which is now exiting * IDLE, then: * cpu_util >= task_util * cpu_util > util_est (== 0) * and thus we discount *p's blocked utilization to return: - * cpu_util_wake = (cpu_util - task_util) >= 0 + * cpu_util_without = (cpu_util - task_util) >= 0 * * c) if other tasks are RUNNABLE on that CPU and * util_est > cpu_util @@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) * covered by the following code when estimated utilization is * enabled. */ - if (sched_feat(UTIL_EST)) - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + if (sched_feat(UTIL_EST)) { + unsigned int estimated = + READ_ONCE(cfs_rq->avg.util_est.enqueued); + + /* + * Despite the following checks we still have a small window + * for a possible race, when an execl's select_task_rq_fair() + * races with LB's detach_task(): + * + * detach_task() + * p->on_rq = TASK_ON_RQ_MIGRATING; + * ---------------------------------- A + * deactivate_task() \ + * dequeue_task() + RaceTime + * util_est_dequeue() / + * ---------------------------------- B + * + * The additional check on "current == p" it's required to + * properly fix the execl regression and it helps in further + * reducing the chances for the above race. + */ + if (unlikely(task_on_rq_queued(p) || current == p)) { + estimated -= min_t(unsigned int, estimated, + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); + } + util = max(util, estimated); + } /* * Utilization (estimated) can exceed the CPU capacity, thus let's -- cgit v1.2.3 From dded2e159208a9edc21dd5c5f583afa28d378d39 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 27 Sep 2018 17:17:49 +0000 Subject: kdb: use correct pointer when 'btc' calls 'btt' On a powerpc 8xx, 'btc' fails as follows: Entering kdb (current=0x(ptrval), pid 282) due to Keyboard Entry kdb> btc btc: cpu status: Currently on cpu 0 Available cpus: 0 kdb_getarea: Bad address 0x0 when booting the kernel with 'debug_boot_weak_hash', it fails as well Entering kdb (current=0xba99ad80, pid 284) due to Keyboard Entry kdb> btc btc: cpu status: Currently on cpu 0 Available cpus: 0 kdb_getarea: Bad address 0xba99ad80 On other platforms, Oopses have been observed too, see https://github.com/linuxppc/linux/issues/139 This is due to btc calling 'btt' with %p pointer as an argument. This patch replaces %p by %px to get the real pointer value as expected by 'btt' Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Cc: Signed-off-by: Christophe Leroy Reviewed-by: Daniel Thompson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_bt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 6ad4a9fcbd6f..7921ae4fca8d 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -179,14 +179,14 @@ kdb_bt(int argc, const char **argv) kdb_printf("no process for cpu %ld\n", cpu); return 0; } - sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); + sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); kdb_parse(buf); return 0; } kdb_printf("btc: cpu status: "); kdb_parse("cpu\n"); for_each_online_cpu(cpu) { - sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); + sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); kdb_parse(buf); touch_nmi_watchdog(); } -- cgit v1.2.3 From 568fb6f42ac6851320adaea25f8f1b94de14e40a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 27 Sep 2018 17:17:57 +0000 Subject: kdb: print real address of pointers instead of hashed addresses Since commit ad67b74d2469 ("printk: hash addresses printed with %p"), all pointers printed with %p are printed with hashed addresses instead of real addresses in order to avoid leaking addresses in dmesg and syslog. But this applies to kdb too, with is unfortunate: Entering kdb (current=0x(ptrval), pid 329) due to Keyboard Entry kdb> ps 15 sleeping system daemon (state M) processes suppressed, use 'ps A' to see all. Task Addr Pid Parent [*] cpu State Thread Command 0x(ptrval) 329 328 1 0 R 0x(ptrval) *sh 0x(ptrval) 1 0 0 0 S 0x(ptrval) init 0x(ptrval) 3 2 0 0 D 0x(ptrval) rcu_gp 0x(ptrval) 4 2 0 0 D 0x(ptrval) rcu_par_gp 0x(ptrval) 5 2 0 0 D 0x(ptrval) kworker/0:0 0x(ptrval) 6 2 0 0 D 0x(ptrval) kworker/0:0H 0x(ptrval) 7 2 0 0 D 0x(ptrval) kworker/u2:0 0x(ptrval) 8 2 0 0 D 0x(ptrval) mm_percpu_wq 0x(ptrval) 10 2 0 0 D 0x(ptrval) rcu_preempt The whole purpose of kdb is to debug, and for debugging real addresses need to be known. In addition, data displayed by kdb doesn't go into dmesg. This patch replaces all %p by %px in kdb in order to display real addresses. Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Cc: Signed-off-by: Christophe Leroy Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 14 +++++++------- kernel/debug/kdb/kdb_support.c | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index bb4fe4e1a601..959242084b40 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1192,7 +1192,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, if (reason == KDB_REASON_DEBUG) { /* special case below */ } else { - kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", + kdb_printf("\nEntering kdb (current=0x%px, pid %d) ", kdb_current, kdb_current ? kdb_current->pid : 0); #if defined(CONFIG_SMP) kdb_printf("on processor %d ", raw_smp_processor_id()); @@ -1208,7 +1208,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, */ switch (db_result) { case KDB_DB_BPT: - kdb_printf("\nEntering kdb (0x%p, pid %d) ", + kdb_printf("\nEntering kdb (0x%px, pid %d) ", kdb_current, kdb_current->pid); #if defined(CONFIG_SMP) kdb_printf("on processor %d ", raw_smp_processor_id()); @@ -2048,7 +2048,7 @@ static int kdb_lsmod(int argc, const char **argv) if (mod->state == MODULE_STATE_UNFORMED) continue; - kdb_printf("%-20s%8u 0x%p ", mod->name, + kdb_printf("%-20s%8u 0x%px ", mod->name, mod->core_layout.size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD kdb_printf("%4d ", module_refcount(mod)); @@ -2059,7 +2059,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); - kdb_printf(" 0x%p", mod->core_layout.base); + kdb_printf(" 0x%px", mod->core_layout.base); #ifdef CONFIG_MODULE_UNLOAD { @@ -2341,7 +2341,7 @@ void kdb_ps1(const struct task_struct *p) return; cpu = kdb_process_cpu(p); - kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n", + kdb_printf("0x%px %8d %8d %d %4d %c 0x%px %c%s\n", (void *)p, p->pid, p->parent->pid, kdb_task_has_cpu(p), kdb_process_cpu(p), kdb_task_state_char(p), @@ -2354,7 +2354,7 @@ void kdb_ps1(const struct task_struct *p) } else { if (KDB_TSK(cpu) != p) kdb_printf(" Error: does not match running " - "process table (0x%p)\n", KDB_TSK(cpu)); + "process table (0x%px)\n", KDB_TSK(cpu)); } } } @@ -2687,7 +2687,7 @@ int kdb_register_flags(char *cmd, for_each_kdbcmd(kp, i) { if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { kdb_printf("Duplicate kdb command registered: " - "%s, func %p help %s\n", cmd, func, help); + "%s, func %px help %s\n", cmd, func, help); return 1; } } diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 990b3cc526c8..987eb73284d2 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -40,7 +40,7 @@ int kdbgetsymval(const char *symname, kdb_symtab_t *symtab) { if (KDB_DEBUG(AR)) - kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname, + kdb_printf("kdbgetsymval: symname=%s, symtab=%px\n", symname, symtab); memset(symtab, 0, sizeof(*symtab)); symtab->sym_start = kallsyms_lookup_name(symname); @@ -88,7 +88,7 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) char *knt1 = NULL; if (KDB_DEBUG(AR)) - kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab); + kdb_printf("kdbnearsym: addr=0x%lx, symtab=%px\n", addr, symtab); memset(symtab, 0, sizeof(*symtab)); if (addr < 4096) @@ -149,7 +149,7 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) symtab->mod_name = "kernel"; if (KDB_DEBUG(AR)) kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, " - "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret, + "symtab->mod_name=%px, symtab->sym_name=%px (%s)\n", ret, symtab->sym_start, symtab->mod_name, symtab->sym_name, symtab->sym_name); @@ -887,13 +887,13 @@ void debug_kusage(void) __func__, dah_first); if (dah_first) { h_used = (struct debug_alloc_header *)debug_alloc_pool; - kdb_printf("%s: h_used %p size %d\n", __func__, h_used, + kdb_printf("%s: h_used %px size %d\n", __func__, h_used, h_used->size); } do { h_used = (struct debug_alloc_header *) ((char *)h_free + dah_overhead + h_free->size); - kdb_printf("%s: h_used %p size %d caller %p\n", + kdb_printf("%s: h_used %px size %d caller %px\n", __func__, h_used, h_used->size, h_used->caller); h_free = (struct debug_alloc_header *) (debug_alloc_pool + h_free->next); @@ -902,7 +902,7 @@ void debug_kusage(void) ((char *)h_free + dah_overhead + h_free->size); if ((char *)h_used - debug_alloc_pool != sizeof(debug_alloc_pool_aligned)) - kdb_printf("%s: h_used %p size %d caller %p\n", + kdb_printf("%s: h_used %px size %d caller %px\n", __func__, h_used, h_used->size, h_used->caller); out: spin_unlock(&dap_lock); -- cgit v1.2.3 From c2b94c72d93d0929f48157eef128c4f9d2e603ce Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 20 Sep 2018 08:59:14 -0400 Subject: kdb: Use strscpy with destination buffer size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc 8.1.0 warns with: kernel/debug/kdb/kdb_support.c: In function ‘kallsyms_symbol_next’: kernel/debug/kdb/kdb_support.c:239:4: warning: ‘strncpy’ specified bound depends on the length of the source argument [-Wstringop-overflow=] strncpy(prefix_name, name, strlen(name)+1); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/debug/kdb/kdb_support.c:239:31: note: length computed here Use strscpy() with the destination buffer size, and use ellipses when displaying truncated symbols. v2: Use strscpy() Signed-off-by: Prarit Bhargava Cc: Jonathan Toppins Cc: Jason Wessel Cc: Daniel Thompson Cc: kgdb-bugreport@lists.sourceforge.net Reviewed-by: Daniel Thompson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 15 +++++++++------ kernel/debug/kdb/kdb_private.h | 2 +- kernel/debug/kdb/kdb_support.c | 10 +++++----- 3 files changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index ed5d34925ad0..6a4b41484afe 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -216,7 +216,7 @@ static char *kdb_read(char *buffer, size_t bufsize) int count; int i; int diag, dtab_count; - int key; + int key, buf_size, ret; diag = kdbgetintenv("DTABCOUNT", &dtab_count); @@ -336,9 +336,8 @@ poll_again: else p_tmp = tmpbuffer; len = strlen(p_tmp); - count = kallsyms_symbol_complete(p_tmp, - sizeof(tmpbuffer) - - (p_tmp - tmpbuffer)); + buf_size = sizeof(tmpbuffer) - (p_tmp - tmpbuffer); + count = kallsyms_symbol_complete(p_tmp, buf_size); if (tab == 2 && count > 0) { kdb_printf("\n%d symbols are found.", count); if (count > dtab_count) { @@ -350,9 +349,13 @@ poll_again: } kdb_printf("\n"); for (i = 0; i < count; i++) { - if (WARN_ON(!kallsyms_symbol_next(p_tmp, i))) + ret = kallsyms_symbol_next(p_tmp, i, buf_size); + if (WARN_ON(!ret)) break; - kdb_printf("%s ", p_tmp); + if (ret != -E2BIG) + kdb_printf("%s ", p_tmp); + else + kdb_printf("%s... ", p_tmp); *(p_tmp + len) = '\0'; } if (i >= dtab_count) diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 1e5a502ba4a7..2118d8258b7c 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -83,7 +83,7 @@ typedef struct __ksymtab { unsigned long sym_start; unsigned long sym_end; } kdb_symtab_t; -extern int kallsyms_symbol_next(char *prefix_name, int flag); +extern int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size); extern int kallsyms_symbol_complete(char *prefix_name, int max_len); /* Exported Symbols for kernel loadable modules to use. */ diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 987eb73284d2..b14b0925c184 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -221,11 +221,13 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len) * Parameters: * prefix_name prefix of a symbol name to lookup * flag 0 means search from the head, 1 means continue search. + * buf_size maximum length that can be written to prefix_name + * buffer * Returns: * 1 if a symbol matches the given prefix. * 0 if no string found */ -int kallsyms_symbol_next(char *prefix_name, int flag) +int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size) { int prefix_len = strlen(prefix_name); static loff_t pos; @@ -235,10 +237,8 @@ int kallsyms_symbol_next(char *prefix_name, int flag) pos = 0; while ((name = kdb_walk_kallsyms(&pos))) { - if (strncmp(name, prefix_name, prefix_len) == 0) { - strncpy(prefix_name, name, strlen(name)+1); - return 1; - } + if (!strncmp(name, prefix_name, prefix_len)) + return strscpy(prefix_name, name, buf_size); } return 0; } -- cgit v1.2.3 From 9eb62f0e1bc70ebc9b15837a0c4e8f12a7b910cb Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 16 Aug 2018 09:01:41 -0500 Subject: kdb: kdb_main: refactor code in kdb_md_line Replace the whole switch statement with a for loop. This makes the code clearer and easy to read. This also addresses the following Coverity warnings: Addresses-Coverity-ID: 115090 ("Missing break in switch") Addresses-Coverity-ID: 115091 ("Missing break in switch") Addresses-Coverity-ID: 114700 ("Missing break in switch") Suggested-by: Daniel Thompson Signed-off-by: Gustavo A. R. Silva Reviewed-by: Daniel Thompson [daniel.thompson@linaro.org: Tiny grammar change in description] Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 959242084b40..d72b32c66f7d 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1493,6 +1493,7 @@ static void kdb_md_line(const char *fmtstr, unsigned long addr, char cbuf[32]; char *c = cbuf; int i; + int j; unsigned long word; memset(cbuf, '\0', sizeof(cbuf)); @@ -1538,25 +1539,9 @@ static void kdb_md_line(const char *fmtstr, unsigned long addr, wc.word = word; #define printable_char(c) \ ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; }) - switch (bytesperword) { - case 8: + for (j = 0; j < bytesperword; j++) *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - addr += 4; - case 4: - *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - addr += 2; - case 2: - *c++ = printable_char(*cp++); - addr++; - case 1: - *c++ = printable_char(*cp++); - addr++; - break; - } + addr += bytesperword; #undef printable_char } } -- cgit v1.2.3 From 01cb37351bafc1b44b962842926210115e231f0a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 4 Aug 2018 23:18:25 -0500 Subject: kdb: kdb_keyboard: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in this particular case, I replaced the code comments with a proper "fall through" annotation, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Daniel Thompson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_keyboard.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index 118527aa60ea..750497b0003a 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -173,11 +173,11 @@ int kdb_get_kbd_char(void) case KT_LATIN: if (isprint(keychar)) break; /* printable characters */ - /* drop through */ + /* fall through */ case KT_SPEC: if (keychar == K_ENTER) break; - /* drop through */ + /* fall through */ default: return -1; /* ignore unprintables */ } -- cgit v1.2.3 From 646558ff1643467d3b941b47f519867cbca462c3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 4 Aug 2018 21:48:44 -0500 Subject: kdb: kdb_support: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in this particular case, I replaced the code comments with a proper "fall through" annotation, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Daniel Thompson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_support.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index b14b0925c184..50bf9b119bad 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -432,7 +432,7 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size) *word = w8; break; } - /* drop through */ + /* fall through */ default: diag = KDB_BADWIDTH; kdb_printf("kdb_getphysword: bad width %ld\n", (long) size); @@ -481,7 +481,7 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size) *word = w8; break; } - /* drop through */ + /* fall through */ default: diag = KDB_BADWIDTH; kdb_printf("kdb_getword: bad width %ld\n", (long) size); @@ -525,7 +525,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) diag = kdb_putarea(addr, w8); break; } - /* drop through */ + /* fall through */ default: diag = KDB_BADWIDTH; kdb_printf("kdb_putword: bad width %ld\n", (long) size); -- cgit v1.2.3 From afd594240806acc138cf696c09f2f4829d55d02f Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 16 Nov 2018 12:00:07 +0000 Subject: bpf: fix off-by-one error in adjust_subprog_starts When patching in a new sequence for the first insn of a subprog, the start of that subprog does not change (it's the first insn of the sequence), so adjust_subprog_starts should check start <= off (rather than < off). Also added a test to test_verifier.c (it's essentially the syz reproducer). Fixes: cc8b0b92a169 ("bpf: introduce function calls (function boundaries)") Reported-by: syzbot+4fc427c7af994b0948be@syzkaller.appspotmail.com Signed-off-by: Edward Cree Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- tools/testing/selftests/bpf/test_verifier.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1971ca325fb4..6dd419550aba 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5650,7 +5650,7 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len return; /* NOTE: fake 'exit' subprog should be updated as well. */ for (i = 0; i <= env->subprog_cnt; i++) { - if (env->subprog_info[i].start < off) + if (env->subprog_info[i].start <= off) continue; env->subprog_info[i].start += len - 1; } diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 6f61df62f690..550b7e46bf4a 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -13896,6 +13896,25 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, }, + { + "calls: ctx read at start of subprog", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5), + BPF_JMP_REG(BPF_JSGT, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, + .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .result_unpriv = REJECT, + .result = ACCEPT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- cgit v1.2.3 From 569a933b03f3c48b392fe67c0086b3a6b9306b5a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 14 Nov 2018 10:00:34 -0800 Subject: bpf: allocate local storage buffers using GFP_ATOMIC Naresh reported an issue with the non-atomic memory allocation of cgroup local storage buffers: [ 73.047526] BUG: sleeping function called from invalid context at /srv/oe/build/tmp-rpb-glibc/work-shared/intel-corei7-64/kernel-source/mm/slab.h:421 [ 73.060915] in_atomic(): 1, irqs_disabled(): 0, pid: 3157, name: test_cgroup_sto [ 73.068342] INFO: lockdep is turned off. [ 73.072293] CPU: 2 PID: 3157 Comm: test_cgroup_sto Not tainted 4.20.0-rc2-next-20181113 #1 [ 73.080548] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 07/27/2017 [ 73.088018] Call Trace: [ 73.090463] dump_stack+0x70/0xa5 [ 73.093783] ___might_sleep+0x152/0x240 [ 73.097619] __might_sleep+0x4a/0x80 [ 73.101191] __kmalloc_node+0x1cf/0x2f0 [ 73.105031] ? cgroup_storage_update_elem+0x46/0x90 [ 73.109909] cgroup_storage_update_elem+0x46/0x90 cgroup_storage_update_elem() (as well as other update map update callbacks) is called with disabled preemption, so GFP_ATOMIC allocation should be used: e.g. alloc_htab_elem() in hashtab.c. Reported-by: Naresh Kamboju Tested-by: Naresh Kamboju Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/local_storage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index c97a8f968638..bed9d48a7ae9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -139,7 +139,8 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return -ENOENT; new = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, __GFP_ZERO | GFP_USER, + map->value_size, + __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, map->numa_node); if (!new) return -ENOMEM; -- cgit v1.2.3 From 8fcb2312d1e3300e81aa871aad00d4c038cfc184 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Fri, 16 Nov 2018 15:08:00 -0800 Subject: kernel/sched/psi.c: simplify cgroup_move_task() The existing code triggered an invalid warning about 'rq' possibly being used uninitialized. Instead of doing the silly warning suppression by initializa it to NULL, refactor the code to bail out early instead. Warning was: kernel/sched/psi.c: In function `cgroup_move_task': kernel/sched/psi.c:639:13: warning: `rq' may be used uninitialized in this function [-Wmaybe-uninitialized] Link: http://lkml.kernel.org/r/20181103183339.8669-1-olof@lixom.net Fixes: 2ce7135adc9ad ("psi: cgroup support") Signed-off-by: Olof Johansson Reviewed-by: Andrew Morton Acked-by: Johannes Weiner Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7cdecfc010af..3d7355d7c3e3 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -633,38 +633,39 @@ void psi_cgroup_free(struct cgroup *cgroup) */ void cgroup_move_task(struct task_struct *task, struct css_set *to) { - bool move_psi = !psi_disabled; unsigned int task_flags = 0; struct rq_flags rf; struct rq *rq; - if (move_psi) { - rq = task_rq_lock(task, &rf); + if (psi_disabled) { + /* + * Lame to do this here, but the scheduler cannot be locked + * from the outside, so we move cgroups from inside sched/. + */ + rcu_assign_pointer(task->cgroups, to); + return; + } - if (task_on_rq_queued(task)) - task_flags = TSK_RUNNING; - else if (task->in_iowait) - task_flags = TSK_IOWAIT; + rq = task_rq_lock(task, &rf); - if (task->flags & PF_MEMSTALL) - task_flags |= TSK_MEMSTALL; + if (task_on_rq_queued(task)) + task_flags = TSK_RUNNING; + else if (task->in_iowait) + task_flags = TSK_IOWAIT; - if (task_flags) - psi_task_change(task, task_flags, 0); - } + if (task->flags & PF_MEMSTALL) + task_flags |= TSK_MEMSTALL; - /* - * Lame to do this here, but the scheduler cannot be locked - * from the outside, so we move cgroups from inside sched/. - */ + if (task_flags) + psi_task_change(task, task_flags, 0); + + /* See comment above */ rcu_assign_pointer(task->cgroups, to); - if (move_psi) { - if (task_flags) - psi_task_change(task, 0, task_flags); + if (task_flags) + psi_task_change(task, 0, task_flags); - task_rq_unlock(rq, task, &rf); - } + task_rq_unlock(rq, task, &rf); } #endif /* CONFIG_CGROUPS */ -- cgit v1.2.3 From cb216b84d6ea24fa10f1e7aac35de77246841041 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 21 Nov 2018 16:00:51 +0000 Subject: swiotlb: Skip cache maintenance on map error If swiotlb_bounce_page() failed, calling arch_sync_dma_for_device() may lead to such delights as performing cache maintenance on whatever address phys_to_virt(SWIOTLB_MAP_ERROR) looks like, which is typically outside the kernel memory map and goes about as well as expected. Don't do that. Fixes: a4a4330db46a ("swiotlb: add support for non-coherent DMA") Tested-by: John Stultz Signed-off-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 5731daa09a32..045930e32c0e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -679,7 +679,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, } if (!dev_is_dma_coherent(dev) && - (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) + (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0 && + dev_addr != DIRECT_MAPPING_ERROR) arch_sync_dma_for_device(dev, phys, size, dir); return dev_addr; -- cgit v1.2.3 From 813961de3ee6474dd5703e883471fd941d6c8f69 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 22 Nov 2018 10:49:56 -0800 Subject: bpf: fix integer overflow in queue_stack_map Fix the following issues: - allow queue_stack_map for root only - fix u32 max_entries overflow - disallow value_size == 0 Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Reported-by: Wei Wu Signed-off-by: Alexei Starovoitov Cc: Mauricio Vasquez B Signed-off-by: Daniel Borkmann --- kernel/bpf/queue_stack_maps.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8bbd72d3a121..b384ea9f3254 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ @@ -45,8 +46,12 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || + attr->value_size == 0 || attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) return -EINVAL; @@ -63,15 +68,10 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int ret, numa_node = bpf_map_attr_numa_node(attr); struct bpf_queue_stack *qs; - u32 size, value_size; - u64 queue_size, cost; - - size = attr->max_entries + 1; - value_size = attr->value_size; - - queue_size = sizeof(*qs) + (u64) value_size * size; + u64 size, queue_size, cost; - cost = queue_size; + size = (u64) attr->max_entries + 1; + cost = queue_size = sizeof(*qs) + size * attr->value_size; if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); -- cgit v1.2.3 From 09d3f015d1e1b4fee7e9bbdcf54201d239393391 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Thu, 22 Nov 2018 17:10:31 +0100 Subject: uprobes: Fix handle_swbp() vs. unregister() + register() race once more Commit: 142b18ddc8143 ("uprobes: Fix handle_swbp() vs unregister() + register() race") added the UPROBE_COPY_INSN flag, and corresponding smp_wmb() and smp_rmb() memory barriers, to ensure that handle_swbp() uses fully-initialized uprobes only. However, the smp_rmb() is mis-placed: this barrier should be placed after handle_swbp() has tested for the flag, thus guaranteeing that (program-order) subsequent loads from the uprobe can see the initial stores performed by prepare_uprobe(). Move the smp_rmb() accordingly. Also amend the comments associated to the two memory barriers to indicate their actual locations. Signed-off-by: Andrea Parri Acked-by: Oleg Nesterov Cc: Alexander Shishkin Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@kernel.org Fixes: 142b18ddc8143 ("uprobes: Fix handle_swbp() vs unregister() + register() race") Link: http://lkml.kernel.org/r/20181122161031.15179-1-andrea.parri@amarulasolutions.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 96d4bee83489..322e97bbb437 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -829,7 +829,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, BUG_ON((uprobe->offset & ~PAGE_MASK) + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ + smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: @@ -2178,10 +2178,18 @@ static void handle_swbp(struct pt_regs *regs) * After we hit the bp, _unregister + _register can install the * new and not-yet-analyzed uprobe at the same address, restart. */ - smp_rmb(); /* pairs with wmb() in install_breakpoint() */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; + /* + * Pairs with the smp_wmb() in prepare_uprobe(). + * + * Guarantees that if we see the UPROBE_COPY_INSN bit set, then + * we must also see the stores to &uprobe->arch performed by the + * prepare_uprobe() call. + */ + smp_rmb(); + /* Tracing handlers use ->utask to communicate with fetch methods */ if (!get_utask()) goto out; -- cgit v1.2.3 From 1efb6ee3edea57f57f9fb05dba8dcb3f7333f61f Mon Sep 17 00:00:00 2001 From: Martynas Pumputis Date: Fri, 23 Nov 2018 17:43:26 +0100 Subject: bpf: fix check of allowed specifiers in bpf_trace_printk A format string consisting of "%p" or "%s" followed by an invalid specifier (e.g. "%p%\n" or "%s%") could pass the check which would make format_decode (lib/vsprintf.c) to warn. Fixes: 9c959c863f82 ("tracing: Allow BPF programs to call bpf_trace_printk()") Reported-by: syzbot+1ec5c5ec949c4adaa0c4@syzkaller.appspotmail.com Signed-off-by: Martynas Pumputis Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 08fcfe440c63..9864a35c8bb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -196,11 +196,13 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, i++; } else if (fmt[i] == 'p' || fmt[i] == 's') { mod[fmt_cnt]++; - i++; - if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) + /* disallow any further format extensions */ + if (fmt[i + 1] != 0 && + !isspace(fmt[i + 1]) && + !ispunct(fmt[i + 1])) return -EINVAL; fmt_cnt++; - if (fmt[i - 1] == 's') { + if (fmt[i] == 's') { if (str_seen) /* allow only one '%s' per fmt string */ return -EINVAL; -- cgit v1.2.3 From 8114865ff82e200b383e46821c25cb0625b842b5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 18 Nov 2018 17:10:15 -0500 Subject: function_graph: Create function_graph_enter() to consolidate architecture code Currently all the architectures do basically the same thing in preparing the function graph tracer on entry to a function. This code can be pulled into a generic location and then this will allow the function graph tracer to be fixed, as well as extended. Create a new function graph helper function_graph_enter() that will call the hook function (ftrace_graph_entry) and the shadow stack operation (ftrace_push_return_trace), and remove the need of the architecture code to manage the shadow stack. This is needed to prepare for a fix of a design bug on how the curr_ret_stack is used. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 3 +++ kernel/trace/trace_functions_graph.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a397907e8d72..5717e8f81c59 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -779,6 +779,9 @@ extern void return_to_handler(void); extern int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, unsigned long frame_pointer, unsigned long *retp); +extern int +function_graph_enter(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp); unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, unsigned long *retp); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 169b3c44ee97..28f2602435d0 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -182,6 +182,22 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, return 0; } +int function_graph_enter(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp) +{ + struct ftrace_graph_ent trace; + + trace.func = func; + trace.depth = current->curr_ret_stack + 1; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) + return -EBUSY; + + return ftrace_push_return_trace(ret, func, &trace.depth, + frame_pointer, retp); +} + /* Retrieve a function return address to the trace stack on thread info.*/ static void ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, -- cgit v1.2.3 From e2c95a61656d29ceaac97b6a975c8a1f26e26f15 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 26 Nov 2018 14:05:38 +0100 Subject: bpf, ppc64: generalize fetching subprog into bpf_jit_get_func_addr Make fetching of the BPF call address from ppc64 JIT generic. ppc64 was using a slightly different variant rather than through the insns' imm field encoding as the target address would not fit into that space. Therefore, the target subprog number was encoded into the insns' offset and fetched through fp->aux->func[off]->bpf_func instead. Given there are other JITs with this issue and the mechanism of fetching the address is JIT-generic, move it into the core as a helper instead. On the JIT side, we get information on whether the retrieved address is a fixed one, that is, not changing through JIT passes, or a dynamic one. For the former, JITs can optimize their imm emission because this doesn't change jump offsets throughout JIT process. Signed-off-by: Daniel Borkmann Reviewed-by: Sandipan Das Tested-by: Sandipan Das Signed-off-by: Alexei Starovoitov --- arch/powerpc/net/bpf_jit_comp64.c | 57 ++++++++++++++++++++++++++------------- include/linux/filter.h | 4 +++ kernel/bpf/core.c | 34 +++++++++++++++++++++++ 3 files changed, 76 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 50b129785aee..17482f5de3e2 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -166,7 +166,33 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) PPC_BLR(); } -static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func) +static void bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, + u64 func) +{ +#ifdef PPC64_ELF_ABI_v1 + /* func points to the function descriptor */ + PPC_LI64(b2p[TMP_REG_2], func); + /* Load actual entry point from function descriptor */ + PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0); + /* ... and move it to LR */ + PPC_MTLR(b2p[TMP_REG_1]); + /* + * Load TOC from function descriptor at offset 8. + * We can clobber r2 since we get called through a + * function pointer (so caller will save/restore r2) + * and since we don't use a TOC ourself. + */ + PPC_BPF_LL(2, b2p[TMP_REG_2], 8); +#else + /* We can clobber r12 */ + PPC_FUNC_ADDR(12, func); + PPC_MTLR(12); +#endif + PPC_BLRL(); +} + +static void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, + u64 func) { unsigned int i, ctx_idx = ctx->idx; @@ -273,7 +299,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, { const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; - int i; + int i, ret; /* Start of epilogue code - will only be valid 2nd pass onwards */ u32 exit_addr = addrs[flen]; @@ -284,8 +310,9 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 src_reg = b2p[insn[i].src_reg]; s16 off = insn[i].off; s32 imm = insn[i].imm; + bool func_addr_fixed; + u64 func_addr; u64 imm64; - u8 *func; u32 true_cond; u32 tmp_idx; @@ -711,23 +738,15 @@ emit_clear: case BPF_JMP | BPF_CALL: ctx->seen |= SEEN_FUNC; - /* bpf function call */ - if (insn[i].src_reg == BPF_PSEUDO_CALL) - if (!extra_pass) - func = NULL; - else if (fp->aux->func && off < fp->aux->func_cnt) - /* use the subprog id from the off - * field to lookup the callee address - */ - func = (u8 *) fp->aux->func[off]->bpf_func; - else - return -EINVAL; - /* kernel helper call */ - else - func = (u8 *) __bpf_call_base + imm; - - bpf_jit_emit_func_call(image, ctx, (u64)func); + ret = bpf_jit_get_func_addr(fp, &insn[i], extra_pass, + &func_addr, &func_addr_fixed); + if (ret < 0) + return ret; + if (func_addr_fixed) + bpf_jit_emit_func_call_hlp(image, ctx, func_addr); + else + bpf_jit_emit_func_call_rel(image, ctx, func_addr); /* move return value from r3 to BPF_REG_0 */ PPC_MR(b2p[BPF_REG_0], 3); break; diff --git a/include/linux/filter.h b/include/linux/filter.h index de629b706d1d..448dcc448f1f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -866,6 +866,10 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr); void bpf_jit_free(struct bpf_prog *fp); +int bpf_jit_get_func_addr(const struct bpf_prog *prog, + const struct bpf_insn *insn, bool extra_pass, + u64 *func_addr, bool *func_addr_fixed); + struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1a796e0799ec..b1a3545d0ec8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -672,6 +672,40 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } +int bpf_jit_get_func_addr(const struct bpf_prog *prog, + const struct bpf_insn *insn, bool extra_pass, + u64 *func_addr, bool *func_addr_fixed) +{ + s16 off = insn->off; + s32 imm = insn->imm; + u8 *addr; + + *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; + if (!*func_addr_fixed) { + /* Place-holder address till the last pass has collected + * all addresses for JITed subprograms in which case we + * can pick them up from prog->aux. + */ + if (!extra_pass) + addr = NULL; + else if (prog->aux->func && + off >= 0 && off < prog->aux->func_cnt) + addr = (u8 *)prog->aux->func[off]->bpf_func; + else + return -EINVAL; + } else { + /* Address of a BPF helper call. Since part of the core + * kernel, it's always at a fixed location. __bpf_call_base + * and the helper with imm relative to it are both in core + * kernel. + */ + addr = (u8 *)__bpf_call_base + imm; + } + + *func_addr = (unsigned long)addr; + return 0; +} + static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) -- cgit v1.2.3 From d125f3f866df88da5a85df00291f88f0baa89f7c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 19 Nov 2018 07:40:39 -0500 Subject: function_graph: Make ftrace_push_return_trace() static As all architectures now call function_graph_enter() to do the entry work, no architecture should ever call ftrace_push_return_trace(). Make it static. This is needed to prepare for a fix of a design bug on how the curr_ret_stack is used. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 3 --- kernel/trace/trace_functions_graph.c | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 5717e8f81c59..dd16e8218db3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -776,9 +776,6 @@ struct ftrace_ret_stack { */ extern void return_to_handler(void); -extern int -ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, - unsigned long frame_pointer, unsigned long *retp); extern int function_graph_enter(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 28f2602435d0..88ca787a1cdc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -118,7 +118,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags); /* Add a function return address to the trace stack on thread info.*/ -int +static int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, unsigned long frame_pointer, unsigned long *retp) { -- cgit v1.2.3 From 39eb456dacb543de90d3bc6a8e0ac5cf51ac475e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 19 Nov 2018 08:07:12 -0500 Subject: function_graph: Use new curr_ret_depth to manage depth instead of curr_ret_stack Currently, the depth of the ret_stack is determined by curr_ret_stack index. The issue is that there's a race between setting of the curr_ret_stack and calling of the callback attached to the return of the function. Commit 03274a3ffb44 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") moved the calling of the callback to after the setting of the curr_ret_stack, even stating that it was safe to do so, when in fact, it was the reason there was a barrier() there (yes, I should have commented that barrier()). Not only does the curr_ret_stack keep track of the current call graph depth, it also keeps the ret_stack content from being overwritten by new data. The function profiler, uses the "subtime" variable of ret_stack structure and by moving the curr_ret_stack, it allows for interrupts to use the same structure it was using, corrupting the data, and breaking the profiler. To fix this, there needs to be two variables to handle the call stack depth and the pointer to where the ret_stack is being used, as they need to change at two different locations. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/sched.h | 1 + kernel/trace/ftrace.c | 3 +++ kernel/trace/trace_functions_graph.c | 21 +++++++++++++-------- 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index a51c13c2b1a0..d6183a55e8eb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1116,6 +1116,7 @@ struct task_struct { #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; + int curr_ret_depth; /* Stack of return addresses for return function tracing: */ struct ftrace_ret_stack *ret_stack; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f536f601bd46..48513954713c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6814,6 +6814,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->curr_ret_stack = -1; + t->curr_ret_depth = -1; /* Make sure the tasks see the -1 first: */ smp_wmb(); t->ret_stack = ret_stack_list[start++]; @@ -7038,6 +7039,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { t->curr_ret_stack = -1; + t->curr_ret_depth = -1; /* * The idle task has no parent, it either has its own * stack or no stack at all. @@ -7068,6 +7070,7 @@ void ftrace_graph_init_task(struct task_struct *t) /* Make sure we do not use the parent ret_stack */ t->ret_stack = NULL; t->curr_ret_stack = -1; + t->curr_ret_depth = -1; if (ftrace_graph_active) { struct ftrace_ret_stack *ret_stack; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 88ca787a1cdc..02d4081a7f5a 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, /* Add a function return address to the trace stack on thread info.*/ static int -ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, +ftrace_push_return_trace(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp) { unsigned long long calltime; @@ -177,8 +177,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR current->ret_stack[index].retp = retp; #endif - *depth = current->curr_ret_stack; - return 0; } @@ -188,14 +186,20 @@ int function_graph_enter(unsigned long ret, unsigned long func, struct ftrace_graph_ent trace; trace.func = func; - trace.depth = current->curr_ret_stack + 1; + trace.depth = ++current->curr_ret_depth; /* Only trace if the calling function expects to */ if (!ftrace_graph_entry(&trace)) - return -EBUSY; + goto out; - return ftrace_push_return_trace(ret, func, &trace.depth, - frame_pointer, retp); + if (ftrace_push_return_trace(ret, func, + frame_pointer, retp)) + goto out; + + return 0; + out: + current->curr_ret_depth--; + return -EBUSY; } /* Retrieve a function return address to the trace stack on thread info.*/ @@ -257,7 +261,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, trace->func = current->ret_stack[index].func; trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); - trace->depth = index; + trace->depth = current->curr_ret_depth; } /* @@ -273,6 +277,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) trace.rettime = trace_clock_local(); barrier(); current->curr_ret_stack--; + current->curr_ret_depth--; /* * The curr_ret_stack can be less than -1 only if it was * filtered out and it's about to return from the function. -- cgit v1.2.3 From 552701dd0fa7c3d448142e87210590ba424694a0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 19 Nov 2018 15:18:40 -0500 Subject: function_graph: Move return callback before update of curr_ret_stack In the past, curr_ret_stack had two functions. One was to denote the depth of the call graph, the other is to keep track of where on the ret_stack the data is used. Although they may be slightly related, there are two cases where they need to be used differently. The one case is that it keeps the ret_stack data from being corrupted by an interrupt coming in and overwriting the data still in use. The other is just to know where the depth of the stack currently is. The function profiler uses the ret_stack to save a "subtime" variable that is part of the data on the ret_stack. If curr_ret_stack is modified too early, then this variable can be corrupted. The "max_depth" option, when set to 1, will record the first functions going into the kernel. To see all top functions (when dealing with timings), the depth variable needs to be lowered before calling the return hook. But by lowering the curr_ret_stack, it makes the data on the ret_stack still being used by the return hook susceptible to being overwritten. Now that there's two variables to handle both cases (curr_ret_depth), we can move them to the locations where they can handle both cases. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions_graph.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 02d4081a7f5a..4f0d72ae6362 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -261,7 +261,13 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, trace->func = current->ret_stack[index].func; trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); - trace->depth = current->curr_ret_depth; + trace->depth = current->curr_ret_depth--; + /* + * We still want to trace interrupts coming in if + * max_depth is set to 1. Make sure the decrement is + * seen before ftrace_graph_return. + */ + barrier(); } /* @@ -275,9 +281,14 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); + ftrace_graph_return(&trace); + /* + * The ftrace_graph_return() may still access the current + * ret_stack structure, we need to make sure the update of + * curr_ret_stack is after that. + */ barrier(); current->curr_ret_stack--; - current->curr_ret_depth--; /* * The curr_ret_stack can be less than -1 only if it was * filtered out and it's about to return from the function. @@ -288,13 +299,6 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } - /* - * The trace should run after decrementing the ret counter - * in case an interrupt were to come in. We don't want to - * lose the interrupt if max_depth is set. - */ - ftrace_graph_return(&trace); - if (unlikely(!ret)) { ftrace_graph_stop(); WARN_ON(1); -- cgit v1.2.3 From 7c6ea35ef50810aa12ab26f21cb858d980881576 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 20 Nov 2018 12:40:25 -0500 Subject: function_graph: Reverse the order of pushing the ret_stack and the callback The function graph profiler uses the ret_stack to store the "subtime" and reuse it by nested functions and also on the return. But the current logic has the profiler callback called before the ret_stack is updated, and it is just modifying the ret_stack that will later be allocated (it's just lucky that the "subtime" is not touched when it is allocated). This could also cause a crash if we are at the end of the ret_stack when this happens. By reversing the order of the allocating the ret_stack and then calling the callbacks attached to a function being traced, the ret_stack entry is no longer used before it is allocated. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions_graph.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4f0d72ae6362..2561460d7baf 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -188,15 +188,17 @@ int function_graph_enter(unsigned long ret, unsigned long func, trace.func = func; trace.depth = ++current->curr_ret_depth; - /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) - goto out; - if (ftrace_push_return_trace(ret, func, frame_pointer, retp)) goto out; + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) + goto out_ret; + return 0; + out_ret: + current->curr_ret_stack--; out: current->curr_ret_depth--; return -EBUSY; -- cgit v1.2.3 From b1b35f2e218a5b57d03bbc3b0667d5064570dc60 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 20 Nov 2018 12:51:07 -0500 Subject: function_graph: Have profiler use curr_ret_stack and not depth The profiler uses trace->depth to find its entry on the ret_stack, but the depth may not match the actual location of where its entry is (if an interrupt were to preempt the processing of the profiler for another function, the depth and the curr_ret_stack will be different). Have it use the curr_ret_stack as the index to find its ret_stack entry instead of using the depth variable, as that is no longer guaranteed to be the same. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 48513954713c..77734451cb05 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -817,7 +817,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int profile_graph_entry(struct ftrace_graph_ent *trace) { - int index = trace->depth; + int index = current->curr_ret_stack; function_profile_call(trace->func, 0, NULL, NULL); @@ -852,7 +852,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) if (!fgraph_graph_time) { int index; - index = trace->depth; + index = current->curr_ret_stack; /* Append this call time to the parent time to subtract */ if (index) -- cgit v1.2.3 From c5511d03ec090980732e929c318a7a6374b5550e Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Sun, 25 Nov 2018 19:33:36 +0100 Subject: sched/smt: Make sched_smt_present track topology Currently the 'sched_smt_present' static key is enabled when at CPU bringup SMT topology is observed, but it is never disabled. However there is demand to also disable the key when the topology changes such that there is no SMT present anymore. Implement this by making the key count the number of cores that have SMT enabled. In particular, the SMT topology bits are set before interrrupts are enabled and similarly, are cleared after interrupts are disabled for the last time and the CPU dies. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.246110444@linutronix.de --- kernel/sched/core.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 091e089063be..6fedf3a98581 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5738,15 +5738,10 @@ int sched_cpu_activate(unsigned int cpu) #ifdef CONFIG_SCHED_SMT /* - * The sched_smt_present static key needs to be evaluated on every - * hotplug event because at boot time SMT might be disabled when - * the number of booted CPUs is limited. - * - * If then later a sibling gets hotplugged, then the key would stay - * off and SMT scheduling would never be functional. + * When going up, increment the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) > 1) - static_branch_enable_cpuslocked(&sched_smt_present); + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_inc_cpuslocked(&sched_smt_present); #endif set_cpu_active(cpu, true); @@ -5790,6 +5785,14 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu_mult(call_rcu, call_rcu_sched); +#ifdef CONFIG_SCHED_SMT + /* + * When going down, decrement the number of cores with SMT present. + */ + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_dec_cpuslocked(&sched_smt_present); +#endif + if (!sched_smp_initialized) return 0; -- cgit v1.2.3 From 321a874a7ef85655e93b3206d0f36b4a6097f948 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:38 +0100 Subject: sched/smt: Expose sched_smt_present static key Make the scheduler's 'sched_smt_present' static key globaly available, so it can be used in the x86 speculation control code. Provide a query function and a stub for the CONFIG_SMP=n case. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.430168326@linutronix.de --- include/linux/sched/smt.h | 18 ++++++++++++++++++ kernel/sched/sched.h | 4 +--- 2 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 include/linux/sched/smt.h (limited to 'kernel') diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h new file mode 100644 index 000000000000..c9e0be514110 --- /dev/null +++ b/include/linux/sched/smt.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SMT_H +#define _LINUX_SCHED_SMT_H + +#include + +#ifdef CONFIG_SCHED_SMT +extern struct static_key_false sched_smt_present; + +static __always_inline bool sched_smt_active(void) +{ + return static_branch_likely(&sched_smt_present); +} +#else +static inline bool sched_smt_active(void) { return false; } +#endif + +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 618577fc9aa8..4e524ab589c9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -936,9 +937,6 @@ static inline int cpu_of(struct rq *rq) #ifdef CONFIG_SCHED_SMT - -extern struct static_key_false sched_smt_present; - extern void __update_idle_core(struct rq *rq); static inline void update_idle_core(struct rq *rq) -- cgit v1.2.3 From a74cfffb03b73d41e08f84c2e5c87dec0ce3db9f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:39 +0100 Subject: x86/speculation: Rework SMT state change arch_smt_update() is only called when the sysfs SMT control knob is changed. This means that when SMT is enabled in the sysfs control knob the system is considered to have SMT active even if all siblings are offline. To allow finegrained control of the speculation mitigations, the actual SMT state is more interesting than the fact that siblings could be enabled. Rework the code, so arch_smt_update() is invoked from each individual CPU hotplug function, and simplify the update function while at it. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.521974984@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 11 +++++------ include/linux/sched/smt.h | 2 ++ kernel/cpu.c | 15 +++++++++------ 3 files changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a723af0c4400..5625b323ff32 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -344,16 +345,14 @@ void arch_smt_update(void) return; mutex_lock(&spec_ctrl_mutex); - mask = x86_spec_ctrl_base; - if (cpu_smt_control == CPU_SMT_ENABLED) + + mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; + if (sched_smt_active()) mask |= SPEC_CTRL_STIBP; - else - mask &= ~SPEC_CTRL_STIBP; if (mask != x86_spec_ctrl_base) { pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n", - cpu_smt_control == CPU_SMT_ENABLED ? - "Enabling" : "Disabling"); + mask & SPEC_CTRL_STIBP ? "Enabling" : "Disabling"); x86_spec_ctrl_base = mask; on_each_cpu(update_stibp_msr, NULL, 1); } diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index c9e0be514110..59d3736c454c 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -15,4 +15,6 @@ static __always_inline bool sched_smt_active(void) static inline bool sched_smt_active(void) { return false; } #endif +void arch_smt_update(void); + #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 3c7f3b4c453c..91d5c38eb7e5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -367,6 +368,12 @@ static void lockdep_release_cpus_lock(void) #endif /* CONFIG_HOTPLUG_CPU */ +/* + * Architectures that need SMT-specific errata handling during SMT hotplug + * should override this. + */ +void __weak arch_smt_update(void) { } + #ifdef CONFIG_HOTPLUG_SMT enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; EXPORT_SYMBOL_GPL(cpu_smt_control); @@ -1011,6 +1018,7 @@ out: * concurrent CPU hotplug via cpu_add_remove_lock. */ lockup_detector_cleanup(); + arch_smt_update(); return ret; } @@ -1139,6 +1147,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) ret = cpuhp_up_callbacks(cpu, st, target); out: cpus_write_unlock(); + arch_smt_update(); return ret; } @@ -2055,12 +2064,6 @@ static void cpuhp_online_cpu_device(unsigned int cpu) kobject_uevent(&dev->kobj, KOBJ_ONLINE); } -/* - * Architectures that need SMT-specific errata handling during SMT hotplug - * should override this. - */ -void __weak arch_smt_update(void) { }; - static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; -- cgit v1.2.3 From 46f7ecb1e7359f183f5bbd1e08b90e10e52164f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:50 +0100 Subject: ptrace: Remove unused ptrace_may_access_sched() and MODE_IBRS The IBPB control code in x86 removed the usage. Remove the functionality which was introduced for this. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.559149393@linutronix.de --- include/linux/ptrace.h | 17 ----------------- kernel/ptrace.c | 10 ---------- 2 files changed, 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 6c2ffed907f5..de20ede2c5c8 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -64,15 +64,12 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_NOAUDIT 0x04 #define PTRACE_MODE_FSCREDS 0x08 #define PTRACE_MODE_REALCREDS 0x10 -#define PTRACE_MODE_SCHED 0x20 -#define PTRACE_MODE_IBPB 0x40 /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) -#define PTRACE_MODE_SPEC_IBPB (PTRACE_MODE_ATTACH_REALCREDS | PTRACE_MODE_IBPB) /** * ptrace_may_access - check whether the caller is permitted to access @@ -90,20 +87,6 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); -/** - * ptrace_may_access - check whether the caller is permitted to access - * a target task. - * @task: target task - * @mode: selects type of access and caller credentials - * - * Returns true on success, false on denial. - * - * Similar to ptrace_may_access(). Only to be called from context switch - * code. Does not call into audit and the regular LSM hooks due to locking - * constraints. - */ -extern bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode); - static inline int ptrace_reparented(struct task_struct *child) { return !same_thread_group(child->real_parent, child->parent); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 80b34dffdfb9..c2cee9db5204 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -261,9 +261,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) { - if (mode & PTRACE_MODE_SCHED) - return false; - if (mode & PTRACE_MODE_NOAUDIT) return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); else @@ -331,16 +328,9 @@ ok: !ptrace_has_cap(mm->user_ns, mode))) return -EPERM; - if (mode & PTRACE_MODE_SCHED) - return 0; return security_ptrace_access_check(task, mode); } -bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode) -{ - return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED); -} - bool ptrace_may_access(struct task_struct *task, unsigned int mode) { int err; -- cgit v1.2.3 From 5cf99a0f3161bc3ae2391269d134d6bf7e26f00e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 29 Nov 2018 08:50:27 -0500 Subject: tracing/fgraph: Fix set_graph_function from showing interrupts The tracefs file set_graph_function is used to only function graph functions that are listed in that file (or all functions if the file is empty). The way this is implemented is that the function graph tracer looks at every function, and if the current depth is zero and the function matches something in the file then it will trace that function. When other functions are called, the depth will be greater than zero (because the original function will be at depth zero), and all functions will be traced where the depth is greater than zero. The issue is that when a function is first entered, and the handler that checks this logic is called, the depth is set to zero. If an interrupt comes in and a function in the interrupt handler is traced, its depth will be greater than zero and it will automatically be traced, even if the original function was not. But because the logic only looks at depth it may trace interrupts when it should not be. The recent design change of the function graph tracer to fix other bugs caused the depth to be zero while the function graph callback handler is being called for a longer time, widening the race of this happening. This bug was actually there for a longer time, but because the race window was so small it seldom happened. The Fixes tag below is for the commit that widen the race window, because that commit belongs to a series that will also help fix the original bug. Cc: stable@kernel.org Fixes: 39eb456dacb5 ("function_graph: Use new curr_ret_depth to manage depth instead of curr_ret_stack") Reported-by: Joe Lawrence Tested-by: Joe Lawrence Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 57 ++++++++++++++++++++++++++++++++++-- kernel/trace/trace_functions_graph.c | 4 +++ kernel/trace/trace_irqsoff.c | 2 ++ kernel/trace/trace_sched_wakeup.c | 2 ++ 4 files changed, 62 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3b8c0e24ab30..447bd96ee658 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -512,12 +512,44 @@ enum { * can only be modified by current, we can reuse trace_recursion. */ TRACE_IRQ_BIT, + + /* Set if the function is in the set_graph_function file */ + TRACE_GRAPH_BIT, + + /* + * In the very unlikely case that an interrupt came in + * at a start of graph tracing, and we want to trace + * the function in that interrupt, the depth can be greater + * than zero, because of the preempted start of a previous + * trace. In an even more unlikely case, depth could be 2 + * if a softirq interrupted the start of graph tracing, + * followed by an interrupt preempting a start of graph + * tracing in the softirq, and depth can even be 3 + * if an NMI came in at the start of an interrupt function + * that preempted a softirq start of a function that + * preempted normal context!!!! Luckily, it can't be + * greater than 3, so the next two bits are a mask + * of what the depth is when we set TRACE_GRAPH_BIT + */ + + TRACE_GRAPH_DEPTH_START_BIT, + TRACE_GRAPH_DEPTH_END_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) #define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) +#define trace_recursion_depth() \ + (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3) +#define trace_recursion_set_depth(depth) \ + do { \ + current->trace_recursion &= \ + ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \ + current->trace_recursion |= \ + ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \ + } while (0) + #define TRACE_CONTEXT_BITS 4 #define TRACE_FTRACE_START TRACE_FTRACE_BIT @@ -843,8 +875,9 @@ extern void __trace_graph_return(struct trace_array *tr, extern struct ftrace_hash *ftrace_graph_hash; extern struct ftrace_hash *ftrace_graph_notrace_hash; -static inline int ftrace_graph_addr(unsigned long addr) +static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) { + unsigned long addr = trace->func; int ret = 0; preempt_disable_notrace(); @@ -855,6 +888,14 @@ static inline int ftrace_graph_addr(unsigned long addr) } if (ftrace_lookup_ip(ftrace_graph_hash, addr)) { + + /* + * This needs to be cleared on the return functions + * when the depth is zero. + */ + trace_recursion_set(TRACE_GRAPH_BIT); + trace_recursion_set_depth(trace->depth); + /* * If no irqs are to be traced, but a set_graph_function * is set, and called by an interrupt handler, we still @@ -872,6 +913,13 @@ out: return ret; } +static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +{ + if (trace_recursion_test(TRACE_GRAPH_BIT) && + trace->depth == trace_recursion_depth()) + trace_recursion_clear(TRACE_GRAPH_BIT); +} + static inline int ftrace_graph_notrace_addr(unsigned long addr) { int ret = 0; @@ -885,7 +933,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) return ret; } #else -static inline int ftrace_graph_addr(unsigned long addr) +static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) { return 1; } @@ -894,6 +942,8 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) { return 0; } +static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +{ } #endif /* CONFIG_DYNAMIC_FTRACE */ extern unsigned int fgraph_max_depth; @@ -901,7 +951,8 @@ extern unsigned int fgraph_max_depth; static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace) { /* trace it when it is-nested-in or is a function enabled. */ - return !(trace->depth || ftrace_graph_addr(trace->func)) || + return !(trace_recursion_test(TRACE_GRAPH_BIT) || + ftrace_graph_addr(trace)) || (trace->depth < 0) || (fgraph_max_depth && trace->depth >= fgraph_max_depth); } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 2561460d7baf..086af4f5c3e8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -509,6 +509,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace) int cpu; int pc; + ftrace_graph_addr_finish(trace); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -532,6 +534,8 @@ void set_graph_array(struct trace_array *tr) static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) { + ftrace_graph_addr_finish(trace); + if (tracing_thresh && (trace->rettime - trace->calltime < tracing_thresh)) return; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b7357f9f82a3..98ea6d28df15 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -208,6 +208,8 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) unsigned long flags; int pc; + ftrace_graph_addr_finish(trace); + if (!func_prolog_dec(tr, &data, &flags)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index a86b303e6c67..7d04b9890755 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -270,6 +270,8 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) unsigned long flags; int pc; + ftrace_graph_addr_finish(trace); + if (!func_prolog_preempt_disable(tr, &data, &pc)) return; -- cgit v1.2.3 From ef1a8409348966f0b25ff97a170d6d0367710ea9 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Tue, 13 Nov 2018 00:08:48 +0300 Subject: stackleak: Disable function tracing and kprobes for stackleak_erase() The stackleak_erase() function is called on the trampoline stack at the end of syscall. This stack is not big enough for ftrace and kprobes operations, e.g. it can be exhausted if we use kprobe_events for stackleak_erase(). So let's disable function tracing and kprobes of stackleak_erase(). Reported-by: kernel test robot Fixes: 10e9ae9fabaf ("gcc-plugins: Add STACKLEAK plugin for tracking the kernel stack") Signed-off-by: Alexander Popov Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Masami Hiramatsu Signed-off-by: Kees Cook --- kernel/stackleak.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stackleak.c b/kernel/stackleak.c index e42892926244..08cb57eed389 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -11,6 +11,7 @@ */ #include +#include #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include @@ -47,7 +48,7 @@ int stack_erasing_sysctl(struct ctl_table *table, int write, #define skip_erasing() false #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ -asmlinkage void stackleak_erase(void) +asmlinkage void notrace stackleak_erase(void) { /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ unsigned long kstack_ptr = current->lowest_stack; @@ -101,6 +102,7 @@ asmlinkage void stackleak_erase(void) /* Reset the 'lowest_stack' value for the next syscall */ current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; } +NOKPROBE_SYMBOL(stackleak_erase); void __used stackleak_track_stack(void) { -- cgit v1.2.3 From e0c274472d5d27f277af722e017525e0b33784cd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 30 Nov 2018 14:09:58 -0800 Subject: psi: make disabling/enabling easier for vendor kernels Mel Gorman reports a hackbench regression with psi that would prohibit shipping the suse kernel with it default-enabled, but he'd still like users to be able to opt in at little to no cost to others. With the current combination of CONFIG_PSI and the psi_disabled bool set from the commandline, this is a challenge. Do the following things to make it easier: 1. Add a config option CONFIG_PSI_DEFAULT_DISABLED that allows distros to enable CONFIG_PSI in their kernel but leave the feature disabled unless a user requests it at boot-time. To avoid double negatives, rename psi_disabled= to psi=. 2. Make psi_disabled a static branch to eliminate any branch costs when the feature is disabled. In terms of numbers before and after this patch, Mel says: : The following is a comparision using CONFIG_PSI=n as a baseline against : your patch and a vanilla kernel : : 4.20.0-rc4 4.20.0-rc4 4.20.0-rc4 : kconfigdisable-v1r1 vanilla psidisable-v1r1 : Amean 1 1.3100 ( 0.00%) 1.3923 ( -6.28%) 1.3427 ( -2.49%) : Amean 3 3.8860 ( 0.00%) 4.1230 * -6.10%* 3.8860 ( -0.00%) : Amean 5 6.8847 ( 0.00%) 8.0390 * -16.77%* 6.7727 ( 1.63%) : Amean 7 9.9310 ( 0.00%) 10.8367 * -9.12%* 9.9910 ( -0.60%) : Amean 12 16.6577 ( 0.00%) 18.2363 * -9.48%* 17.1083 ( -2.71%) : Amean 18 26.5133 ( 0.00%) 27.8833 * -5.17%* 25.7663 ( 2.82%) : Amean 24 34.3003 ( 0.00%) 34.6830 ( -1.12%) 32.0450 ( 6.58%) : Amean 30 40.0063 ( 0.00%) 40.5800 ( -1.43%) 41.5087 ( -3.76%) : Amean 32 40.1407 ( 0.00%) 41.2273 ( -2.71%) 39.9417 ( 0.50%) : : It's showing that the vanilla kernel takes a hit (as the bisection : indicated it would) and that disabling PSI by default is reasonably : close in terms of performance for this particular workload on this : particular machine so; Link: http://lkml.kernel.org/r/20181127165329.GA29728@cmpxchg.org Signed-off-by: Johannes Weiner Tested-by: Mel Gorman Reported-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ include/linux/psi.h | 3 ++- init/Kconfig | 9 ++++++++ kernel/sched/psi.c | 30 +++++++++++++++++-------- kernel/sched/stats.h | 8 +++---- 5 files changed, 40 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 675170c36078..5d6ba930d4f4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3505,6 +3505,10 @@ before loading. See Documentation/blockdev/ramdisk.txt. + psi= [KNL] Enable or disable pressure stall information + tracking. + Format: + psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to probe for; one of (bare|imps|exps|lifebook|any). psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports diff --git a/include/linux/psi.h b/include/linux/psi.h index 8e0725aac0aa..7006008d5b72 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -1,6 +1,7 @@ #ifndef _LINUX_PSI_H #define _LINUX_PSI_H +#include #include #include @@ -9,7 +10,7 @@ struct css_set; #ifdef CONFIG_PSI -extern bool psi_disabled; +extern struct static_key_false psi_disabled; void psi_init(void); diff --git a/init/Kconfig b/init/Kconfig index a4112e95724a..cf5b5a0dcbc2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -509,6 +509,15 @@ config PSI Say N if unsure. +config PSI_DEFAULT_DISABLED + bool "Require boot parameter to enable pressure stall information tracking" + default n + depends on PSI + help + If set, pressure stall information tracking will be disabled + per default but can be enabled through passing psi_enable=1 + on the kernel commandline during boot. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 3d7355d7c3e3..fe24de3fbc93 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -136,8 +136,18 @@ static int psi_bug __read_mostly; -bool psi_disabled __read_mostly; -core_param(psi_disabled, psi_disabled, bool, 0644); +DEFINE_STATIC_KEY_FALSE(psi_disabled); + +#ifdef CONFIG_PSI_DEFAULT_DISABLED +bool psi_enable; +#else +bool psi_enable = true; +#endif +static int __init setup_psi(char *str) +{ + return kstrtobool(str, &psi_enable) == 0; +} +__setup("psi=", setup_psi); /* Running averages - we need to be higher-res than loadavg */ #define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ @@ -169,8 +179,10 @@ static void group_init(struct psi_group *group) void __init psi_init(void) { - if (psi_disabled) + if (!psi_enable) { + static_branch_enable(&psi_disabled); return; + } psi_period = jiffies_to_nsecs(PSI_FREQ); group_init(&psi_system); @@ -549,7 +561,7 @@ void psi_memstall_enter(unsigned long *flags) struct rq_flags rf; struct rq *rq; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; *flags = current->flags & PF_MEMSTALL; @@ -579,7 +591,7 @@ void psi_memstall_leave(unsigned long *flags) struct rq_flags rf; struct rq *rq; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (*flags) @@ -600,7 +612,7 @@ void psi_memstall_leave(unsigned long *flags) #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return 0; cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); @@ -612,7 +624,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) void psi_cgroup_free(struct cgroup *cgroup) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; cancel_delayed_work_sync(&cgroup->psi.clock_work); @@ -637,7 +649,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) struct rq_flags rf; struct rq *rq; - if (psi_disabled) { + if (static_branch_likely(&psi_disabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. @@ -673,7 +685,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; update_stats(group); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 4904c4677000..aa0de240fb41 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -66,7 +66,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) { int clear = 0, set = TSK_RUNNING; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (!wakeup || p->sched_psi_wake_requeue) { @@ -86,7 +86,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) { int clear = TSK_RUNNING, set = 0; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (!sleep) { @@ -102,7 +102,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) static inline void psi_ttwu_dequeue(struct task_struct *p) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; /* * Is the task being migrated during a wakeup? Make sure to @@ -128,7 +128,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) static inline void psi_task_tick(struct rq *rq) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (unlikely(rq->curr->flags & PF_MEMSTALL)) -- cgit v1.2.3 From 903e8ff86753e6f327bb92166a0665e4ecb8e2e7 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 30 Nov 2018 14:10:05 -0800 Subject: kernel/kcov.c: mark funcs in __sanitizer_cov_trace_pc() as notrace Since __sanitizer_cov_trace_pc() is marked as notrace, function calls in __sanitizer_cov_trace_pc() shouldn't be traced either. ftrace_graph_caller() gets called for each function that isn't marked 'notrace', like canonicalize_ip(). This is the call trace from a run: [ 139.644550] ftrace_graph_caller+0x1c/0x24 [ 139.648352] canonicalize_ip+0x18/0x28 [ 139.652313] __sanitizer_cov_trace_pc+0x14/0x58 [ 139.656184] sched_clock+0x34/0x1e8 [ 139.659759] trace_clock_local+0x40/0x88 [ 139.663722] ftrace_push_return_trace+0x8c/0x1f0 [ 139.667767] prepare_ftrace_return+0xa8/0x100 [ 139.671709] ftrace_graph_caller+0x1c/0x24 Rework so that check_kcov_mode() and canonicalize_ip() that are called from __sanitizer_cov_trace_pc() are also marked as notrace. Link: http://lkml.kernel.org/r/20181128081239.18317-1-anders.roxell@linaro.org Signed-off-by: Arnd Bergmann Signen-off-by: Anders Roxell Co-developed-by: Arnd Bergmann Acked-by: Steven Rostedt (VMware) Cc: Dmitry Vyukov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 3ebd09efe72a..97959d7b77e2 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -56,7 +56,7 @@ struct kcov { struct task_struct *t; }; -static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) +static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { unsigned int mode; @@ -78,7 +78,7 @@ static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) return mode == needed_mode; } -static unsigned long canonicalize_ip(unsigned long ip) +static notrace unsigned long canonicalize_ip(unsigned long ip) { #ifdef CONFIG_RANDOMIZE_BASE ip -= kaslr_offset(); -- cgit v1.2.3