From 3a38bb98d9abdc3856f26b5ed4332803065cd7cf Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 10 Apr 2018 09:37:32 -0700 Subject: bpf/tracing: fix a deadlock in perf_event_detach_bpf_prog syzbot reported a possible deadlock in perf_event_detach_bpf_prog. The error details: ====================================================== WARNING: possible circular locking dependency detected 4.16.0-rc7+ #3 Not tainted ------------------------------------------------------ syz-executor7/24531 is trying to acquire lock: (bpf_event_mutex){+.+.}, at: [<000000008a849b07>] perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854 but task is already holding lock: (&mm->mmap_sem){++++}, at: [<0000000038768f87>] vm_mmap_pgoff+0x198/0x280 mm/util.c:353 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&mm->mmap_sem){++++}: __might_fault+0x13a/0x1d0 mm/memory.c:4571 _copy_to_user+0x2c/0xc0 lib/usercopy.c:25 copy_to_user include/linux/uaccess.h:155 [inline] bpf_prog_array_copy_info+0xf2/0x1c0 kernel/bpf/core.c:1694 perf_event_query_prog_array+0x1c7/0x2c0 kernel/trace/bpf_trace.c:891 _perf_ioctl kernel/events/core.c:4750 [inline] perf_ioctl+0x3e1/0x1480 kernel/events/core.c:4770 vfs_ioctl fs/ioctl.c:46 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:686 SYSC_ioctl fs/ioctl.c:701 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 -> #0 (bpf_event_mutex){+.+.}: lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920 __mutex_lock_common kernel/locking/mutex.c:756 [inline] __mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908 perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854 perf_event_free_bpf_prog kernel/events/core.c:8147 [inline] _free_event+0xbdb/0x10f0 kernel/events/core.c:4116 put_event+0x24/0x30 kernel/events/core.c:4204 perf_mmap_close+0x60d/0x1010 kernel/events/core.c:5172 remove_vma+0xb4/0x1b0 mm/mmap.c:172 remove_vma_list mm/mmap.c:2490 [inline] do_munmap+0x82a/0xdf0 mm/mmap.c:2731 mmap_region+0x59e/0x15a0 mm/mmap.c:1646 do_mmap+0x6c0/0xe00 mm/mmap.c:1483 do_mmap_pgoff include/linux/mm.h:2223 [inline] vm_mmap_pgoff+0x1de/0x280 mm/util.c:355 SYSC_mmap_pgoff mm/mmap.c:1533 [inline] SyS_mmap_pgoff+0x462/0x5f0 mm/mmap.c:1491 SYSC_mmap arch/x86/kernel/sys_x86_64.c:100 [inline] SyS_mmap+0x16/0x20 arch/x86/kernel/sys_x86_64.c:91 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(bpf_event_mutex); lock(&mm->mmap_sem); lock(bpf_event_mutex); *** DEADLOCK *** ====================================================== The bug is introduced by Commit f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") where copy_to_user, which requires mm->mmap_sem, is called inside bpf_event_mutex lock. At the same time, during perf_event file descriptor close, mm->mmap_sem is held first and then subsequent perf_event_detach_bpf_prog needs bpf_event_mutex lock. Such a senario caused a deadlock. As suggested by Daniel, moving copy_to_user out of the bpf_event_mutex lock should fix the problem. Fixes: f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") Reported-by: syzbot+dc5ca0e4c9bfafaf2bae@syzkaller.appspotmail.com Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 45 +++++++++++++++++++++++++++++---------------- kernel/trace/bpf_trace.c | 25 +++++++++++++++++++++---- 2 files changed, 50 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d315b393abdd..ba03ec39efb3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1572,13 +1572,32 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) return cnt; } +static bool bpf_prog_array_copy_core(struct bpf_prog **prog, + u32 *prog_ids, + u32 request_cnt) +{ + int i = 0; + + for (; *prog; prog++) { + if (*prog == &dummy_bpf_prog.prog) + continue; + prog_ids[i] = (*prog)->aux->id; + if (++i == request_cnt) { + prog++; + break; + } + } + + return !!(*prog); +} + int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, __u32 __user *prog_ids, u32 cnt) { struct bpf_prog **prog; unsigned long err = 0; - u32 i = 0, *ids; bool nospc; + u32 *ids; /* users of this function are doing: * cnt = bpf_prog_array_length(); @@ -1595,16 +1614,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, return -ENOMEM; rcu_read_lock(); prog = rcu_dereference(progs)->progs; - for (; *prog; prog++) { - if (*prog == &dummy_bpf_prog.prog) - continue; - ids[i] = (*prog)->aux->id; - if (++i == cnt) { - prog++; - break; - } - } - nospc = !!(*prog); + nospc = bpf_prog_array_copy_core(prog, ids, cnt); rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); @@ -1683,22 +1693,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, } int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, - __u32 __user *prog_ids, u32 request_cnt, - __u32 __user *prog_cnt) + u32 *prog_ids, u32 request_cnt, + u32 *prog_cnt) { + struct bpf_prog **prog; u32 cnt = 0; if (array) cnt = bpf_prog_array_length(array); - if (copy_to_user(prog_cnt, &cnt, sizeof(cnt))) - return -EFAULT; + *prog_cnt = cnt; /* return early if user requested only program count or nothing to copy */ if (!request_cnt || !cnt) return 0; - return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt); + /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ + prog = rcu_dereference_check(array, 1)->progs; + return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC + : 0; } static void bpf_prog_free_deferred(struct work_struct *work) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d88e96d4e12c..56ba0f2a01db 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -977,6 +977,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; + u32 *ids, prog_cnt, ids_len; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -985,16 +986,32 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) return -EINVAL; if (copy_from_user(&query, uquery, sizeof(query))) return -EFAULT; - if (query.ids_len > BPF_TRACE_MAX_PROGS) + + ids_len = query.ids_len; + if (ids_len > BPF_TRACE_MAX_PROGS) return -E2BIG; + ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN); + if (!ids) + return -ENOMEM; + /* + * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which + * is required when user only wants to check for uquery->prog_cnt. + * There is no need to check for it since the case is handled + * gracefully in bpf_prog_array_copy_info. + */ mutex_lock(&bpf_event_mutex); ret = bpf_prog_array_copy_info(event->tp_event->prog_array, - uquery->ids, - query.ids_len, - &uquery->prog_cnt); + ids, + ids_len, + &prog_cnt); mutex_unlock(&bpf_event_mutex); + if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || + copy_to_user(uquery->ids, ids, ids_len * sizeof(u32))) + ret = -EFAULT; + + kfree(ids); return ret; } -- cgit v1.2.3 From 5c8dad48e4f53d6fd0a7e4f95d7c1c983374de88 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 13 Apr 2018 11:55:13 -0700 Subject: trace_kprobe: Remove warning message "Could not insert probe at..." This warning message is not very helpful, as the return value should already show information about the error. Also, this message will spam dmesg if the user space does testing in a loop, like: for x in {0..5} do echo p:xx xx+$x >> /sys/kernel/debug/tracing/kprobe_events done Reported-by: Vince Weaver Signed-off-by: Song Liu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/20180413185513.3626052-1-songliubraving@fb.com Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1cd3fb4d70f8..02aed76e0978 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -512,8 +512,6 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; else { - pr_warn("Could not insert probe at %s+%lu: %d\n", - trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); ret = 0; -- cgit v1.2.3 From 101592b4904ecf6b8ed2a4784d41d180319d95a1 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Mon, 9 Apr 2018 10:25:32 +0300 Subject: perf/core: Store context switch out type in PERF_RECORD_SWITCH[_CPU_WIDE] Store preempting context switch out event into Perf trace as a part of PERF_RECORD_SWITCH[_CPU_WIDE] record. Percentage of preempting and non-preempting context switches help understanding the nature of workloads (CPU or IO bound) that are running on a machine; The event is treated as preemption one when task->state value of the thread being switched out is TASK_RUNNING. Event type encoding is implemented using PERF_RECORD_MISC_SWITCH_OUT_PREEMPT bit; Signed-off-by: Alexey Budankov Acked-by: Peter Zijlstra Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/9ff84e83-a0ca-dd82-a6d0-cb951689be74@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- include/uapi/linux/perf_event.h | 18 +++++++++++++++--- kernel/events/core.c | 4 ++++ tools/include/uapi/linux/perf_event.h | 18 +++++++++++++++--- 3 files changed, 34 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 912b85b52344..b8e288a1f740 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -650,11 +650,23 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_COMM_EXEC (1 << 13) #define PERF_RECORD_MISC_SWITCH_OUT (1 << 13) /* - * Indicates that the content of PERF_SAMPLE_IP points to - * the actual instruction that triggered the event. See also - * perf_event_attr::precise_ip. + * These PERF_RECORD_MISC_* flags below are safely reused + * for the following events: + * + * PERF_RECORD_MISC_EXACT_IP - PERF_RECORD_SAMPLE of precise events + * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events + * + * + * PERF_RECORD_MISC_EXACT_IP: + * Indicates that the content of PERF_SAMPLE_IP points to + * the actual instruction that triggered the event. See also + * perf_event_attr::precise_ip. + * + * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT: + * Indicates that thread was preempted in TASK_RUNNING state. */ #define PERF_RECORD_MISC_EXACT_IP (1 << 14) +#define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) /* * Reserve the last bit to indicate some extended misc field */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d5fe26551f8..1bae80aaabfb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7587,6 +7587,10 @@ static void perf_event_switch(struct task_struct *task, }, }; + if (!sched_in && task->state == TASK_RUNNING) + switch_event.event_id.header.misc |= + PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; + perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 912b85b52344..b8e288a1f740 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -650,11 +650,23 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_COMM_EXEC (1 << 13) #define PERF_RECORD_MISC_SWITCH_OUT (1 << 13) /* - * Indicates that the content of PERF_SAMPLE_IP points to - * the actual instruction that triggered the event. See also - * perf_event_attr::precise_ip. + * These PERF_RECORD_MISC_* flags below are safely reused + * for the following events: + * + * PERF_RECORD_MISC_EXACT_IP - PERF_RECORD_SAMPLE of precise events + * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events + * + * + * PERF_RECORD_MISC_EXACT_IP: + * Indicates that the content of PERF_SAMPLE_IP points to + * the actual instruction that triggered the event. See also + * perf_event_attr::precise_ip. + * + * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT: + * Indicates that thread was preempted in TASK_RUNNING state. */ #define PERF_RECORD_MISC_EXACT_IP (1 << 14) +#define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) /* * Reserve the last bit to indicate some extended misc field */ -- cgit v1.2.3 From 78b562fbfa2cf0a9fcb23c3154756b690f4905c1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 15 Apr 2018 11:23:50 +0200 Subject: perf: Return proper values for user stack errors Return immediately when we find issue in the user stack checks. The error value could get overwritten by following check for PERF_SAMPLE_REGS_INTR. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: H. Peter Anvin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: syzkaller-bugs@googlegroups.com Cc: x86@kernel.org Fixes: 60e2364e60e8 ("perf: Add ability to sample machine state on interrupt") Link: http://lkml.kernel.org/r/20180415092352.12403-1-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1bae80aaabfb..67612ce359ad 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10209,9 +10209,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, * __u16 sample size limit. */ if (attr->sample_stack_user >= USHRT_MAX) - ret = -EINVAL; + return -EINVAL; else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) - ret = -EINVAL; + return -EINVAL; } if (!attr->sample_max_stack) -- cgit v1.2.3 From 5af44ca53d019de47efe6dbc4003dd518e5197ed Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 15 Apr 2018 11:23:51 +0200 Subject: perf: Fix sample_max_stack maximum check The syzbot hit KASAN bug in perf_callchain_store having the entry stored behind the allocated bounds [1]. We miss the sample_max_stack check for the initial event that allocates callchain buffers. This missing check allows to create an event with sample_max_stack value bigger than the global sysctl maximum: # sysctl -a | grep perf_event_max_stack kernel.perf_event_max_stack = 127 # perf record -vv -C 1 -e cycles/max-stack=256/ kill ... perf_event_attr: size 112 ... sample_max_stack 256 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8 = 4 Note the '-C 1', which forces perf record to create just single event. Otherwise it opens event for every cpu, then the sample_max_stack check fails on the second event and all's fine. The fix is to run the sample_max_stack check also for the first event with callchains. [1] https://marc.info/?l=linux-kernel&m=152352732920874&w=2 Reported-by: syzbot+7c449856228b63ac951e@syzkaller.appspotmail.com Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: H. Peter Anvin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: syzkaller-bugs@googlegroups.com Cc: x86@kernel.org Fixes: 97c79a38cd45 ("perf core: Per event callchain limit") Link: http://lkml.kernel.org/r/20180415092352.12403-2-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/callchain.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 772a43fea825..73cc26e321de 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -119,19 +119,22 @@ int get_callchain_buffers(int event_max_stack) goto exit; } + /* + * If requesting per event more than the global cap, + * return a different error to help userspace figure + * this out. + * + * And also do it here so that we have &callchain_mutex held. + */ + if (event_max_stack > sysctl_perf_event_max_stack) { + err = -EOVERFLOW; + goto exit; + } + if (count > 1) { /* If the allocation failed, give up */ if (!callchain_cpus_entries) err = -ENOMEM; - /* - * If requesting per event more than the global cap, - * return a different error to help userspace figure - * this out. - * - * And also do it here so that we have &callchain_mutex held. - */ - if (event_max_stack > sysctl_perf_event_max_stack) - err = -EOVERFLOW; goto exit; } -- cgit v1.2.3 From bfb3d7b8b906b66551424d7636182126e1d134c8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 15 Apr 2018 11:23:52 +0200 Subject: perf: Remove superfluous allocation error check If the get_callchain_buffers fails to allocate the buffer it will decrease the nr_callchain_events right away. There's no point of checking the allocation error for nr_callchain_events > 1. Removing that check. Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: H. Peter Anvin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: syzkaller-bugs@googlegroups.com Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20180415092352.12403-3-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/callchain.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 73cc26e321de..c187aa3df3c8 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -131,14 +131,8 @@ int get_callchain_buffers(int event_max_stack) goto exit; } - if (count > 1) { - /* If the allocation failed, give up */ - if (!callchain_cpus_entries) - err = -ENOMEM; - goto exit; - } - - err = alloc_callchain_buffers(); + if (count == 1) + err = alloc_callchain_buffers(); exit: if (err) atomic_dec(&nr_callchain_events); -- cgit v1.2.3 From 4450dc0ae2c18a0ac6dce560215c7a1fa12122b5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 5 Apr 2018 17:26:58 +0200 Subject: clockevents: Fix kernel messages split across multiple lines Convert the clockevents driver from old-style printk() to pr_info() and pr_cont(), to fix split kernel messages like below: Clockevents: could not switch to one-shot mode: dummy_timer is not functional. Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Cc: Frederic Weisbecker Link: https://lkml.kernel.org/r/1522942018-14471-1-git-send-email-geert%2Brenesas@glider.be --- kernel/time/tick-oneshot.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index c1f518e7aa80..6fe615d57ebb 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -82,16 +82,15 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || !tick_device_is_functional(dev)) { - printk(KERN_INFO "Clockevents: " - "could not switch to one-shot mode:"); + pr_info("Clockevents: could not switch to one-shot mode:"); if (!dev) { - printk(" no tick device\n"); + pr_cont(" no tick device\n"); } else { if (!tick_device_is_functional(dev)) - printk(" %s is not functional.\n", dev->name); + pr_cont(" %s is not functional.\n", dev->name); else - printk(" %s does not support one-shot mode.\n", - dev->name); + pr_cont(" %s does not support one-shot mode.\n", + dev->name); } return -EINVAL; } -- cgit v1.2.3 From e142aa09ed88be98395dde7acb96fb2263566b68 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 13 Apr 2018 13:27:58 +0800 Subject: timekeeping: Remove __current_kernel_time() The __current_kernel_time() function based on 'struct timespec' is no longer recommended for new code, and the only user of this function has been replaced by commit 6909e29fdefb ("kdb: use __ktime_get_real_seconds instead of __current_kernel_time"). Remove the obsolete interface. Signed-off-by: Baolin Wang Signed-off-by: Thomas Gleixner Cc: arnd@arndb.de Cc: sboyd@kernel.org Cc: broonie@kernel.org Cc: john.stultz@linaro.org Link: https://lkml.kernel.org/r/1a9dbea7ee2cda7efe9ed330874075cf17fdbff6.1523596316.git.baolin.wang@linaro.org --- include/linux/timekeeping32.h | 3 --- kernel/time/timekeeping.c | 7 ------- 2 files changed, 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h index af4114d5dc17..3616b4becb59 100644 --- a/include/linux/timekeeping32.h +++ b/include/linux/timekeeping32.h @@ -9,9 +9,6 @@ extern void do_gettimeofday(struct timeval *tv); unsigned long get_seconds(void); -/* does not take xtime_lock */ -struct timespec __current_kernel_time(void); - static inline struct timespec current_kernel_time(void) { struct timespec64 now = current_kernel_time64(); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ca90219a1e73..dcf7f20fcd12 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2139,13 +2139,6 @@ unsigned long get_seconds(void) } EXPORT_SYMBOL(get_seconds); -struct timespec __current_kernel_time(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - return timespec64_to_timespec(tk_xtime(tk)); -} - struct timespec64 current_kernel_time64(void) { struct timekeeper *tk = &tk_core.timekeeper; -- cgit v1.2.3 From c3bca5d450b620dd3d36e14b5e1f43639fd47d6b Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 17 Apr 2018 14:57:42 -0700 Subject: posix-cpu-timers: Ensure set_process_cpu_timer is always evaluated Commit a9445e47d897 ("posix-cpu-timers: Make set_process_cpu_timer() more robust") moved the check into the 'if' statement. Unfortunately, it did so on the right side of an && which means that it may get short circuited and never evaluated. This is easily reproduced with: $ cat loop.c void main() { struct rlimit res; /* set the CPU time limit */ getrlimit(RLIMIT_CPU,&res); res.rlim_cur = 2; res.rlim_max = 2; setrlimit(RLIMIT_CPU,&res); while (1); } Which will hang forever instead of being killed. Fix this by pulling the evaluation out of the if statement but checking the return value instead. Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1568337 Fixes: a9445e47d897 ("posix-cpu-timers: Make set_process_cpu_timer() more robust") Signed-off-by: Laura Abbott Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Cc: "Max R . P . Grossmann" Cc: John Stultz Link: https://lkml.kernel.org/r/20180417215742.2521-1-labbott@redhat.com --- kernel/time/posix-cpu-timers.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2541bd89f20e..5a6251ac6f7a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1205,10 +1205,12 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, u64 *newval, u64 *oldval) { u64 now; + int ret; WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); + ret = cpu_timer_sample_group(clock_idx, tsk, &now); - if (oldval && cpu_timer_sample_group(clock_idx, tsk, &now) != -EINVAL) { + if (oldval && ret != -EINVAL) { /* * We are setting itimer. The *oldval is absolute and we update * it to be relative, *newval argument is relative and we update -- cgit v1.2.3 From 6ab690aa439803347743c0d899ac422774fdd5e7 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 20 Apr 2018 18:16:30 +0200 Subject: bpf: sockmap remove dead check Remove dead code that bails on `attr->value_size > KMALLOC_MAX_SIZE` - the previous check already bails on `attr->value_size != 4`. Signed-off-by: Jann Horn Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 8dd9210d7db7..a3b21385e947 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1442,9 +1442,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - if (attr->value_size > KMALLOC_MAX_SIZE) - return ERR_PTR(-E2BIG); - err = bpf_tcp_ulp_register(); if (err && err != -EEXIST) return ERR_PTR(err); -- cgit v1.2.3 From e01e80634ecdde1dd113ac43b3adad21b47f3957 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 20 Apr 2018 14:55:31 -0700 Subject: fork: unconditionally clear stack on fork One of the classes of kernel stack content leaks[1] is exposing the contents of prior heap or stack contents when a new process stack is allocated. Normally, those stacks are not zeroed, and the old contents remain in place. In the face of stack content exposure flaws, those contents can leak to userspace. Fixing this will make the kernel no longer vulnerable to these flaws, as the stack will be wiped each time a stack is assigned to a new process. There's not a meaningful change in runtime performance; it almost looks like it provides a benefit. Performing back-to-back kernel builds before: Run times: 157.86 157.09 158.90 160.94 160.80 Mean: 159.12 Std Dev: 1.54 and after: Run times: 159.31 157.34 156.71 158.15 160.81 Mean: 158.46 Std Dev: 1.46 Instead of making this a build or runtime config, Andy Lutomirski recommended this just be enabled by default. [1] A noisy search for many kinds of stack content leaks can be seen here: https://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=linux+kernel+stack+leak I did some more with perf and cycle counts on running 100,000 execs of /bin/true. before: Cycles: 218858861551 218853036130 214727610969 227656844122 224980542841 Mean: 221015379122.60 Std Dev: 4662486552.47 after: Cycles: 213868945060 213119275204 211820169456 224426673259 225489986348 Mean: 217745009865.40 Std Dev: 5935559279.99 It continues to look like it's faster, though the deviation is rather wide, but I'm not sure what I could do that would be less noisy. I'm open to ideas! Link: http://lkml.kernel.org/r/20180221021659.GA37073@beast Signed-off-by: Kees Cook Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Andy Lutomirski Cc: Laura Abbott Cc: Rasmus Villemoes Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/thread_info.h | 6 +----- kernel/fork.c | 3 +-- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 34f053a150a9..cf2862bd134a 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -43,11 +43,7 @@ enum { #define THREAD_ALIGN THREAD_SIZE #endif -#if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK) -# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) -#else -# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT) -#endif +#define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) /* * flag set/clear/test wrappers diff --git a/kernel/fork.c b/kernel/fork.c index 242c8c93d285..a5d21c42acfc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -216,10 +216,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; -#ifdef CONFIG_DEBUG_KMEMLEAK /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); -#endif + tsk->stack_vm_area = s; return s->addr; } -- cgit v1.2.3