diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/inode.c | 7 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 24 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 76 | ||||
-rw-r--r-- | kernel/cgroup.c | 7 | ||||
-rw-r--r-- | kernel/cpuset.c | 4 | ||||
-rw-r--r-- | kernel/events/core.c | 55 | ||||
-rw-r--r-- | kernel/kcov.c | 3 | ||||
-rw-r--r-- | kernel/kexec_core.c | 7 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 37 | ||||
-rw-r--r-- | kernel/locking/lockdep_proc.c | 2 | ||||
-rw-r--r-- | kernel/workqueue.c | 29 |
11 files changed, 188 insertions, 63 deletions
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index f2ece3c174a5..8f94ca1860cf 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: - atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); + raw = bpf_prog_inc(raw); break; case BPF_TYPE_MAP: - bpf_map_inc(raw, true); + raw = bpf_map_inc(raw, true); break; default: WARN_ON_ONCE(1); @@ -297,7 +297,8 @@ static void *bpf_obj_do_get(const struct filename *pathname, goto out; raw = bpf_any_get(inode->i_private, *type); - touch_atime(&path); + if (!IS_ERR(raw)) + touch_atime(&path); path_put(&path); return raw; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index adc5e4bd74f8..cf5e9f7ad13a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -218,11 +218,18 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -void bpf_map_inc(struct bpf_map *map, bool uref) +/* prog's and map's refcnt limit */ +#define BPF_MAX_REFCNT 32768 + +struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) { - atomic_inc(&map->refcnt); + if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) { + atomic_dec(&map->refcnt); + return ERR_PTR(-EBUSY); + } if (uref) atomic_inc(&map->usercnt); + return map; } struct bpf_map *bpf_map_get_with_uref(u32 ufd) @@ -234,7 +241,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) if (IS_ERR(map)) return map; - bpf_map_inc(map, true); + map = bpf_map_inc(map, true); fdput(f); return map; @@ -658,6 +665,15 @@ static struct bpf_prog *__bpf_prog_get(struct fd f) return f.file->private_data; } +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { + atomic_dec(&prog->aux->refcnt); + return ERR_PTR(-EBUSY); + } + return prog; +} + /* called by sockets/tracing/seccomp before attaching program to an event * pairs with bpf_prog_put() */ @@ -670,7 +686,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) if (IS_ERR(prog)) return prog; - atomic_inc(&prog->aux->refcnt); + prog = bpf_prog_inc(prog); fdput(f); return prog; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 56f18068b52b..63554b6d4e25 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -249,16 +249,6 @@ static const char * const reg_type_str[] = { [CONST_IMM] = "imm", }; -static const struct { - int map_type; - int func_id; -} func_limit[] = { - {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, - {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, - {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, - {BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid}, -}; - static void print_verifier_state(struct verifier_env *env) { enum bpf_reg_type t; @@ -943,27 +933,52 @@ static int check_func_arg(struct verifier_env *env, u32 regno, static int check_map_func_compatibility(struct bpf_map *map, int func_id) { - bool bool_map, bool_func; - int i; - if (!map) return 0; - for (i = 0; i < ARRAY_SIZE(func_limit); i++) { - bool_map = (map->map_type == func_limit[i].map_type); - bool_func = (func_id == func_limit[i].func_id); - /* only when map & func pair match it can continue. - * don't allow any other map type to be passed into - * the special func; - */ - if (bool_func && bool_map != bool_func) { - verbose("cannot pass map_type %d into func %d\n", - map->map_type, func_id); - return -EINVAL; - } + /* We need a two way check, first is from map perspective ... */ + switch (map->map_type) { + case BPF_MAP_TYPE_PROG_ARRAY: + if (func_id != BPF_FUNC_tail_call) + goto error; + break; + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + if (func_id != BPF_FUNC_perf_event_read && + func_id != BPF_FUNC_perf_event_output) + goto error; + break; + case BPF_MAP_TYPE_STACK_TRACE: + if (func_id != BPF_FUNC_get_stackid) + goto error; + break; + default: + break; + } + + /* ... and second from the function itself. */ + switch (func_id) { + case BPF_FUNC_tail_call: + if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + goto error; + break; + case BPF_FUNC_perf_event_read: + case BPF_FUNC_perf_event_output: + if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) + goto error; + break; + case BPF_FUNC_get_stackid: + if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) + goto error; + break; + default: + break; } return 0; +error: + verbose("cannot pass map_type %d into func %d\n", + map->map_type, func_id); + return -EINVAL; } static int check_raw_mode(const struct bpf_func_proto *fn) @@ -2111,15 +2126,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) return -E2BIG; } - /* remember this map */ - env->used_maps[env->used_map_cnt++] = map; - /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded * and all maps are released in free_bpf_prog_info() */ - bpf_map_inc(map, false); + map = bpf_map_inc(map, false); + if (IS_ERR(map)) { + fdput(f); + return PTR_ERR(map); + } + env->used_maps[env->used_map_cnt++] = map; + fdput(f); next_insn: insn++; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 671dc05c0b0f..909a7d31ffd3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2825,9 +2825,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; + struct cgroup_subsys *ss; struct cgroup *cgrp; pid_t pid; - int ret; + int ssid, ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; @@ -2875,8 +2876,10 @@ out_unlock_rcu: rcu_read_unlock(); out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); + for_each_subsys(ss, ssid) + if (ss->post_attach) + ss->post_attach(); cgroup_kn_unlock(of->kn); - cpuset_post_attach_flush(); return ret ?: nbytes; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 00ab5c2b7c5b..1902956baba1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -58,7 +58,6 @@ #include <asm/uaccess.h> #include <linux/atomic.h> #include <linux/mutex.h> -#include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/wait.h> @@ -1016,7 +1015,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -void cpuset_post_attach_flush(void) +static void cpuset_post_attach(void) { flush_workqueue(cpuset_migrate_mm_wq); } @@ -2087,6 +2086,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, + .post_attach = cpuset_post_attach, .bind = cpuset_bind, .legacy_cftypes = files, .early_init = true, diff --git a/kernel/events/core.c b/kernel/events/core.c index 9eb23dc27462..0bdc6e7d4908 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -412,7 +412,8 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, if (ret || !write) return ret; - if (sysctl_perf_cpu_time_max_percent == 100) { + if (sysctl_perf_cpu_time_max_percent == 100 || + sysctl_perf_cpu_time_max_percent == 0) { printk(KERN_WARNING "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); WRITE_ONCE(perf_sample_allowed_ns, 0); @@ -1105,6 +1106,7 @@ static void put_ctx(struct perf_event_context *ctx) * function. * * Lock order: + * cred_guard_mutex * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; @@ -3420,7 +3422,6 @@ static struct task_struct * find_lively_task_by_vpid(pid_t vpid) { struct task_struct *task; - int err; rcu_read_lock(); if (!vpid) @@ -3434,16 +3435,7 @@ find_lively_task_by_vpid(pid_t vpid) if (!task) return ERR_PTR(-ESRCH); - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) - goto errout; - return task; -errout: - put_task_struct(task); - return ERR_PTR(err); - } /* @@ -8446,6 +8438,24 @@ SYSCALL_DEFINE5(perf_event_open, get_online_cpus(); + if (task) { + err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); + if (err) + goto err_cpus; + + /* + * Reuse ptrace permission checks for now. + * + * We must hold cred_guard_mutex across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) + goto err_cred; + } + if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; @@ -8453,7 +8463,7 @@ SYSCALL_DEFINE5(perf_event_open, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_cpus; + goto err_cred; } if (is_sampling_event(event)) { @@ -8512,11 +8522,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; } - if (task) { - put_task_struct(task); - task = NULL; - } - /* * Look up the group leader (we will attach this event to it): */ @@ -8614,6 +8619,11 @@ SYSCALL_DEFINE5(perf_event_open, WARN_ON_ONCE(ctx->parent_ctx); + /* + * This is the point on no return; we cannot fail hereafter. This is + * where we start modifying current state. + */ + if (move_group) { /* * See perf_event_ctx_lock() for comments on the details @@ -8685,6 +8695,11 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&gctx->mutex); mutex_unlock(&ctx->mutex); + if (task) { + mutex_unlock(&task->signal->cred_guard_mutex); + put_task_struct(task); + } + put_online_cpus(); mutex_lock(¤t->perf_event_mutex); @@ -8717,6 +8732,9 @@ err_alloc: */ if (!event_file) free_event(event); +err_cred: + if (task) + mutex_unlock(&task->signal->cred_guard_mutex); err_cpus: put_online_cpus(); err_task: @@ -9001,6 +9019,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) /* * When a child task exits, feed back event values to parent events. + * + * Can be called with cred_guard_mutex held when called from + * install_exec_creds(). */ void perf_event_exit_task(struct task_struct *child) { diff --git a/kernel/kcov.c b/kernel/kcov.c index 3efbee0834a8..a02f2dddd1d7 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -1,5 +1,6 @@ #define pr_fmt(fmt) "kcov: " fmt +#define DISABLE_BRANCH_PROFILING #include <linux/compiler.h> #include <linux/types.h> #include <linux/file.h> @@ -43,7 +44,7 @@ struct kcov { * Entry point from instrumented code. * This is called once per basic-block/edge. */ -void __sanitizer_cov_trace_pc(void) +void notrace __sanitizer_cov_trace_pc(void) { struct task_struct *t; enum kcov_mode mode; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 8d34308ea449..1391d3ee3b86 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1415,6 +1415,9 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); + VMCOREINFO_OFFSET(page, compound_dtor); + VMCOREINFO_OFFSET(page, compound_order); + VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -1447,8 +1450,8 @@ static int __init crash_save_vmcoreinfo_init(void) #ifdef CONFIG_X86 VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); #endif -#ifdef CONFIG_HUGETLBFS - VMCOREINFO_SYMBOL(free_huge_page); +#ifdef CONFIG_HUGETLB_PAGE + VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); #endif arch_crash_save_vmcoreinfo(); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ed9410936a22..78c1c0ee6dc1 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2176,15 +2176,37 @@ cache_hit: chain->irq_context = hlock->irq_context; i = get_first_held_lock(curr, hlock); chain->depth = curr->lockdep_depth + 1 - i; + + BUILD_BUG_ON((1UL << 24) <= ARRAY_SIZE(chain_hlocks)); + BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks)); + BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes)); + if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = nr_chain_hlocks; - nr_chain_hlocks += chain->depth; for (j = 0; j < chain->depth - 1; j++, i++) { int lock_id = curr->held_locks[i].class_idx - 1; chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; } + + if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS) + nr_chain_hlocks += chain->depth; + +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * Important for check_no_collision(). + */ + if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) { + if (debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); + dump_stack(); + return 0; + } +#endif + hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(); @@ -2932,6 +2954,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) return 1; } +static inline unsigned int task_irq_context(struct task_struct *task) +{ + return 2 * !!task->hardirq_context + !!task->softirq_context; +} + static int separate_irq_context(struct task_struct *curr, struct held_lock *hlock) { @@ -2940,8 +2967,6 @@ static int separate_irq_context(struct task_struct *curr, /* * Keep track of points where we cross into an interrupt context: */ - hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + - curr->softirq_context; if (depth) { struct held_lock *prev_hlock; @@ -2973,6 +2998,11 @@ static inline int mark_irqflags(struct task_struct *curr, return 1; } +static inline unsigned int task_irq_context(struct task_struct *task) +{ + return 0; +} + static inline int separate_irq_context(struct task_struct *curr, struct held_lock *hlock) { @@ -3241,6 +3271,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->acquire_ip = ip; hlock->instance = lock; hlock->nest_lock = nest_lock; + hlock->irq_context = task_irq_context(curr); hlock->trylock = trylock; hlock->read = read; hlock->check = check; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index dbb61a302548..a0f61effad25 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -141,6 +141,8 @@ static int lc_show(struct seq_file *m, void *v) int i; if (v == SEQ_START_TOKEN) { + if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS) + seq_printf(m, "(buggered) "); seq_printf(m, "all lock chains:\n"); return 0; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2232ae3e3ad6..3bfdff06eea7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -666,6 +666,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, */ smp_wmb(); set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); + /* + * The following mb guarantees that previous clear of a PENDING bit + * will not be reordered with any speculative LOADS or STORES from + * work->current_func, which is executed afterwards. This possible + * reordering can lead to a missed execution on attempt to qeueue + * the same @work. E.g. consider this case: + * + * CPU#0 CPU#1 + * ---------------------------- -------------------------------- + * + * 1 STORE event_indicated + * 2 queue_work_on() { + * 3 test_and_set_bit(PENDING) + * 4 } set_..._and_clear_pending() { + * 5 set_work_data() # clear bit + * 6 smp_mb() + * 7 work->current_func() { + * 8 LOAD event_indicated + * } + * + * Without an explicit full barrier speculative LOAD on line 8 can + * be executed before CPU#0 does STORE on line 1. If that happens, + * CPU#0 observes the PENDING bit is still set and new execution of + * a @work is not queued in a hope, that CPU#1 will eventually + * finish the queued @work. Meanwhile CPU#1 does not see + * event_indicated is set, because speculative LOAD was executed + * before actual STORE. + */ + smp_mb(); } static void clear_work_data(struct work_struct *work) |