summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/inode.c7
-rw-r--r--kernel/bpf/syscall.c24
-rw-r--r--kernel/bpf/verifier.c76
-rw-r--r--kernel/cgroup.c7
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/events/core.c55
-rw-r--r--kernel/kcov.c3
-rw-r--r--kernel/kexec_core.c7
-rw-r--r--kernel/locking/lockdep.c37
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/workqueue.c29
11 files changed, 188 insertions, 63 deletions
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index f2ece3c174a5..8f94ca1860cf 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
{
switch (type) {
case BPF_TYPE_PROG:
- atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
+ raw = bpf_prog_inc(raw);
break;
case BPF_TYPE_MAP:
- bpf_map_inc(raw, true);
+ raw = bpf_map_inc(raw, true);
break;
default:
WARN_ON_ONCE(1);
@@ -297,7 +297,8 @@ static void *bpf_obj_do_get(const struct filename *pathname,
goto out;
raw = bpf_any_get(inode->i_private, *type);
- touch_atime(&path);
+ if (!IS_ERR(raw))
+ touch_atime(&path);
path_put(&path);
return raw;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index adc5e4bd74f8..cf5e9f7ad13a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -218,11 +218,18 @@ struct bpf_map *__bpf_map_get(struct fd f)
return f.file->private_data;
}
-void bpf_map_inc(struct bpf_map *map, bool uref)
+/* prog's and map's refcnt limit */
+#define BPF_MAX_REFCNT 32768
+
+struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
{
- atomic_inc(&map->refcnt);
+ if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) {
+ atomic_dec(&map->refcnt);
+ return ERR_PTR(-EBUSY);
+ }
if (uref)
atomic_inc(&map->usercnt);
+ return map;
}
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
@@ -234,7 +241,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
if (IS_ERR(map))
return map;
- bpf_map_inc(map, true);
+ map = bpf_map_inc(map, true);
fdput(f);
return map;
@@ -658,6 +665,15 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
return f.file->private_data;
}
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+ if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
+ atomic_dec(&prog->aux->refcnt);
+ return ERR_PTR(-EBUSY);
+ }
+ return prog;
+}
+
/* called by sockets/tracing/seccomp before attaching program to an event
* pairs with bpf_prog_put()
*/
@@ -670,7 +686,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
if (IS_ERR(prog))
return prog;
- atomic_inc(&prog->aux->refcnt);
+ prog = bpf_prog_inc(prog);
fdput(f);
return prog;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 56f18068b52b..63554b6d4e25 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -249,16 +249,6 @@ static const char * const reg_type_str[] = {
[CONST_IMM] = "imm",
};
-static const struct {
- int map_type;
- int func_id;
-} func_limit[] = {
- {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
- {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
- {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
- {BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid},
-};
-
static void print_verifier_state(struct verifier_env *env)
{
enum bpf_reg_type t;
@@ -943,27 +933,52 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
static int check_map_func_compatibility(struct bpf_map *map, int func_id)
{
- bool bool_map, bool_func;
- int i;
-
if (!map)
return 0;
- for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
- bool_map = (map->map_type == func_limit[i].map_type);
- bool_func = (func_id == func_limit[i].func_id);
- /* only when map & func pair match it can continue.
- * don't allow any other map type to be passed into
- * the special func;
- */
- if (bool_func && bool_map != bool_func) {
- verbose("cannot pass map_type %d into func %d\n",
- map->map_type, func_id);
- return -EINVAL;
- }
+ /* We need a two way check, first is from map perspective ... */
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_PROG_ARRAY:
+ if (func_id != BPF_FUNC_tail_call)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+ if (func_id != BPF_FUNC_perf_event_read &&
+ func_id != BPF_FUNC_perf_event_output)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_STACK_TRACE:
+ if (func_id != BPF_FUNC_get_stackid)
+ goto error;
+ break;
+ default:
+ break;
+ }
+
+ /* ... and second from the function itself. */
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ goto error;
+ break;
+ case BPF_FUNC_perf_event_read:
+ case BPF_FUNC_perf_event_output:
+ if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+ goto error;
+ break;
+ case BPF_FUNC_get_stackid:
+ if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
+ goto error;
+ break;
+ default:
+ break;
}
return 0;
+error:
+ verbose("cannot pass map_type %d into func %d\n",
+ map->map_type, func_id);
+ return -EINVAL;
}
static int check_raw_mode(const struct bpf_func_proto *fn)
@@ -2111,15 +2126,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
return -E2BIG;
}
- /* remember this map */
- env->used_maps[env->used_map_cnt++] = map;
-
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
* and all maps are released in free_bpf_prog_info()
*/
- bpf_map_inc(map, false);
+ map = bpf_map_inc(map, false);
+ if (IS_ERR(map)) {
+ fdput(f);
+ return PTR_ERR(map);
+ }
+ env->used_maps[env->used_map_cnt++] = map;
+
fdput(f);
next_insn:
insn++;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 671dc05c0b0f..909a7d31ffd3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2825,9 +2825,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
+ struct cgroup_subsys *ss;
struct cgroup *cgrp;
pid_t pid;
- int ret;
+ int ssid, ret;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return -EINVAL;
@@ -2875,8 +2876,10 @@ out_unlock_rcu:
rcu_read_unlock();
out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
+ for_each_subsys(ss, ssid)
+ if (ss->post_attach)
+ ss->post_attach();
cgroup_kn_unlock(of->kn);
- cpuset_post_attach_flush();
return ret ?: nbytes;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 00ab5c2b7c5b..1902956baba1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -58,7 +58,6 @@
#include <asm/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
-#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
@@ -1016,7 +1015,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
}
}
-void cpuset_post_attach_flush(void)
+static void cpuset_post_attach(void)
{
flush_workqueue(cpuset_migrate_mm_wq);
}
@@ -2087,6 +2086,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
+ .post_attach = cpuset_post_attach,
.bind = cpuset_bind,
.legacy_cftypes = files,
.early_init = true,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9eb23dc27462..0bdc6e7d4908 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -412,7 +412,8 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
if (ret || !write)
return ret;
- if (sysctl_perf_cpu_time_max_percent == 100) {
+ if (sysctl_perf_cpu_time_max_percent == 100 ||
+ sysctl_perf_cpu_time_max_percent == 0) {
printk(KERN_WARNING
"perf: Dynamic interrupt throttling disabled, can hang your system!\n");
WRITE_ONCE(perf_sample_allowed_ns, 0);
@@ -1105,6 +1106,7 @@ static void put_ctx(struct perf_event_context *ctx)
* function.
*
* Lock order:
+ * cred_guard_mutex
* task_struct::perf_event_mutex
* perf_event_context::mutex
* perf_event::child_mutex;
@@ -3420,7 +3422,6 @@ static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
struct task_struct *task;
- int err;
rcu_read_lock();
if (!vpid)
@@ -3434,16 +3435,7 @@ find_lively_task_by_vpid(pid_t vpid)
if (!task)
return ERR_PTR(-ESRCH);
- /* Reuse ptrace permission checks for now. */
- err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
- goto errout;
-
return task;
-errout:
- put_task_struct(task);
- return ERR_PTR(err);
-
}
/*
@@ -8446,6 +8438,24 @@ SYSCALL_DEFINE5(perf_event_open,
get_online_cpus();
+ if (task) {
+ err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+ if (err)
+ goto err_cpus;
+
+ /*
+ * Reuse ptrace permission checks for now.
+ *
+ * We must hold cred_guard_mutex across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ goto err_cred;
+ }
+
if (flags & PERF_FLAG_PID_CGROUP)
cgroup_fd = pid;
@@ -8453,7 +8463,7 @@ SYSCALL_DEFINE5(perf_event_open,
NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_cpus;
+ goto err_cred;
}
if (is_sampling_event(event)) {
@@ -8512,11 +8522,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
}
- if (task) {
- put_task_struct(task);
- task = NULL;
- }
-
/*
* Look up the group leader (we will attach this event to it):
*/
@@ -8614,6 +8619,11 @@ SYSCALL_DEFINE5(perf_event_open,
WARN_ON_ONCE(ctx->parent_ctx);
+ /*
+ * This is the point on no return; we cannot fail hereafter. This is
+ * where we start modifying current state.
+ */
+
if (move_group) {
/*
* See perf_event_ctx_lock() for comments on the details
@@ -8685,6 +8695,11 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&gctx->mutex);
mutex_unlock(&ctx->mutex);
+ if (task) {
+ mutex_unlock(&task->signal->cred_guard_mutex);
+ put_task_struct(task);
+ }
+
put_online_cpus();
mutex_lock(&current->perf_event_mutex);
@@ -8717,6 +8732,9 @@ err_alloc:
*/
if (!event_file)
free_event(event);
+err_cred:
+ if (task)
+ mutex_unlock(&task->signal->cred_guard_mutex);
err_cpus:
put_online_cpus();
err_task:
@@ -9001,6 +9019,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
/*
* When a child task exits, feed back event values to parent events.
+ *
+ * Can be called with cred_guard_mutex held when called from
+ * install_exec_creds().
*/
void perf_event_exit_task(struct task_struct *child)
{
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 3efbee0834a8..a02f2dddd1d7 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,5 +1,6 @@
#define pr_fmt(fmt) "kcov: " fmt
+#define DISABLE_BRANCH_PROFILING
#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/file.h>
@@ -43,7 +44,7 @@ struct kcov {
* Entry point from instrumented code.
* This is called once per basic-block/edge.
*/
-void __sanitizer_cov_trace_pc(void)
+void notrace __sanitizer_cov_trace_pc(void)
{
struct task_struct *t;
enum kcov_mode mode;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 8d34308ea449..1391d3ee3b86 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1415,6 +1415,9 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
VMCOREINFO_OFFSET(page, private);
+ VMCOREINFO_OFFSET(page, compound_dtor);
+ VMCOREINFO_OFFSET(page, compound_order);
+ VMCOREINFO_OFFSET(page, compound_head);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1447,8 +1450,8 @@ static int __init crash_save_vmcoreinfo_init(void)
#ifdef CONFIG_X86
VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
#endif
-#ifdef CONFIG_HUGETLBFS
- VMCOREINFO_SYMBOL(free_huge_page);
+#ifdef CONFIG_HUGETLB_PAGE
+ VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
#endif
arch_crash_save_vmcoreinfo();
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index ed9410936a22..78c1c0ee6dc1 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2176,15 +2176,37 @@ cache_hit:
chain->irq_context = hlock->irq_context;
i = get_first_held_lock(curr, hlock);
chain->depth = curr->lockdep_depth + 1 - i;
+
+ BUILD_BUG_ON((1UL << 24) <= ARRAY_SIZE(chain_hlocks));
+ BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks));
+ BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes));
+
if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
chain->base = nr_chain_hlocks;
- nr_chain_hlocks += chain->depth;
for (j = 0; j < chain->depth - 1; j++, i++) {
int lock_id = curr->held_locks[i].class_idx - 1;
chain_hlocks[chain->base + j] = lock_id;
}
chain_hlocks[chain->base + j] = class - lock_classes;
}
+
+ if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS)
+ nr_chain_hlocks += chain->depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ /*
+ * Important for check_no_collision().
+ */
+ if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ if (debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
+ dump_stack();
+ return 0;
+ }
+#endif
+
hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
inc_chains();
@@ -2932,6 +2954,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
return 1;
}
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 2 * !!task->hardirq_context + !!task->softirq_context;
+}
+
static int separate_irq_context(struct task_struct *curr,
struct held_lock *hlock)
{
@@ -2940,8 +2967,6 @@ static int separate_irq_context(struct task_struct *curr,
/*
* Keep track of points where we cross into an interrupt context:
*/
- hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
- curr->softirq_context;
if (depth) {
struct held_lock *prev_hlock;
@@ -2973,6 +2998,11 @@ static inline int mark_irqflags(struct task_struct *curr,
return 1;
}
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 0;
+}
+
static inline int separate_irq_context(struct task_struct *curr,
struct held_lock *hlock)
{
@@ -3241,6 +3271,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->acquire_ip = ip;
hlock->instance = lock;
hlock->nest_lock = nest_lock;
+ hlock->irq_context = task_irq_context(curr);
hlock->trylock = trylock;
hlock->read = read;
hlock->check = check;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index dbb61a302548..a0f61effad25 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -141,6 +141,8 @@ static int lc_show(struct seq_file *m, void *v)
int i;
if (v == SEQ_START_TOKEN) {
+ if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)
+ seq_printf(m, "(buggered) ");
seq_printf(m, "all lock chains:\n");
return 0;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2232ae3e3ad6..3bfdff06eea7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -666,6 +666,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
*/
smp_wmb();
set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+ /*
+ * The following mb guarantees that previous clear of a PENDING bit
+ * will not be reordered with any speculative LOADS or STORES from
+ * work->current_func, which is executed afterwards. This possible
+ * reordering can lead to a missed execution on attempt to qeueue
+ * the same @work. E.g. consider this case:
+ *
+ * CPU#0 CPU#1
+ * ---------------------------- --------------------------------
+ *
+ * 1 STORE event_indicated
+ * 2 queue_work_on() {
+ * 3 test_and_set_bit(PENDING)
+ * 4 } set_..._and_clear_pending() {
+ * 5 set_work_data() # clear bit
+ * 6 smp_mb()
+ * 7 work->current_func() {
+ * 8 LOAD event_indicated
+ * }
+ *
+ * Without an explicit full barrier speculative LOAD on line 8 can
+ * be executed before CPU#0 does STORE on line 1. If that happens,
+ * CPU#0 observes the PENDING bit is still set and new execution of
+ * a @work is not queued in a hope, that CPU#1 will eventually
+ * finish the queued @work. Meanwhile CPU#1 does not see
+ * event_indicated is set, because speculative LOAD was executed
+ * before actual STORE.
+ */
+ smp_mb();
}
static void clear_work_data(struct work_struct *work)