summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c1
-rw-r--r--kernel/bpf/devmap.c9
-rw-r--r--kernel/bpf/lpm_trie.c9
-rw-r--r--kernel/bpf/syscall.c8
-rw-r--r--kernel/bpf/verifier.c12
-rw-r--r--kernel/cgroup/cgroup.c139
-rw-r--r--kernel/cgroup/cpuset.c15
-rw-r--r--kernel/cred.c9
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/livepatch/core.c6
-rw-r--r--kernel/memremap.c23
-rw-r--r--kernel/ptrace.c20
-rw-r--r--kernel/sysctl.c44
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/trace/bpf_trace.c100
-rw-r--r--kernel/trace/ftrace.c22
-rw-r--r--kernel/trace/trace.c4
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/trace/trace_uprobe.c15
19 files changed, 315 insertions, 130 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7c473f208a10..080e2bb644cc 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2097,7 +2097,6 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
-int sysctl_bpf_stats_enabled __read_mostly;
/* All definitions of tracepoints related to BPF. */
#define CREATE_TRACE_POINTS
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 15dbc15c5b0c..cd8297b3bdb9 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -178,6 +178,7 @@ static void dev_map_free(struct bpf_map *map)
if (!dev)
continue;
+ free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -273,6 +274,7 @@ void __dev_map_flush(struct bpf_map *map)
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
u32 bit;
+ rcu_read_lock();
for_each_set_bit(bit, bitmap, map->max_entries) {
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
struct xdp_bulk_queue *bq;
@@ -283,11 +285,12 @@ void __dev_map_flush(struct bpf_map *map)
if (unlikely(!dev))
continue;
- __clear_bit(bit, bitmap);
-
bq = this_cpu_ptr(dev->bulkq);
bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true);
+
+ __clear_bit(bit, bitmap);
}
+ rcu_read_unlock();
}
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -380,6 +383,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
int cpu;
+ rcu_read_lock();
for_each_online_cpu(cpu) {
bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
__clear_bit(dev->bit, bitmap);
@@ -387,6 +391,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
bq = per_cpu_ptr(dev->bulkq, cpu);
bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false);
}
+ rcu_read_unlock();
}
}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e61630c2e50b..864e2a496376 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -716,9 +716,14 @@ find_leftmost:
* have exact two children, so this function will never return NULL.
*/
for (node = search_root; node;) {
- if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ if (node->flags & LPM_TREE_NODE_FLAG_IM) {
+ node = rcu_dereference(node->child[0]);
+ } else {
next_node = node;
- node = rcu_dereference(node->child[0]);
+ node = rcu_dereference(node->child[0]);
+ if (!node)
+ node = rcu_dereference(next_node->child[1]);
+ }
}
do_copy:
next_key->prefixlen = next_node->prefixlen;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ef63d26622f2..42d17f730780 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1573,6 +1573,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
return 0;
default:
return -EINVAL;
@@ -1867,6 +1869,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
break;
case BPF_CGROUP_SOCK_OPS:
@@ -1952,6 +1956,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
break;
case BPF_CGROUP_SOCK_OPS:
@@ -2003,6 +2009,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
case BPF_CGROUP_SYSCTL:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d15cc4fafa89..a5c369e60343 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5353,9 +5353,12 @@ static int check_return_code(struct bpf_verifier_env *env)
struct tnum range = tnum_range(0, 1);
switch (env->prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+ range = tnum_range(1, 1);
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
@@ -5372,16 +5375,17 @@ static int check_return_code(struct bpf_verifier_env *env)
}
if (!tnum_in(range, reg->var_off)) {
+ char tn_buf[48];
+
verbose(env, "At program exit the register R0 ");
if (!tnum_is_unknown(reg->var_off)) {
- char tn_buf[48];
-
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env, "has value %s", tn_buf);
} else {
verbose(env, "has unknown scalar value");
}
- verbose(env, " should have been 0 or 1\n");
+ tnum_strn(tn_buf, sizeof(tn_buf), range);
+ verbose(env, " should have been in %s\n", tn_buf);
return -EINVAL;
}
return 0;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 155048b0eca2..bf9dbffd46b1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[];
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
-static void css_task_iter_advance(struct css_task_iter *it);
+static void css_task_iter_skip(struct css_task_iter *it,
+ struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss);
@@ -738,6 +739,7 @@ struct css_set init_css_set = {
.dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
@@ -843,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
cgroup_update_populated(link->cgrp, populated);
}
+/*
+ * @task is leaving, advance task iterators which are pointing to it so
+ * that they can resume at the next position. Advancing an iterator might
+ * remove it from the list, use safe walk. See css_task_iter_skip() for
+ * details.
+ */
+static void css_set_skip_task_iters(struct css_set *cset,
+ struct task_struct *task)
+{
+ struct css_task_iter *it, *pos;
+
+ list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
+ css_task_iter_skip(it, task);
+}
+
/**
* css_set_move_task - move a task from one css_set to another
* @task: task being moved
@@ -868,22 +885,9 @@ static void css_set_move_task(struct task_struct *task,
css_set_update_populated(to_cset, true);
if (from_cset) {
- struct css_task_iter *it, *pos;
-
WARN_ON_ONCE(list_empty(&task->cg_list));
- /*
- * @task is leaving, advance task iterators which are
- * pointing to it so that they can resume at the next
- * position. Advancing an iterator might remove it from
- * the list, use safe walk. See css_task_iter_advance*()
- * for details.
- */
- list_for_each_entry_safe(it, pos, &from_cset->task_iters,
- iters_node)
- if (it->task_pos == &task->cg_list)
- css_task_iter_advance(it);
-
+ css_set_skip_task_iters(from_cset, task);
list_del_init(&task->cg_list);
if (!css_set_populated(from_cset))
css_set_update_populated(from_cset, false);
@@ -1210,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
+ INIT_LIST_HEAD(&cset->dying_tasks);
INIT_LIST_HEAD(&cset->task_iters);
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
@@ -1460,8 +1465,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task,
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
-static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft,
- char *buf, bool write_link_name)
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+ char *buf)
{
struct cgroup_subsys *ss = cft->ss;
@@ -1471,26 +1476,13 @@ static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft,
snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
- write_link_name ? cft->link_name : cft->name);
+ cft->name);
} else {
- strscpy(buf, write_link_name ? cft->link_name : cft->name,
- CGROUP_FILE_NAME_MAX);
+ strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
}
return buf;
}
-static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
- char *buf)
-{
- return cgroup_fill_name(cgrp, cft, buf, false);
-}
-
-static char *cgroup_link_name(struct cgroup *cgrp, const struct cftype *cft,
- char *buf)
-{
- return cgroup_fill_name(cgrp, cft, buf, true);
-}
-
/**
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
@@ -1649,9 +1641,6 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
}
kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
- if (cft->flags & CFTYPE_SYMLINKED)
- kernfs_remove_by_name(cgrp->kn,
- cgroup_link_name(cgrp, cft, name));
}
/**
@@ -3837,7 +3826,6 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
{
char name[CGROUP_FILE_NAME_MAX];
struct kernfs_node *kn;
- struct kernfs_node *kn_link;
struct lock_class_key *key = NULL;
int ret;
@@ -3868,14 +3856,6 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
spin_unlock_irq(&cgroup_file_kn_lock);
}
- if (cft->flags & CFTYPE_SYMLINKED) {
- kn_link = kernfs_create_link(cgrp->kn,
- cgroup_link_name(cgrp, cft, name),
- kn);
- if (IS_ERR(kn_link))
- return PTR_ERR(kn_link);
- }
-
return 0;
}
@@ -4433,15 +4413,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
it->task_pos = NULL;
return;
}
- } while (!css_set_populated(cset));
+ } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
if (!list_empty(&cset->tasks))
it->task_pos = cset->tasks.next;
- else
+ else if (!list_empty(&cset->mg_tasks))
it->task_pos = cset->mg_tasks.next;
+ else
+ it->task_pos = cset->dying_tasks.next;
it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;
+ it->dying_tasks_head = &cset->dying_tasks;
/*
* We don't keep css_sets locked across iteration steps and thus
@@ -4467,9 +4450,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
list_add(&it->iters_node, &cset->task_iters);
}
+static void css_task_iter_skip(struct css_task_iter *it,
+ struct task_struct *task)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ if (it->task_pos == &task->cg_list) {
+ it->task_pos = it->task_pos->next;
+ it->flags |= CSS_TASK_ITER_SKIPPED;
+ }
+}
+
static void css_task_iter_advance(struct css_task_iter *it)
{
- struct list_head *next;
+ struct task_struct *task;
lockdep_assert_held(&css_set_lock);
repeat:
@@ -4479,25 +4473,40 @@ repeat:
* consumed first and then ->mg_tasks. After ->mg_tasks,
* we move onto the next cset.
*/
- next = it->task_pos->next;
-
- if (next == it->tasks_head)
- next = it->mg_tasks_head->next;
+ if (it->flags & CSS_TASK_ITER_SKIPPED)
+ it->flags &= ~CSS_TASK_ITER_SKIPPED;
+ else
+ it->task_pos = it->task_pos->next;
- if (next == it->mg_tasks_head)
+ if (it->task_pos == it->tasks_head)
+ it->task_pos = it->mg_tasks_head->next;
+ if (it->task_pos == it->mg_tasks_head)
+ it->task_pos = it->dying_tasks_head->next;
+ if (it->task_pos == it->dying_tasks_head)
css_task_iter_advance_css_set(it);
- else
- it->task_pos = next;
} else {
/* called from start, proceed to the first cset */
css_task_iter_advance_css_set(it);
}
- /* if PROCS, skip over tasks which aren't group leaders */
- if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
- !thread_group_leader(list_entry(it->task_pos, struct task_struct,
- cg_list)))
- goto repeat;
+ if (!it->task_pos)
+ return;
+
+ task = list_entry(it->task_pos, struct task_struct, cg_list);
+
+ if (it->flags & CSS_TASK_ITER_PROCS) {
+ /* if PROCS, skip over tasks which aren't group leaders */
+ if (!thread_group_leader(task))
+ goto repeat;
+
+ /* and dying leaders w/o live member threads */
+ if (!atomic_read(&task->signal->live))
+ goto repeat;
+ } else {
+ /* skip all dying ones */
+ if (task->flags & PF_EXITING)
+ goto repeat;
+ }
}
/**
@@ -4553,6 +4562,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
spin_lock_irq(&css_set_lock);
+ /* @it may be half-advanced by skips, finish advancing */
+ if (it->flags & CSS_TASK_ITER_SKIPPED)
+ css_task_iter_advance(it);
+
if (it->task_pos) {
it->cur_task = list_entry(it->task_pos, struct task_struct,
cg_list);
@@ -6034,6 +6047,7 @@ void cgroup_exit(struct task_struct *tsk)
if (!list_empty(&tsk->cg_list)) {
spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;
WARN_ON_ONCE(cgroup_task_frozen(tsk));
@@ -6059,6 +6073,13 @@ void cgroup_release(struct task_struct *task)
do_each_subsys_mask(ss, ssid, have_release_callback) {
ss->release(task);
} while_each_subsys_mask();
+
+ if (use_task_css_set_links) {
+ spin_lock_irq(&css_set_lock);
+ css_set_skip_task_iters(task_css_set(task), task);
+ list_del_init(&task->cg_list);
+ spin_unlock_irq(&css_set_lock);
+ }
}
void cgroup_free(struct task_struct *task)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6a1942ed781c..515525ff1cfd 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
spin_unlock_irqrestore(&callback_lock, flags);
}
+/**
+ * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
+ * @tsk: pointer to task_struct with which the scheduler is struggling
+ *
+ * Description: In the case that the scheduler cannot find an allowed cpu in
+ * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
+ * mode however, this value is the same as task_cs(tsk)->effective_cpus,
+ * which will not contain a sane cpumask during cases such as cpu hotplugging.
+ * This is the absolute last resort for the scheduler and it is only used if
+ * _every_ other avenue has been traveled.
+ **/
+
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
rcu_read_lock();
- do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
+ do_set_cpus_allowed(tsk, is_in_v2_mode() ?
+ task_cs(tsk)->cpus_allowed : cpu_possible_mask);
rcu_read_unlock();
/*
diff --git a/kernel/cred.c b/kernel/cred.c
index e74ffdc98a92..c73a87a4df13 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -446,6 +446,15 @@ int commit_creds(struct cred *new)
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
+ /*
+ * If a task drops privileges and becomes nondumpable,
+ * the dumpability change must become visible before
+ * the credential change; otherwise, a __ptrace_may_access()
+ * racing with this change may be able to attach to a task it
+ * shouldn't be able to attach to (as if the task had dropped
+ * privileges without becoming nondumpable).
+ * Pairs with a read barrier in __ptrace_may_access().
+ */
smp_wmb();
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 1803efb2922f..a75b6a7f458a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -195,6 +195,7 @@ repeat:
rcu_read_unlock();
proc_flush_task(p);
+ cgroup_release(p);
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
@@ -220,7 +221,6 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
- cgroup_release(p);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 2398832947c6..c4ce08f43bd6 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -18,6 +18,7 @@
#include <linux/elf.h>
#include <linux/moduleloader.h>
#include <linux/completion.h>
+#include <linux/memory.h>
#include <asm/cacheflush.h>
#include "core.h"
#include "patch.h"
@@ -718,16 +719,21 @@ static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_func *func;
int ret;
+ mutex_lock(&text_mutex);
+
module_disable_ro(patch->mod);
ret = klp_write_object_relocations(patch->mod, obj);
if (ret) {
module_enable_ro(patch->mod, true);
+ mutex_unlock(&text_mutex);
return ret;
}
arch_klp_init_object_loaded(patch, obj);
module_enable_ro(patch->mod, true);
+ mutex_unlock(&text_mutex);
+
klp_for_each_func(obj, func) {
ret = klp_find_object_symbol(obj->name, func->old_name,
func->old_sympos,
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 1490e63f69a9..6e1970719dc2 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -95,6 +95,7 @@ static void devm_memremap_pages_release(void *data)
pgmap->kill(pgmap->ref);
for_each_device_pfn(pfn, pgmap)
put_page(pfn_to_page(pfn));
+ pgmap->cleanup(pgmap->ref);
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
@@ -133,8 +134,8 @@ static void devm_memremap_pages_release(void *data)
* 2/ The altmap field may optionally be initialized, in which case altmap_valid
* must be set to true
*
- * 3/ pgmap->ref must be 'live' on entry and will be killed at
- * devm_memremap_pages_release() time, or if this routine fails.
+ * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
+ * at devm_memremap_pages_release() time, or if this routine fails.
*
* 4/ res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -156,8 +157,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
pgprot_t pgprot = PAGE_KERNEL;
int error, nid, is_ram;
- if (!pgmap->ref || !pgmap->kill)
+ if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
+ WARN(1, "Missing reference count teardown definition\n");
return ERR_PTR(-EINVAL);
+ }
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -168,14 +171,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
if (conflict_pgmap) {
dev_WARN(dev, "Conflicting mapping in same section\n");
put_dev_pagemap(conflict_pgmap);
- return ERR_PTR(-ENOMEM);
+ error = -ENOMEM;
+ goto err_array;
}
conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
if (conflict_pgmap) {
dev_WARN(dev, "Conflicting mapping in same section\n");
put_dev_pagemap(conflict_pgmap);
- return ERR_PTR(-ENOMEM);
+ error = -ENOMEM;
+ goto err_array;
}
is_ram = region_intersects(align_start, align_size,
@@ -267,10 +272,18 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
pgmap_array_delete(res);
err_array:
pgmap->kill(pgmap->ref);
+ pgmap->cleanup(pgmap->ref);
+
return ERR_PTR(error);
}
EXPORT_SYMBOL_GPL(devm_memremap_pages);
+void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
+{
+ devm_release_action(dev, devm_memremap_pages_release, pgmap);
+}
+EXPORT_SYMBOL_GPL(devm_memunmap_pages);
+
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
/* number of pfns from base where pfn_to_page() is valid */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 5710d07e67cf..8456b6e2205f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -324,6 +324,16 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
return -EPERM;
ok:
rcu_read_unlock();
+ /*
+ * If a task drops privileges and becomes nondumpable (through a syscall
+ * like setresuid()) while we are trying to access it, we must ensure
+ * that the dumpability is read after the credentials; otherwise,
+ * we may be able to attach to a task that we shouldn't be able to
+ * attach to (as if the task had dropped privileges without becoming
+ * nondumpable).
+ * Pairs with a write barrier in commit_creds().
+ */
+ smp_rmb();
mm = task->mm;
if (mm &&
((get_dumpable(mm) != SUID_DUMP_USER) &&
@@ -705,6 +715,10 @@ static int ptrace_peek_siginfo(struct task_struct *child,
if (arg.nr < 0)
return -EINVAL;
+ /* Ensure arg.off fits in an unsigned long */
+ if (arg.off > ULONG_MAX)
+ return 0;
+
if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
pending = &child->signal->shared_pending;
else
@@ -712,18 +726,20 @@ static int ptrace_peek_siginfo(struct task_struct *child,
for (i = 0; i < arg.nr; ) {
kernel_siginfo_t info;
- s32 off = arg.off + i;
+ unsigned long off = arg.off + i;
+ bool found = false;
spin_lock_irq(&child->sighand->siglock);
list_for_each_entry(q, &pending->list, list) {
if (!off--) {
+ found = true;
copy_siginfo(&info, &q->info);
break;
}
}
spin_unlock_irq(&child->sighand->siglock);
- if (off >= 0) /* beyond the end of the list */
+ if (!found) /* beyond the end of the list */
break;
#ifdef CONFIG_COMPAT
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7d1008be6173..1beca96fb625 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -230,11 +230,6 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
#endif
static int proc_dopipe_max_size(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
-#ifdef CONFIG_BPF_SYSCALL
-static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-#endif
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses its own private copy */
@@ -1253,12 +1248,10 @@ static struct ctl_table kern_table[] = {
},
{
.procname = "bpf_stats_enabled",
- .data = &sysctl_bpf_stats_enabled,
- .maxlen = sizeof(sysctl_bpf_stats_enabled),
+ .data = &bpf_stats_enabled_key.key,
+ .maxlen = sizeof(bpf_stats_enabled_key),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax_bpf_stats,
- .extra1 = &zero,
- .extra2 = &one,
+ .proc_handler = proc_do_static_key,
},
#endif
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
@@ -3374,26 +3367,35 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
#endif /* CONFIG_PROC_SYSCTL */
-#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
-static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+#if defined(CONFIG_SYSCTL)
+int proc_do_static_key(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
{
- int ret, bpf_stats = *(int *)table->data;
- struct ctl_table tmp = *table;
+ struct static_key *key = (struct static_key *)table->data;
+ static DEFINE_MUTEX(static_key_mutex);
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = &zero,
+ .extra2 = &one,
+ };
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
- tmp.data = &bpf_stats;
+ mutex_lock(&static_key_mutex);
+ val = static_key_enabled(key);
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && !ret) {
- *(int *)table->data = bpf_stats;
- if (bpf_stats)
- static_branch_enable(&bpf_stats_enabled_key);
+ if (val)
+ static_key_enable(key);
else
- static_branch_disable(&bpf_stats_enabled_key);
+ static_key_disable(key);
}
+ mutex_unlock(&static_key_mutex);
return ret;
}
#endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 85f5912d8f70..44b726bab4bd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -808,17 +808,18 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
ktime_t base, *offset = offsets[offs];
+ u64 nsecs;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqcount_begin(&tk_core.seq);
base = ktime_add(tk->tkr_mono.base, *offset);
+ nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
} while (read_seqcount_retry(&tk_core.seq, seq));
- return base;
-
+ return base + nsecs;
}
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f92d6ad5e080..1c9a4745e596 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
.arg4_type = ARG_CONST_SIZE,
};
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
-
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
u64 flags, struct perf_sample_data *sd)
@@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
return perf_event_output(event, sd, regs);
}
+/*
+ * Support executing tracepoints in normal, irq, and nmi context that each call
+ * bpf_perf_event_output
+ */
+struct bpf_trace_sample_data {
+ struct perf_sample_data sds[3];
+};
+
+static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
+static DEFINE_PER_CPU(int, bpf_trace_nest_level);
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
- struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
+ struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
+ int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
struct perf_raw_record raw = {
.frag = {
.size = size,
.data = data,
},
};
+ struct perf_sample_data *sd;
+ int err;
- if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
- return -EINVAL;
+ if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ sd = &sds->sds[nest_level - 1];
+
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
+ err = -EINVAL;
+ goto out;
+ }
perf_sample_data_init(sd, 0, 0);
sd->raw = &raw;
- return __bpf_perf_event_output(regs, map, flags, sd);
+ err = __bpf_perf_event_output(regs, map, flags, sd);
+
+out:
+ this_cpu_dec(bpf_trace_nest_level);
+ return err;
}
static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
/*
* bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
* to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
+ *
+ * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
+ * in normal, irq, and nmi context.
*/
-static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
+struct bpf_raw_tp_regs {
+ struct pt_regs regs[3];
+};
+static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
+static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
+static struct pt_regs *get_bpf_raw_tp_regs(void)
+{
+ struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
+
+ if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
+ this_cpu_dec(bpf_raw_tp_nest_level);
+ return ERR_PTR(-EBUSY);
+ }
+
+ return &tp_regs->regs[nest_level - 1];
+}
+
+static void put_bpf_raw_tp_regs(void)
+{
+ this_cpu_dec(bpf_raw_tp_nest_level);
+}
+
BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags, void *, data, u64, size)
{
- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ struct pt_regs *regs = get_bpf_raw_tp_regs();
+ int ret;
+
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);
perf_fetch_caller_regs(regs);
- return ____bpf_perf_event_output(regs, map, flags, data, size);
+ ret = ____bpf_perf_event_output(regs, map, flags, data, size);
+
+ put_bpf_raw_tp_regs();
+ return ret;
}
static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
@@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags)
{
- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ struct pt_regs *regs = get_bpf_raw_tp_regs();
+ int ret;
+
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);
perf_fetch_caller_regs(regs);
/* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
- return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
- flags, 0, 0);
+ ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+ flags, 0, 0);
+ put_bpf_raw_tp_regs();
+ return ret;
}
static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
@@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
void *, buf, u32, size, u64, flags)
{
- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ struct pt_regs *regs = get_bpf_raw_tp_regs();
+ int ret;
+
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);
perf_fetch_caller_regs(regs);
- return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
- (unsigned long) size, flags, 0);
+ ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+ (unsigned long) size, flags, 0);
+ put_bpf_raw_tp_regs();
+ return ret;
}
static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a12aff849c04..38277af44f5c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -34,6 +34,7 @@
#include <linux/hash.h>
#include <linux/rcupdate.h>
#include <linux/kprobes.h>
+#include <linux/memory.h>
#include <trace/events/sched.h>
@@ -2610,10 +2611,12 @@ static void ftrace_run_update_code(int command)
{
int ret;
+ mutex_lock(&text_mutex);
+
ret = ftrace_arch_code_modify_prepare();
FTRACE_WARN_ON(ret);
if (ret)
- return;
+ goto out_unlock;
/*
* By default we use stop_machine() to modify the code.
@@ -2625,6 +2628,9 @@ static void ftrace_run_update_code(int command)
ret = ftrace_arch_code_modify_post_process();
FTRACE_WARN_ON(ret);
+
+out_unlock:
+ mutex_unlock(&text_mutex);
}
static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
@@ -2935,14 +2941,13 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
p = &pg->records[i];
p->flags = rec_flags;
-#ifndef CC_USING_NOP_MCOUNT
/*
* Do the initial record conversion from mcount jump
* to the NOP instructions.
*/
- if (!ftrace_code_disable(mod, p))
+ if (!__is_defined(CC_USING_NOP_MCOUNT) &&
+ !ftrace_code_disable(mod, p))
break;
-#endif
update_cnt++;
}
@@ -4221,10 +4226,13 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
struct ftrace_func_entry *entry;
struct ftrace_func_map *map;
struct hlist_head *hhd;
- int size = 1 << mapper->hash.size_bits;
- int i;
+ int size, i;
+
+ if (!mapper)
+ return;
if (free_func && mapper->hash.count) {
+ size = 1 << mapper->hash.size_bits;
for (i = 0; i < size; i++) {
hhd = &mapper->hash.buckets[i];
hlist_for_each_entry(entry, hhd, hlist) {
@@ -5776,6 +5784,7 @@ void ftrace_module_enable(struct module *mod)
struct ftrace_page *pg;
mutex_lock(&ftrace_lock);
+ mutex_lock(&text_mutex);
if (ftrace_disabled)
goto out_unlock;
@@ -5837,6 +5846,7 @@ void ftrace_module_enable(struct module *mod)
ftrace_arch_code_modify_post_process();
out_unlock:
+ mutex_unlock(&text_mutex);
mutex_unlock(&ftrace_lock);
process_cached_mods(mod->name);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1c80521fd436..83e08b78dbee 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6923,7 +6923,7 @@ struct tracing_log_err {
static DEFINE_MUTEX(tracing_err_log_lock);
-struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
+static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
{
struct tracing_log_err *err;
@@ -8192,7 +8192,7 @@ static const struct file_operations buffer_percent_fops = {
.llseek = default_llseek,
};
-struct dentry *trace_instance_dir;
+static struct dentry *trace_instance_dir;
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 54373d93e251..ba751f993c3b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1057,7 +1057,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
trace_seq_puts(s, "<stack trace>\n");
- for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
+ for (p = field->caller; p && p < end && *p != ULONG_MAX; p++) {
if (trace_seq_has_overflowed(s))
break;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index eb7e06b54741..b55906c77ce0 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -426,8 +426,6 @@ end:
/*
* Argument syntax:
* - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS]
- *
- * - Remove uprobe: -:[GRP/]EVENT
*/
static int trace_uprobe_create(int argc, const char **argv)
{
@@ -443,10 +441,17 @@ static int trace_uprobe_create(int argc, const char **argv)
ret = 0;
ref_ctr_offset = 0;
- /* argc must be >= 1 */
- if (argv[0][0] == 'r')
+ switch (argv[0][0]) {
+ case 'r':
is_return = true;
- else if (argv[0][0] != 'p' || argc < 2)
+ break;
+ case 'p':
+ break;
+ default:
+ return -ECANCELED;
+ }
+
+ if (argc < 2)
return -ECANCELED;
if (argv[0][1] == ':')