summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/audit.c76
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/auditfilter.c39
-rw-r--r--kernel/auditsc.c23
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c67
-rw-r--r--kernel/bpf/cgroup.c570
-rw-r--r--kernel/bpf/core.c210
-rw-r--r--kernel/bpf/cpumap.c706
-rw-r--r--kernel/bpf/devmap.c5
-rw-r--r--kernel/bpf/disasm.c214
-rw-r--r--kernel/bpf/disasm.h32
-rw-r--r--kernel/bpf/hashtab.c7
-rw-r--r--kernel/bpf/inode.c105
-rw-r--r--kernel/bpf/lpm_trie.c98
-rw-r--r--kernel/bpf/offload.c206
-rw-r--r--kernel/bpf/percpu_freelist.c8
-rw-r--r--kernel/bpf/sockmap.c20
-rw-r--r--kernel/bpf/stackmap.c5
-rw-r--r--kernel/bpf/syscall.c315
-rw-r--r--kernel/bpf/verifier.c1859
-rw-r--r--kernel/cgroup/Makefile2
-rw-r--r--kernel/cgroup/cgroup-internal.h9
-rw-r--r--kernel/cgroup/cgroup-v1.c6
-rw-r--r--kernel/cgroup/cgroup.c216
-rw-r--r--kernel/cgroup/debug.c4
-rw-r--r--kernel/cgroup/stat.c338
-rw-r--r--kernel/compat.c77
-rw-r--r--kernel/configs/nopm.config15
-rw-r--r--kernel/cpu.c26
-rw-r--r--kernel/crash_core.c5
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c10
-rw-r--r--kernel/debug/kdb/kdb_private.h2
-rw-r--r--kernel/delayacct.c42
-rw-r--r--kernel/events/callchain.c15
-rw-r--r--kernel/events/core.c158
-rw-r--r--kernel/events/internal.h4
-rw-r--r--kernel/events/ring_buffer.c4
-rw-r--r--kernel/events/uprobes.c12
-rw-r--r--kernel/exit.c9
-rw-r--r--kernel/fork.c29
-rw-r--r--kernel/futex.c96
-rw-r--r--kernel/groups.c5
-rw-r--r--kernel/irq/Kconfig10
-rw-r--r--kernel/irq/affinity.c30
-rw-r--r--kernel/irq/debug.h5
-rw-r--r--kernel/irq/debugfs.c1
-rw-r--r--kernel/irq/generic-chip.c11
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/irqdomain.c131
-rw-r--r--kernel/irq/manage.c13
-rw-r--r--kernel/irq/matrix.c24
-rw-r--r--kernel/irq/msi.c64
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/irq_work.c2
-rw-r--r--kernel/jump_label.c14
-rw-r--r--kernel/kallsyms.c51
-rw-r--r--kernel/kcov.c216
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/locking/lockdep.c743
-rw-r--r--kernel/locking/locktorture.c108
-rw-r--r--kernel/locking/qspinlock.c12
-rw-r--r--kernel/locking/rtmutex.c26
-rw-r--r--kernel/locking/rtmutex_common.h1
-rw-r--r--kernel/locking/spinlock.c13
-rw-r--r--kernel/module.c23
-rw-r--r--kernel/padata.c6
-rw-r--r--kernel/panic.c47
-rw-r--r--kernel/pid.c271
-rw-r--r--kernel/pid_namespace.c59
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/snapshot.c10
-rw-r--r--kernel/power/swap.c6
-rw-r--r--kernel/printk/printk.c9
-rw-r--r--kernel/printk/printk_safe.c17
-rw-r--r--kernel/ptrace.c5
-rw-r--r--kernel/rcu/rcu.h27
-rw-r--r--kernel/rcu/rcuperf.c6
-rw-r--r--kernel/rcu/rcutorture.c12
-rw-r--r--kernel/rcu/srcutree.c109
-rw-r--r--kernel/rcu/tree.c355
-rw-r--r--kernel/rcu/tree.h5
-rw-r--r--kernel/rcu/tree_plugin.h13
-rw-r--r--kernel/rcu/update.c2
-rw-r--r--kernel/reboot.c27
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c308
-rw-r--r--kernel/sched/cpuacct.h18
-rw-r--r--kernel/sched/cpufreq_schedutil.c95
-rw-r--r--kernel/sched/cputime.c14
-rw-r--r--kernel/sched/deadline.c145
-rw-r--r--kernel/sched/fair.c147
-rw-r--r--kernel/sched/membarrier.c2
-rw-r--r--kernel/sched/rt.c12
-rw-r--r--kernel/sched/sched.h114
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/sched/wait.c2
-rw-r--r--kernel/sched/wait_bit.c18
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c448
-rw-r--r--kernel/softirq.c12
-rw-r--r--kernel/sys.c14
-rw-r--r--kernel/sysctl.c84
-rw-r--r--kernel/time/Kconfig5
-rw-r--r--kernel/time/clocksource.c5
-rw-r--r--kernel/time/hrtimer.c654
-rw-r--r--kernel/time/posix-clock.c6
-rw-r--r--kernel/time/posix-cpu-timers.c25
-rw-r--r--kernel/time/posix-timers.c31
-rw-r--r--kernel/time/tick-internal.h13
-rw-r--r--kernel/time/tick-sched.c34
-rw-r--r--kernel/time/timekeeping.c45
-rw-r--r--kernel/time/timer.c167
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/torture.c39
-rw-r--r--kernel/trace/Kconfig14
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c30
-rw-r--r--kernel/trace/bpf_trace.c204
-rw-r--r--kernel/trace/ftrace.c383
-rw-r--r--kernel/trace/ring_buffer.c36
-rw-r--r--kernel/trace/trace.c196
-rw-r--r--kernel/trace/trace.h9
-rw-r--r--kernel/trace/trace_benchmark.c2
-rw-r--r--kernel/trace/trace_event_perf.c82
-rw-r--r--kernel/trace/trace_events.c47
-rw-r--r--kernel/trace/trace_events_hist.c128
-rw-r--r--kernel/trace/trace_events_trigger.c13
-rw-r--r--kernel/trace/trace_functions.c49
-rw-r--r--kernel/trace/trace_irqsoff.c133
-rw-r--r--kernel/trace/trace_kprobe.c28
-rw-r--r--kernel/trace/trace_probe.c86
-rw-r--r--kernel/trace/trace_probe.h7
-rw-r--r--kernel/trace/trace_selftest.c34
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_syscalls.c38
-rw-r--r--kernel/trace/trace_uprobe.c7
-rw-r--r--kernel/trace/tracing_map.c3
-rw-r--r--kernel/trace/tracing_map.h2
-rw-r--r--kernel/tracepoint.c9
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/umh.c4
-rw-r--r--kernel/user.c30
-rw-r--r--kernel/user_namespace.c349
-rw-r--r--kernel/workqueue.c117
149 files changed, 8449 insertions, 4249 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index d15c0ee4d955..addf7732fb56 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- if (time_is_before_jiffies(acct->needcheck))
+ if (time_is_after_jiffies(acct->needcheck))
goto out;
/* May block */
diff --git a/kernel/audit.c b/kernel/audit.c
index be1c28fd4d57..227db99b0f19 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -85,13 +85,13 @@ static int audit_initialized;
#define AUDIT_OFF 0
#define AUDIT_ON 1
#define AUDIT_LOCKED 2
-u32 audit_enabled;
-u32 audit_ever_enabled;
+u32 audit_enabled = AUDIT_OFF;
+bool audit_ever_enabled = !!AUDIT_OFF;
EXPORT_SYMBOL_GPL(audit_enabled);
/* Default state when kernel boots without any parameters. */
-static u32 audit_default;
+static u32 audit_default = AUDIT_OFF;
/* If auditing cannot proceed, audit_failure selects what happens. */
static u32 audit_failure = AUDIT_FAIL_PRINTK;
@@ -1197,25 +1197,28 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
pid_t auditd_pid;
struct pid *req_pid = task_tgid(current);
- /* sanity check - PID values must match */
- if (new_pid != pid_vnr(req_pid))
+ /* Sanity check - PID values must match. Setting
+ * pid to 0 is how auditd ends auditing. */
+ if (new_pid && (new_pid != pid_vnr(req_pid)))
return -EINVAL;
/* test the auditd connection */
audit_replace(req_pid);
auditd_pid = auditd_pid_vnr();
- /* only the current auditd can unregister itself */
- if ((!new_pid) && (new_pid != auditd_pid)) {
- audit_log_config_change("audit_pid", new_pid,
- auditd_pid, 0);
- return -EACCES;
- }
- /* replacing a healthy auditd is not allowed */
- if (auditd_pid && new_pid) {
- audit_log_config_change("audit_pid", new_pid,
- auditd_pid, 0);
- return -EEXIST;
+ if (auditd_pid) {
+ /* replacing a healthy auditd is not allowed */
+ if (new_pid) {
+ audit_log_config_change("audit_pid",
+ new_pid, auditd_pid, 0);
+ return -EEXIST;
+ }
+ /* only current auditd can unregister itself */
+ if (pid_vnr(req_pid) != auditd_pid) {
+ audit_log_config_change("audit_pid",
+ new_pid, auditd_pid, 0);
+ return -EACCES;
+ }
}
if (new_pid) {
@@ -1549,8 +1552,6 @@ static int __init audit_init(void)
register_pernet_subsys(&audit_net_ops);
audit_initialized = AUDIT_INITIALIZED;
- audit_enabled = audit_default;
- audit_ever_enabled |= !!audit_default;
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
if (IS_ERR(kauditd_task)) {
@@ -1564,14 +1565,21 @@ static int __init audit_init(void)
return 0;
}
-__initcall(audit_init);
+postcore_initcall(audit_init);
/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
static int __init audit_enable(char *str)
{
- audit_default = !!simple_strtol(str, NULL, 0);
- if (!audit_default)
+ long val;
+
+ if (kstrtol(str, 0, &val))
+ panic("audit: invalid 'audit' parameter value (%s)\n", str);
+ audit_default = (val ? AUDIT_ON : AUDIT_OFF);
+
+ if (audit_default == AUDIT_OFF)
audit_initialized = AUDIT_DISABLED;
+ if (audit_set_enabled(audit_default))
+ panic("audit: error setting audit state (%d)\n", audit_default);
pr_info("%s\n", audit_default ?
"enabled (after initialization)" : "disabled (until reboot)");
@@ -2337,32 +2345,6 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
}
}
-#ifdef CONFIG_SECURITY
-/**
- * audit_log_secctx - Converts and logs SELinux context
- * @ab: audit_buffer
- * @secid: security number
- *
- * This is a helper function that calls security_secid_to_secctx to convert
- * secid to secctx and then adds the (converted) SELinux context to the audit
- * log by calling audit_log_format, thus also preventing leak of internal secid
- * to userspace. If secid cannot be converted audit_panic is called.
- */
-void audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{
- u32 len;
- char *secctx;
-
- if (security_secid_to_secctx(secid, &secctx, &len)) {
- audit_panic("Cannot convert secid to context");
- } else {
- audit_log_format(ab, " obj=%s", secctx);
- security_release_secctx(secctx, len);
- }
-}
-EXPORT_SYMBOL(audit_log_secctx);
-#endif
-
EXPORT_SYMBOL(audit_log_start);
EXPORT_SYMBOL(audit_log_end);
EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit.h b/kernel/audit.h
index 9b110ae17ee3..af5bc59487ed 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -208,7 +208,7 @@ struct audit_context {
struct audit_proctitle proctitle;
};
-extern u32 audit_ever_enabled;
+extern bool audit_ever_enabled;
extern void audit_copy_inode(struct audit_names *name,
const struct dentry *dentry,
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0b0aa5854dac..4a1758adb222 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -56,7 +56,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[3]),
LIST_HEAD_INIT(audit_filter_list[4]),
LIST_HEAD_INIT(audit_filter_list[5]),
-#if AUDIT_NR_FILTERS != 6
+ LIST_HEAD_INIT(audit_filter_list[6]),
+#if AUDIT_NR_FILTERS != 7
#error Fix audit_filter_list initialiser
#endif
};
@@ -67,6 +68,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_rules_list[3]),
LIST_HEAD_INIT(audit_rules_list[4]),
LIST_HEAD_INIT(audit_rules_list[5]),
+ LIST_HEAD_INIT(audit_rules_list[6]),
};
DEFINE_MUTEX(audit_filter_mutex);
@@ -263,6 +265,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
#endif
case AUDIT_FILTER_USER:
case AUDIT_FILTER_TYPE:
+ case AUDIT_FILTER_FS:
;
}
if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -338,6 +341,21 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
entry->rule.listnr != AUDIT_FILTER_USER)
return -EINVAL;
break;
+ case AUDIT_FSTYPE:
+ if (entry->rule.listnr != AUDIT_FILTER_FS)
+ return -EINVAL;
+ break;
+ }
+
+ switch(entry->rule.listnr) {
+ case AUDIT_FILTER_FS:
+ switch(f->type) {
+ case AUDIT_FSTYPE:
+ case AUDIT_FILTERKEY:
+ break;
+ default:
+ return -EINVAL;
+ }
}
switch(f->type) {
@@ -391,6 +409,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
return -EINVAL;
/* FALL THROUGH */
case AUDIT_ARCH:
+ case AUDIT_FSTYPE:
if (f->op != Audit_not_equal && f->op != Audit_equal)
return -EINVAL;
break;
@@ -910,10 +929,13 @@ static inline int audit_add_rule(struct audit_entry *entry)
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
- /* If either of these, don't count towards total */
- if (entry->rule.listnr == AUDIT_FILTER_USER ||
- entry->rule.listnr == AUDIT_FILTER_TYPE)
+ /* If any of these, don't count towards total */
+ switch(entry->rule.listnr) {
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_TYPE:
+ case AUDIT_FILTER_FS:
dont_count = 1;
+ }
#endif
mutex_lock(&audit_filter_mutex);
@@ -989,10 +1011,13 @@ int audit_del_rule(struct audit_entry *entry)
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
- /* If either of these, don't count towards total */
- if (entry->rule.listnr == AUDIT_FILTER_USER ||
- entry->rule.listnr == AUDIT_FILTER_TYPE)
+ /* If any of these, don't count towards total */
+ switch(entry->rule.listnr) {
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_TYPE:
+ case AUDIT_FILTER_FS:
dont_count = 1;
+ }
#endif
mutex_lock(&audit_filter_mutex);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9c723e978245..e80459f7e132 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1869,10 +1869,33 @@ void __audit_inode_child(struct inode *parent,
struct inode *inode = d_backing_inode(dentry);
const char *dname = dentry->d_name.name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
+ struct audit_entry *e;
+ struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
+ int i;
if (!context->in_syscall)
return;
+ rcu_read_lock();
+ if (!list_empty(list)) {
+ list_for_each_entry_rcu(e, list, list) {
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+
+ if (f->type == AUDIT_FSTYPE) {
+ if (audit_comparator(parent->i_sb->s_magic,
+ f->op, f->val)) {
+ if (e->rule.action == AUDIT_NEVER) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ }
+ }
+ }
+ }
+ rcu_read_unlock();
+
if (inode)
handle_one(inode);
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index af3ab6164ff5..e691da0b3bab 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,8 +3,11 @@ obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += offload.o
ifeq ($(CONFIG_STREAM_PARSER),y)
obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c4b9ab01bba5..ab94d304a634 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -19,6 +19,9 @@
#include "map_in_map.h"
+#define ARRAY_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
static void bpf_array_free_percpu(struct bpf_array *array)
{
int i;
@@ -50,13 +53,15 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
+ u32 elem_size, index_mask, max_entries;
+ bool unpriv = !capable(CAP_SYS_ADMIN);
struct bpf_array *array;
- u64 array_size;
- u32 elem_size;
+ u64 array_size, mask64;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+ attr->value_size == 0 ||
+ attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
(percpu && numa_node != NUMA_NO_NODE))
return ERR_PTR(-EINVAL);
@@ -68,11 +73,32 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
elem_size = round_up(attr->value_size, 8);
+ max_entries = attr->max_entries;
+
+ /* On 32 bit archs roundup_pow_of_two() with max_entries that has
+ * upper most bit set in u32 space is undefined behavior due to
+ * resulting 1U << 32, so do it manually here in u64 space.
+ */
+ mask64 = fls_long(max_entries - 1);
+ mask64 = 1ULL << mask64;
+ mask64 -= 1;
+
+ index_mask = mask64;
+ if (unpriv) {
+ /* round up array size to nearest power of 2,
+ * since cpu will speculate within index_mask limits
+ */
+ max_entries = index_mask + 1;
+ /* Check for overflows. */
+ if (max_entries < attr->max_entries)
+ return ERR_PTR(-E2BIG);
+ }
+
array_size = sizeof(*array);
if (percpu)
- array_size += (u64) attr->max_entries * sizeof(void *);
+ array_size += (u64) max_entries * sizeof(void *);
else
- array_size += (u64) attr->max_entries * elem_size;
+ array_size += (u64) max_entries * elem_size;
/* make sure there is no u32 overflow later in round_up() */
if (array_size >= U32_MAX - PAGE_SIZE)
@@ -82,6 +108,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array = bpf_map_area_alloc(array_size, numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
+ array->index_mask = index_mask;
+ array->map.unpriv_array = unpriv;
/* copy mandatory map attributes */
array->map.map_type = attr->map_type;
@@ -117,12 +145,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return array->value + array->elem_size * index;
+ return array->value + array->elem_size * (index & array->index_mask);
}
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
u32 elem_size = round_up(map->value_size, 8);
const int ret = BPF_REG_0;
@@ -131,7 +160,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+ if (map->unpriv_array) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+ }
if (is_power_of_2(elem_size)) {
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
@@ -153,7 +187,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return this_cpu_ptr(array->pptrs[index]);
+ return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
@@ -173,7 +207,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
*/
size = round_up(map->value_size, 8);
rcu_read_lock();
- pptr = array->pptrs[index];
+ pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
off += size;
@@ -221,10 +255,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EEXIST;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
- memcpy(this_cpu_ptr(array->pptrs[index]),
+ memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
value, map->value_size);
else
- memcpy(array->value + array->elem_size * index,
+ memcpy(array->value +
+ array->elem_size * (index & array->index_mask),
value, map->value_size);
return 0;
}
@@ -258,7 +293,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
*/
size = round_up(map->value_size, 8);
rcu_read_lock();
- pptr = array->pptrs[index];
+ pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
off += size;
@@ -609,6 +644,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
static u32 array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 elem_size = round_up(map->value_size, 8);
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
@@ -617,7 +653,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ if (map->unpriv_array) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ }
if (is_power_of_2(elem_size))
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
else
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 546113430049..b789ab78d28f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,129 +27,405 @@ void cgroup_bpf_put(struct cgroup *cgrp)
{
unsigned int type;
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
- struct bpf_prog *prog = cgrp->bpf.prog[type];
-
- if (prog) {
- bpf_prog_put(prog);
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ struct bpf_prog_list *pl, *tmp;
+
+ list_for_each_entry_safe(pl, tmp, progs, node) {
+ list_del(&pl->node);
+ bpf_prog_put(pl->prog);
+ kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key);
}
+ bpf_prog_array_free(cgrp->bpf.effective[type]);
+ }
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct list_head *head)
+{
+ struct bpf_prog_list *pl;
+ u32 cnt = 0;
+
+ list_for_each_entry(pl, head, node) {
+ if (!pl->prog)
+ continue;
+ cnt++;
}
+ return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ u32 new_flags)
+{
+ struct cgroup *p;
+
+ p = cgroup_parent(cgrp);
+ if (!p)
+ return true;
+ do {
+ u32 flags = p->bpf.flags[type];
+ u32 cnt;
+
+ if (flags & BPF_F_ALLOW_MULTI)
+ return true;
+ cnt = prog_list_length(&p->bpf.progs[type]);
+ WARN_ON_ONCE(cnt > 1);
+ if (cnt == 1)
+ return !!(flags & BPF_F_ALLOW_OVERRIDE);
+ p = cgroup_parent(p);
+ } while (p);
+ return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ struct bpf_prog_array __rcu **array)
+{
+ struct bpf_prog_array __rcu *progs;
+ struct bpf_prog_list *pl;
+ struct cgroup *p = cgrp;
+ int cnt = 0;
+
+ /* count number of effective programs by walking parents */
+ do {
+ if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ cnt += prog_list_length(&p->bpf.progs[type]);
+ p = cgroup_parent(p);
+ } while (p);
+
+ progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+ if (!progs)
+ return -ENOMEM;
+
+ /* populate the array with effective progs */
+ cnt = 0;
+ p = cgrp;
+ do {
+ if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ list_for_each_entry(pl,
+ &p->bpf.progs[type], node) {
+ if (!pl->prog)
+ continue;
+ rcu_dereference_protected(progs, 1)->
+ progs[cnt++] = pl->prog;
+ }
+ p = cgroup_parent(p);
+ } while (p);
+
+ *array = progs;
+ return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ struct bpf_prog_array __rcu *array)
+{
+ struct bpf_prog_array __rcu *old_array;
+
+ old_array = xchg(&cgrp->bpf.effective[type], array);
+ /* free prog array after grace period, since __cgroup_bpf_run_*()
+ * might be still walking the array
+ */
+ bpf_prog_array_free(old_array);
}
/**
* cgroup_bpf_inherit() - inherit effective programs from parent
* @cgrp: the cgroup to modify
- * @parent: the parent to inherit from
*/
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+int cgroup_bpf_inherit(struct cgroup *cgrp)
{
- unsigned int type;
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define NR ARRAY_SIZE(cgrp->bpf.effective)
+ struct bpf_prog_array __rcu *arrays[NR] = {};
+ int i;
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
- struct bpf_prog *e;
+ for (i = 0; i < NR; i++)
+ INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
- e = rcu_dereference_protected(parent->bpf.effective[type],
- lockdep_is_held(&cgroup_mutex));
- rcu_assign_pointer(cgrp->bpf.effective[type], e);
- cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
- }
+ for (i = 0; i < NR; i++)
+ if (compute_effective_progs(cgrp, i, &arrays[i]))
+ goto cleanup;
+
+ for (i = 0; i < NR; i++)
+ activate_effective_progs(cgrp, i, arrays[i]);
+
+ return 0;
+cleanup:
+ for (i = 0; i < NR; i++)
+ bpf_prog_array_free(arrays[i]);
+ return -ENOMEM;
}
+#define BPF_CGROUP_MAX_PROGS 64
+
/**
- * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program to a cgroup, and
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
- * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
- * @prog: A new program to pin
- * @type: Type of pinning operation (ingress/egress)
- *
- * Each cgroup has a set of two pointers for bpf programs; one for eBPF
- * programs it owns, and which is effective for execution.
- *
- * If @prog is not %NULL, this function attaches a new program to the cgroup
- * and releases the one that is currently attached, if any. @prog is then made
- * the effective program of type @type in that cgroup.
- *
- * If @prog is %NULL, the currently attached program of type @type is released,
- * and the effective program of the parent cgroup (if any) is inherited to
- * @cgrp.
- *
- * Then, the descendants of @cgrp are walked and the effective program for
- * each of them is set to the effective program of @cgrp unless the
- * descendant has its own program attached, in which case the subbranch is
- * skipped. This ensures that delegated subcgroups with own programs are left
- * untouched.
+ * @prog: A program to attach
+ * @type: Type of attach operation
*
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
- struct bpf_prog *prog, enum bpf_attach_type type,
- bool new_overridable)
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
{
- struct bpf_prog *old_prog, *effective = NULL;
- struct cgroup_subsys_state *pos;
- bool overridable = true;
-
- if (parent) {
- overridable = !parent->bpf.disallow_override[type];
- effective = rcu_dereference_protected(parent->bpf.effective[type],
- lockdep_is_held(&cgroup_mutex));
- }
-
- if (prog && effective && !overridable)
- /* if parent has non-overridable prog attached, disallow
- * attaching new programs to descendent cgroup
- */
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ struct bpf_prog *old_prog = NULL;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_list *pl;
+ bool pl_was_allocated;
+ int err;
+
+ if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+ /* invalid combination */
+ return -EINVAL;
+
+ if (!hierarchy_allows_attach(cgrp, type, flags))
return -EPERM;
- if (prog && effective && overridable != new_overridable)
- /* if parent has overridable prog attached, only
- * allow overridable programs in descendent cgroup
+ if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+ /* Disallow attaching non-overridable on top
+ * of existing overridable in this cgroup.
+ * Disallow attaching multi-prog if overridable or none
*/
return -EPERM;
- old_prog = cgrp->bpf.prog[type];
-
- if (prog) {
- overridable = new_overridable;
- effective = prog;
- if (old_prog &&
- cgrp->bpf.disallow_override[type] == new_overridable)
- /* disallow attaching non-overridable on top
- * of existing overridable in this cgroup
- * and vice versa
- */
- return -EPERM;
+ if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+ return -E2BIG;
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ list_for_each_entry(pl, progs, node)
+ if (pl->prog == prog)
+ /* disallow attaching the same prog twice */
+ return -EINVAL;
+
+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return -ENOMEM;
+ pl_was_allocated = true;
+ pl->prog = prog;
+ list_add_tail(&pl->node, progs);
+ } else {
+ if (list_empty(progs)) {
+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return -ENOMEM;
+ pl_was_allocated = true;
+ list_add_tail(&pl->node, progs);
+ } else {
+ pl = list_first_entry(progs, typeof(*pl), node);
+ old_prog = pl->prog;
+ pl_was_allocated = false;
+ }
+ pl->prog = prog;
}
- if (!prog && !old_prog)
- /* report error when trying to detach and nothing is attached */
- return -ENOENT;
+ cgrp->bpf.flags[type] = flags;
- cgrp->bpf.prog[type] = prog;
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
- css_for_each_descendant_pre(pos, &cgrp->self) {
- struct cgroup *desc = container_of(pos, struct cgroup, self);
-
- /* skip the subtree if the descendant has its own program */
- if (desc->bpf.prog[type] && desc != cgrp) {
- pos = css_rightmost_descendant(pos);
- } else {
- rcu_assign_pointer(desc->bpf.effective[type],
- effective);
- desc->bpf.disallow_override[type] = !overridable;
- }
+ err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
}
- if (prog)
- static_branch_inc(&cgroup_bpf_enabled_key);
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+ activate_effective_progs(desc, type, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ static_branch_inc(&cgroup_bpf_enabled_key);
if (old_prog) {
bpf_prog_put(old_prog);
static_branch_dec(&cgroup_bpf_enabled_key);
}
return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* and cleanup the prog list */
+ pl->prog = old_prog;
+ if (pl_was_allocated) {
+ list_del(&pl->node);
+ kfree(pl);
+ }
+ return err;
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @type: Type of detach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 unused_flags)
+{
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ u32 flags = cgrp->bpf.flags[type];
+ struct bpf_prog *old_prog = NULL;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_list *pl;
+ int err;
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ if (!prog)
+ /* to detach MULTI prog the user has to specify valid FD
+ * of the program to be detached
+ */
+ return -EINVAL;
+ } else {
+ if (list_empty(progs))
+ /* report error when trying to detach and nothing is attached */
+ return -ENOENT;
+ }
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ /* find the prog and detach it */
+ list_for_each_entry(pl, progs, node) {
+ if (pl->prog != prog)
+ continue;
+ old_prog = prog;
+ /* mark it deleted, so it's ignored while
+ * recomputing effective
+ */
+ pl->prog = NULL;
+ break;
+ }
+ if (!old_prog)
+ return -ENOENT;
+ } else {
+ /* to maintain backward compatibility NONE and OVERRIDE cgroups
+ * allow detaching with invalid FD (prog==NULL)
+ */
+ pl = list_first_entry(progs, typeof(*pl), node);
+ old_prog = pl->prog;
+ pl->prog = NULL;
+ }
+
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
+ }
+
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ activate_effective_progs(desc, type, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* now can actually delete it from this cgroup list */
+ list_del(&pl->node);
+ kfree(pl);
+ if (list_empty(progs))
+ /* last program was detached, reset flags to zero */
+ cgrp->bpf.flags[type] = 0;
+
+ bpf_prog_put(old_prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* and restore back old_prog */
+ pl->prog = old_prog;
+ return err;
+}
+
+/* Must be called with cgroup_mutex held to avoid races. */
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ enum bpf_attach_type type = attr->query.attach_type;
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ u32 flags = cgrp->bpf.flags[type];
+ int cnt, ret = 0, i;
+
+ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
+ cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+ else
+ cnt = prog_list_length(progs);
+
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+ return -EFAULT;
+ if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+ /* return early if user requested only program count + flags */
+ return 0;
+ if (attr->query.prog_cnt < cnt) {
+ cnt = attr->query.prog_cnt;
+ ret = -ENOSPC;
+ }
+
+ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+ return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
+ prog_ids, cnt);
+ } else {
+ struct bpf_prog_list *pl;
+ u32 id;
+
+ i = 0;
+ list_for_each_entry(pl, progs, node) {
+ id = pl->prog->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (++i == cnt)
+ break;
+ }
+ }
+ return ret;
}
/**
@@ -171,36 +447,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
enum bpf_attach_type type)
{
- struct bpf_prog *prog;
+ unsigned int offset = skb->data - skb_network_header(skb);
+ struct sock *save_sk;
struct cgroup *cgrp;
- int ret = 0;
+ int ret;
if (!sk || !sk_fullsock(sk))
return 0;
- if (sk->sk_family != AF_INET &&
- sk->sk_family != AF_INET6)
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-
- rcu_read_lock();
-
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog) {
- unsigned int offset = skb->data - skb_network_header(skb);
- struct sock *save_sk = skb->sk;
-
- skb->sk = sk;
- __skb_push(skb, offset);
- ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
- __skb_pull(skb, offset);
- skb->sk = save_sk;
- }
-
- rcu_read_unlock();
-
- return ret;
+ save_sk = skb->sk;
+ skb->sk = sk;
+ __skb_push(skb, offset);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+ bpf_prog_run_save_cb);
+ __skb_pull(skb, offset);
+ skb->sk = save_sk;
+ return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
@@ -221,19 +487,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
enum bpf_attach_type type)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- struct bpf_prog *prog;
- int ret = 0;
+ int ret;
-
- rcu_read_lock();
-
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog)
- ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
-
- rcu_read_unlock();
-
- return ret;
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+ return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -258,18 +515,77 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
enum bpf_attach_type type)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- struct bpf_prog *prog;
- int ret = 0;
+ int ret;
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
+ BPF_PROG_RUN);
+ return ret == 1 ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+ short access, enum bpf_attach_type type)
+{
+ struct cgroup *cgrp;
+ struct bpf_cgroup_dev_ctx ctx = {
+ .access_type = (access << 16) | dev_type,
+ .major = major,
+ .minor = minor,
+ };
+ int allow = 1;
rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
+ BPF_PROG_RUN);
+ rcu_read_unlock();
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog)
- ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
+ return !allow;
+}
+EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
- rcu_read_unlock();
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_trace_printk:
+ if (capable(CAP_SYS_ADMIN))
+ return bpf_get_trace_printk_proto();
+ default:
+ return NULL;
+ }
+}
- return ret;
+static bool cgroup_dev_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE)
+ return false;
+
+ if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
+ return false;
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u32))
+ return false;
+
+ return true;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+const struct bpf_prog_ops cg_dev_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_dev_verifier_ops = {
+ .get_func_proto = cgroup_dev_func_proto,
+ .is_valid_access = cgroup_dev_is_valid_access,
+};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7b62df86be1d..7949e8b8f94e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
if (fp == NULL)
return NULL;
- kmemcheck_annotate_bitfield(fp, meta);
-
aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
if (aux == NULL) {
vfree(fp);
@@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
if (fp == NULL) {
__bpf_prog_uncharge(fp_old->aux->user, delta);
} else {
- kmemcheck_annotate_bitfield(fp, meta);
-
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
fp->pages = pages;
fp->aux->prog = fp;
@@ -309,12 +305,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
{
+ const char *end = sym + KSYM_NAME_LEN;
+
BUILD_BUG_ON(sizeof("bpf_prog_") +
- sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+ sizeof(prog->tag) * 2 +
+ /* name has been null terminated.
+ * We should need +1 for the '_' preceding
+ * the name. However, the null character
+ * is double counted between the name and the
+ * sizeof("bpf_prog_") above, so we omit
+ * the +1 here.
+ */
+ sizeof(prog->aux->name) > KSYM_NAME_LEN);
sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
- *sym = 0;
+ if (prog->aux->name[0])
+ snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
+ else
+ *sym = 0;
}
static __always_inline unsigned long
@@ -662,8 +671,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
if (fp != NULL) {
- kmemcheck_annotate_bitfield(fp, meta);
-
/* aux->prog still points to the fp_other one, so
* when promoting the clone to the real program,
* this still needs to be adapted.
@@ -760,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
}
EXPORT_SYMBOL_GPL(__bpf_call_base);
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
* __bpf_prog_run - run eBPF program on a given context
* @ctx: is the data we are operating on
@@ -948,7 +956,7 @@ select_insn:
DST = tmp;
CONT;
ALU_MOD_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
DST = do_div(tmp, (u32) SRC);
@@ -967,7 +975,7 @@ select_insn:
DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
do_div(tmp, (u32) SRC);
@@ -1310,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
+#else
+static unsigned int __bpf_prog_ret0(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ return 0;
+}
+#endif
+
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
{
@@ -1357,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
+#else
+ fp->bpf_func = __bpf_prog_ret0;
+#endif
/* eBPF JITs can rewrite the program in case constant
* blinding is active. However, in case of error during
@@ -1367,7 +1387,19 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* valid program, which in this case would simply not
* be JITed, but falls back to the interpreter.
*/
- fp = bpf_int_jit_compile(fp);
+ if (!bpf_prog_is_dev_bound(fp->aux)) {
+ fp = bpf_int_jit_compile(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ if (!fp->jited) {
+ *err = -ENOTSUPP;
+ return fp;
+ }
+#endif
+ } else {
+ *err = bpf_prog_offload_compile(fp);
+ if (*err)
+ return fp;
+ }
bpf_prog_lock_ro(fp);
/* The tail call compatibility check can only be done at
@@ -1381,11 +1413,164 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+static unsigned int __bpf_prog_ret1(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ return 1;
+}
+
+static struct bpf_prog_dummy {
+ struct bpf_prog prog;
+} dummy_bpf_prog = {
+ .prog = {
+ .bpf_func = __bpf_prog_ret1,
+ },
+};
+
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+static struct {
+ struct bpf_prog_array hdr;
+ struct bpf_prog *null_prog;
+} empty_prog_array = {
+ .null_prog = NULL,
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+ if (prog_cnt)
+ return kzalloc(sizeof(struct bpf_prog_array) +
+ sizeof(struct bpf_prog *) * (prog_cnt + 1),
+ flags);
+
+ return &empty_prog_array.hdr;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+{
+ if (!progs ||
+ progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+ return;
+ kfree_rcu(progs, rcu);
+}
+
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+{
+ struct bpf_prog **prog;
+ u32 cnt = 0;
+
+ rcu_read_lock();
+ prog = rcu_dereference(progs)->progs;
+ for (; *prog; prog++)
+ if (*prog != &dummy_bpf_prog.prog)
+ cnt++;
+ rcu_read_unlock();
+ return cnt;
+}
+
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+ __u32 __user *prog_ids, u32 cnt)
+{
+ struct bpf_prog **prog;
+ u32 i = 0, id;
+
+ rcu_read_lock();
+ prog = rcu_dereference(progs)->progs;
+ for (; *prog; prog++) {
+ id = (*prog)->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
+ rcu_read_unlock();
+ return -EFAULT;
+ }
+ if (++i == cnt) {
+ prog++;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (*prog)
+ return -ENOSPC;
+ return 0;
+}
+
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_prog **prog = progs->progs;
+
+ for (; *prog; prog++)
+ if (*prog == old_prog) {
+ WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+ break;
+ }
+}
+
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+ struct bpf_prog *exclude_prog,
+ struct bpf_prog *include_prog,
+ struct bpf_prog_array **new_array)
+{
+ int new_prog_cnt, carry_prog_cnt = 0;
+ struct bpf_prog **existing_prog;
+ struct bpf_prog_array *array;
+ int new_prog_idx = 0;
+
+ /* Figure out how many existing progs we need to carry over to
+ * the new array.
+ */
+ if (old_array) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++) {
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ carry_prog_cnt++;
+ if (*existing_prog == include_prog)
+ return -EEXIST;
+ }
+ }
+
+ /* How many progs (not NULL) will be in the new array? */
+ new_prog_cnt = carry_prog_cnt;
+ if (include_prog)
+ new_prog_cnt += 1;
+
+ /* Do we have any prog (not NULL) in the new array? */
+ if (!new_prog_cnt) {
+ *new_array = NULL;
+ return 0;
+ }
+
+ /* +1 as the end of prog_array is marked with NULL */
+ array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+
+ /* Fill in the new prog array */
+ if (carry_prog_cnt) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++)
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ array->progs[new_prog_idx++] = *existing_prog;
+ }
+ if (include_prog)
+ array->progs[new_prog_idx++] = include_prog;
+ array->progs[new_prog_idx] = NULL;
+ *new_array = array;
+ return 0;
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
aux = container_of(work, struct bpf_prog_aux, work);
+ if (bpf_prog_is_dev_bound(aux))
+ bpf_prog_offload_destroy(aux->prog);
bpf_jit_free(aux->prog);
}
@@ -1498,5 +1683,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+/* These are only used within the BPF_SYSCALL code */
+#ifdef CONFIG_BPF_SYSCALL
EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
+#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
new file mode 100644
index 000000000000..ce5b669003b2
--- /dev/null
+++ b/kernel/bpf/cpumap.c
@@ -0,0 +1,706 @@
+/* bpf/cpumap.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2. See COPYING.
+ */
+
+/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+ * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
+ *
+ * Unlike devmap which redirects XDP frames out another NIC device,
+ * this map type redirects raw XDP frames to another CPU. The remote
+ * CPU will do SKB-allocation and call the normal network stack.
+ *
+ * This is a scalability and isolation mechanism, that allow
+ * separating the early driver network XDP layer, from the rest of the
+ * netstack, and assigning dedicated CPUs for this stage. This
+ * basically allows for 10G wirespeed pre-filtering via bpf.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ptr_ring.h>
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/capability.h>
+#include <trace/events/xdp.h>
+
+#include <linux/netdevice.h> /* netif_receive_skb_core */
+#include <linux/etherdevice.h> /* eth_type_trans */
+
+/* General idea: XDP packets getting XDP redirected to another CPU,
+ * will maximum be stored/queued for one driver ->poll() call. It is
+ * guaranteed that setting flush bit and flush operation happen on
+ * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
+ * which queue in bpf_cpu_map_entry contains packets.
+ */
+
+#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
+struct xdp_bulk_queue {
+ void *q[CPU_MAP_BULK_SIZE];
+ unsigned int count;
+};
+
+/* Struct for every remote "destination" CPU in map */
+struct bpf_cpu_map_entry {
+ u32 cpu; /* kthread CPU and map index */
+ int map_id; /* Back reference to map */
+ u32 qsize; /* Queue size placeholder for map lookup */
+
+ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */
+ struct xdp_bulk_queue __percpu *bulkq;
+
+ /* Queue with potential multi-producers, and single-consumer kthread */
+ struct ptr_ring *queue;
+ struct task_struct *kthread;
+ struct work_struct kthread_stop_wq;
+
+ atomic_t refcnt; /* Control when this struct can be free'ed */
+ struct rcu_head rcu;
+};
+
+struct bpf_cpu_map {
+ struct bpf_map map;
+ /* Below members specific for map type */
+ struct bpf_cpu_map_entry **cpu_map;
+ unsigned long __percpu *flush_needed;
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_bulk_queue *bq);
+
+static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
+{
+ return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_cpu_map *cmap;
+ int err = -ENOMEM;
+ u64 cost;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ return ERR_PTR(-EINVAL);
+
+ cmap = kzalloc(sizeof(*cmap), GFP_USER);
+ if (!cmap)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ cmap->map.map_type = attr->map_type;
+ cmap->map.key_size = attr->key_size;
+ cmap->map.value_size = attr->value_size;
+ cmap->map.max_entries = attr->max_entries;
+ cmap->map.map_flags = attr->map_flags;
+ cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+
+ /* Pre-limit array size based on NR_CPUS, not final CPU check */
+ if (cmap->map.max_entries > NR_CPUS) {
+ err = -E2BIG;
+ goto free_cmap;
+ }
+
+ /* make sure page count doesn't overflow */
+ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
+ cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_cmap;
+ cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* Notice returns -EPERM on if map size is larger than memlock limit */
+ ret = bpf_map_precharge_memlock(cmap->map.pages);
+ if (ret) {
+ err = ret;
+ goto free_cmap;
+ }
+
+ /* A per cpu bitfield with a bit per possible CPU in map */
+ cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
+ __alignof__(unsigned long));
+ if (!cmap->flush_needed)
+ goto free_cmap;
+
+ /* Alloc array for possible remote "destination" CPUs */
+ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
+ sizeof(struct bpf_cpu_map_entry *),
+ cmap->map.numa_node);
+ if (!cmap->cpu_map)
+ goto free_percpu;
+
+ return &cmap->map;
+free_percpu:
+ free_percpu(cmap->flush_needed);
+free_cmap:
+ kfree(cmap);
+ return ERR_PTR(err);
+}
+
+void __cpu_map_queue_destructor(void *ptr)
+{
+ /* The tear-down procedure should have made sure that queue is
+ * empty. See __cpu_map_entry_replace() and work-queue
+ * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+ * gracefully and warn once.
+ */
+ if (WARN_ON_ONCE(ptr))
+ page_frag_free(ptr);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+ if (atomic_dec_and_test(&rcpu->refcnt)) {
+ /* The queue should be empty at this point */
+ ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
+ kfree(rcpu->queue);
+ kfree(rcpu);
+ }
+}
+
+static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+ atomic_inc(&rcpu->refcnt);
+}
+
+/* called from workqueue, to workaround syscall using preempt_disable */
+static void cpu_map_kthread_stop(struct work_struct *work)
+{
+ struct bpf_cpu_map_entry *rcpu;
+
+ rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
+
+ /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
+ * as it waits until all in-flight call_rcu() callbacks complete.
+ */
+ rcu_barrier();
+
+ /* kthread_stop will wake_up_process and wait for it to complete */
+ kthread_stop(rcpu->kthread);
+}
+
+/* For now, xdp_pkt is a cpumap internal data structure, with info
+ * carried between enqueue to dequeue. It is mapped into the top
+ * headroom of the packet, to avoid allocating separate mem.
+ */
+struct xdp_pkt {
+ void *data;
+ u16 len;
+ u16 headroom;
+ u16 metasize;
+ struct net_device *dev_rx;
+};
+
+/* Convert xdp_buff to xdp_pkt */
+static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
+{
+ struct xdp_pkt *xdp_pkt;
+ int metasize;
+ int headroom;
+
+ /* Assure headroom is available for storing info */
+ headroom = xdp->data - xdp->data_hard_start;
+ metasize = xdp->data - xdp->data_meta;
+ metasize = metasize > 0 ? metasize : 0;
+ if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
+ return NULL;
+
+ /* Store info in top of packet */
+ xdp_pkt = xdp->data_hard_start;
+
+ xdp_pkt->data = xdp->data;
+ xdp_pkt->len = xdp->data_end - xdp->data;
+ xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
+ xdp_pkt->metasize = metasize;
+
+ return xdp_pkt;
+}
+
+struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_pkt *xdp_pkt)
+{
+ unsigned int frame_size;
+ void *pkt_data_start;
+ struct sk_buff *skb;
+
+ /* build_skb need to place skb_shared_info after SKB end, and
+ * also want to know the memory "truesize". Thus, need to
+ * know the memory frame size backing xdp_buff.
+ *
+ * XDP was designed to have PAGE_SIZE frames, but this
+ * assumption is not longer true with ixgbe and i40e. It
+ * would be preferred to set frame_size to 2048 or 4096
+ * depending on the driver.
+ * frame_size = 2048;
+ * frame_len = frame_size - sizeof(*xdp_pkt);
+ *
+ * Instead, with info avail, skb_shared_info in placed after
+ * packet len. This, unfortunately fakes the truesize.
+ * Another disadvantage of this approach, the skb_shared_info
+ * is not at a fixed memory location, with mixed length
+ * packets, which is bad for cache-line hotness.
+ */
+ frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+ skb = build_skb(pkt_data_start, frame_size);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, xdp_pkt->headroom);
+ __skb_put(skb, xdp_pkt->len);
+ if (xdp_pkt->metasize)
+ skb_metadata_set(skb, xdp_pkt->metasize);
+
+ /* Essential SKB info: protocol and skb->dev */
+ skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+
+ /* Optional SKB info, currently missing:
+ * - HW checksum info (skb->ip_summed)
+ * - HW RX hash (skb_set_hash)
+ * - RX ring dev queue index (skb_record_rx_queue)
+ */
+
+ return skb;
+}
+
+static int cpu_map_kthread_run(void *data)
+{
+ struct bpf_cpu_map_entry *rcpu = data;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* When kthread gives stop order, then rcpu have been disconnected
+ * from map, thus no new packets can enter. Remaining in-flight
+ * per CPU stored packets are flushed to this queue. Wait honoring
+ * kthread_stop signal until queue is empty.
+ */
+ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+ unsigned int processed = 0, drops = 0, sched = 0;
+ struct xdp_pkt *xdp_pkt;
+
+ /* Release CPU reschedule checks */
+ if (__ptr_ring_empty(rcpu->queue)) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* Recheck to avoid lost wake-up */
+ if (__ptr_ring_empty(rcpu->queue)) {
+ schedule();
+ sched = 1;
+ } else {
+ __set_current_state(TASK_RUNNING);
+ }
+ } else {
+ sched = cond_resched();
+ }
+
+ /* Process packets in rcpu->queue */
+ local_bh_disable();
+ /*
+ * The bpf_cpu_map_entry is single consumer, with this
+ * kthread CPU pinned. Lockless access to ptr_ring
+ * consume side valid as no-resize allowed of queue.
+ */
+ while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+ struct sk_buff *skb;
+ int ret;
+
+ skb = cpu_map_build_skb(rcpu, xdp_pkt);
+ if (!skb) {
+ page_frag_free(xdp_pkt);
+ continue;
+ }
+
+ /* Inject into network stack */
+ ret = netif_receive_skb_core(skb);
+ if (ret == NET_RX_DROP)
+ drops++;
+
+ /* Limit BH-disable period */
+ if (++processed == 8)
+ break;
+ }
+ /* Feedback loop via tracepoint */
+ trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+
+ local_bh_enable(); /* resched point, may call do_softirq() */
+ }
+ __set_current_state(TASK_RUNNING);
+
+ put_cpu_map_entry(rcpu);
+ return 0;
+}
+
+struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+{
+ gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
+ struct bpf_cpu_map_entry *rcpu;
+ int numa, err;
+
+ /* Have map->numa_node, but choose node of redirect target CPU */
+ numa = cpu_to_node(cpu);
+
+ rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
+ if (!rcpu)
+ return NULL;
+
+ /* Alloc percpu bulkq */
+ rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
+ sizeof(void *), gfp);
+ if (!rcpu->bulkq)
+ goto free_rcu;
+
+ /* Alloc queue */
+ rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
+ if (!rcpu->queue)
+ goto free_bulkq;
+
+ err = ptr_ring_init(rcpu->queue, qsize, gfp);
+ if (err)
+ goto free_queue;
+
+ rcpu->cpu = cpu;
+ rcpu->map_id = map_id;
+ rcpu->qsize = qsize;
+
+ /* Setup kthread */
+ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
+ "cpumap/%d/map:%d", cpu, map_id);
+ if (IS_ERR(rcpu->kthread))
+ goto free_ptr_ring;
+
+ get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
+ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
+
+ /* Make sure kthread runs on a single CPU */
+ kthread_bind(rcpu->kthread, cpu);
+ wake_up_process(rcpu->kthread);
+
+ return rcpu;
+
+free_ptr_ring:
+ ptr_ring_cleanup(rcpu->queue, NULL);
+free_queue:
+ kfree(rcpu->queue);
+free_bulkq:
+ free_percpu(rcpu->bulkq);
+free_rcu:
+ kfree(rcpu);
+ return NULL;
+}
+
+void __cpu_map_entry_free(struct rcu_head *rcu)
+{
+ struct bpf_cpu_map_entry *rcpu;
+ int cpu;
+
+ /* This cpu_map_entry have been disconnected from map and one
+ * RCU graze-period have elapsed. Thus, XDP cannot queue any
+ * new packets and cannot change/set flush_needed that can
+ * find this entry.
+ */
+ rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+
+ /* Flush remaining packets in percpu bulkq */
+ for_each_online_cpu(cpu) {
+ struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
+
+ /* No concurrent bq_enqueue can run at this point */
+ bq_flush_to_queue(rcpu, bq);
+ }
+ free_percpu(rcpu->bulkq);
+ /* Cannot kthread_stop() here, last put free rcpu resources */
+ put_cpu_map_entry(rcpu);
+}
+
+/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
+ * ensure any driver rcu critical sections have completed, but this
+ * does not guarantee a flush has happened yet. Because driver side
+ * rcu_read_lock/unlock only protects the running XDP program. The
+ * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
+ * pending flush op doesn't fail.
+ *
+ * The bpf_cpu_map_entry is still used by the kthread, and there can
+ * still be pending packets (in queue and percpu bulkq). A refcnt
+ * makes sure to last user (kthread_stop vs. call_rcu) free memory
+ * resources.
+ *
+ * The rcu callback __cpu_map_entry_free flush remaining packets in
+ * percpu bulkq to queue. Due to caller map_delete_elem() disable
+ * preemption, cannot call kthread_stop() to make sure queue is empty.
+ * Instead a work_queue is started for stopping kthread,
+ * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * stopping kthread, emptying the queue.
+ */
+void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+ u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+{
+ struct bpf_cpu_map_entry *old_rcpu;
+
+ old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+ if (old_rcpu) {
+ call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
+ INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
+ schedule_work(&old_rcpu->kthread_stop_wq);
+ }
+}
+
+int cpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ u32 key_cpu = *(u32 *)key;
+
+ if (key_cpu >= map->max_entries)
+ return -EINVAL;
+
+ /* notice caller map_delete_elem() use preempt_disable() */
+ __cpu_map_entry_replace(cmap, key_cpu, NULL);
+ return 0;
+}
+
+int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpu_map_entry *rcpu;
+
+ /* Array index key correspond to CPU number */
+ u32 key_cpu = *(u32 *)key;
+ /* Value is the queue size */
+ u32 qsize = *(u32 *)value;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(key_cpu >= cmap->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+ if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+ return -EOVERFLOW;
+
+ /* Make sure CPU is a valid possible cpu */
+ if (!cpu_possible(key_cpu))
+ return -ENODEV;
+
+ if (qsize == 0) {
+ rcpu = NULL; /* Same as deleting */
+ } else {
+ /* Updating qsize cause re-allocation of bpf_cpu_map_entry */
+ rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+ if (!rcpu)
+ return -ENOMEM;
+ }
+ rcu_read_lock();
+ __cpu_map_entry_replace(cmap, key_cpu, rcpu);
+ rcu_read_unlock();
+ return 0;
+}
+
+void cpu_map_free(struct bpf_map *map)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ int cpu;
+ u32 i;
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the bpf programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete. The rcu critical section only guarantees
+ * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
+ * It does __not__ ensure pending flush operations (if any) are
+ * complete.
+ */
+ synchronize_rcu();
+
+ /* To ensure all pending flush operations have completed wait for flush
+ * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * Because the above synchronize_rcu() ensures the map is disconnected
+ * from the program we can assume no new bits will be set.
+ */
+ for_each_online_cpu(cpu) {
+ unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+
+ while (!bitmap_empty(bitmap, cmap->map.max_entries))
+ cond_resched();
+ }
+
+ /* For cpu_map the remote CPUs can still be using the entries
+ * (struct bpf_cpu_map_entry).
+ */
+ for (i = 0; i < cmap->map.max_entries; i++) {
+ struct bpf_cpu_map_entry *rcpu;
+
+ rcpu = READ_ONCE(cmap->cpu_map[i]);
+ if (!rcpu)
+ continue;
+
+ /* bq flush and cleanup happens after RCU graze-period */
+ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+ }
+ free_percpu(cmap->flush_needed);
+ bpf_map_area_free(cmap->cpu_map);
+ kfree(cmap);
+}
+
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpu_map_entry *rcpu;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ rcpu = READ_ONCE(cmap->cpu_map[key]);
+ return rcpu;
+}
+
+static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_cpu_map_entry *rcpu =
+ __cpu_map_lookup_elem(map, *(u32 *)key);
+
+ return rcpu ? &rcpu->qsize : NULL;
+}
+
+static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= cmap->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == cmap->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+const struct bpf_map_ops cpu_map_ops = {
+ .map_alloc = cpu_map_alloc,
+ .map_free = cpu_map_free,
+ .map_delete_elem = cpu_map_delete_elem,
+ .map_update_elem = cpu_map_update_elem,
+ .map_lookup_elem = cpu_map_lookup_elem,
+ .map_get_next_key = cpu_map_get_next_key,
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+ struct xdp_bulk_queue *bq)
+{
+ unsigned int processed = 0, drops = 0;
+ const int to_cpu = rcpu->cpu;
+ struct ptr_ring *q;
+ int i;
+
+ if (unlikely(!bq->count))
+ return 0;
+
+ q = rcpu->queue;
+ spin_lock(&q->producer_lock);
+
+ for (i = 0; i < bq->count; i++) {
+ void *xdp_pkt = bq->q[i];
+ int err;
+
+ err = __ptr_ring_produce(q, xdp_pkt);
+ if (err) {
+ drops++;
+ page_frag_free(xdp_pkt); /* Free xdp_pkt */
+ }
+ processed++;
+ }
+ bq->count = 0;
+ spin_unlock(&q->producer_lock);
+
+ /* Feedback loop via tracepoints */
+ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
+ return 0;
+}
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+{
+ struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+
+ if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
+ bq_flush_to_queue(rcpu, bq);
+
+ /* Notice, xdp_buff/page MUST be queued here, long enough for
+ * driver to code invoking us to finished, due to driver
+ * (e.g. ixgbe) recycle tricks based on page-refcnt.
+ *
+ * Thus, incoming xdp_pkt is always queued here (else we race
+ * with another CPU on page-refcnt and remaining driver code).
+ * Queue time is very short, as driver will invoke flush
+ * operation, when completing napi->poll call.
+ */
+ bq->q[bq->count++] = xdp_pkt;
+ return 0;
+}
+
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct xdp_pkt *xdp_pkt;
+
+ xdp_pkt = convert_to_xdp_pkt(xdp);
+ if (unlikely(!xdp_pkt))
+ return -EOVERFLOW;
+
+ /* Info needed when constructing SKB on remote CPU */
+ xdp_pkt->dev_rx = dev_rx;
+
+ bq_enqueue(rcpu, xdp_pkt);
+ return 0;
+}
+
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+
+ __set_bit(bit, bitmap);
+}
+
+void __cpu_map_flush(struct bpf_map *map)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+ u32 bit;
+
+ /* The napi->poll softirq makes sure __cpu_map_insert_ctx()
+ * and __cpu_map_flush() happen on same CPU. Thus, the percpu
+ * bitmap indicate which percpu bulkq have packets.
+ */
+ for_each_set_bit(bit, bitmap, map->max_entries) {
+ struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
+ struct xdp_bulk_queue *bq;
+
+ /* This is possible if entry is removed by user space
+ * between xdp redirect and flush op.
+ */
+ if (unlikely(!rcpu))
+ continue;
+
+ __clear_bit(bit, bitmap);
+
+ /* Flush all frames in bulkq to real queue */
+ bq = this_cpu_ptr(rcpu->bulkq);
+ bq_flush_to_queue(rcpu, bq);
+
+ /* If already running, costs spin_lock_irqsave + smb_mb */
+ wake_up_process(rcpu->kthread);
+ }
+}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e745d6a88224..ebdef54bf7df 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,9 @@
#include <linux/bpf.h>
#include <linux/filter.h>
+#define DEV_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
struct bpf_dtab_netdev {
struct net_device *dev;
struct bpf_dtab *dtab;
@@ -83,7 +86,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
dtab = kzalloc(sizeof(*dtab), GFP_USER);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
new file mode 100644
index 000000000000..e682850c9715
--- /dev/null
+++ b/kernel/bpf/disasm.c
@@ -0,0 +1,214 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+
+#include "disasm.h"
+
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+ __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+const char *func_id_name(int id)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+ return func_id_str[id];
+ else
+ return "unknown";
+}
+
+const char *const bpf_class_string[8] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_RET] = "BUG",
+ [BPF_ALU64] = "alu64",
+};
+
+const char *const bpf_alu_string[16] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[16] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JLT >> 4] = "<",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JLE >> 4] = "<=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSLT >> 4] = "s<",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_JSLE >> 4] = "s<=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_end_insn(bpf_insn_print_cb verbose,
+ struct bpf_verifier_env *env,
+ const struct bpf_insn *insn)
+{
+ verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+ BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+ insn->imm, insn->dst_reg);
+}
+
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, bool allow_ptr_leaks)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_OP(insn->code) == BPF_END) {
+ if (class == BPF_ALU64)
+ verbose(env, "BUG_alu64_%02x\n", insn->code);
+ else
+ print_bpf_end_insn(verbose, env, insn);
+ } else if (BPF_OP(insn->code) == BPF_NEG) {
+ verbose(env, "(%02x) r%d = %s-r%d\n",
+ insn->code, insn->dst_reg,
+ class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose(env, "(%02x) %sr%d %s %sr%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->src_reg);
+ } else {
+ verbose(env, "(%02x) %sr%d %s %s%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->imm);
+ }
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ else
+ verbose(env, "BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "BUG_st_%02x\n", insn->code);
+ return;
+ }
+ verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM &&
+ BPF_SIZE(insn->code) == BPF_DW) {
+ /* At this point, we already made sure that the second
+ * part of the ldimm64 insn is accessible.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+ bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+ if (map_ptr && !allow_ptr_leaks)
+ imm = 0;
+
+ verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
+ insn->dst_reg, (unsigned long long)imm);
+ } else {
+ verbose(env, "BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ verbose(env, "(%02x) call %s#%d\n", insn->code,
+ func_id_name(insn->imm), insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose(env, "(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose(env, "(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->src_reg, insn->off);
+ } else {
+ verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ verbose(env, "(%02x) %s\n",
+ insn->code, bpf_class_string[class]);
+ }
+}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
new file mode 100644
index 000000000000..8de977e420b6
--- /dev/null
+++ b/kernel/bpf/disasm.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __BPF_DISASM_H__
+#define __BPF_DISASM_H__
+
+#include <linux/bpf.h>
+#include <linux/kernel.h>
+#include <linux/stringify.h>
+
+extern const char *const bpf_alu_string[16];
+extern const char *const bpf_class_string[8];
+
+const char *func_id_name(int id);
+
+struct bpf_verifier_env;
+typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env,
+ const char *, ...);
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, bool allow_ptr_leaks);
+
+#endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 6533f08d1238..3905d4bc5b80 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,8 +18,9 @@
#include "bpf_lru_list.h"
#include "map_in_map.h"
-#define HTAB_CREATE_FLAG_MASK \
- (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+#define HTAB_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
+ BPF_F_RDONLY | BPF_F_WRONLY)
struct bucket {
struct hlist_nulls_head head;
@@ -113,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab)
pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
htab->map.key_size);
free_percpu(pptr);
+ cond_resched();
}
free_elems:
bpf_map_area_free(htab->elems);
@@ -158,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab)
goto free_elems;
htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
pptr);
+ cond_resched();
}
skip_percpu_elems:
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index be1dde967208..81e2f6995adb 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,39 +150,29 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
return 0;
}
-static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
- umode_t mode, const struct inode_operations *iops)
+static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
+ const struct inode_operations *iops)
{
- struct inode *inode;
-
- inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
inode->i_op = iops;
- inode->i_private = dentry->d_fsdata;
+ inode->i_private = raw;
bpf_dentry_finalize(dentry, inode, dir);
return 0;
}
-static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
- dev_t devt)
+static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
{
- enum bpf_type type = MINOR(devt);
-
- if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) ||
- dentry->d_fsdata == NULL)
- return -EPERM;
+ return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops);
+}
- switch (type) {
- case BPF_TYPE_PROG:
- return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops);
- case BPF_TYPE_MAP:
- return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops);
- default:
- return -EPERM;
- }
+static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
+{
+ return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops);
}
static struct dentry *
@@ -218,7 +208,6 @@ static int bpf_symlink(struct inode *dir, struct dentry *dentry,
static const struct inode_operations bpf_dir_iops = {
.lookup = bpf_lookup,
- .mknod = bpf_mkobj,
.mkdir = bpf_mkdir,
.symlink = bpf_symlink,
.rmdir = simple_rmdir,
@@ -234,7 +223,6 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
struct inode *dir;
struct path path;
umode_t mode;
- dev_t devt;
int ret;
dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
@@ -242,9 +230,8 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
return PTR_ERR(dentry);
mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
- devt = MKDEV(UNNAMED_MAJOR, type);
- ret = security_path_mknod(&path, dentry, mode, devt);
+ ret = security_path_mknod(&path, dentry, mode, 0);
if (ret)
goto out;
@@ -254,9 +241,16 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
goto out;
}
- dentry->d_fsdata = raw;
- ret = vfs_mknod(dir, dentry, mode, devt);
- dentry->d_fsdata = NULL;
+ switch (type) {
+ case BPF_TYPE_PROG:
+ ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
+ break;
+ case BPF_TYPE_MAP:
+ ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
+ break;
+ default:
+ ret = -EPERM;
+ }
out:
done_path_create(&path, dentry);
return ret;
@@ -295,7 +289,7 @@ out:
}
static void *bpf_obj_do_get(const struct filename *pathname,
- enum bpf_type *type)
+ enum bpf_type *type, int flags)
{
struct inode *inode;
struct path path;
@@ -307,7 +301,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
return ERR_PTR(ret);
inode = d_backing_inode(path.dentry);
- ret = inode_permission(inode, MAY_WRITE);
+ ret = inode_permission(inode, ACC_MODE(flags));
if (ret)
goto out;
@@ -326,18 +320,23 @@ out:
return ERR_PTR(ret);
}
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
struct filename *pname;
int ret = -ENOENT;
+ int f_flags;
void *raw;
+ f_flags = bpf_get_file_flag(flags);
+ if (f_flags < 0)
+ return f_flags;
+
pname = getname(pathname);
if (IS_ERR(pname))
return PTR_ERR(pname);
- raw = bpf_obj_do_get(pname, &type);
+ raw = bpf_obj_do_get(pname, &type, f_flags);
if (IS_ERR(raw)) {
ret = PTR_ERR(raw);
goto out;
@@ -346,7 +345,7 @@ int bpf_obj_get_user(const char __user *pathname)
if (type == BPF_TYPE_PROG)
ret = bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
- ret = bpf_map_new_fd(raw);
+ ret = bpf_map_new_fd(raw, f_flags);
else
goto out;
@@ -363,7 +362,45 @@ out:
putname(pname);
return ret;
}
-EXPORT_SYMBOL_GPL(bpf_obj_get_user);
+
+static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (inode->i_op == &bpf_map_iops)
+ return ERR_PTR(-EINVAL);
+ if (inode->i_op != &bpf_prog_iops)
+ return ERR_PTR(-EACCES);
+
+ prog = inode->i_private;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (!bpf_prog_get_ok(prog, &type, false))
+ return ERR_PTR(-EINVAL);
+
+ return bpf_prog_inc(prog);
+}
+
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ struct path path;
+ int ret = kern_path(name, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+ prog = __get_prog_inode(d_backing_inode(path.dentry), type);
+ if (!IS_ERR(prog))
+ touch_atime(&path);
+ path_put(&path);
+ return prog;
+}
+EXPORT_SYMBOL(bpf_prog_get_type_path);
static void bpf_evict_inode(struct inode *inode)
{
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1b767844a76f..885e45479680 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -389,10 +389,99 @@ out:
return ret;
}
-static int trie_delete_elem(struct bpf_map *map, void *key)
+/* Called from syscall or from eBPF program */
+static int trie_delete_elem(struct bpf_map *map, void *_key)
{
- /* TODO */
- return -ENOSYS;
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct bpf_lpm_trie_key *key = _key;
+ struct lpm_trie_node __rcu **trim, **trim2;
+ struct lpm_trie_node *node, *parent;
+ unsigned long irq_flags;
+ unsigned int next_bit;
+ size_t matchlen = 0;
+ int ret = 0;
+
+ if (key->prefixlen > trie->max_prefixlen)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+ /* Walk the tree looking for an exact key/length match and keeping
+ * track of the path we traverse. We will need to know the node
+ * we wish to delete, and the slot that points to the node we want
+ * to delete. We may also need to know the nodes parent and the
+ * slot that contains it.
+ */
+ trim = &trie->root;
+ trim2 = trim;
+ parent = NULL;
+ while ((node = rcu_dereference_protected(
+ *trim, lockdep_is_held(&trie->lock)))) {
+ matchlen = longest_prefix_match(trie, node, key);
+
+ if (node->prefixlen != matchlen ||
+ node->prefixlen == key->prefixlen)
+ break;
+
+ parent = node;
+ trim2 = trim;
+ next_bit = extract_bit(key->data, node->prefixlen);
+ trim = &node->child[next_bit];
+ }
+
+ if (!node || node->prefixlen != key->prefixlen ||
+ (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ trie->n_entries--;
+
+ /* If the node we are removing has two children, simply mark it
+ * as intermediate and we are done.
+ */
+ if (rcu_access_pointer(node->child[0]) &&
+ rcu_access_pointer(node->child[1])) {
+ node->flags |= LPM_TREE_NODE_FLAG_IM;
+ goto out;
+ }
+
+ /* If the parent of the node we are about to delete is an intermediate
+ * node, and the deleted node doesn't have any children, we can delete
+ * the intermediate parent as well and promote its other child
+ * up the tree. Doing this maintains the invariant that all
+ * intermediate nodes have exactly 2 children and that there are no
+ * unnecessary intermediate nodes in the tree.
+ */
+ if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) &&
+ !node->child[0] && !node->child[1]) {
+ if (node == rcu_access_pointer(parent->child[0]))
+ rcu_assign_pointer(
+ *trim2, rcu_access_pointer(parent->child[1]));
+ else
+ rcu_assign_pointer(
+ *trim2, rcu_access_pointer(parent->child[0]));
+ kfree_rcu(parent, rcu);
+ kfree_rcu(node, rcu);
+ goto out;
+ }
+
+ /* The node we are removing has either zero or one child. If there
+ * is a child, move it into the removed node's slot then delete
+ * the node. Otherwise just clear the slot and delete the node.
+ */
+ if (node->child[0])
+ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0]));
+ else if (node->child[1])
+ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
+ else
+ RCU_INIT_POINTER(*trim, NULL);
+ kfree_rcu(node, rcu);
+
+out:
+ raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+ return ret;
}
#define LPM_DATA_SIZE_MAX 256
@@ -406,7 +495,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key)
#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
-#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \
+ BPF_F_RDONLY | BPF_F_WRONLY)
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
new file mode 100644
index 000000000000..8455b89d1bbf
--- /dev/null
+++ b/kernel/bpf/offload.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree.
+ *
+ * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
+ * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+ * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
+ * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+ * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+ */
+
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rtnetlink.h>
+
+/* protected by RTNL */
+static LIST_HEAD(bpf_prog_offload_devs);
+
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_dev_offload *offload;
+
+ if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_type != BPF_PROG_TYPE_XDP)
+ return -EINVAL;
+
+ if (attr->prog_flags)
+ return -EINVAL;
+
+ offload = kzalloc(sizeof(*offload), GFP_USER);
+ if (!offload)
+ return -ENOMEM;
+
+ offload->prog = prog;
+ init_waitqueue_head(&offload->verifier_done);
+
+ rtnl_lock();
+ offload->netdev = __dev_get_by_index(net, attr->prog_ifindex);
+ if (!offload->netdev) {
+ rtnl_unlock();
+ kfree(offload);
+ return -EINVAL;
+ }
+
+ prog->aux->offload = offload;
+ list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
+ rtnl_unlock();
+
+ return 0;
+}
+
+static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
+ struct netdev_bpf *data)
+{
+ struct net_device *netdev = prog->aux->offload->netdev;
+
+ ASSERT_RTNL();
+
+ if (!netdev)
+ return -ENODEV;
+ if (!netdev->netdev_ops->ndo_bpf)
+ return -EOPNOTSUPP;
+
+ data->command = cmd;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, data);
+}
+
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+ struct netdev_bpf data = {};
+ int err;
+
+ data.verifier.prog = env->prog;
+
+ rtnl_lock();
+ err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
+ if (err)
+ goto exit_unlock;
+
+ env->dev_ops = data.verifier.ops;
+
+ env->prog->aux->offload->dev_state = true;
+ env->prog->aux->offload->verifier_running = true;
+exit_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+ struct netdev_bpf data = {};
+
+ /* Caution - if netdev is destroyed before the program, this function
+ * will be called twice.
+ */
+
+ data.offload.prog = prog;
+
+ if (offload->verifier_running)
+ wait_event(offload->verifier_done, !offload->verifier_running);
+
+ if (offload->dev_state)
+ WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+
+ offload->dev_state = false;
+ list_del_init(&offload->offloads);
+ offload->netdev = NULL;
+}
+
+void bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+
+ offload->verifier_running = false;
+ wake_up(&offload->verifier_done);
+
+ rtnl_lock();
+ __bpf_prog_offload_destroy(prog);
+ rtnl_unlock();
+
+ kfree(offload);
+}
+
+static int bpf_prog_offload_translate(struct bpf_prog *prog)
+{
+ struct bpf_dev_offload *offload = prog->aux->offload;
+ struct netdev_bpf data = {};
+ int ret;
+
+ data.offload.prog = prog;
+
+ offload->verifier_running = false;
+ wake_up(&offload->verifier_done);
+
+ rtnl_lock();
+ ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
+ rtnl_unlock();
+
+ return ret;
+}
+
+static unsigned int bpf_prog_warn_on_exec(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ WARN(1, "attempt to execute device eBPF program on the host!");
+ return 0;
+}
+
+int bpf_prog_offload_compile(struct bpf_prog *prog)
+{
+ prog->bpf_func = bpf_prog_warn_on_exec;
+
+ return bpf_prog_offload_translate(prog);
+}
+
+const struct bpf_prog_ops bpf_offload_prog_ops = {
+};
+
+static int bpf_offload_notification(struct notifier_block *notifier,
+ ulong event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct bpf_dev_offload *offload, *tmp;
+
+ ASSERT_RTNL();
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* ignore namespace changes */
+ if (netdev->reg_state != NETREG_UNREGISTERING)
+ break;
+
+ list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
+ offloads) {
+ if (offload->netdev == netdev)
+ __bpf_prog_offload_destroy(offload->prog);
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block bpf_offload_notifier = {
+ .notifier_call = bpf_offload_notification,
+};
+
+static int __init bpf_offload_init(void)
+{
+ register_netdevice_notifier(&bpf_offload_notifier);
+ return 0;
+}
+
+subsys_initcall(bpf_offload_init);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 5c51d1985b51..673fa6fe2d73 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -78,8 +78,10 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
+ unsigned long flags;
int orig_cpu, cpu;
+ local_irq_save(flags);
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
@@ -87,14 +89,16 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
node = head->first;
if (node) {
head->first = node->next;
- raw_spin_unlock(&head->lock);
+ raw_spin_unlock_irqrestore(&head->lock, flags);
return node;
}
raw_spin_unlock(&head->lock);
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
- if (cpu == orig_cpu)
+ if (cpu == orig_cpu) {
+ local_irq_restore(flags);
return NULL;
+ }
}
}
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index dbd7b322a86b..1712d319c2d8 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -41,6 +41,9 @@
#include <net/strparser.h>
#include <net/tcp.h>
+#define SOCK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
struct bpf_stab {
struct bpf_map map;
struct sock **sock_map;
@@ -122,7 +125,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
*/
TCP_SKB_CB(skb)->bpf.map = NULL;
skb->sk = psock->sock;
- bpf_compute_data_end_sk_skb(skb);
+ bpf_compute_data_pointers(skb);
preempt_disable();
rc = (*prog->bpf_func)(skb, prog->insnsi);
preempt_enable();
@@ -385,7 +388,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
* any socket yet.
*/
skb->sk = psock->sock;
- bpf_compute_data_end_sk_skb(skb);
+ bpf_compute_data_pointers(skb);
rc = (*prog->bpf_func)(skb, prog->insnsi);
skb->sk = NULL;
rcu_read_unlock();
@@ -508,7 +511,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -588,8 +591,15 @@ static void sock_map_free(struct bpf_map *map)
write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
- smap_list_remove(psock, &stab->sock_map[i]);
- smap_release_sock(psock, sock);
+ /* This check handles a racing sock event that can get the
+ * sk_callback_lock before this case but after xchg happens
+ * causing the refcnt to hit zero and sock user data (psock)
+ * to be null and queued for garbage collection.
+ */
+ if (likely(psock)) {
+ smap_list_remove(psock, &stab->sock_map[i]);
+ smap_release_sock(psock, sock);
+ }
write_unlock_bh(&sock->sk_callback_lock);
}
rcu_read_unlock();
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 135be433e9a0..a15bc636cc98 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,9 @@
#include <linux/perf_event.h>
#include "percpu_freelist.h"
+#define STACK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
struct stack_map_bucket {
struct pcpu_freelist_node fnode;
u32 hash;
@@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (attr->map_flags & ~BPF_F_NUMA_NODE)
+ if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
/* check sanity of attributes */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 25d074920a00..5cb783fc8224 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,9 @@
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -31,6 +34,8 @@
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
+#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
+
DEFINE_PER_CPU(int, bpf_prog_active);
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
@@ -207,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
struct bpf_map *map = container_of(work, struct bpf_map, work);
bpf_map_uncharge_memlock(map);
+ security_bpf_map_free(map);
/* implementation dependent freeing */
map->ops->map_free(map);
}
@@ -291,17 +297,54 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
-static const struct file_operations bpf_map_fops = {
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+ loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_READ.
+ */
+ return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+ size_t siz, loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_WRITE.
+ */
+ return -EINVAL;
+}
+
+const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
#endif
.release = bpf_map_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
{
+ int ret;
+
+ ret = security_bpf_map(map, OPEN_FMODE(flags));
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
- O_RDWR | O_CLOEXEC);
+ flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+ if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+ return -EINVAL;
+ if (flags & BPF_F_RDONLY)
+ return O_RDONLY;
+ if (flags & BPF_F_WRONLY)
+ return O_WRONLY;
+ return O_RDWR;
}
/* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -312,18 +355,46 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD numa_node
+/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
+ * Return 0 on success and < 0 on error.
+ */
+static int bpf_obj_name_cpy(char *dst, const char *src)
+{
+ const char *end = src + BPF_OBJ_NAME_LEN;
+
+ memset(dst, 0, BPF_OBJ_NAME_LEN);
+
+ /* Copy all isalnum() and '_' char */
+ while (src < end && *src) {
+ if (!isalnum(*src) && *src != '_')
+ return -EINVAL;
+ *dst++ = *src++;
+ }
+
+ /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+ if (src == end)
+ return -EINVAL;
+
+ return 0;
+}
+
+#define BPF_MAP_CREATE_LAST_FIELD map_name
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map *map;
+ int f_flags;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;
+ f_flags = bpf_get_file_flag(attr->map_flags);
+ if (f_flags < 0)
+ return f_flags;
+
if (numa_node != NUMA_NO_NODE &&
((unsigned int)numa_node >= nr_node_ids ||
!node_online(numa_node)))
@@ -334,18 +405,26 @@ static int map_create(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ err = bpf_obj_name_cpy(map->name, attr->map_name);
+ if (err)
+ goto free_map_nouncharge;
+
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
- err = bpf_map_charge_memlock(map);
+ err = security_bpf_map_alloc(map);
if (err)
goto free_map_nouncharge;
+ err = bpf_map_charge_memlock(map);
+ if (err)
+ goto free_map_sec;
+
err = bpf_map_alloc_id(map);
if (err)
goto free_map;
- err = bpf_map_new_fd(map);
+ err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
/* failed to allocate fd.
* bpf_map_put() is needed because the above
@@ -362,6 +441,8 @@ static int map_create(union bpf_attr *attr)
free_map:
bpf_map_uncharge_memlock(map);
+free_map_sec:
+ security_bpf_map_free(map);
free_map_nouncharge:
map->ops->map_free(map);
return err;
@@ -460,6 +541,11 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -540,6 +626,11 @@ static int map_update_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -562,6 +653,12 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;
+ /* Need to create a kthread, thus must support schedule */
+ if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
+ goto out;
+ }
+
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
* inside bpf map update or delete otherwise deadlocks are possible
*/
@@ -592,7 +689,7 @@ static int map_update_elem(union bpf_attr *attr)
}
__this_cpu_dec(bpf_prog_active);
preempt_enable();
-
+out:
if (!err)
trace_bpf_map_update_elem(map, ufd, key, value);
free_value:
@@ -623,6 +720,11 @@ static int map_delete_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -666,6 +768,11 @@ static int map_get_next_key(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
if (ukey) {
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
@@ -703,9 +810,9 @@ err_put:
return err;
}
-static const struct bpf_verifier_ops * const bpf_prog_types[] = {
-#define BPF_PROG_TYPE(_id, _ops) \
- [_id] = &_ops,
+static const struct bpf_prog_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+ [_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
@@ -717,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
return -EINVAL;
- prog->aux->ops = bpf_prog_types[type];
+ if (!bpf_prog_is_dev_bound(prog->aux))
+ prog->aux->ops = bpf_prog_types[type];
+ else
+ prog->aux->ops = &bpf_offload_prog_ops;
prog->type = type;
return 0;
}
@@ -820,6 +930,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
free_used_maps(aux);
bpf_prog_uncharge_memlock(aux->prog);
+ security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
}
@@ -867,15 +978,23 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
-static const struct file_operations bpf_prog_fops = {
+const struct file_operations bpf_prog_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_prog_show_fdinfo,
#endif
.release = bpf_prog_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
int bpf_prog_new_fd(struct bpf_prog *prog)
{
+ int ret;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
O_RDWR | O_CLOEXEC);
}
@@ -938,7 +1057,23 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
}
EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
+bool bpf_prog_get_ok(struct bpf_prog *prog,
+ enum bpf_prog_type *attach_type, bool attach_drv)
+{
+ /* not an attachment, just a refcount inc, always allow */
+ if (!attach_type)
+ return true;
+
+ if (prog->type != *attach_type)
+ return false;
+ if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
+ return false;
+
+ return true;
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
+ bool attach_drv)
{
struct fd f = fdget(ufd);
struct bpf_prog *prog;
@@ -946,7 +1081,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
prog = ____bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
- if (type && prog->type != *type) {
+ if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
prog = ERR_PTR(-EINVAL);
goto out;
}
@@ -959,21 +1094,22 @@ out:
struct bpf_prog *bpf_prog_get(u32 ufd)
{
- return __bpf_prog_get(ufd, NULL);
+ return __bpf_prog_get(ufd, NULL, false);
}
-struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+ bool attach_drv)
{
- struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
+ struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv);
if (!IS_ERR(prog))
trace_bpf_prog_get_type(prog);
return prog;
}
-EXPORT_SYMBOL_GPL(bpf_prog_get_type);
+EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD prog_flags
+#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex
static int bpf_prog_load(union bpf_attr *attr)
{
@@ -1015,10 +1151,14 @@ static int bpf_prog_load(union bpf_attr *attr)
if (!prog)
return -ENOMEM;
- err = bpf_prog_charge_memlock(prog);
+ err = security_bpf_prog_alloc(prog->aux);
if (err)
goto free_prog_nouncharge;
+ err = bpf_prog_charge_memlock(prog);
+ if (err)
+ goto free_prog_sec;
+
prog->len = attr->insn_cnt;
err = -EFAULT;
@@ -1032,11 +1172,22 @@ static int bpf_prog_load(union bpf_attr *attr)
atomic_set(&prog->aux->refcnt, 1);
prog->gpl_compatible = is_gpl ? 1 : 0;
+ if (attr->prog_ifindex) {
+ err = bpf_prog_offload_init(prog, attr);
+ if (err)
+ goto free_prog;
+ }
+
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog;
+ prog->aux->load_time = ktime_get_boot_ns();
+ err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
+ if (err)
+ goto free_prog;
+
/* run eBPF verifier */
err = bpf_check(&prog, attr);
if (err < 0)
@@ -1071,16 +1222,18 @@ free_used_maps:
free_used_maps(prog->aux);
free_prog:
bpf_prog_uncharge_memlock(prog);
+free_prog_sec:
+ security_bpf_prog_free(prog->aux);
free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
static int bpf_obj_pin(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ))
+ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
return -EINVAL;
return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
@@ -1088,10 +1241,12 @@ static int bpf_obj_pin(const union bpf_attr *attr)
static int bpf_obj_get(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+ attr->file_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
- return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
+ return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+ attr->file_flags);
}
#ifdef CONFIG_CGROUP_BPF
@@ -1132,6 +1287,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
return 0;
}
+#define BPF_F_ATTACH_MASK \
+ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+
static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
@@ -1145,7 +1303,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
- if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
return -EINVAL;
switch (attr->attach_type) {
@@ -1159,6 +1317,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_SOCK_OPS:
ptype = BPF_PROG_TYPE_SOCK_OPS;
break;
+ case BPF_CGROUP_DEVICE:
+ ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+ break;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
return sockmap_get_from_fd(attr, true);
@@ -1176,8 +1337,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return PTR_ERR(cgrp);
}
- ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
- attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+ ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+ attr->attach_flags);
if (ret)
bpf_prog_put(prog);
cgroup_put(cgrp);
@@ -1189,6 +1350,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
static int bpf_prog_detach(const union bpf_attr *attr)
{
+ enum bpf_prog_type ptype;
+ struct bpf_prog *prog;
struct cgroup *cgrp;
int ret;
@@ -1201,26 +1364,71 @@ static int bpf_prog_detach(const union bpf_attr *attr)
switch (attr->attach_type) {
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
+ ptype = BPF_PROG_TYPE_CGROUP_SKB;
+ break;
case BPF_CGROUP_INET_SOCK_CREATE:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+ break;
case BPF_CGROUP_SOCK_OPS:
- cgrp = cgroup_get_from_fd(attr->target_fd);
- if (IS_ERR(cgrp))
- return PTR_ERR(cgrp);
-
- ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
- cgroup_put(cgrp);
+ ptype = BPF_PROG_TYPE_SOCK_OPS;
+ break;
+ case BPF_CGROUP_DEVICE:
+ ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- ret = sockmap_get_from_fd(attr, false);
- break;
+ return sockmap_get_from_fd(attr, false);
default:
return -EINVAL;
}
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ prog = NULL;
+
+ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+ if (prog)
+ bpf_prog_put(prog);
+ cgroup_put(cgrp);
return ret;
}
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+
+static int bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (CHECK_ATTR(BPF_PROG_QUERY))
+ return -EINVAL;
+ if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
+ return -EINVAL;
+
+ switch (attr->query.attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_DEVICE:
+ break;
+ default:
+ return -EINVAL;
+ }
+ cgrp = cgroup_get_from_fd(attr->query.target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+ ret = cgroup_bpf_query(cgrp, attr, uattr);
+ cgroup_put(cgrp);
+ return ret;
+}
#endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -1305,20 +1513,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
return fd;
}
-#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
{
struct bpf_map *map;
u32 id = attr->map_id;
+ int f_flags;
int fd;
- if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+ if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
+ attr->open_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ f_flags = bpf_get_file_flag(attr->open_flags);
+ if (f_flags < 0)
+ return f_flags;
+
spin_lock_bh(&map_idr_lock);
map = idr_find(&map_idr, id);
if (map)
@@ -1330,7 +1544,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- fd = bpf_map_new_fd(map);
+ fd = bpf_map_new_fd(map, f_flags);
if (fd < 0)
bpf_map_put(map);
@@ -1358,8 +1572,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info.type = prog->type;
info.id = prog->aux->id;
+ info.load_time = prog->aux->load_time;
+ info.created_by_uid = from_kuid_munged(current_user_ns(),
+ prog->aux->user->uid);
memcpy(info.tag, prog->tag, sizeof(prog->tag));
+ memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+ ulen = info.nr_map_ids;
+ info.nr_map_ids = prog->aux->used_map_cnt;
+ ulen = min_t(u32, info.nr_map_ids, ulen);
+ if (ulen) {
+ u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
+ u32 i;
+
+ for (i = 0; i < ulen; i++)
+ if (put_user(prog->aux->used_maps[i]->id,
+ &user_map_ids[i]))
+ return -EFAULT;
+ }
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
@@ -1413,6 +1644,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
info.value_size = map->value_size;
info.max_entries = map->max_entries;
info.map_flags = map->map_flags;
+ memcpy(info.name, map->name, sizeof(map->name));
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
@@ -1467,6 +1699,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
+ err = security_bpf(cmd, &attr, size);
+ if (err < 0)
+ return err;
+
switch (cmd) {
case BPF_MAP_CREATE:
err = map_create(&attr);
@@ -1499,6 +1735,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_DETACH:
err = bpf_prog_detach(&attr);
break;
+ case BPF_PROG_QUERY:
+ err = bpf_prog_query(&attr, uattr);
+ break;
#endif
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c48ca2a34b5e..13551e623501 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21,6 +21,17 @@
#include <linux/vmalloc.h>
#include <linux/stringify.h>
+#include "disasm.h"
+
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+ [_id] = & _name ## _verifier_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
/* bpf_check() is a static code analyzer that walks eBPF program
* instruction by instruction and updates register/stack state.
* All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -153,28 +164,42 @@ struct bpf_call_arg_meta {
int access_size;
};
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
-
static DEFINE_MUTEX(bpf_verifier_lock);
/* log_level controls verbosity level of eBPF verifier.
* verbose() is used to dump the verification trace to the log, so the user
* can figure out what's wrong with the program
*/
-static __printf(1, 2) void verbose(const char *fmt, ...)
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+ const char *fmt, ...)
{
+ struct bpf_verifer_log *log = &env->log;
+ unsigned int n;
va_list args;
- if (log_level == 0 || log_len >= log_size - 1)
+ if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
return;
va_start(args, fmt);
- log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
va_end(args);
+
+ WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+ "verifier log line truncated - local buffer too short\n");
+
+ n = min(log->len_total - log->len_used - 1, n);
+ log->kbuf[n] = '\0';
+
+ if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+ log->len_used += n;
+ else
+ log->ubuf = NULL;
+}
+
+static bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+ return type == PTR_TO_PACKET ||
+ type == PTR_TO_PACKET_META;
}
/* string representation of 'enum bpf_reg_type' */
@@ -187,26 +212,12 @@ static const char * const reg_type_str[] = {
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
[PTR_TO_STACK] = "fp",
[PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_META] = "pkt_meta",
[PTR_TO_PACKET_END] = "pkt_end",
};
-#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
-static const char * const func_id_str[] = {
- __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
-};
-#undef __BPF_FUNC_STR_FN
-
-static const char *func_id_name(int id)
-{
- BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
-
- if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
- return func_id_str[id];
- else
- return "unknown";
-}
-
-static void print_verifier_state(struct bpf_verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state)
{
struct bpf_reg_state *reg;
enum bpf_reg_type t;
@@ -217,21 +228,21 @@ static void print_verifier_state(struct bpf_verifier_state *state)
t = reg->type;
if (t == NOT_INIT)
continue;
- verbose(" R%d=%s", i, reg_type_str[t]);
+ verbose(env, " R%d=%s", i, reg_type_str[t]);
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
tnum_is_const(reg->var_off)) {
/* reg->off should be 0 for SCALAR_VALUE */
- verbose("%lld", reg->var_off.value + reg->off);
+ verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- verbose("(id=%d", reg->id);
+ verbose(env, "(id=%d", reg->id);
if (t != SCALAR_VALUE)
- verbose(",off=%d", reg->off);
- if (t == PTR_TO_PACKET)
- verbose(",r=%d", reg->range);
+ verbose(env, ",off=%d", reg->off);
+ if (type_is_pkt_pointer(t))
+ verbose(env, ",r=%d", reg->range);
else if (t == CONST_PTR_TO_MAP ||
t == PTR_TO_MAP_VALUE ||
t == PTR_TO_MAP_VALUE_OR_NULL)
- verbose(",ks=%d,vs=%d",
+ verbose(env, ",ks=%d,vs=%d",
reg->map_ptr->key_size,
reg->map_ptr->value_size);
if (tnum_is_const(reg->var_off)) {
@@ -239,243 +250,174 @@ static void print_verifier_state(struct bpf_verifier_state *state)
* could be a pointer whose offset is too big
* for reg->off
*/
- verbose(",imm=%llx", reg->var_off.value);
+ verbose(env, ",imm=%llx", reg->var_off.value);
} else {
if (reg->smin_value != reg->umin_value &&
reg->smin_value != S64_MIN)
- verbose(",smin_value=%lld",
+ verbose(env, ",smin_value=%lld",
(long long)reg->smin_value);
if (reg->smax_value != reg->umax_value &&
reg->smax_value != S64_MAX)
- verbose(",smax_value=%lld",
+ verbose(env, ",smax_value=%lld",
(long long)reg->smax_value);
if (reg->umin_value != 0)
- verbose(",umin_value=%llu",
+ verbose(env, ",umin_value=%llu",
(unsigned long long)reg->umin_value);
if (reg->umax_value != U64_MAX)
- verbose(",umax_value=%llu",
+ verbose(env, ",umax_value=%llu",
(unsigned long long)reg->umax_value);
if (!tnum_is_unknown(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(",var_off=%s", tn_buf);
+ verbose(env, ",var_off=%s", tn_buf);
}
}
- verbose(")");
+ verbose(env, ")");
}
}
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] == STACK_SPILL)
- verbose(" fp%d=%s", -MAX_BPF_STACK + i,
- reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] == STACK_SPILL)
+ verbose(env, " fp%d=%s",
+ -MAX_BPF_STACK + i * BPF_REG_SIZE,
+ reg_type_str[state->stack[i].spilled_ptr.type]);
}
- verbose("\n");
+ verbose(env, "\n");
}
-static const char *const bpf_class_string[] = {
- [BPF_LD] = "ld",
- [BPF_LDX] = "ldx",
- [BPF_ST] = "st",
- [BPF_STX] = "stx",
- [BPF_ALU] = "alu",
- [BPF_JMP] = "jmp",
- [BPF_RET] = "BUG",
- [BPF_ALU64] = "alu64",
-};
-
-static const char *const bpf_alu_string[16] = {
- [BPF_ADD >> 4] = "+=",
- [BPF_SUB >> 4] = "-=",
- [BPF_MUL >> 4] = "*=",
- [BPF_DIV >> 4] = "/=",
- [BPF_OR >> 4] = "|=",
- [BPF_AND >> 4] = "&=",
- [BPF_LSH >> 4] = "<<=",
- [BPF_RSH >> 4] = ">>=",
- [BPF_NEG >> 4] = "neg",
- [BPF_MOD >> 4] = "%=",
- [BPF_XOR >> 4] = "^=",
- [BPF_MOV >> 4] = "=",
- [BPF_ARSH >> 4] = "s>>=",
- [BPF_END >> 4] = "endian",
-};
-
-static const char *const bpf_ldst_string[] = {
- [BPF_W >> 3] = "u32",
- [BPF_H >> 3] = "u16",
- [BPF_B >> 3] = "u8",
- [BPF_DW >> 3] = "u64",
-};
-
-static const char *const bpf_jmp_string[16] = {
- [BPF_JA >> 4] = "jmp",
- [BPF_JEQ >> 4] = "==",
- [BPF_JGT >> 4] = ">",
- [BPF_JLT >> 4] = "<",
- [BPF_JGE >> 4] = ">=",
- [BPF_JLE >> 4] = "<=",
- [BPF_JSET >> 4] = "&",
- [BPF_JNE >> 4] = "!=",
- [BPF_JSGT >> 4] = "s>",
- [BPF_JSLT >> 4] = "s<",
- [BPF_JSGE >> 4] = "s>=",
- [BPF_JSLE >> 4] = "s<=",
- [BPF_CALL >> 4] = "call",
- [BPF_EXIT >> 4] = "exit",
-};
+static int copy_stack_state(struct bpf_verifier_state *dst,
+ const struct bpf_verifier_state *src)
+{
+ if (!src->stack)
+ return 0;
+ if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
+ /* internal bug, make state invalid to reject the program */
+ memset(dst, 0, sizeof(*dst));
+ return -EFAULT;
+ }
+ memcpy(dst->stack, src->stack,
+ sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
+ return 0;
+}
-static void print_bpf_insn(const struct bpf_verifier_env *env,
- const struct bpf_insn *insn)
+/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
+ * make it consume minimal amount of memory. check_stack_write() access from
+ * the program calls into realloc_verifier_state() to grow the stack size.
+ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
+ * which this function copies over. It points to previous bpf_verifier_state
+ * which is never reallocated
+ */
+static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
+ bool copy_old)
{
- u8 class = BPF_CLASS(insn->code);
-
- if (class == BPF_ALU || class == BPF_ALU64) {
- if (BPF_SRC(insn->code) == BPF_X)
- verbose("(%02x) %sr%d %s %sr%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
- insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
- insn->src_reg);
- else
- verbose("(%02x) %sr%d %s %s%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
- insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
- insn->imm);
- } else if (class == BPF_STX) {
- if (BPF_MODE(insn->code) == BPF_MEM)
- verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->src_reg);
- else if (BPF_MODE(insn->code) == BPF_XADD)
- verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg, insn->off,
- insn->src_reg);
- else
- verbose("BUG_%02x\n", insn->code);
- } else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_st_%02x\n", insn->code);
- return;
- }
- verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->imm);
- } else if (class == BPF_LDX) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_ldx_%02x\n", insn->code);
- return;
+ u32 old_size = state->allocated_stack;
+ struct bpf_stack_state *new_stack;
+ int slot = size / BPF_REG_SIZE;
+
+ if (size <= old_size || !size) {
+ if (copy_old)
+ return 0;
+ state->allocated_stack = slot * BPF_REG_SIZE;
+ if (!size && old_size) {
+ kfree(state->stack);
+ state->stack = NULL;
}
- verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
- insn->code, insn->dst_reg,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->src_reg, insn->off);
- } else if (class == BPF_LD) {
- if (BPF_MODE(insn->code) == BPF_ABS) {
- verbose("(%02x) r0 = *(%s *)skb[%d]\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IND) {
- verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->src_reg, insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IMM &&
- BPF_SIZE(insn->code) == BPF_DW) {
- /* At this point, we already made sure that the second
- * part of the ldimm64 insn is accessible.
- */
- u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+ return 0;
+ }
+ new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
+ GFP_KERNEL);
+ if (!new_stack)
+ return -ENOMEM;
+ if (copy_old) {
+ if (state->stack)
+ memcpy(new_stack, state->stack,
+ sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
+ memset(new_stack + old_size / BPF_REG_SIZE, 0,
+ sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
+ }
+ state->allocated_stack = slot * BPF_REG_SIZE;
+ kfree(state->stack);
+ state->stack = new_stack;
+ return 0;
+}
- if (map_ptr && !env->allow_ptr_leaks)
- imm = 0;
+static void free_verifier_state(struct bpf_verifier_state *state,
+ bool free_self)
+{
+ kfree(state->stack);
+ if (free_self)
+ kfree(state);
+}
- verbose("(%02x) r%d = 0x%llx\n", insn->code,
- insn->dst_reg, (unsigned long long)imm);
- } else {
- verbose("BUG_ld_%02x\n", insn->code);
- return;
- }
- } else if (class == BPF_JMP) {
- u8 opcode = BPF_OP(insn->code);
+/* copy verifier state from src to dst growing dst stack space
+ * when necessary to accommodate larger src stack
+ */
+static int copy_verifier_state(struct bpf_verifier_state *dst,
+ const struct bpf_verifier_state *src)
+{
+ int err;
- if (opcode == BPF_CALL) {
- verbose("(%02x) call %s#%d\n", insn->code,
- func_id_name(insn->imm), insn->imm);
- } else if (insn->code == (BPF_JMP | BPF_JA)) {
- verbose("(%02x) goto pc%+d\n",
- insn->code, insn->off);
- } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
- verbose("(%02x) exit\n", insn->code);
- } else if (BPF_SRC(insn->code) == BPF_X) {
- verbose("(%02x) if r%d %s r%d goto pc%+d\n",
- insn->code, insn->dst_reg,
- bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->src_reg, insn->off);
- } else {
- verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
- insn->code, insn->dst_reg,
- bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->imm, insn->off);
- }
- } else {
- verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
- }
+ err = realloc_verifier_state(dst, src->allocated_stack, false);
+ if (err)
+ return err;
+ memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack));
+ return copy_stack_state(dst, src);
}
-static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
+ int *insn_idx)
{
- struct bpf_verifier_stack_elem *elem;
- int insn_idx;
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_verifier_stack_elem *elem, *head = env->head;
+ int err;
if (env->head == NULL)
- return -1;
+ return -ENOENT;
- memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
- insn_idx = env->head->insn_idx;
+ if (cur) {
+ err = copy_verifier_state(cur, &head->st);
+ if (err)
+ return err;
+ }
+ if (insn_idx)
+ *insn_idx = head->insn_idx;
if (prev_insn_idx)
- *prev_insn_idx = env->head->prev_insn_idx;
- elem = env->head->next;
- kfree(env->head);
+ *prev_insn_idx = head->prev_insn_idx;
+ elem = head->next;
+ free_verifier_state(&head->st, false);
+ kfree(head);
env->head = elem;
env->stack_size--;
- return insn_idx;
+ return 0;
}
static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx)
{
+ struct bpf_verifier_state *cur = env->cur_state;
struct bpf_verifier_stack_elem *elem;
+ int err;
- elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
if (!elem)
goto err;
- memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
env->head = elem;
env->stack_size++;
+ err = copy_verifier_state(&elem->st, cur);
+ if (err)
+ goto err;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
- verbose("BPF program is too complex\n");
+ verbose(env, "BPF program is too complex\n");
goto err;
}
return &elem->st;
err:
/* pop all elements and return */
- while (pop_stack(env, NULL) >= 0);
+ while (!pop_stack(env, NULL, NULL));
return NULL;
}
@@ -507,10 +449,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
__mark_reg_known(reg, 0);
}
-static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_known_zero(regs, %u)\n", regno);
+ verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -519,6 +462,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
__mark_reg_known_zero(regs + regno);
}
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+ return type_is_pkt_pointer(reg->type);
+}
+
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+ return reg_is_pkt_pointer(reg) ||
+ reg->type == PTR_TO_PACKET_END;
+}
+
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+ enum bpf_reg_type which)
+{
+ /* The register can already have a range from prior markings.
+ * This is fine as long as it hasn't been advanced from its
+ * origin.
+ */
+ return reg->type == which &&
+ reg->id == 0 &&
+ reg->off == 0 &&
+ tnum_equals_const(reg->var_off, 0);
+}
+
/* Attempts to improve min/max values based on var_off information */
static void __update_reg_bounds(struct bpf_reg_state *reg)
{
@@ -595,10 +563,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
__mark_reg_unbounded(reg);
}
-static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_unknown(regs, %u)\n", regno);
+ verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -613,10 +582,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg)
reg->type = NOT_INIT;
}
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_not_init(regs, %u)\n", regno);
+ verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -625,22 +595,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
__mark_reg_not_init(regs + regno);
}
-static void init_reg_state(struct bpf_reg_state *regs)
+static void init_reg_state(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
{
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
- mark_reg_not_init(regs, i);
+ mark_reg_not_init(env, regs, i);
regs[i].live = REG_LIVE_NONE;
}
/* frame pointer */
regs[BPF_REG_FP].type = PTR_TO_STACK;
- mark_reg_known_zero(regs, BPF_REG_FP);
+ mark_reg_known_zero(env, regs, BPF_REG_FP);
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
- mark_reg_known_zero(regs, BPF_REG_1);
+ mark_reg_known_zero(env, regs, BPF_REG_1);
}
enum reg_arg_type {
@@ -671,29 +642,29 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
enum reg_arg_type t)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state->regs;
if (regno >= MAX_BPF_REG) {
- verbose("R%d is invalid\n", regno);
+ verbose(env, "R%d is invalid\n", regno);
return -EINVAL;
}
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
if (regs[regno].type == NOT_INIT) {
- verbose("R%d !read_ok\n", regno);
+ verbose(env, "R%d !read_ok\n", regno);
return -EACCES;
}
- mark_reg_read(&env->cur_state, regno);
+ mark_reg_read(env->cur_state, regno);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
- verbose("frame pointer is read only\n");
+ verbose(env, "frame pointer is read only\n");
return -EACCES;
}
regs[regno].live |= REG_LIVE_WRITTEN;
if (t == DST_OP)
- mark_reg_unknown(regs, regno);
+ mark_reg_unknown(env, regs, regno);
}
return 0;
}
@@ -706,6 +677,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
case PTR_TO_PACKET_END:
case CONST_PTR_TO_MAP:
return true;
@@ -717,35 +689,48 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
-static int check_stack_write(struct bpf_verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state, int off,
int size, int value_regno)
{
- int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+
+ err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE),
+ true);
+ if (err)
+ return err;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
+ if (!env->allow_ptr_leaks &&
+ state->stack[spi].slot_type[0] == STACK_SPILL &&
+ size != BPF_REG_SIZE) {
+ verbose(env, "attempt to corrupt spilled pointer on stack\n");
+ return -EACCES;
+ }
if (value_regno >= 0 &&
is_spillable_regtype(state->regs[value_regno].type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
+ verbose(env, "invalid size of register spill\n");
return -EACCES;
}
/* save register state */
- state->spilled_regs[spi] = state->regs[value_regno];
- state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
+ state->stack[spi].spilled_ptr = state->regs[value_regno];
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
for (i = 0; i < BPF_REG_SIZE; i++)
- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+ state->stack[spi].slot_type[i] = STACK_SPILL;
} else {
/* regular write of data into stack */
- state->spilled_regs[spi] = (struct bpf_reg_state) {};
+ state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
for (i = 0; i < size; i++)
- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+ state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
+ STACK_MISC;
}
return 0;
}
@@ -756,66 +741,72 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
while (parent) {
/* if read wasn't screened by an earlier write ... */
- if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+ if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
break;
/* ... then we depend on parent's value */
- parent->spilled_regs[slot].live |= REG_LIVE_READ;
+ parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
state = parent;
parent = state->parent;
}
}
-static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
- u8 *slot_type;
- int i, spi;
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+ u8 *stype;
- slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+ if (state->allocated_stack <= slot) {
+ verbose(env, "invalid read from stack off %d+0 size %d\n",
+ off, size);
+ return -EACCES;
+ }
+ stype = state->stack[spi].slot_type;
- if (slot_type[0] == STACK_SPILL) {
+ if (stype[0] == STACK_SPILL) {
if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
+ verbose(env, "invalid size of register spill\n");
return -EACCES;
}
for (i = 1; i < BPF_REG_SIZE; i++) {
- if (slot_type[i] != STACK_SPILL) {
- verbose("corrupted spill memory\n");
+ if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
+ verbose(env, "corrupted spill memory\n");
return -EACCES;
}
}
- spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
-
if (value_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] = state->spilled_regs[spi];
+ state->regs[value_regno] = state->stack[spi].spilled_ptr;
mark_stack_slot_read(state, spi);
}
return 0;
} else {
for (i = 0; i < size; i++) {
- if (slot_type[i] != STACK_MISC) {
- verbose("invalid read from stack off %d+%d size %d\n",
+ if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) {
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
}
}
if (value_regno >= 0)
/* have read misc data from the stack */
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, state->regs, value_regno);
return 0;
}
}
/* check read/write into map element returned by bpf_map_lookup_elem() */
static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size)
+ int size, bool zero_size_allowed)
{
- struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_map *map = regs[regno].map_ptr;
- if (off < 0 || size <= 0 || off + size > map->value_size) {
- verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+ off + size > map->value_size) {
+ verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
map->value_size, off, size);
return -EACCES;
}
@@ -824,9 +815,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
/* check read/write into a map element with possible variable offset */
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size)
+ int off, int size, bool zero_size_allowed)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *reg = &state->regs[regno];
int err;
@@ -834,8 +825,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
*/
- if (log_level)
- print_verifier_state(state);
+ if (env->log.level)
+ print_verifier_state(env, state);
/* The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
@@ -843,13 +834,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* will have a set floor within our range.
*/
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
- err = __check_map_access(env, regno, reg->smin_value + off, size);
+ err = __check_map_access(env, regno, reg->smin_value + off, size,
+ zero_size_allowed);
if (err) {
- verbose("R%d min value is outside of the array range\n", regno);
+ verbose(env, "R%d min value is outside of the array range\n",
+ regno);
return err;
}
@@ -858,13 +851,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* If reg->umax_value + off could overflow, treat that as unbounded too.
*/
if (reg->umax_value >= BPF_MAX_VAR_OFF) {
- verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+ verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
regno);
return -EACCES;
}
- err = __check_map_access(env, regno, reg->umax_value + off, size);
+ err = __check_map_access(env, regno, reg->umax_value + off, size,
+ zero_size_allowed);
if (err)
- verbose("R%d max value is outside of the array range\n", regno);
+ verbose(env, "R%d max value is outside of the array range\n",
+ regno);
return err;
}
@@ -897,13 +892,14 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
}
static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size)
+ int off, int size, bool zero_size_allowed)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = &regs[regno];
- if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
- verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+ if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+ (u64)off + size > reg->range) {
+ verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, reg->off, reg->range);
return -EACCES;
}
@@ -911,9 +907,9 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
}
static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size)
+ int size, bool zero_size_allowed)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = &regs[regno];
int err;
@@ -926,13 +922,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
* detail to prove they're safe.
*/
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
- err = __check_packet_access(env, regno, off, size);
+ err = __check_packet_access(env, regno, off, size, zero_size_allowed);
if (err) {
- verbose("R%d offset is outside of the packet\n", regno);
+ verbose(env, "R%d offset is outside of the packet\n", regno);
return err;
}
return err;
@@ -946,12 +942,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
.reg_type = *reg_type,
};
- /* for analyzer ctx accesses are already validated and converted */
- if (env->analyzer_ops)
- return 0;
-
- if (env->prog->aux->ops->is_valid_access &&
- env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+ if (env->ops->is_valid_access &&
+ env->ops->is_valid_access(off, size, t, &info)) {
/* A non zero info.ctx_field_size indicates that this field is a
* candidate for later verifier transformation to load the whole
* field and then apply a mask when accessed with a narrower
@@ -959,16 +951,16 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
* will only allow for whole field access and rejects any other
* type of narrower access.
*/
- env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
*reg_type = info.reg_type;
+ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
return 0;
}
- verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+ verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
return -EACCES;
}
@@ -983,10 +975,18 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
{
- return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+ return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
}
-static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = cur_regs(env) + regno;
+
+ return reg->type == PTR_TO_CTX;
+}
+
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
int off, int size, bool strict)
{
struct tnum reg_off;
@@ -1011,7 +1011,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+ verbose(env,
+ "misaligned packet access off %d+%s+%d+%d size %d\n",
ip_align, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -1019,7 +1020,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
return 0;
}
-static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
const char *pointer_desc,
int off, int size, bool strict)
{
@@ -1034,7 +1036,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("misaligned %saccess off %s+%d+%d size %d\n",
+ verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
pointer_desc, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -1051,8 +1053,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
switch (reg->type) {
case PTR_TO_PACKET:
- /* special case, because of NET_IP_ALIGN */
- return check_pkt_ptr_alignment(reg, off, size, strict);
+ case PTR_TO_PACKET_META:
+ /* Special case, because of NET_IP_ALIGN. Given metadata sits
+ * right in front, treat it the very same way.
+ */
+ return check_pkt_ptr_alignment(env, reg, off, size, strict);
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
break;
@@ -1061,11 +1066,40 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
break;
case PTR_TO_STACK:
pointer_desc = "stack ";
+ /* The stack spill tracking logic in check_stack_write()
+ * and check_stack_read() relies on stack accesses being
+ * aligned.
+ */
+ strict = true;
break;
default:
break;
}
- return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
+ return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+ strict);
+}
+
+/* truncate register to smaller size (in bytes)
+ * must be called with size < BPF_REG_SIZE
+ */
+static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
+{
+ u64 mask;
+
+ /* clear high bits in bit representation */
+ reg->var_off = tnum_cast(reg->var_off, size);
+
+ /* fix arithmetic bounds */
+ mask = ((u64)1 << (size * 8)) - 1;
+ if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
+ reg->umin_value &= mask;
+ reg->umax_value &= mask;
+ } else {
+ reg->umin_value = 0;
+ reg->umax_value = mask;
+ }
+ reg->smin_value = reg->umin_value;
+ reg->smax_value = reg->umax_value;
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -1078,8 +1112,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
int bpf_size, enum bpf_access_type t,
int value_regno)
{
- struct bpf_verifier_state *state = &env->cur_state;
- struct bpf_reg_state *reg = &state->regs[regno];
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = regs + regno;
int size, err = 0;
size = bpf_size_to_bytes(bpf_size);
@@ -1097,27 +1132,28 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into map\n", value_regno);
+ verbose(env, "R%d leaks addr into map\n", value_regno);
return -EACCES;
}
- err = check_map_access(env, regno, off, size);
+ err = check_map_access(env, regno, off, size, false);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into ctx\n", value_regno);
+ verbose(env, "R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
/* ctx accesses must be at a fixed offset, so that we can
* determine what type of data were returned.
*/
if (reg->off) {
- verbose("dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
+ verbose(env,
+ "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
regno, reg->off, off - reg->off);
return -EACCES;
}
@@ -1125,24 +1161,26 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("variable ctx access var_off=%s off=%d size=%d",
+ verbose(env,
+ "variable ctx access var_off=%s off=%d size=%d",
tn_buf, off, size);
return -EACCES;
}
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
/* ctx access returns either a scalar, or a
- * PTR_TO_PACKET[_END]. In the latter case, we know
- * the offset is zero.
+ * PTR_TO_PACKET[_META,_END]. In the latter
+ * case, we know the offset is zero.
*/
if (reg_type == SCALAR_VALUE)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, regs, value_regno);
else
- mark_reg_known_zero(state->regs, value_regno);
- state->regs[value_regno].id = 0;
- state->regs[value_regno].off = 0;
- state->regs[value_regno].range = 0;
- state->regs[value_regno].type = reg_type;
+ mark_reg_known_zero(env, regs,
+ value_regno);
+ regs[value_regno].id = 0;
+ regs[value_regno].off = 0;
+ regs[value_regno].range = 0;
+ regs[value_regno].type = reg_type;
}
} else if (reg->type == PTR_TO_STACK) {
@@ -1154,55 +1192,50 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("variable stack access var_off=%s off=%d size=%d",
+ verbose(env, "variable stack access var_off=%s off=%d size=%d",
tn_buf, off, size);
return -EACCES;
}
off += reg->var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK) {
- verbose("invalid stack off=%d size=%d\n", off, size);
+ verbose(env, "invalid stack off=%d size=%d\n", off,
+ size);
return -EACCES;
}
if (env->prog->aux->stack_depth < -off)
env->prog->aux->stack_depth = -off;
- if (t == BPF_WRITE) {
- if (!env->allow_ptr_leaks &&
- state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
- size != BPF_REG_SIZE) {
- verbose("attempt to corrupt spilled pointer on stack\n");
- return -EACCES;
- }
- err = check_stack_write(state, off, size, value_regno);
- } else {
- err = check_stack_read(state, off, size, value_regno);
- }
- } else if (reg->type == PTR_TO_PACKET) {
+ if (t == BPF_WRITE)
+ err = check_stack_write(env, state, off, size,
+ value_regno);
+ else
+ err = check_stack_read(env, state, off, size,
+ value_regno);
+ } else if (reg_is_pkt_pointer(reg)) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
- verbose("cannot write into packet\n");
+ verbose(env, "cannot write into packet\n");
return -EACCES;
}
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into packet\n", value_regno);
+ verbose(env, "R%d leaks addr into packet\n",
+ value_regno);
return -EACCES;
}
- err = check_packet_access(env, regno, off, size);
+ err = check_packet_access(env, regno, off, size, false);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, regs, value_regno);
} else {
- verbose("R%d invalid mem access '%s'\n",
- regno, reg_type_str[reg->type]);
+ verbose(env, "R%d invalid mem access '%s'\n", regno,
+ reg_type_str[reg->type]);
return -EACCES;
}
if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
- state->regs[value_regno].type == SCALAR_VALUE) {
+ regs[value_regno].type == SCALAR_VALUE) {
/* b/h/w load zero-extends, mark upper bits as known 0 */
- state->regs[value_regno].var_off = tnum_cast(
- state->regs[value_regno].var_off, size);
- __update_reg_bounds(&state->regs[value_regno]);
+ coerce_reg_to_size(&regs[value_regno], size);
}
return err;
}
@@ -1213,7 +1246,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
insn->imm != 0) {
- verbose("BPF_XADD uses reserved fields\n");
+ verbose(env, "BPF_XADD uses reserved fields\n");
return -EINVAL;
}
@@ -1228,7 +1261,13 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
return err;
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d leaks addr into mem\n", insn->src_reg);
+ verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
+ return -EACCES;
+ }
+
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose(env, "BPF_XADD stores into R%d context is not allowed\n",
+ insn->dst_reg);
return -EACCES;
}
@@ -1259,9 +1298,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *regs = state->regs;
- int off, i;
+ int off, i, slot, spi;
if (regs[regno].type != PTR_TO_STACK) {
/* Allow zero-byte read from NULL, regardless of pointer type */
@@ -1269,7 +1308,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
register_is_null(regs[regno]))
return 0;
- verbose("R%d type=%s expected=%s\n", regno,
+ verbose(env, "R%d type=%s expected=%s\n", regno,
reg_type_str[regs[regno].type],
reg_type_str[PTR_TO_STACK]);
return -EACCES;
@@ -1280,13 +1319,14 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
- verbose("invalid variable stack read R%d var_off=%s\n",
+ verbose(env, "invalid variable stack read R%d var_off=%s\n",
regno, tn_buf);
+ return -EACCES;
}
off = regs[regno].off + regs[regno].var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
- access_size <= 0) {
- verbose("invalid stack type R%d off=%d access_size=%d\n",
+ access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
+ verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
regno, off, access_size);
return -EACCES;
}
@@ -1301,8 +1341,12 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
}
for (i = 0; i < access_size; i++) {
- if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
- verbose("invalid indirect read from stack off %d+%d size %d\n",
+ slot = -(off + i) - 1;
+ spi = slot / BPF_REG_SIZE;
+ if (state->allocated_stack <= slot ||
+ state->stack[spi].slot_type[slot % BPF_REG_SIZE] !=
+ STACK_MISC) {
+ verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
off, i, access_size);
return -EACCES;
}
@@ -1314,13 +1358,16 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
switch (reg->type) {
case PTR_TO_PACKET:
- return check_packet_access(env, regno, reg->off, access_size);
+ case PTR_TO_PACKET_META:
+ return check_packet_access(env, regno, reg->off, access_size,
+ zero_size_allowed);
case PTR_TO_MAP_VALUE:
- return check_map_access(env, regno, reg->off, access_size);
+ return check_map_access(env, regno, reg->off, access_size,
+ zero_size_allowed);
default: /* scalar_value|ptr_to_stack or invalid ptr */
return check_stack_boundary(env, regno, access_size,
zero_size_allowed, meta);
@@ -1331,7 +1378,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
enum bpf_reg_type expected_type, type = reg->type;
int err = 0;
@@ -1344,22 +1391,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_ANYTHING) {
if (is_pointer_value(env, regno)) {
- verbose("R%d leaks addr into helper function\n", regno);
+ verbose(env, "R%d leaks addr into helper function\n",
+ regno);
return -EACCES;
}
return 0;
}
- if (type == PTR_TO_PACKET &&
+ if (type_is_pkt_pointer(type) &&
!may_access_direct_pkt_data(env, meta, BPF_READ)) {
- verbose("helper access to the packet is not allowed\n");
+ verbose(env, "helper access to the packet is not allowed\n");
return -EACCES;
}
if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
- if (type != PTR_TO_PACKET && type != expected_type)
+ if (!type_is_pkt_pointer(type) &&
+ type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -1375,20 +1424,23 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (type != expected_type)
goto err_type;
} else if (arg_type == ARG_PTR_TO_MEM ||
+ arg_type == ARG_PTR_TO_MEM_OR_NULL ||
arg_type == ARG_PTR_TO_UNINIT_MEM) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
* passed in as argument, it's a SCALAR_VALUE type. Final test
* happens during stack boundary checking.
*/
- if (register_is_null(*reg))
+ if (register_is_null(*reg) &&
+ arg_type == ARG_PTR_TO_MEM_OR_NULL)
/* final test in check_stack_boundary() */;
- else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+ else if (!type_is_pkt_pointer(type) &&
+ type != PTR_TO_MAP_VALUE &&
type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
} else {
- verbose("unsupported arg_type %d\n", arg_type);
+ verbose(env, "unsupported arg_type %d\n", arg_type);
return -EFAULT;
}
@@ -1406,12 +1458,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
* we have to check map_key here. Otherwise it means
* that kernel subsystem misconfigured verifier
*/
- verbose("invalid map_ptr to access map->key\n");
+ verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
- if (type == PTR_TO_PACKET)
+ if (type_is_pkt_pointer(type))
err = check_packet_access(env, regno, reg->off,
- meta->map_ptr->key_size);
+ meta->map_ptr->key_size,
+ false);
else
err = check_stack_boundary(env, regno,
meta->map_ptr->key_size,
@@ -1422,12 +1475,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (!meta->map_ptr) {
/* kernel subsystem misconfigured verifier */
- verbose("invalid map_ptr to access map->value\n");
+ verbose(env, "invalid map_ptr to access map->value\n");
return -EACCES;
}
- if (type == PTR_TO_PACKET)
+ if (type_is_pkt_pointer(type))
err = check_packet_access(env, regno, reg->off,
- meta->map_ptr->value_size);
+ meta->map_ptr->value_size,
+ false);
else
err = check_stack_boundary(env, regno,
meta->map_ptr->value_size,
@@ -1442,7 +1496,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (regno == 0) {
/* kernel subsystem misconfigured verifier */
- verbose("ARG_CONST_SIZE cannot be first argument\n");
+ verbose(env,
+ "ARG_CONST_SIZE cannot be first argument\n");
return -EACCES;
}
@@ -1459,7 +1514,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
meta = NULL;
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
regno);
return -EACCES;
}
@@ -1473,7 +1528,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
}
if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
- verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
regno);
return -EACCES;
}
@@ -1484,12 +1539,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return err;
err_type:
- verbose("R%d type=%s expected=%s\n", regno,
+ verbose(env, "R%d type=%s expected=%s\n", regno,
reg_type_str[type], reg_type_str[expected_type]);
return -EACCES;
}
-static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map, int func_id)
{
if (!map)
return 0;
@@ -1502,7 +1558,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
break;
case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
if (func_id != BPF_FUNC_perf_event_read &&
- func_id != BPF_FUNC_perf_event_output)
+ func_id != BPF_FUNC_perf_event_output &&
+ func_id != BPF_FUNC_perf_event_read_value)
goto error;
break;
case BPF_MAP_TYPE_STACK_TRACE:
@@ -1522,6 +1579,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
+ /* Restrict bpf side of cpumap, open when use-cases appear */
+ case BPF_MAP_TYPE_CPUMAP:
+ if (func_id != BPF_FUNC_redirect_map)
+ goto error;
+ break;
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
if (func_id != BPF_FUNC_map_lookup_elem)
@@ -1545,6 +1607,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
break;
case BPF_FUNC_perf_event_read:
case BPF_FUNC_perf_event_output:
+ case BPF_FUNC_perf_event_read_value:
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
@@ -1558,7 +1621,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
goto error;
break;
case BPF_FUNC_redirect_map:
- if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+ if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+ map->map_type != BPF_MAP_TYPE_CPUMAP)
goto error;
break;
case BPF_FUNC_sk_redirect_map:
@@ -1575,7 +1639,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
return 0;
error:
- verbose("cannot pass map_type %d into func %s#%d\n",
+ verbose(env, "cannot pass map_type %d into func %s#%d\n",
map->map_type, func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1598,61 +1662,65 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
return count > 1 ? -EINVAL : 0;
}
-/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
- * so turn them into unknown SCALAR_VALUE.
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
*/
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *regs = state->regs, *reg;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].type == PTR_TO_PACKET ||
- regs[i].type == PTR_TO_PACKET_END)
- mark_reg_unknown(regs, i);
+ if (reg_is_pkt_pointer_any(&regs[i]))
+ mark_reg_unknown(env, regs, i);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- reg = &state->spilled_regs[i / BPF_REG_SIZE];
- if (reg->type != PTR_TO_PACKET &&
- reg->type != PTR_TO_PACKET_END)
- continue;
- __mark_reg_unknown(reg);
+ reg = &state->stack[i].spilled_ptr;
+ if (reg_is_pkt_pointer_any(reg))
+ __mark_reg_unknown(reg);
}
}
static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
- struct bpf_verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs;
struct bpf_call_arg_meta meta;
bool changes_data;
int i, err;
/* find function prototype */
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
- verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
+ verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
+ func_id);
return -EINVAL;
}
- if (env->prog->aux->ops->get_func_proto)
- fn = env->prog->aux->ops->get_func_proto(func_id);
+ if (env->ops->get_func_proto)
+ fn = env->ops->get_func_proto(func_id);
if (!fn) {
- verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
+ verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
+ func_id);
return -EINVAL;
}
/* eBPF programs must be GPL compatible to use GPL-ed functions */
if (!env->prog->gpl_compatible && fn->gpl_only) {
- verbose("cannot call GPL only function from proprietary program\n");
+ verbose(env, "cannot call GPL only function from proprietary program\n");
return -EINVAL;
}
+ /* With LD_ABS/IND some JITs save/restore skb from r1. */
changes_data = bpf_helper_changes_pkt_data(fn->func);
+ if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
+ verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
memset(&meta, 0, sizeof(meta));
meta.pkt_access = fn->pkt_access;
@@ -1662,7 +1730,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
*/
err = check_raw_mode(fn);
if (err) {
- verbose("kernel subsystem misconfigured func %s#%d\n",
+ verbose(env, "kernel subsystem misconfigured func %s#%d\n",
func_id_name(func_id), func_id);
return err;
}
@@ -1674,6 +1742,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
if (err)
return err;
+ if (func_id == BPF_FUNC_tail_call) {
+ if (meta.map_ptr == NULL) {
+ verbose(env, "verifier bug\n");
+ return -EINVAL;
+ }
+ env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
+ }
err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
if (err)
return err;
@@ -1693,16 +1768,17 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return err;
}
+ regs = cur_regs(env);
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(regs, caller_saved[i]);
+ mark_reg_not_init(env, regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
/* update return register (already marked as written above) */
if (fn->ret_type == RET_INTEGER) {
/* sets type to SCALAR_VALUE */
- mark_reg_unknown(regs, BPF_REG_0);
+ mark_reg_unknown(env, regs, BPF_REG_0);
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1710,14 +1786,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */
- mark_reg_known_zero(regs, BPF_REG_0);
+ mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].off = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
*/
if (meta.map_ptr == NULL) {
- verbose("kernel subsystem misconfigured verifier\n");
+ verbose(env,
+ "kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
@@ -1728,12 +1805,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
else if (insn_aux->map_ptr != meta.map_ptr)
insn_aux->map_ptr = BPF_MAP_PTR_POISON;
} else {
- verbose("unknown return type %d of func %s#%d\n",
+ verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
return -EINVAL;
}
- err = check_map_func_compatibility(meta.map_ptr, func_id);
+ err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
return err;
@@ -1742,14 +1819,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return 0;
}
-static void coerce_reg_to_32(struct bpf_reg_state *reg)
-{
- /* clear high 32 bits */
- reg->var_off = tnum_cast(reg->var_off, 4);
- /* Update bounds */
- __update_reg_bounds(reg);
-}
-
static bool signed_add_overflows(s64 a, s64 b)
{
/* Do the add in u64, where overflow is well-defined */
@@ -1770,6 +1839,41 @@ static bool signed_sub_overflows(s64 a, s64 b)
return res > a;
}
+static bool check_reg_sane_offset(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ enum bpf_reg_type type)
+{
+ bool known = tnum_is_const(reg->var_off);
+ s64 val = reg->var_off.value;
+ s64 smin = reg->smin_value;
+
+ if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
+ verbose(env, "math between %s pointer and %lld is not allowed\n",
+ reg_type_str[type], val);
+ return false;
+ }
+
+ if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "%s pointer offset %d is not allowed\n",
+ reg_type_str[type], reg->off);
+ return false;
+ }
+
+ if (smin == S64_MIN) {
+ verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
+ reg_type_str[type]);
+ return false;
+ }
+
+ if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "value %lld makes %s pointer be out of bounds\n",
+ smin, reg_type_str[type]);
+ return false;
+ }
+
+ return true;
+}
+
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
* Caller should also handle BPF_MOV case separately.
* If we return -EACCES, caller may want to try again treating pointer as a
@@ -1780,7 +1884,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
const struct bpf_reg_state *ptr_reg,
const struct bpf_reg_state *off_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ struct bpf_reg_state *regs = cur_regs(env), *dst_reg;
bool known = tnum_is_const(off_reg->var_off);
s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
@@ -1791,41 +1895,36 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg = &regs[dst];
- if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: known but bad sbounds\n");
- return -EINVAL;
- }
- if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: known but bad ubounds\n");
- return -EINVAL;
+ if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
+ smin_val > smax_val || umin_val > umax_val) {
+ /* Taint dst register if offset had invalid bounds derived from
+ * e.g. dead branches.
+ */
+ __mark_reg_unknown(dst_reg);
+ return 0;
}
if (BPF_CLASS(insn->code) != BPF_ALU64) {
/* 32-bit ALU ops on pointers produce (meaningless) scalars */
- if (!env->allow_ptr_leaks)
- verbose("R%d 32-bit pointer arithmetic prohibited\n",
- dst);
+ verbose(env,
+ "R%d 32-bit pointer arithmetic prohibited\n",
+ dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
- if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
- dst);
+ verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+ dst);
return -EACCES;
}
if (ptr_reg->type == CONST_PTR_TO_MAP) {
- if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
- dst);
+ verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+ dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_PACKET_END) {
- if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
- dst);
+ verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+ dst);
return -EACCES;
}
@@ -1835,6 +1934,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->type = ptr_reg->type;
dst_reg->id = ptr_reg->id;
+ if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
+ !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
+ return -EINVAL;
+
switch (opcode) {
case BPF_ADD:
/* We can take a fixed offset as long as it doesn't overflow
@@ -1879,7 +1982,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
- if (ptr_reg->type == PTR_TO_PACKET) {
+ if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
dst_reg->range = 0;
@@ -1888,9 +1991,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case BPF_SUB:
if (dst_reg == off_reg) {
/* scalar -= pointer. Creates an unknown scalar */
- if (!env->allow_ptr_leaks)
- verbose("R%d tried to subtract pointer from scalar\n",
- dst);
+ verbose(env, "R%d tried to subtract pointer from scalar\n",
+ dst);
return -EACCES;
}
/* We don't allow subtraction from FP, because (according to
@@ -1898,9 +2000,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
* be able to deal with it.
*/
if (ptr_reg->type == PTR_TO_STACK) {
- if (!env->allow_ptr_leaks)
- verbose("R%d subtraction from stack pointer prohibited\n",
- dst);
+ verbose(env, "R%d subtraction from stack pointer prohibited\n",
+ dst);
return -EACCES;
}
if (known && (ptr_reg->off - smin_val ==
@@ -1939,7 +2040,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
- if (ptr_reg->type == PTR_TO_PACKET) {
+ if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
if (smin_val < 0)
@@ -1949,44 +2050,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case BPF_AND:
case BPF_OR:
case BPF_XOR:
- /* bitwise ops on pointers are troublesome, prohibit for now.
- * (However, in principle we could allow some cases, e.g.
- * ptr &= ~3 which would reduce min_value by 3.)
- */
- if (!env->allow_ptr_leaks)
- verbose("R%d bitwise operator %s on pointer prohibited\n",
- dst, bpf_alu_string[opcode >> 4]);
+ /* bitwise ops on pointers are troublesome, prohibit. */
+ verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
default:
/* other operators (e.g. MUL,LSH) produce non-pointer results */
- if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic with %s operator prohibited\n",
- dst, bpf_alu_string[opcode >> 4]);
+ verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
}
+ if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
+ return -EINVAL;
+
__update_reg_bounds(dst_reg);
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
return 0;
}
+/* WARNING: This function does calculations on 64-bit values, but the actual
+ * execution may occur on 32-bit values. Therefore, things like bitshifts
+ * need extra checks in the 32-bit case.
+ */
static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn,
struct bpf_reg_state *dst_reg,
struct bpf_reg_state src_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
bool src_known, dst_known;
s64 smin_val, smax_val;
u64 umin_val, umax_val;
+ u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
- if (BPF_CLASS(insn->code) != BPF_ALU64) {
- /* 32-bit ALU ops are (32,32)->64 */
- coerce_reg_to_32(dst_reg);
- coerce_reg_to_32(&src_reg);
- }
smin_val = src_reg.smin_value;
smax_val = src_reg.smax_value;
umin_val = src_reg.umin_value;
@@ -1994,6 +2093,21 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
src_known = tnum_is_const(src_reg.var_off);
dst_known = tnum_is_const(dst_reg->var_off);
+ if ((src_known && (smin_val != smax_val || umin_val != umax_val)) ||
+ smin_val > smax_val || umin_val > umax_val) {
+ /* Taint dst register if offset had invalid bounds derived from
+ * e.g. dead branches.
+ */
+ __mark_reg_unknown(dst_reg);
+ return 0;
+ }
+
+ if (!src_known &&
+ opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
+ __mark_reg_unknown(dst_reg);
+ return 0;
+ }
+
switch (opcode) {
case BPF_ADD:
if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
@@ -2122,11 +2236,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
break;
case BPF_LSH:
- if (umax_val > 63) {
- /* Shifts greater than 63 are undefined. This includes
- * shifts by a negative number.
+ if (umax_val >= insn_bitness) {
+ /* Shifts greater than 31 or 63 are undefined.
+ * This includes shifts by a negative number.
*/
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
/* We lose all sign bit information (except what we can pick
@@ -2150,27 +2264,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
break;
case BPF_RSH:
- if (umax_val > 63) {
- /* Shifts greater than 63 are undefined. This includes
- * shifts by a negative number.
+ if (umax_val >= insn_bitness) {
+ /* Shifts greater than 31 or 63 are undefined.
+ * This includes shifts by a negative number.
*/
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
- /* BPF_RSH is an unsigned shift, so make the appropriate casts */
- if (dst_reg->smin_value < 0) {
- if (umin_val) {
- /* Sign bit will be cleared */
- dst_reg->smin_value = 0;
- } else {
- /* Lost sign bit information */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- }
- } else {
- dst_reg->smin_value =
- (u64)(dst_reg->smin_value) >> umax_val;
- }
+ /* BPF_RSH is an unsigned shift. If the value in dst_reg might
+ * be negative, then either:
+ * 1) src_reg might be zero, so the sign bit of the result is
+ * unknown, so we lose our signed bounds
+ * 2) it's known negative, thus the unsigned bounds capture the
+ * signed bounds
+ * 3) the signed bounds cross zero, so they tell us nothing
+ * about the result
+ * If the value in dst_reg is known nonnegative, then again the
+ * unsigned bounts capture the signed bounds.
+ * Thus, in all cases it suffices to blow away our signed bounds
+ * and rely on inferring new ones from the unsigned bounds and
+ * var_off of the result.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
if (src_known)
dst_reg->var_off = tnum_rshift(dst_reg->var_off,
umin_val);
@@ -2182,10 +2298,16 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
break;
default:
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops are (32,32)->32 */
+ coerce_reg_to_size(dst_reg, 4);
+ coerce_reg_to_size(&src_reg, 4);
+ }
+
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
return 0;
@@ -2197,10 +2319,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+ struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
u8 opcode = BPF_OP(insn->code);
- int rc;
dst_reg = &regs[insn->dst_reg];
src_reg = NULL;
@@ -2211,43 +2332,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
if (src_reg->type != SCALAR_VALUE) {
if (dst_reg->type != SCALAR_VALUE) {
/* Combining two pointers by any ALU op yields
- * an arbitrary scalar.
+ * an arbitrary scalar. Disallow all math except
+ * pointer subtraction
*/
- if (!env->allow_ptr_leaks) {
- verbose("R%d pointer %s pointer prohibited\n",
- insn->dst_reg,
- bpf_alu_string[opcode >> 4]);
- return -EACCES;
+ if (opcode == BPF_SUB){
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ return 0;
}
- mark_reg_unknown(regs, insn->dst_reg);
- return 0;
+ verbose(env, "R%d pointer %s pointer prohibited\n",
+ insn->dst_reg,
+ bpf_alu_string[opcode >> 4]);
+ return -EACCES;
} else {
/* scalar += pointer
* This is legal, but we have to reverse our
* src/dest handling in computing the range
*/
- rc = adjust_ptr_min_max_vals(env, insn,
- src_reg, dst_reg);
- if (rc == -EACCES && env->allow_ptr_leaks) {
- /* scalar += unknown scalar */
- __mark_reg_unknown(&off_reg);
- return adjust_scalar_min_max_vals(
- env, insn,
- dst_reg, off_reg);
- }
- return rc;
+ return adjust_ptr_min_max_vals(env, insn,
+ src_reg, dst_reg);
}
} else if (ptr_reg) {
/* pointer += scalar */
- rc = adjust_ptr_min_max_vals(env, insn,
- dst_reg, src_reg);
- if (rc == -EACCES && env->allow_ptr_leaks) {
- /* unknown scalar += scalar */
- __mark_reg_unknown(dst_reg);
- return adjust_scalar_min_max_vals(
- env, insn, dst_reg, *src_reg);
- }
- return rc;
+ return adjust_ptr_min_max_vals(env, insn,
+ dst_reg, src_reg);
}
} else {
/* Pretend the src is a reg with a known value, since we only
@@ -2256,28 +2363,20 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
off_reg.type = SCALAR_VALUE;
__mark_reg_known(&off_reg, insn->imm);
src_reg = &off_reg;
- if (ptr_reg) { /* pointer += K */
- rc = adjust_ptr_min_max_vals(env, insn,
- ptr_reg, src_reg);
- if (rc == -EACCES && env->allow_ptr_leaks) {
- /* unknown scalar += K */
- __mark_reg_unknown(dst_reg);
- return adjust_scalar_min_max_vals(
- env, insn, dst_reg, off_reg);
- }
- return rc;
- }
+ if (ptr_reg) /* pointer += K */
+ return adjust_ptr_min_max_vals(env, insn,
+ ptr_reg, src_reg);
}
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: unexpected ptr_reg\n");
+ print_verifier_state(env, env->cur_state);
+ verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: no src_reg\n");
+ print_verifier_state(env, env->cur_state);
+ verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
@@ -2286,7 +2385,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* check validity of 32-bit and 64-bit arithmetic operations */
static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
int err;
@@ -2295,14 +2394,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->off != 0 || insn->imm != 0) {
- verbose("BPF_NEG uses reserved fields\n");
+ verbose(env, "BPF_NEG uses reserved fields\n");
return -EINVAL;
}
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
(insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
BPF_CLASS(insn->code) == BPF_ALU64) {
- verbose("BPF_END uses reserved fields\n");
+ verbose(env, "BPF_END uses reserved fields\n");
return -EINVAL;
}
}
@@ -2313,7 +2412,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
+ verbose(env, "R%d pointer arithmetic prohibited\n",
insn->dst_reg);
return -EACCES;
}
@@ -2327,7 +2426,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
+ verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
@@ -2337,7 +2436,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
+ verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
}
@@ -2357,33 +2456,37 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d partial copy of pointer\n",
+ verbose(env,
+ "R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
}
- mark_reg_unknown(regs, insn->dst_reg);
- /* high 32 bits are known zero. */
- regs[insn->dst_reg].var_off = tnum_cast(
- regs[insn->dst_reg].var_off, 4);
- __update_reg_bounds(&regs[insn->dst_reg]);
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ coerce_reg_to_size(&regs[insn->dst_reg], 4);
}
} else {
/* case: R = imm
* remember the value we stored into this reg
*/
regs[insn->dst_reg].type = SCALAR_VALUE;
- __mark_reg_known(regs + insn->dst_reg, insn->imm);
+ if (BPF_CLASS(insn->code) == BPF_ALU64) {
+ __mark_reg_known(regs + insn->dst_reg,
+ insn->imm);
+ } else {
+ __mark_reg_known(regs + insn->dst_reg,
+ (u32)insn->imm);
+ }
}
} else if (opcode > BPF_END) {
- verbose("invalid BPF_ALU opcode %x\n", opcode);
+ verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
return -EINVAL;
} else { /* all other ALU ops: and, sub, xor, add, ... */
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
/* check src1 operand */
@@ -2392,7 +2495,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
}
@@ -2404,7 +2507,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
- verbose("div by zero\n");
+ verbose(env, "div by zero\n");
+ return -EINVAL;
+ }
+
+ if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
+ verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
return -EINVAL;
}
@@ -2413,7 +2521,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
if (insn->imm < 0 || insn->imm >= size) {
- verbose("invalid shift %d\n", insn->imm);
+ verbose(env, "invalid shift %d\n", insn->imm);
return -EINVAL;
}
}
@@ -2431,6 +2539,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
static void find_good_pkt_pointers(struct bpf_verifier_state *state,
struct bpf_reg_state *dst_reg,
+ enum bpf_reg_type type,
bool range_right_open)
{
struct bpf_reg_state *regs = state->regs, *reg;
@@ -2501,15 +2610,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+ if (regs[i].type == type && regs[i].id == dst_reg->id)
/* keep the maximum range already checked */
regs[i].range = max(regs[i].range, new_range);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- reg = &state->spilled_regs[i / BPF_REG_SIZE];
- if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+ reg = &state->stack[i].spilled_ptr;
+ if (reg->type == type && reg->id == dst_reg->id)
reg->range = max(reg->range, new_range);
}
}
@@ -2758,29 +2867,122 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
for (i = 0; i < MAX_BPF_REG; i++)
mark_map_reg(regs, i, id, is_null);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
+ mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
}
}
+static bool try_match_pkt_pointers(const struct bpf_insn *insn,
+ struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg,
+ struct bpf_verifier_state *this_branch,
+ struct bpf_verifier_state *other_branch)
+{
+ if (BPF_SRC(insn->code) != BPF_X)
+ return false;
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_JGT:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' > pkt_end, pkt_meta' > pkt_data */
+ find_good_pkt_pointers(this_branch, dst_reg,
+ dst_reg->type, false);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end > pkt_data', pkt_data > pkt_meta' */
+ find_good_pkt_pointers(other_branch, src_reg,
+ src_reg->type, true);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JLT:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' < pkt_end, pkt_meta' < pkt_data */
+ find_good_pkt_pointers(other_branch, dst_reg,
+ dst_reg->type, true);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end < pkt_data', pkt_data > pkt_meta' */
+ find_good_pkt_pointers(this_branch, src_reg,
+ src_reg->type, false);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JGE:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
+ find_good_pkt_pointers(this_branch, dst_reg,
+ dst_reg->type, true);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
+ find_good_pkt_pointers(other_branch, src_reg,
+ src_reg->type, false);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JLE:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
+ find_good_pkt_pointers(other_branch, dst_reg,
+ dst_reg->type, false);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
+ find_good_pkt_pointers(this_branch, src_reg,
+ src_reg->type, true);
+ } else {
+ return false;
+ }
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
- struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+ struct bpf_verifier_state *other_branch, *this_branch = env->cur_state;
struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
u8 opcode = BPF_OP(insn->code);
int err;
if (opcode > BPF_JSLE) {
- verbose("invalid BPF_JMP opcode %x\n", opcode);
+ verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
return -EINVAL;
}
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP uses reserved fields\n");
return -EINVAL;
}
@@ -2790,13 +2992,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return err;
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d pointer comparison prohibited\n",
+ verbose(env, "R%d pointer comparison prohibited\n",
insn->src_reg);
return -EACCES;
}
} else {
if (insn->src_reg != BPF_REG_0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP uses reserved fields\n");
return -EINVAL;
}
}
@@ -2871,52 +3073,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*/
mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
- dst_reg->type == PTR_TO_PACKET &&
- regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- /* pkt_data' > pkt_end */
- find_good_pkt_pointers(this_branch, dst_reg, false);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
- dst_reg->type == PTR_TO_PACKET_END &&
- regs[insn->src_reg].type == PTR_TO_PACKET) {
- /* pkt_end > pkt_data' */
- find_good_pkt_pointers(other_branch, &regs[insn->src_reg], true);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
- dst_reg->type == PTR_TO_PACKET &&
- regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- /* pkt_data' < pkt_end */
- find_good_pkt_pointers(other_branch, dst_reg, true);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
- dst_reg->type == PTR_TO_PACKET_END &&
- regs[insn->src_reg].type == PTR_TO_PACKET) {
- /* pkt_end < pkt_data' */
- find_good_pkt_pointers(this_branch, &regs[insn->src_reg], false);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
- dst_reg->type == PTR_TO_PACKET &&
- regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- /* pkt_data' >= pkt_end */
- find_good_pkt_pointers(this_branch, dst_reg, true);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
- dst_reg->type == PTR_TO_PACKET_END &&
- regs[insn->src_reg].type == PTR_TO_PACKET) {
- /* pkt_end >= pkt_data' */
- find_good_pkt_pointers(other_branch, &regs[insn->src_reg], false);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
- dst_reg->type == PTR_TO_PACKET &&
- regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- /* pkt_data' <= pkt_end */
- find_good_pkt_pointers(other_branch, dst_reg, false);
- } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
- dst_reg->type == PTR_TO_PACKET_END &&
- regs[insn->src_reg].type == PTR_TO_PACKET) {
- /* pkt_end <= pkt_data' */
- find_good_pkt_pointers(this_branch, &regs[insn->src_reg], true);
- } else if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+ } else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
+ this_branch, other_branch) &&
+ is_pointer_value(env, insn->dst_reg)) {
+ verbose(env, "R%d pointer comparison prohibited\n",
+ insn->dst_reg);
return -EACCES;
}
- if (log_level)
- print_verifier_state(this_branch);
+ if (env->log.level)
+ print_verifier_state(env, this_branch);
return 0;
}
@@ -2931,15 +3096,15 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
/* verify BPF_LD_IMM64 instruction */
static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
int err;
if (BPF_SIZE(insn->code) != BPF_DW) {
- verbose("invalid BPF_LD_IMM insn\n");
+ verbose(env, "invalid BPF_LD_IMM insn\n");
return -EINVAL;
}
if (insn->off != 0) {
- verbose("BPF_LD_IMM64 uses reserved fields\n");
+ verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
return -EINVAL;
}
@@ -2992,19 +3157,19 @@ static bool may_access_skb(enum bpf_prog_type type)
*/
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 mode = BPF_MODE(insn->code);
int i, err;
if (!may_access_skb(env->prog->type)) {
- verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
+ verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
return -EINVAL;
}
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
- verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
+ verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
return -EINVAL;
}
@@ -3014,7 +3179,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (regs[BPF_REG_6].type != PTR_TO_CTX) {
- verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+ verbose(env,
+ "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
return -EINVAL;
}
@@ -3027,7 +3193,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* reset caller saved regs to unreadable */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(regs, caller_saved[i]);
+ mark_reg_not_init(env, regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
@@ -3035,7 +3201,45 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* the value fetched from the packet.
* Already marked as written above.
*/
- mark_reg_unknown(regs, BPF_REG_0);
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ return 0;
+}
+
+static int check_return_code(struct bpf_verifier_env *env)
+{
+ struct bpf_reg_state *reg;
+ struct tnum range = tnum_range(0, 1);
+
+ switch (env->prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ break;
+ default:
+ return 0;
+ }
+
+ reg = cur_regs(env) + BPF_REG_0;
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "At program exit the register R0 is not a known value (%s)\n",
+ reg_type_str[reg->type]);
+ return -EINVAL;
+ }
+
+ if (!tnum_in(range, reg->var_off)) {
+ verbose(env, "At program exit the register R0 ");
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "has value %s", tn_buf);
+ } else {
+ verbose(env, "has unknown scalar value");
+ }
+ verbose(env, " should have been 0 or 1\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -3099,7 +3303,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
return 0;
if (w < 0 || w >= env->prog->len) {
- verbose("jump out of range from insn %d to %d\n", t, w);
+ verbose(env, "jump out of range from insn %d to %d\n", t, w);
return -EINVAL;
}
@@ -3116,13 +3320,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
insn_stack[cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- verbose("back-edge from insn %d to %d\n", t, w);
+ verbose(env, "back-edge from insn %d to %d\n", t, w);
return -EINVAL;
} else if (insn_state[w] == EXPLORED) {
/* forward- or cross-edge */
insn_state[t] = DISCOVERED | e;
} else {
- verbose("insn state internal bug\n");
+ verbose(env, "insn state internal bug\n");
return -EFAULT;
}
return 0;
@@ -3216,7 +3420,7 @@ peek_stack:
mark_explored:
insn_state[t] = EXPLORED;
if (cur_stack-- <= 0) {
- verbose("pop stack internal bug\n");
+ verbose(env, "pop stack internal bug\n");
ret = -EFAULT;
goto err_free;
}
@@ -3225,7 +3429,7 @@ mark_explored:
check_state:
for (i = 0; i < insn_cnt; i++) {
if (insn_state[i] != EXPLORED) {
- verbose("unreachable insn %d\n", i);
+ verbose(env, "unreachable insn %d\n", i);
ret = -EINVAL;
goto err_free;
}
@@ -3307,15 +3511,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
} else {
- /* if we knew anything about the old value, we're not
- * equal, because we can't know anything about the
- * scalar value of the pointer in the new value.
+ /* We're trying to use a pointer in place of a scalar.
+ * Even if the scalar was unbounded, this could lead to
+ * pointer leaks because scalars are allowed to leak
+ * while pointers are not. We could make this safe in
+ * special cases if root is calling us, but it's
+ * probably not worth the hassle.
*/
- return rold->umin_value == 0 &&
- rold->umax_value == U64_MAX &&
- rold->smin_value == S64_MIN &&
- rold->smax_value == S64_MAX &&
- tnum_is_unknown(rold->var_off);
+ return false;
}
case PTR_TO_MAP_VALUE:
/* If the new min/max/var_off satisfy the old ones and
@@ -3340,8 +3543,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
/* Check our ids match any regs they're supposed to */
return check_ids(rold->id, rcur->id, idmap);
+ case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
- if (rcur->type != PTR_TO_PACKET)
+ if (rcur->type != rold->type)
return false;
/* We must have at least as much range as the old ptr
* did, so that any accesses which were safe before are
@@ -3379,6 +3583,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
}
+static bool stacksafe(struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur,
+ struct idpair *idmap)
+{
+ int i, spi;
+
+ /* if explored stack has more populated slots than current stack
+ * such stacks are not equivalent
+ */
+ if (old->allocated_stack > cur->allocated_stack)
+ return false;
+
+ /* walk slots of the explored stack and ignore any additional
+ * slots in the current stack, since explored(safe) state
+ * didn't use them
+ */
+ for (i = 0; i < old->allocated_stack; i++) {
+ spi = i / BPF_REG_SIZE;
+
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
+ continue;
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
+ return false;
+ if (i % BPF_REG_SIZE)
+ continue;
+ if (old->stack[spi].slot_type[0] != STACK_SPILL)
+ continue;
+ if (!regsafe(&old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr,
+ idmap))
+ /* when explored and current stack slot are both storing
+ * spilled registers, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
+ * but current path has stored:
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ return false;
+ }
+ return true;
+}
+
/* compare two verifier states
*
* all states stored in state_list are known to be valid, since
@@ -3423,37 +3678,8 @@ static bool states_equal(struct bpf_verifier_env *env,
goto out_free;
}
- for (i = 0; i < MAX_BPF_STACK; i++) {
- if (old->stack_slot_type[i] == STACK_INVALID)
- continue;
- if (old->stack_slot_type[i] != cur->stack_slot_type[i])
- /* Ex: old explored (safe) state has STACK_SPILL in
- * this stack slot, but current has has STACK_MISC ->
- * this verifier states are not equivalent,
- * return false to continue verification of this path
- */
- goto out_free;
- if (i % BPF_REG_SIZE)
- continue;
- if (old->stack_slot_type[i] != STACK_SPILL)
- continue;
- if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
- &cur->spilled_regs[i / BPF_REG_SIZE],
- idmap))
- /* when explored and current stack slot are both storing
- * spilled registers, check that stored pointers types
- * are the same as well.
- * Ex: explored safe path could have stored
- * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
- * but current path has stored:
- * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
- * such verifier states are not equivalent.
- * return false to continue verification of this path
- */
- goto out_free;
- else
- continue;
- }
+ if (!stacksafe(old, cur, idmap))
+ goto out_free;
ret = true;
out_free:
kfree(idmap);
@@ -3489,17 +3715,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state,
}
}
/* ... and stack slots */
- for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
- if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
+ i < parent->allocated_stack / BPF_REG_SIZE; i++) {
+ if (parent->stack[i].slot_type[0] != STACK_SPILL)
continue;
- if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- if (parent->spilled_regs[i].live & REG_LIVE_READ)
+ if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
continue;
- if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN))
+ if (writes &&
+ (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))
continue;
- if (state->spilled_regs[i].live & REG_LIVE_READ) {
- parent->spilled_regs[i].live |= REG_LIVE_READ;
+ if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) {
+ parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;
touched = true;
}
}
@@ -3529,7 +3757,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
- int i;
+ struct bpf_verifier_state *cur = env->cur_state;
+ int i, err;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -3539,7 +3768,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
return 0;
while (sl != STATE_LIST_MARK) {
- if (states_equal(env, &sl->state, &env->cur_state)) {
+ if (states_equal(env, &sl->state, cur)) {
/* reached equivalent register/stack state,
* prune the search.
* Registers read by the continuation are read by us.
@@ -3550,7 +3779,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* they'll be immediately forgotten as we're pruning
* this state and will pop a new one.
*/
- propagate_liveness(&sl->state, &env->cur_state);
+ propagate_liveness(&sl->state, cur);
return 1;
}
sl = sl->next;
@@ -3562,16 +3791,21 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* it will be rejected. Since there are no loops, we won't be
* seeing this 'insn_idx' instruction again on the way to bpf_exit
*/
- new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
+ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
if (!new_sl)
return -ENOMEM;
/* add new state to the head of linked list */
- memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+ err = copy_verifier_state(&new_sl->state, cur);
+ if (err) {
+ free_verifier_state(&new_sl->state, false);
+ kfree(new_sl);
+ return err;
+ }
new_sl->next = env->explored_states[insn_idx];
env->explored_states[insn_idx] = new_sl;
/* connect new state to parentage chain */
- env->cur_state.parent = &new_sl->state;
+ cur->parent = &new_sl->state;
/* clear write marks in current state: the writes we did are not writes
* our child did, so they don't screen off its reads from us.
* (There are no read marks in current state, because reads always mark
@@ -3579,33 +3813,37 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* explored_states can get read marks.)
*/
for (i = 0; i < BPF_REG_FP; i++)
- env->cur_state.regs[i].live = REG_LIVE_NONE;
- for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
- if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
- env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
+ cur->regs[i].live = REG_LIVE_NONE;
+ for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++)
+ if (cur->stack[i].slot_type[0] == STACK_SPILL)
+ cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;
return 0;
}
static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx)
{
- if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
- return 0;
+ if (env->dev_ops && env->dev_ops->insn_hook)
+ return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
- return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+ return 0;
}
static int do_check(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state;
struct bpf_insn *insns = env->prog->insnsi;
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs;
int insn_cnt = env->prog->len;
int insn_idx, prev_insn_idx = 0;
int insn_processed = 0;
bool do_print_state = false;
- init_reg_state(regs);
+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+ env->cur_state = state;
+ init_reg_state(env, state->regs);
state->parent = NULL;
insn_idx = 0;
for (;;) {
@@ -3614,7 +3852,7 @@ static int do_check(struct bpf_verifier_env *env)
int err;
if (insn_idx >= insn_cnt) {
- verbose("invalid insn idx %d insn_cnt %d\n",
+ verbose(env, "invalid insn idx %d insn_cnt %d\n",
insn_idx, insn_cnt);
return -EFAULT;
}
@@ -3623,7 +3861,8 @@ static int do_check(struct bpf_verifier_env *env)
class = BPF_CLASS(insn->code);
if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
- verbose("BPF program is too large. Processed %d insn\n",
+ verbose(env,
+ "BPF program is too large. Processed %d insn\n",
insn_processed);
return -E2BIG;
}
@@ -3633,12 +3872,12 @@ static int do_check(struct bpf_verifier_env *env)
return err;
if (err == 1) {
/* found equivalent state, can prune the search */
- if (log_level) {
+ if (env->log.level) {
if (do_print_state)
- verbose("\nfrom %d to %d: safe\n",
+ verbose(env, "\nfrom %d to %d: safe\n",
prev_insn_idx, insn_idx);
else
- verbose("%d: safe\n", insn_idx);
+ verbose(env, "%d: safe\n", insn_idx);
}
goto process_bpf_exit;
}
@@ -3646,25 +3885,28 @@ static int do_check(struct bpf_verifier_env *env)
if (need_resched())
cond_resched();
- if (log_level > 1 || (log_level && do_print_state)) {
- if (log_level > 1)
- verbose("%d:", insn_idx);
+ if (env->log.level > 1 || (env->log.level && do_print_state)) {
+ if (env->log.level > 1)
+ verbose(env, "%d:", insn_idx);
else
- verbose("\nfrom %d to %d:",
+ verbose(env, "\nfrom %d to %d:",
prev_insn_idx, insn_idx);
- print_verifier_state(&env->cur_state);
+ print_verifier_state(env, state);
do_print_state = false;
}
- if (log_level) {
- verbose("%d: ", insn_idx);
- print_bpf_insn(env, insn);
+ if (env->log.level) {
+ verbose(env, "%d: ", insn_idx);
+ print_bpf_insn(verbose, env, insn,
+ env->allow_ptr_leaks);
}
err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
if (err)
return err;
+ regs = cur_regs(env);
+ env->insn_aux_data[insn_idx].seen = true;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
@@ -3714,7 +3956,7 @@ static int do_check(struct bpf_verifier_env *env)
* src_reg == stack|map in some other branch.
* Reject it.
*/
- verbose("same insn cannot be used with different pointers\n");
+ verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}
@@ -3754,14 +3996,14 @@ static int do_check(struct bpf_verifier_env *env)
} else if (dst_reg_type != *prev_dst_type &&
(dst_reg_type == PTR_TO_CTX ||
*prev_dst_type == PTR_TO_CTX)) {
- verbose("same insn cannot be used with different pointers\n");
+ verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}
} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
- verbose("BPF_ST uses reserved fields\n");
+ verbose(env, "BPF_ST uses reserved fields\n");
return -EINVAL;
}
/* check src operand */
@@ -3769,6 +4011,12 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose(env, "BPF_ST stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
@@ -3784,7 +4032,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->off != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_CALL uses reserved fields\n");
+ verbose(env, "BPF_CALL uses reserved fields\n");
return -EINVAL;
}
@@ -3797,7 +4045,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_JA uses reserved fields\n");
+ verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
@@ -3809,7 +4057,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_EXIT uses reserved fields\n");
+ verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
@@ -3824,13 +4072,18 @@ static int do_check(struct bpf_verifier_env *env)
return err;
if (is_pointer_value(env, BPF_REG_0)) {
- verbose("R0 leaks addr as return value\n");
+ verbose(env, "R0 leaks addr as return value\n");
return -EACCES;
}
+ err = check_return_code(env);
+ if (err)
+ return err;
process_bpf_exit:
- insn_idx = pop_stack(env, &prev_insn_idx);
- if (insn_idx < 0) {
+ err = pop_stack(env, &prev_insn_idx, &insn_idx);
+ if (err < 0) {
+ if (err != -ENOENT)
+ return err;
break;
} else {
do_print_state = true;
@@ -3855,20 +4108,21 @@ process_bpf_exit:
return err;
insn_idx++;
+ env->insn_aux_data[insn_idx].seen = true;
} else {
- verbose("invalid BPF_LD mode\n");
+ verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
}
} else {
- verbose("unknown insn class %d\n", class);
+ verbose(env, "unknown insn class %d\n", class);
return -EINVAL;
}
insn_idx++;
}
- verbose("processed %d insns, stack depth %d\n",
- insn_processed, env->prog->aux->stack_depth);
+ verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+ env->prog->aux->stack_depth);
return 0;
}
@@ -3880,7 +4134,8 @@ static int check_map_prealloc(struct bpf_map *map)
!(map->map_flags & BPF_F_NO_PREALLOC);
}
-static int check_map_prog_compatibility(struct bpf_map *map,
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map,
struct bpf_prog *prog)
{
@@ -3891,12 +4146,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,
*/
if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
if (!check_map_prealloc(map)) {
- verbose("perf_event programs can only use preallocated hash map\n");
+ verbose(env, "perf_event programs can only use preallocated hash map\n");
return -EINVAL;
}
if (map->inner_map_meta &&
!check_map_prealloc(map->inner_map_meta)) {
- verbose("perf_event programs can only use preallocated inner hash map\n");
+ verbose(env, "perf_event programs can only use preallocated inner hash map\n");
return -EINVAL;
}
}
@@ -3919,14 +4174,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
- verbose("BPF_LDX uses reserved fields\n");
+ verbose(env, "BPF_LDX uses reserved fields\n");
return -EINVAL;
}
if (BPF_CLASS(insn->code) == BPF_STX &&
((BPF_MODE(insn->code) != BPF_MEM &&
BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
- verbose("BPF_STX uses reserved fields\n");
+ verbose(env, "BPF_STX uses reserved fields\n");
return -EINVAL;
}
@@ -3937,7 +4192,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
insn[1].off != 0) {
- verbose("invalid bpf_ld_imm64 insn\n");
+ verbose(env, "invalid bpf_ld_imm64 insn\n");
return -EINVAL;
}
@@ -3946,19 +4201,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
goto next_insn;
if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
- verbose("unrecognized bpf_ld_imm64 insn\n");
+ verbose(env,
+ "unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}
f = fdget(insn->imm);
map = __bpf_map_get(f);
if (IS_ERR(map)) {
- verbose("fd %d is not pointing to valid bpf_map\n",
+ verbose(env, "fd %d is not pointing to valid bpf_map\n",
insn->imm);
return PTR_ERR(map);
}
- err = check_map_prog_compatibility(map, env->prog);
+ err = check_map_prog_compatibility(env, map, env->prog);
if (err) {
fdput(f);
return err;
@@ -4035,6 +4291,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
u32 off, u32 cnt)
{
struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ int i;
if (cnt == 1)
return 0;
@@ -4044,6 +4301,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ for (i = off; i < off + cnt - 1; i++)
+ new_data[i].seen = true;
env->insn_aux_data = new_data;
vfree(old_data);
return 0;
@@ -4062,12 +4321,31 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
return new_prog;
}
+/* The verifier does more data flow analysis than llvm and will not explore
+ * branches that are dead at run time. Malicious programs can have dead code
+ * too. Therefore replace all dead at-run-time code with nops.
+ */
+static void sanitize_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (aux_data[i].seen)
+ continue;
+ memcpy(insn + i, &nop, sizeof(nop));
+ }
+}
+
/* convert load instructions that access fields of 'struct __sk_buff'
* into sequence of instructions that access fields of 'struct sk_buff'
*/
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
- const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+ const struct bpf_verifier_ops *ops = env->ops;
int i, cnt, size, ctx_field_size, delta = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16], *insn;
@@ -4080,7 +4358,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
env->prog);
if (cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
} else if (cnt) {
new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
@@ -4128,7 +4406,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
u8 size_code;
if (type == BPF_WRITE) {
- verbose("bpf verifier narrow ctx access misconfigured\n");
+ verbose(env, "bpf verifier narrow ctx access misconfigured\n");
return -EINVAL;
}
@@ -4147,7 +4425,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
&target_size);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
(ctx_field_size && !target_size)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -4191,6 +4469,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
int i, cnt, delta = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+ /* due to JIT bugs clear upper 32-bits of src register
+ * before div/mod operation
+ */
+ insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
+ insn_buf[1] = *insn;
+ cnt = 2;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
if (insn->code != (BPF_JMP | BPF_CALL))
continue;
@@ -4214,6 +4510,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
insn->imm = 0;
insn->code = BPF_JMP | BPF_TAIL_CALL;
+
+ /* instead of changing every JIT dealing with tail_call
+ * emit two extra insns:
+ * if (index >= max_entries) goto out;
+ * index &= array->index_mask;
+ * to avoid out-of-bounds cpu speculation
+ */
+ map_ptr = env->insn_aux_data[i + delta].map_ptr;
+ if (map_ptr == BPF_MAP_PTR_POISON) {
+ verbose(env, "tail_call abusing map_ptr\n");
+ return -EINVAL;
+ }
+ if (!map_ptr->unpriv_array)
+ continue;
+ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+ map_ptr->max_entries, 2);
+ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+ container_of(map_ptr,
+ struct bpf_array,
+ map)->index_mask);
+ insn_buf[2] = *insn;
+ cnt = 3;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
continue;
}
@@ -4229,7 +4554,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -4268,12 +4593,13 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn = new_prog->insnsi + i + delta;
}
patch_call_imm:
- fn = prog->aux->ops->get_func_proto(insn->imm);
+ fn = env->ops->get_func_proto(insn->imm);
/* all functions that have prototype and verifier allowed
* programs to call them, must be real in-kernel functions
*/
if (!fn->func) {
- verbose("kernel subsystem misconfigured func %s#%d\n",
+ verbose(env,
+ "kernel subsystem misconfigured func %s#%d\n",
func_id_name(insn->imm), insn->imm);
return -EFAULT;
}
@@ -4297,6 +4623,7 @@ static void free_states(struct bpf_verifier_env *env)
if (sl)
while (sl != STATE_LIST_MARK) {
sln = sl->next;
+ free_verifier_state(&sl->state, false);
kfree(sl);
sl = sln;
}
@@ -4307,16 +4634,21 @@ static void free_states(struct bpf_verifier_env *env)
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
- char __user *log_ubuf = NULL;
struct bpf_verifier_env *env;
+ struct bpf_verifer_log *log;
int ret = -EINVAL;
+ /* no program is valid */
+ if (ARRAY_SIZE(bpf_verifier_ops) == 0)
+ return -EINVAL;
+
/* 'struct bpf_verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
+ log = &env->log;
env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
(*prog)->len);
@@ -4324,6 +4656,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (!env->insn_aux_data)
goto err_free_env;
env->prog = *prog;
+ env->ops = bpf_verifier_ops[env->prog->type];
/* grab the mutex to protect few globals used by verifier */
mutex_lock(&bpf_verifier_lock);
@@ -4332,29 +4665,27 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
/* user requested verbose verifier output
* and supplied buffer to store the verification trace
*/
- log_level = attr->log_level;
- log_ubuf = (char __user *) (unsigned long) attr->log_buf;
- log_size = attr->log_size;
- log_len = 0;
+ log->level = attr->log_level;
+ log->ubuf = (char __user *) (unsigned long) attr->log_buf;
+ log->len_total = attr->log_size;
ret = -EINVAL;
- /* log_* values have to be sane */
- if (log_size < 128 || log_size > UINT_MAX >> 8 ||
- log_level == 0 || log_ubuf == NULL)
+ /* log attributes have to be sane */
+ if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+ !log->level || !log->ubuf)
goto err_unlock;
-
- ret = -ENOMEM;
- log_buf = vmalloc(log_size);
- if (!log_buf)
- goto err_unlock;
- } else {
- log_level = 0;
}
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
+ if (env->prog->aux->offload) {
+ ret = bpf_prog_offload_verifier_prep(env);
+ if (ret)
+ goto err_unlock;
+ }
+
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
@@ -4373,29 +4704,30 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
ret = do_check(env);
+ if (env->cur_state) {
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ }
skip_full_check:
- while (pop_stack(env, NULL) >= 0);
+ while (!pop_stack(env, NULL, NULL));
free_states(env);
if (ret == 0)
+ sanitize_dead_code(env);
+
+ if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
if (ret == 0)
ret = fixup_bpf_calls(env);
- if (log_level && log_len >= log_size - 1) {
- BUG_ON(log_len >= log_size);
- /* verifier log exceeded user supplied buffer */
+ if (log->level && bpf_verifier_log_full(log))
ret = -ENOSPC;
- /* fall through to return what was recorded */
- }
-
- /* copy verifier log back to user space including trailing zero */
- if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ if (log->level && !log->ubuf) {
ret = -EFAULT;
- goto free_log_buf;
+ goto err_release_maps;
}
if (ret == 0 && env->used_map_cnt) {
@@ -4406,7 +4738,7 @@ skip_full_check:
if (!env->prog->aux->used_maps) {
ret = -ENOMEM;
- goto free_log_buf;
+ goto err_release_maps;
}
memcpy(env->prog->aux->used_maps, env->used_maps,
@@ -4419,9 +4751,7 @@ skip_full_check:
convert_pseudo_ld_imm64(env);
}
-free_log_buf:
- if (log_level)
- vfree(log_buf);
+err_release_maps:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_bpf_prog_info() will release them.
@@ -4435,58 +4765,3 @@ err_free_env:
kfree(env);
return ret;
}
-
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
- void *priv)
-{
- struct bpf_verifier_env *env;
- int ret;
-
- env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
- if (!env)
- return -ENOMEM;
-
- env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
- prog->len);
- ret = -ENOMEM;
- if (!env->insn_aux_data)
- goto err_free_env;
- env->prog = prog;
- env->analyzer_ops = ops;
- env->analyzer_priv = priv;
-
- /* grab the mutex to protect few globals used by verifier */
- mutex_lock(&bpf_verifier_lock);
-
- log_level = 0;
-
- env->strict_alignment = false;
- if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
- env->strict_alignment = true;
-
- env->explored_states = kcalloc(env->prog->len,
- sizeof(struct bpf_verifier_state_list *),
- GFP_KERNEL);
- ret = -ENOMEM;
- if (!env->explored_states)
- goto skip_full_check;
-
- ret = check_cfg(env);
- if (ret < 0)
- goto skip_full_check;
-
- env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
- ret = do_check(env);
-
-skip_full_check:
- while (pop_stack(env, NULL) >= 0);
- free_states(env);
-
- mutex_unlock(&bpf_verifier_lock);
- vfree(env->insn_aux_data);
-err_free_env:
- kfree(env);
- return ret;
-}
-EXPORT_SYMBOL_GPL(bpf_analyzer);
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index ae448f7632cc..2be89a003185 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y := cgroup.o namespace.o cgroup-v1.o
+obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index bf54ade001be..b928b27050c6 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -201,6 +201,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
int cgroup_task_count(const struct cgroup *cgrp);
/*
+ * stat.c
+ */
+void cgroup_stat_flush(struct cgroup *cgrp);
+int cgroup_stat_init(struct cgroup *cgrp);
+void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_stat_show_cputime(struct seq_file *seq);
+void cgroup_stat_boot(void);
+
+/*
* namespace.c
*/
extern const struct proc_ns_operations cgroupns_operations;
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085daab1a..a2c05d2476ac 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
*/
do {
css_task_iter_start(&from->self, 0, &it);
- task = css_task_iter_next(&it);
+
+ do {
+ task = css_task_iter_next(&it);
+ } while (task && (task->flags & PF_EXITING));
+
if (task)
get_task_struct(task);
css_task_iter_end(&it);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 44857278eb8a..8cda3bc3ae22 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
};
#undef SUBSYS
+static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+
/*
* The default hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
* part of that cgroup.
*/
-struct cgroup_root cgrp_dfl_root;
+struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
@@ -462,6 +464,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
}
/**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+ css = cgroup_css(cgrp, ss);
+ if (!css || !css_tryget_online(css))
+ css = NULL;
+ rcu_read_unlock();
+
+ return css;
+}
+
+/**
* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
@@ -647,6 +671,14 @@ struct css_set init_css_set = {
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+
+ /*
+ * The following field is re-initialized when this cset gets linked
+ * in cgroup_init(). However, let's initialize the field
+ * statically too so that the default cgroup can be accessed safely
+ * early during boot.
+ */
+ .dfl_cgrp = &cgrp_dfl_root.cgrp,
};
static int css_set_count = 1; /* 1 for init_css_set */
@@ -1365,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
cft->name);
else
- strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
return buf;
}
@@ -1832,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
root->flags = opts->flags;
if (opts->release_agent)
- strcpy(root->release_agent_path, opts->release_agent);
+ strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
if (opts->name)
- strcpy(root->name, opts->name);
+ strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
if (opts->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
@@ -1896,6 +1928,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
if (ret)
goto destroy_root;
+ ret = cgroup_bpf_inherit(root_cgrp);
+ WARN_ON_ONCE(ret);
+
trace_cgroup_setup_root(root);
/*
@@ -3312,6 +3347,37 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
return 0;
}
+static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
+ struct cgroup *cgrp, int ssid)
+{
+ struct cgroup_subsys *ss = cgroup_subsys[ssid];
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (!ss->css_extra_stat_show)
+ return 0;
+
+ css = cgroup_tryget_css(cgrp, ss);
+ if (!css)
+ return 0;
+
+ ret = ss->css_extra_stat_show(seq, css);
+ css_put(css);
+ return ret;
+}
+
+static int cpu_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
+ int ret = 0;
+
+ cgroup_stat_show_cputime(seq);
+#ifdef CONFIG_CGROUP_SCHED
+ ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+#endif
+ return ret;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
@@ -4059,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
static void css_task_iter_advance(struct css_task_iter *it)
{
- struct list_head *l = it->task_pos;
+ struct list_head *next;
lockdep_assert_held(&css_set_lock);
- WARN_ON_ONCE(!l);
-
repeat:
/*
* Advance iterator to find next entry. cset->tasks is consumed
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
* next cset.
*/
- l = l->next;
+ next = it->task_pos->next;
- if (l == it->tasks_head)
- l = it->mg_tasks_head->next;
+ if (next == it->tasks_head)
+ next = it->mg_tasks_head->next;
- if (l == it->mg_tasks_head)
+ if (next == it->mg_tasks_head)
css_task_iter_advance_css_set(it);
else
- it->task_pos = l;
+ it->task_pos = next;
/* if PROCS, skip over tasks which aren't group leaders */
if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
@@ -4383,6 +4447,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cgroup.threads",
+ .flags = CFTYPE_NS_DELEGATABLE,
.release = cgroup_procs_release,
.seq_start = cgroup_threads_start,
.seq_next = cgroup_procs_next,
@@ -4419,6 +4484,11 @@ static struct cftype cgroup_base_files[] = {
.name = "cgroup.stat",
.seq_show = cgroup_stat_show,
},
+ {
+ .name = "cpu.stat",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_stat_show,
+ },
{ } /* terminate */
};
@@ -4479,6 +4549,8 @@ static void css_free_work_fn(struct work_struct *work)
*/
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
+ if (cgroup_on_dfl(cgrp))
+ cgroup_stat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -4523,6 +4595,9 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
trace_cgroup_release(cgrp);
+ if (cgroup_on_dfl(cgrp))
+ cgroup_stat_flush(cgrp);
+
for (tcgrp = cgroup_parent(cgrp); tcgrp;
tcgrp = cgroup_parent(tcgrp))
tcgrp->nr_dying_descendants--;
@@ -4706,6 +4781,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (ret)
goto out_free_cgrp;
+ if (cgroup_on_dfl(parent)) {
+ ret = cgroup_stat_init(cgrp);
+ if (ret)
+ goto out_cancel_ref;
+ }
+
/*
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
@@ -4713,7 +4794,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
if (cgrp->id < 0) {
ret = -ENOMEM;
- goto out_cancel_ref;
+ goto out_stat_exit;
}
init_cgroup_housekeeping(cgrp);
@@ -4721,6 +4802,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->self.parent = &parent->self;
cgrp->root = root;
cgrp->level = level;
+ ret = cgroup_bpf_inherit(cgrp);
+ if (ret)
+ goto out_idr_free;
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4755,13 +4839,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp);
- if (parent)
- cgroup_bpf_inherit(cgrp, parent);
-
cgroup_propagate_control(cgrp);
return cgrp;
+out_idr_free:
+ cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_stat_exit:
+ if (cgroup_on_dfl(parent))
+ cgroup_stat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -5156,6 +5242,8 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
+ cgroup_stat_boot();
+
/*
* The latency of the synchronize_sched() is too high for cgroups,
* avoid it at the cost of forcing all readers into the slow path.
@@ -5744,15 +5832,103 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */
#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, bool overridable)
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
- struct cgroup *parent = cgroup_parent(cgrp);
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+ ret = __cgroup_bpf_query(cgrp, attr, uattr);
mutex_unlock(&cgroup_mutex);
return ret;
}
#endif /* CONFIG_CGROUP_BPF */
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_delegatable_files(struct cftype *files, char *buf,
+ ssize_t size, const char *prefix)
+{
+ struct cftype *cft;
+ ssize_t ret = 0;
+
+ for (cft = files; cft && cft->name[0] != '\0'; cft++) {
+ if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
+ continue;
+
+ if (prefix)
+ ret += snprintf(buf + ret, size - ret, "%s.", prefix);
+
+ ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
+
+ if (unlikely(ret >= size)) {
+ WARN_ON(1);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
+ ssize_t ret = 0;
+
+ ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
+ NULL);
+
+ for_each_subsys(ss, ssid)
+ ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
+ PAGE_SIZE - ret,
+ cgroup_subsys_name[ssid]);
+
+ return ret;
+}
+static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
+
+static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+}
+static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
+
+static struct attribute *cgroup_sysfs_attrs[] = {
+ &cgroup_delegate_attr.attr,
+ &cgroup_features_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group cgroup_sysfs_attr_group = {
+ .attrs = cgroup_sysfs_attrs,
+ .name = "cgroup",
+};
+
+static int __init cgroup_sysfs_init(void)
+{
+ return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
+}
+subsys_initcall(cgroup_sysfs_init);
+#endif /* CONFIG_SYSFS */
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 5f780d8f6a9d..9caeda610249 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v)
spin_lock_irq(&css_set_lock);
rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
+ cset = task_css_set(current);
refcnt = refcount_read(&cset->refcount);
seq_printf(seq, "css_set %pK %d", cset, refcnt);
if (refcnt > cset->nr_tasks)
@@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
spin_lock_irq(&css_set_lock);
rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
+ cset = task_css_set(current);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
new file mode 100644
index 000000000000..1e111dd455c4
--- /dev/null
+++ b/kernel/cgroup/stat.c
@@ -0,0 +1,338 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/cputime.h>
+
+static DEFINE_MUTEX(cgroup_stat_mutex);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
+{
+ return per_cpu_ptr(cgrp->cpu_stat, cpu);
+}
+
+/**
+ * cgroup_cpu_stat_updated - keep track of updated cpu_stat
+ * @cgrp: target cgroup
+ * @cpu: cpu on which cpu_stat was updated
+ *
+ * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching
+ * cpu_stat->updated_children list. See the comment on top of
+ * cgroup_cpu_stat definition for details.
+ */
+static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
+{
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+ struct cgroup *parent;
+ unsigned long flags;
+
+ /*
+ * Speculative already-on-list test. This may race leading to
+ * temporary inaccuracies, which is fine.
+ *
+ * Because @parent's updated_children is terminated with @parent
+ * instead of NULL, we can tell whether @cgrp is on the list by
+ * testing the next pointer for NULL.
+ */
+ if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
+ return;
+
+ raw_spin_lock_irqsave(cpu_lock, flags);
+
+ /* put @cgrp and all ancestors on the corresponding updated lists */
+ for (parent = cgroup_parent(cgrp); parent;
+ cgrp = parent, parent = cgroup_parent(cgrp)) {
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+ struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+
+ /*
+ * Both additions and removals are bottom-up. If a cgroup
+ * is already in the tree, all ancestors are.
+ */
+ if (cstat->updated_next)
+ break;
+
+ cstat->updated_next = pcstat->updated_children;
+ pcstat->updated_children = cgrp;
+ }
+
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
+/**
+ * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts
+ * the traversal and %NULL return indicates the end. During traversal,
+ * each returned cgroup is unlinked from the tree. Must be called with the
+ * matching cgroup_cpu_stat_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
+ struct cgroup *root, int cpu)
+{
+ struct cgroup_cpu_stat *cstat;
+ struct cgroup *parent;
+
+ if (pos == root)
+ return NULL;
+
+ /*
+ * We're gonna walk down to the first leaf and visit/remove it. We
+ * can pick whatever unvisited node as the starting point.
+ */
+ if (!pos)
+ pos = root;
+ else
+ pos = cgroup_parent(pos);
+
+ /* walk down to the first leaf */
+ while (true) {
+ cstat = cgroup_cpu_stat(pos, cpu);
+ if (cstat->updated_children == pos)
+ break;
+ pos = cstat->updated_children;
+ }
+
+ /*
+ * Unlink @pos from the tree. As the updated_children list is
+ * singly linked, we have to walk it to find the removal point.
+ * However, due to the way we traverse, @pos will be the first
+ * child in most cases. The only exception is @root.
+ */
+ parent = cgroup_parent(pos);
+ if (parent && cstat->updated_next) {
+ struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+ struct cgroup_cpu_stat *ncstat;
+ struct cgroup **nextp;
+
+ nextp = &pcstat->updated_children;
+ while (true) {
+ ncstat = cgroup_cpu_stat(*nextp, cpu);
+ if (*nextp == pos)
+ break;
+
+ WARN_ON_ONCE(*nextp == parent);
+ nextp = &ncstat->updated_next;
+ }
+
+ *nextp = cstat->updated_next;
+ cstat->updated_next = NULL;
+ }
+
+ return pos;
+}
+
+static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
+ struct cgroup_stat *src_stat)
+{
+ dst_stat->cputime.utime += src_stat->cputime.utime;
+ dst_stat->cputime.stime += src_stat->cputime.stime;
+ dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
+}
+
+static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+ struct task_cputime *last_cputime = &cstat->last_cputime;
+ struct task_cputime cputime;
+ struct cgroup_stat delta;
+ unsigned seq;
+
+ lockdep_assert_held(&cgroup_stat_mutex);
+
+ /* fetch the current per-cpu values */
+ do {
+ seq = __u64_stats_fetch_begin(&cstat->sync);
+ cputime = cstat->cputime;
+ } while (__u64_stats_fetch_retry(&cstat->sync, seq));
+
+ /* accumulate the deltas to propgate */
+ delta.cputime.utime = cputime.utime - last_cputime->utime;
+ delta.cputime.stime = cputime.stime - last_cputime->stime;
+ delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+ last_cputime->sum_exec_runtime;
+ *last_cputime = cputime;
+
+ /* transfer the pending stat into delta */
+ cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
+ memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
+
+ /* propagate delta into the global stat and the parent's pending */
+ cgroup_stat_accumulate(&cgrp->stat, &delta);
+ if (parent)
+ cgroup_stat_accumulate(&parent->pending_stat, &delta);
+}
+
+/* see cgroup_stat_flush() */
+static void cgroup_stat_flush_locked(struct cgroup *cgrp)
+{
+ int cpu;
+
+ lockdep_assert_held(&cgroup_stat_mutex);
+
+ for_each_possible_cpu(cpu) {
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+ struct cgroup *pos = NULL;
+
+ raw_spin_lock_irq(cpu_lock);
+ while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
+ cgroup_cpu_stat_flush_one(pos, cpu);
+ raw_spin_unlock_irq(cpu_lock);
+ }
+}
+
+/**
+ * cgroup_stat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards. After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ */
+void cgroup_stat_flush(struct cgroup *cgrp)
+{
+ mutex_lock(&cgroup_stat_mutex);
+ cgroup_stat_flush_locked(cgrp);
+ mutex_unlock(&cgroup_stat_mutex);
+}
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
+{
+ struct cgroup_cpu_stat *cstat;
+
+ cstat = get_cpu_ptr(cgrp->cpu_stat);
+ u64_stats_update_begin(&cstat->sync);
+ return cstat;
+}
+
+static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
+ struct cgroup_cpu_stat *cstat)
+{
+ u64_stats_update_end(&cstat->sync);
+ cgroup_cpu_stat_updated(cgrp, smp_processor_id());
+ put_cpu_ptr(cstat);
+}
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+ struct cgroup_cpu_stat *cstat;
+
+ cstat = cgroup_cpu_stat_account_begin(cgrp);
+ cstat->cputime.sum_exec_runtime += delta_exec;
+ cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+ enum cpu_usage_stat index, u64 delta_exec)
+{
+ struct cgroup_cpu_stat *cstat;
+
+ cstat = cgroup_cpu_stat_account_begin(cgrp);
+
+ switch (index) {
+ case CPUTIME_USER:
+ case CPUTIME_NICE:
+ cstat->cputime.utime += delta_exec;
+ break;
+ case CPUTIME_SYSTEM:
+ case CPUTIME_IRQ:
+ case CPUTIME_SOFTIRQ:
+ cstat->cputime.stime += delta_exec;
+ break;
+ default:
+ break;
+ }
+
+ cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void cgroup_stat_show_cputime(struct seq_file *seq)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ u64 usage, utime, stime;
+
+ if (!cgroup_parent(cgrp))
+ return;
+
+ mutex_lock(&cgroup_stat_mutex);
+
+ cgroup_stat_flush_locked(cgrp);
+
+ usage = cgrp->stat.cputime.sum_exec_runtime;
+ cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
+ &utime, &stime);
+
+ mutex_unlock(&cgroup_stat_mutex);
+
+ do_div(usage, NSEC_PER_USEC);
+ do_div(utime, NSEC_PER_USEC);
+ do_div(stime, NSEC_PER_USEC);
+
+ seq_printf(seq, "usage_usec %llu\n"
+ "user_usec %llu\n"
+ "system_usec %llu\n",
+ usage, utime, stime);
+}
+
+int cgroup_stat_init(struct cgroup *cgrp)
+{
+ int cpu;
+
+ /* the root cgrp has cpu_stat preallocated */
+ if (!cgrp->cpu_stat) {
+ cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
+ if (!cgrp->cpu_stat)
+ return -ENOMEM;
+ }
+
+ /* ->updated_children list is self terminated */
+ for_each_possible_cpu(cpu) {
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+
+ cstat->updated_children = cgrp;
+ u64_stats_init(&cstat->sync);
+ }
+
+ prev_cputime_init(&cgrp->stat.prev_cputime);
+
+ return 0;
+}
+
+void cgroup_stat_exit(struct cgroup *cgrp)
+{
+ int cpu;
+
+ cgroup_stat_flush(cgrp);
+
+ /* sanity check */
+ for_each_possible_cpu(cpu) {
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+
+ if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
+ WARN_ON_ONCE(cstat->updated_next))
+ return;
+ }
+
+ free_percpu(cgrp->cpu_stat);
+ cgrp->cpu_stat = NULL;
+}
+
+void __init cgroup_stat_boot(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
+
+ BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
+}
diff --git a/kernel/compat.c b/kernel/compat.c
index 772e038d04d9..d1cee656a7ed 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -367,24 +367,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
return ret;
}
-int get_compat_itimerspec(struct itimerspec *dst,
- const struct compat_itimerspec __user *src)
-{
- if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
- __compat_get_timespec(&dst->it_value, &src->it_value))
- return -EFAULT;
- return 0;
-}
-
-int put_compat_itimerspec(struct compat_itimerspec __user *dst,
- const struct itimerspec *src)
-{
- if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
- __compat_put_timespec(&src->it_value, &dst->it_value))
- return -EFAULT;
- return 0;
-}
-
int get_compat_itimerspec64(struct itimerspec64 *its,
const struct compat_itimerspec __user *uits)
{
@@ -485,27 +467,44 @@ Efault:
return -EFAULT;
}
-void
-sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
+int
+get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
{
+#ifdef __BIG_ENDIAN
+ compat_sigset_t v;
+ if (copy_from_user(&v, compat, sizeof(compat_sigset_t)))
+ return -EFAULT;
switch (_NSIG_WORDS) {
- case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
- case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
- case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
- case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
+ case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
+ case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
+ case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
+ case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
}
+#else
+ if (copy_from_user(set, compat, sizeof(compat_sigset_t)))
+ return -EFAULT;
+#endif
+ return 0;
}
-EXPORT_SYMBOL_GPL(sigset_from_compat);
+EXPORT_SYMBOL_GPL(get_compat_sigset);
-void
-sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+int
+put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
+ unsigned int size)
{
+ /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
+#ifdef __BIG_ENDIAN
+ compat_sigset_t v;
switch (_NSIG_WORDS) {
- case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
- case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
- case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
- case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+ case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
+ case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
+ case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
+ case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
}
+ return copy_to_user(compat, &v, size) ? -EFAULT : 0;
+#else
+ return copy_to_user(compat, set, size) ? -EFAULT : 0;
+#endif
}
#ifdef CONFIG_NUMA
@@ -563,22 +562,6 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
}
#endif
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
- compat_pid_t, pid,
- struct compat_timespec __user *, interval)
-{
- struct timespec t;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
- set_fs(old_fs);
- if (compat_put_timespec(&t, interval))
- return -EFAULT;
- return ret;
-}
-
/*
* Allocate user-space memory for the duration of a single system call,
* in order to marshall parameters inside a compat thunk.
diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config
new file mode 100644
index 000000000000..81ff07863576
--- /dev/null
+++ b/kernel/configs/nopm.config
@@ -0,0 +1,15 @@
+CONFIG_PM=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+
+# Triggers PM on OMAP
+CONFIG_CPU_IDLE=n
+
+# Triggers enablement via hibernate callbacks
+CONFIG_XEN=n
+
+# ARM/ARM64 architectures that select PM unconditionally
+CONFIG_ARCH_OMAP2PLUS_TYPICAL=n
+CONFIG_ARCH_RENESAS=n
+CONFIG_ARCH_TEGRA=n
+CONFIG_ARCH_VEXPRESS=n
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 04892a82f6ac..53f7dc65f9a3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,19 +80,19 @@ static struct lockdep_map cpuhp_state_down_map =
STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
-static void inline cpuhp_lock_acquire(bool bringup)
+static inline void cpuhp_lock_acquire(bool bringup)
{
lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}
-static void inline cpuhp_lock_release(bool bringup)
+static inline void cpuhp_lock_release(bool bringup)
{
lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}
#else
-static void inline cpuhp_lock_acquire(bool bringup) { }
-static void inline cpuhp_lock_release(bool bringup) { }
+static inline void cpuhp_lock_acquire(bool bringup) { }
+static inline void cpuhp_lock_release(bool bringup) { }
#endif
@@ -780,8 +780,8 @@ static int takedown_cpu(unsigned int cpu)
BUG_ON(cpu_online(cpu));
/*
- * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all
- * runnable tasks from the cpu, there's only the idle task left now
+ * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
+ * all runnable tasks from the CPU, there's only the idle task left now
* that the migration thread is done doing the stop_machine thing.
*
* Wait for the stop thread to go away.
@@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
* before blk_mq_queue_reinit_notify() from notify_dead(),
* otherwise a RCU stall occurs.
*/
- [CPUHP_TIMERS_DEAD] = {
+ [CPUHP_TIMERS_PREPARE] = {
.name = "timers:dead",
- .startup.single = NULL,
+ .startup.single = timers_prepare_cpu,
.teardown.single = timers_dead_cpu,
},
/* Kicks the plugged cpu into life */
@@ -1289,11 +1289,6 @@ static struct cpuhp_step cpuhp_bp_states[] = {
.teardown.single = NULL,
.cant_stop = true,
},
- [CPUHP_AP_SMPCFD_DYING] = {
- .name = "smpcfd:dying",
- .startup.single = NULL,
- .teardown.single = smpcfd_dying_cpu,
- },
/*
* Handled on controll processor until the plugged processor manages
* this itself.
@@ -1335,6 +1330,11 @@ static struct cpuhp_step cpuhp_ap_states[] = {
.startup.single = NULL,
.teardown.single = rcutree_dying_cpu,
},
+ [CPUHP_AP_SMPCFD_DYING] = {
+ .name = "smpcfd:dying",
+ .startup.single = NULL,
+ .teardown.single = smpcfd_dying_cpu,
+ },
/* Entry state on starting. Interrupts enabled from here on. Transient
* state for synchronsization */
[CPUHP_AP_ONLINE] = {
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 6db80fc0810b..4f63597c824d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -108,7 +108,8 @@ static int __init parse_crashkernel_mem(char *cmdline,
return -EINVAL;
}
}
- }
+ } else
+ pr_info("crashkernel size resulted in zero bytes\n");
return 0;
}
@@ -409,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(contig_page_data);
#endif
#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_SYMBOL_ARRAY(mem_section);
VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
VMCOREINFO_STRUCT_SIZE(mem_section);
VMCOREINFO_OFFSET(mem_section, section_mem_map);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index e74be38245ad..ed5d34925ad0 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -350,7 +350,7 @@ poll_again:
}
kdb_printf("\n");
for (i = 0; i < count; i++) {
- if (kallsyms_symbol_next(p_tmp, i) < 0)
+ if (WARN_ON(!kallsyms_symbol_next(p_tmp, i)))
break;
kdb_printf("%s ", p_tmp);
*(p_tmp + len) = '\0';
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index c8146d53ca67..dbb0781a0533 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2441,7 +2441,6 @@ static int kdb_kill(int argc, const char **argv)
long sig, pid;
char *endp;
struct task_struct *p;
- struct siginfo info;
if (argc != 2)
return KDB_ARGCOUNT;
@@ -2449,7 +2448,7 @@ static int kdb_kill(int argc, const char **argv)
sig = simple_strtol(argv[1], &endp, 0);
if (*endp)
return KDB_BADINT;
- if (sig >= 0) {
+ if ((sig >= 0) || !valid_signal(-sig)) {
kdb_printf("Invalid signal parameter.<-signal>\n");
return 0;
}
@@ -2470,12 +2469,7 @@ static int kdb_kill(int argc, const char **argv)
return 0;
}
p = p->group_leader;
- info.si_signo = sig;
- info.si_errno = 0;
- info.si_code = SI_USER;
- info.si_pid = pid; /* same capabilities as process being signalled */
- info.si_uid = 0; /* kdb has root authority */
- kdb_send_sig_info(p, &info);
+ kdb_send_sig(p, sig);
return 0;
}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index fc224fbcf954..1e5a502ba4a7 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -208,7 +208,7 @@ extern unsigned long kdb_task_state(const struct task_struct *p,
extern void kdb_ps_suppressed(void);
extern void kdb_ps1(const struct task_struct *p);
extern void kdb_print_nameval(const char *name, unsigned long val);
-extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
+extern void kdb_send_sig(struct task_struct *p, int sig);
extern void kdb_meminfo_proc_show(void);
extern char *kdb_getstr(char *, size_t, const char *);
extern void kdb_gdb_state_pass(char *buf);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 4a1c33416b6a..e2764d767f18 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk)
* Finish delay accounting for a statistic using its timestamps (@start),
* accumalator (@total) and @count
*/
-static void delayacct_end(u64 *start, u64 *total, u32 *count)
+static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
{
s64 ns = ktime_get_ns() - *start;
unsigned long flags;
if (ns > 0) {
- spin_lock_irqsave(&current->delays->lock, flags);
+ spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
- spin_unlock_irqrestore(&current->delays->lock, flags);
+ spin_unlock_irqrestore(lock, flags);
}
}
@@ -69,17 +69,25 @@ void __delayacct_blkio_start(void)
current->delays->blkio_start = ktime_get_ns();
}
-void __delayacct_blkio_end(void)
+/*
+ * We cannot rely on the `current` macro, as we haven't yet switched back to
+ * the process being woken.
+ */
+void __delayacct_blkio_end(struct task_struct *p)
{
- if (current->delays->flags & DELAYACCT_PF_SWAPIN)
- /* Swapin block I/O */
- delayacct_end(&current->delays->blkio_start,
- &current->delays->swapin_delay,
- &current->delays->swapin_count);
- else /* Other block I/O */
- delayacct_end(&current->delays->blkio_start,
- &current->delays->blkio_delay,
- &current->delays->blkio_count);
+ struct task_delay_info *delays = p->delays;
+ u64 *total;
+ u32 *count;
+
+ if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
+ total = &delays->swapin_delay;
+ count = &delays->swapin_count;
+ } else {
+ total = &delays->blkio_delay;
+ count = &delays->blkio_count;
+ }
+
+ delayacct_end(&delays->lock, &delays->blkio_start, total, count);
}
int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -153,8 +161,10 @@ void __delayacct_freepages_start(void)
void __delayacct_freepages_end(void)
{
- delayacct_end(&current->delays->freepages_start,
- &current->delays->freepages_delay,
- &current->delays->freepages_count);
+ delayacct_end(
+ &current->delays->lock,
+ &current->delays->freepages_start,
+ &current->delays->freepages_delay,
+ &current->delays->freepages_count);
}
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 1b2be63c8528..772a43fea825 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -179,21 +179,6 @@ put_callchain_entry(int rctx)
}
struct perf_callchain_entry *
-perf_callchain(struct perf_event *event, struct pt_regs *regs)
-{
- bool kernel = !event->attr.exclude_callchain_kernel;
- bool user = !event->attr.exclude_callchain_user;
- /* Disallow cross-task user callchains. */
- bool crosstask = event->ctx->task && event->ctx->task != current;
- const u32 max_stack = event->attr.sample_max_stack;
-
- if (!kernel && !user)
- return NULL;
-
- return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);
-}
-
-struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark)
{
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c39c05e029a..d0d9bfb47d2e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
+ *
+ * cpu_hotplug_lock
+ * pmus_lock
+ * cpuctx->mutex / perf_event_context::mutex
*/
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
@@ -3601,7 +3605,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
goto out;
}
-
/*
* If the event is currently on this CPU, its either a per-task event,
* or local to this CPU. Furthermore it means its ACTIVE (otherwise
@@ -4197,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct perf_event *child, *tmp;
+ LIST_HEAD(free_list);
/*
* If we got here through err_file: fput(event_file); we will not have
@@ -4269,8 +4273,7 @@ again:
struct perf_event, child_list);
if (tmp == child) {
perf_remove_from_context(child, DETACH_GROUP);
- list_del(&child->child_list);
- free_event(child);
+ list_move(&child->child_list, &free_list);
/*
* This matches the refcount bump in inherit_event();
* this can't be the last reference.
@@ -4285,6 +4288,11 @@ again:
}
mutex_unlock(&event->child_mutex);
+ list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+ list_del(&child->child_list);
+ free_event(child);
+ }
+
no_ctx:
put_event(event); /* Must be the 'last' reference */
return 0;
@@ -4512,11 +4520,11 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
return ret;
}
-static unsigned int perf_poll(struct file *file, poll_table *wait)
+static __poll_t perf_poll(struct file *file, poll_table *wait)
{
struct perf_event *event = file->private_data;
struct ring_buffer *rb;
- unsigned int events = POLLHUP;
+ __poll_t events = POLLHUP;
poll_wait(file, &event->waitq, wait);
@@ -4905,6 +4913,7 @@ void perf_event_update_userpage(struct perf_event *event)
unlock:
rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(perf_event_update_userpage);
static int perf_mmap_fault(struct vm_fault *vmf)
{
@@ -5816,19 +5825,11 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_read(handle, event);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- if (data->callchain) {
- int size = 1;
-
- if (data->callchain)
- size += data->callchain->nr;
-
- size *= sizeof(u64);
+ int size = 1;
- __output_copy(handle, data->callchain, size);
- } else {
- u64 nr = 0;
- perf_output_put(handle, nr);
- }
+ size += data->callchain->nr;
+ size *= sizeof(u64);
+ __output_copy(handle, data->callchain, size);
}
if (sample_type & PERF_SAMPLE_RAW) {
@@ -5981,6 +5982,26 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
}
+static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
+
+static struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
+{
+ bool kernel = !event->attr.exclude_callchain_kernel;
+ bool user = !event->attr.exclude_callchain_user;
+ /* Disallow cross-task user callchains. */
+ bool crosstask = event->ctx->task && event->ctx->task != current;
+ const u32 max_stack = event->attr.sample_max_stack;
+ struct perf_callchain_entry *callchain;
+
+ if (!kernel && !user)
+ return &__empty_callchain;
+
+ callchain = get_perf_callchain(regs, 0, kernel, user,
+ max_stack, crosstask, true);
+ return callchain ?: &__empty_callchain;
+}
+
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
@@ -6003,9 +6024,7 @@ void perf_prepare_sample(struct perf_event_header *header,
int size = 1;
data->callchain = perf_callchain(event, regs);
-
- if (data->callchain)
- size += data->callchain->nr;
+ size += data->callchain->nr;
header->size += size * sizeof(u64);
}
@@ -6640,6 +6659,7 @@ static void perf_event_namespaces_output(struct perf_event *event,
struct perf_namespaces_event *namespaces_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
+ u16 header_size = namespaces_event->event_id.header.size;
int ret;
if (!perf_event_namespaces_match(event))
@@ -6650,7 +6670,7 @@ static void perf_event_namespaces_output(struct perf_event *event,
ret = perf_output_begin(&handle, event,
namespaces_event->event_id.header.size);
if (ret)
- return;
+ goto out;
namespaces_event->event_id.pid = perf_event_pid(event,
namespaces_event->task);
@@ -6662,6 +6682,8 @@ static void perf_event_namespaces_output(struct perf_event *event,
perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
+out:
+ namespaces_event->event_id.header.size = header_size;
}
static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
@@ -6677,6 +6699,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
ns_inode = ns_path.dentry->d_inode;
ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
ns_link_info->ino = ns_inode->i_ino;
+ path_put(&ns_path);
}
}
@@ -7867,25 +7890,24 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
struct pt_regs *regs, struct hlist_head *head,
struct task_struct *task)
{
- struct bpf_prog *prog = call->prog;
-
- if (prog) {
+ if (bpf_prog_array_valid(call)) {
*(struct pt_regs **)raw_data = regs;
- if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+ if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
}
perf_tp_event(call->event.type, count, raw_data, size, regs, head,
- rctx, task, NULL);
+ rctx, task);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
- struct task_struct *task, struct perf_event *event)
+ struct task_struct *task)
{
struct perf_sample_data data;
+ struct perf_event *event;
struct perf_raw_record raw = {
.frag = {
@@ -7899,15 +7921,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
perf_trace_buf_update(record, event_type);
- /* Use the given event instead of the hlist */
- if (event) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
- } else {
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
}
/*
@@ -7994,11 +8010,11 @@ static void bpf_overflow_handler(struct perf_event *event,
{
struct bpf_perf_event_data_kern ctx = {
.data = data,
- .regs = regs,
.event = event,
};
int ret = 0;
+ ctx.regs = perf_arch_bpf_user_pt_regs(regs);
preempt_disable();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
goto out;
@@ -8060,13 +8076,11 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
+ int ret;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return perf_event_set_bpf_handler(event, prog_fd);
- if (event->tp_event->prog)
- return -EEXIST;
-
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
is_syscall_tp = is_syscall_trace_event(event->tp_event);
@@ -8094,26 +8108,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return -EACCES;
}
}
- event->tp_event->prog = prog;
- event->tp_event->bpf_prog_owner = event;
- return 0;
+ ret = perf_event_attach_bpf_prog(event, prog);
+ if (ret)
+ bpf_prog_put(prog);
+ return ret;
}
static void perf_event_free_bpf_prog(struct perf_event *event)
{
- struct bpf_prog *prog;
-
- perf_event_free_bpf_handler(event);
-
- if (!event->tp_event)
+ if (event->attr.type != PERF_TYPE_TRACEPOINT) {
+ perf_event_free_bpf_handler(event);
return;
-
- prog = event->tp_event->prog;
- if (prog && event->tp_event->bpf_prog_owner == event) {
- event->tp_event->prog = NULL;
- bpf_prog_put(prog);
}
+ perf_event_detach_bpf_prog(event);
}
#else
@@ -8528,6 +8536,29 @@ fail_clear_files:
return ret;
}
+static int
+perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
+{
+ struct perf_event_context *ctx = event->ctx;
+ int ret;
+
+ /*
+ * Beware, here be dragons!!
+ *
+ * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
+ * stuff does not actually need it. So temporarily drop ctx->mutex. As per
+ * perf_event_ctx_lock() we already have a reference on ctx.
+ *
+ * This can result in event getting moved to a different ctx, but that
+ * does not affect the tracepoint state.
+ */
+ mutex_unlock(&ctx->mutex);
+ ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+ mutex_lock(&ctx->mutex);
+
+ return ret;
+}
+
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
char *filter_str;
@@ -8544,8 +8575,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
event->attr.type == PERF_TYPE_TRACEPOINT)
- ret = ftrace_profile_set_filter(event, event->attr.config,
- filter_str);
+ ret = perf_tracepoint_set_filter(event, filter_str);
else if (has_addr_filter(event))
ret = perf_event_set_addr_filter(event, filter_str);
@@ -9180,7 +9210,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (!try_module_get(pmu->module))
return -ENODEV;
- if (event->group_leader != event) {
+ /*
+ * A number of pmu->event_init() methods iterate the sibling_list to,
+ * for example, validate if the group fits on the PMU. Therefore,
+ * if this is a sibling event, acquire the ctx->mutex to protect
+ * the sibling_list.
+ */
+ if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
/*
* This ctx->mutex can nest when we're called through
* inheritance. See the perf_event_ctx_lock_nested() comment.
@@ -10715,6 +10751,19 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
+
+ if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
+ !child_ctx->task_ctx_data) {
+ struct pmu *pmu = child_event->pmu;
+
+ child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
+ GFP_KERNEL);
+ if (!child_ctx->task_ctx_data) {
+ free_event(child_event);
+ return NULL;
+ }
+ }
+
/*
* is_orphaned_event() and list_add_tail(&parent_event->child_list)
* must be under the same lock in order to serialize against
@@ -10725,6 +10774,7 @@ inherit_event(struct perf_event *parent_event,
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
mutex_unlock(&parent_event->child_mutex);
+ /* task_ctx_data is freed with child_ctx */
free_event(child_event);
return NULL;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 09b1537ae06c..6dc725a7e7bc 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -201,10 +201,6 @@ arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
-/* Callchain handling */
-extern struct perf_callchain_entry *
-perf_callchain(struct perf_event *event, struct pt_regs *regs);
-
static inline int get_recursion_context(int *recursion)
{
int rctx;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index f3e37971c842..141aa2ca8728 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -411,6 +411,7 @@ err:
return NULL;
}
+EXPORT_SYMBOL_GPL(perf_aux_output_begin);
static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb)
{
@@ -480,6 +481,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
rb_free_aux(rb);
ring_buffer_put(rb);
}
+EXPORT_SYMBOL_GPL(perf_aux_output_end);
/*
* Skip over a given number of bytes in the AUX buffer, due to, for example,
@@ -505,6 +507,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
return 0;
}
+EXPORT_SYMBOL_GPL(perf_aux_output_skip);
void *perf_get_aux(struct perf_output_handle *handle)
{
@@ -514,6 +517,7 @@ void *perf_get_aux(struct perf_output_handle *handle)
return handle->rb->aux_priv;
}
+EXPORT_SYMBOL_GPL(perf_get_aux);
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 267f6ef91d97..ce6848e46e94 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1167,8 +1167,8 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
}
ret = 0;
- smp_wmb(); /* pairs with get_xol_area() */
- mm->uprobes_state.xol_area = area;
+ /* pairs with get_xol_area() */
+ smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
fail:
up_write(&mm->mmap_sem);
@@ -1230,8 +1230,8 @@ static struct xol_area *get_xol_area(void)
if (!mm->uprobes_state.xol_area)
__create_xol_area(0);
- area = mm->uprobes_state.xol_area;
- smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ /* Pairs with xol_add_vma() smp_store_release() */
+ area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
return area;
}
@@ -1528,8 +1528,8 @@ static unsigned long get_trampoline_vaddr(void)
struct xol_area *area;
unsigned long trampoline_vaddr = -1;
- area = current->mm->uprobes_state.xol_area;
- smp_read_barrier_depends();
+ /* Pairs with xol_add_vma() smp_store_release() */
+ area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
if (area)
trampoline_vaddr = area->vaddr;
diff --git a/kernel/exit.c b/kernel/exit.c
index 6b4298a41167..995453d9fb55 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1755,3 +1755,12 @@ Efault:
return -EFAULT;
}
#endif
+
+__weak void abort(void)
+{
+ BUG();
+
+ /* if that doesn't kill us, halt */
+ panic("Oops failed to kill thread");
+}
+EXPORT_SYMBOL(abort);
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cc743698d3..2295fc69717f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -469,7 +469,7 @@ void __init fork_init(void)
/* create a slab on which task_structs can be allocated */
task_struct_cachep = kmem_cache_create("task_struct",
arch_task_struct_size, align,
- SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
+ SLAB_PANIC|SLAB_ACCOUNT, NULL);
#endif
/* do the arch specific task caches init */
@@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
goto out;
}
/* a new mm has just been created */
- arch_dup_mmap(oldmm, mm);
- retval = 0;
+ retval = arch_dup_mmap(oldmm, mm);
out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
@@ -817,8 +816,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
- atomic_long_set(&mm->nr_ptes, 0);
- mm_nr_pmds_init(mm);
+ mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
mm->pinned_vm = 0;
@@ -872,12 +870,9 @@ static void check_mm(struct mm_struct *mm)
"mm:%p idx:%d val:%ld\n", mm, i, x);
}
- if (atomic_long_read(&mm->nr_ptes))
- pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
- atomic_long_read(&mm->nr_ptes));
- if (mm_nr_pmds(mm))
- pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
- mm_nr_pmds(mm));
+ if (mm_pgtables_bytes(mm))
+ pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+ mm_pgtables_bytes(mm));
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
@@ -1875,7 +1870,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
- if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_cancel_cgroup;
}
@@ -2209,18 +2204,18 @@ void __init proc_caches_init(void)
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
- SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
+ SLAB_ACCOUNT, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
/*
* FIXME! The "sizeof(struct mm_struct)" currently includes the
@@ -2231,7 +2226,7 @@ void __init proc_caches_init(void)
*/
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
diff --git a/kernel/futex.c b/kernel/futex.c
index 76ed5921117a..7f719d110908 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1582,8 +1582,8 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
{
unsigned int op = (encoded_op & 0x70000000) >> 28;
unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
- int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12);
- int cmparg = sign_extend32(encoded_op & 0x00000fff, 12);
+ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
+ int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
int oldval, ret;
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
@@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
+ if (nr_wake < 0 || nr_requeue < 0)
+ return -EINVAL;
+
/*
* When PI not supported: return -ENOSYS if requeue_pi is true,
* consequently the compiler knows requeue_pi is always false past
@@ -2294,34 +2297,33 @@ static void unqueue_me_pi(struct futex_q *q)
spin_unlock(q->lock_ptr);
}
-/*
- * Fixup the pi_state owner with the new owner.
- *
- * Must be called with hash bucket lock held and mm->sem held for non
- * private futexes.
- */
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner)
+ struct task_struct *argowner)
{
- u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
u32 uval, uninitialized_var(curval), newval;
- struct task_struct *oldowner;
+ struct task_struct *oldowner, *newowner;
+ u32 newtid;
int ret;
+ lockdep_assert_held(q->lock_ptr);
+
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
oldowner = pi_state->owner;
- /* Owner died? */
- if (!pi_state->owner)
- newtid |= FUTEX_OWNER_DIED;
/*
- * We are here either because we stole the rtmutex from the
- * previous highest priority waiter or we are the highest priority
- * waiter but have failed to get the rtmutex the first time.
+ * We are here because either:
+ *
+ * - we stole the lock and pi_state->owner needs updating to reflect
+ * that (@argowner == current),
+ *
+ * or:
*
- * We have to replace the newowner TID in the user space variable.
+ * - someone stole our lock and we need to fix things to point to the
+ * new owner (@argowner == NULL).
+ *
+ * Either way, we have to replace the TID in the user space variable.
* This must be atomic as we have to preserve the owner died bit here.
*
* Note: We write the user space value _before_ changing the pi_state
@@ -2334,6 +2336,45 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* in the PID check in lookup_pi_state.
*/
retry:
+ if (!argowner) {
+ if (oldowner != current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+ /* We got the lock after all, nothing to fix. */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * Since we just failed the trylock; there must be an owner.
+ */
+ newowner = rt_mutex_owner(&pi_state->pi_mutex);
+ BUG_ON(!newowner);
+ } else {
+ WARN_ON_ONCE(argowner != current);
+ if (oldowner == current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ ret = 0;
+ goto out_unlock;
+ }
+ newowner = argowner;
+ }
+
+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ /* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;
@@ -2434,9 +2475,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case:
*
- * We can safely read pi_state->owner without holding wait_lock
- * because we now own the rt_mutex, only the owner will attempt
- * to change it.
+ * Speculative pi_state->owner read (we don't hold wait_lock);
+ * since we own the lock pi_state->owner == current is the
+ * stable state, anything else needs more attention.
*/
if (q->pi_state->owner != current)
ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2444,6 +2485,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
}
/*
+ * If we didn't get the lock; check if anybody stole it from us. In
+ * that case, we need to fix up the uval to point to them instead of
+ * us, otherwise bad things happen. [10]
+ *
+ * Another speculative read; pi_state->owner == current is unstable
+ * but needs our attention.
+ */
+ if (q->pi_state->owner == current) {
+ ret = fixup_pi_state_owner(uaddr, q, NULL);
+ goto out;
+ }
+
+ /*
* Paranoia check. If we did not take the lock, then we should not be
* the owner of the rt_mutex.
*/
diff --git a/kernel/groups.c b/kernel/groups.c
index e357bc800111..daae2f2dc6d4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b)
return gid_gt(a, b) - gid_lt(a, b);
}
-static void groups_sort(struct group_info *group_info)
+void groups_sort(struct group_info *group_info)
{
sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
gid_cmp, NULL);
}
+EXPORT_SYMBOL(groups_sort);
/* a simple bsearch */
int groups_search(const struct group_info *group_info, kgid_t grp)
@@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
void set_groups(struct cred *new, struct group_info *group_info)
{
put_group_info(new->group_info);
- groups_sort(group_info);
get_group_info(group_info);
new->group_info = group_info;
}
@@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
return retval;
}
+ groups_sort(group_info);
retval = set_current_groups(group_info);
put_group_info(group_info);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 89e355866450..6fc87ccda1d7 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -103,16 +103,6 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR
config GENERIC_IRQ_RESERVATION_MODE
bool
-config IRQ_DOMAIN_DEBUG
- bool "Expose hardware/virtual IRQ mapping via debugfs"
- depends on IRQ_DOMAIN && DEBUG_FS
- help
- This option will show the mapping relationship between hardware irq
- numbers and Linux irq numbers. The mapping is exposed via debugfs
- in the file "irq_domain_mapping".
-
- If you don't know what this means you don't need it.
-
# Support forced irq threading
config IRQ_FORCED_THREADING
bool
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e12d35108225..a37a3b4b6342 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
}
}
-static cpumask_var_t *alloc_node_to_present_cpumask(void)
+static cpumask_var_t *alloc_node_to_possible_cpumask(void)
{
cpumask_var_t *masks;
int node;
@@ -62,7 +62,7 @@ out_unwind:
return NULL;
}
-static void free_node_to_present_cpumask(cpumask_var_t *masks)
+static void free_node_to_possible_cpumask(cpumask_var_t *masks)
{
int node;
@@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t *masks)
kfree(masks);
}
-static void build_node_to_present_cpumask(cpumask_var_t *masks)
+static void build_node_to_possible_cpumask(cpumask_var_t *masks)
{
int cpu;
- for_each_present_cpu(cpu)
+ for_each_possible_cpu(cpu)
cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
}
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask,
+static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
const struct cpumask *mask, nodemask_t *nodemsk)
{
int n, nodes = 0;
/* Calculate the number of nodes in the supplied affinity mask */
for_each_node(n) {
- if (cpumask_intersects(mask, node_to_present_cpumask[n])) {
+ if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
node_set(n, *nodemsk);
nodes++;
}
@@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
int last_affv = affv + affd->pre_vectors;
nodemask_t nodemsk = NODE_MASK_NONE;
struct cpumask *masks;
- cpumask_var_t nmsk, *node_to_present_cpumask;
+ cpumask_var_t nmsk, *node_to_possible_cpumask;
/*
* If there aren't any vectors left after applying the pre/post
@@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
if (!masks)
goto out;
- node_to_present_cpumask = alloc_node_to_present_cpumask();
- if (!node_to_present_cpumask)
+ node_to_possible_cpumask = alloc_node_to_possible_cpumask();
+ if (!node_to_possible_cpumask)
goto out;
/* Fill out vectors at the beginning that don't need affinity */
@@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
/* Stabilize the cpumasks */
get_online_cpus();
- build_node_to_present_cpumask(node_to_present_cpumask);
- nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask,
+ build_node_to_possible_cpumask(node_to_possible_cpumask);
+ nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask,
&nodemsk);
/*
@@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
if (affv <= nodes) {
for_each_node_mask(n, nodemsk) {
cpumask_copy(masks + curvec,
- node_to_present_cpumask[n]);
+ node_to_possible_cpumask[n]);
if (++curvec == last_affv)
break;
}
@@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
/* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]);
+ cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]);
/* Calculate the number of cpus per vector */
ncpus = cpumask_weight(nmsk);
@@ -192,7 +192,7 @@ done:
/* Fill out vectors at the end that don't need affinity */
for (; curvec < nvecs; curvec++)
cpumask_copy(masks + curvec, irq_default_affinity);
- free_node_to_present_cpumask(node_to_present_cpumask);
+ free_node_to_possible_cpumask(node_to_possible_cpumask);
out:
free_cpumask_var(nmsk);
return masks;
@@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity
return 0;
get_online_cpus();
- ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
+ ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv;
put_online_cpus();
return ret;
}
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 17f05ef8f575..e4d3819a91cc 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -12,6 +12,11 @@
static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
{
+ static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
+
+ if (!__ratelimit(&ratelimit))
+ return;
+
printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
printk("->handle_irq(): %p, ", desc->handle_irq);
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 7f608ac39653..acfaaef8672a 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
+ BIT_MASK_DESCR(IRQD_CAN_RESERVE),
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c26c5bb6b491..508c03dfef25 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
/*
- * Separate lockdep class for interrupt chip which can nest irq_desc
- * lock.
+ * Separate lockdep classes for interrupt chip which can nest irq_desc
+ * lock and request mutex.
*/
static struct lock_class_key irq_nested_lock_class;
+static struct lock_class_key irq_nested_request_class;
/*
* irq_map_generic_chip - Map a generic chip for an irq domain
@@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
set_bit(idx, &gc->installed);
if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
- irq_set_lockdep_class(virq, &irq_nested_lock_class);
+ irq_set_lockdep_class(virq, &irq_nested_lock_class,
+ &irq_nested_request_class);
if (chip->irq_calc_mask)
chip->irq_calc_mask(data);
@@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
continue;
if (flags & IRQ_GC_INIT_NESTED_LOCK)
- irq_set_lockdep_class(i, &irq_nested_lock_class);
+ irq_set_lockdep_class(i, &irq_nested_lock_class,
+ &irq_nested_request_class);
if (!(flags & IRQ_GC_NO_MASK)) {
struct irq_data *d = irq_get_irq_data(i);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 07d08ca701ec..ab19371eab9b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
#endif /* !CONFIG_GENERIC_PENDING_IRQ */
#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
-static inline int irq_domain_activate_irq(struct irq_data *data, bool early)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve)
{
irqd_set_activated(data);
return 0;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index f2edcf85780d..49b54e9979cc 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -862,6 +862,7 @@ int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity)
return 0;
}
+EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition);
void kstat_incr_irq_this_cpu(unsigned int irq)
{
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4f4f60015e8a..e6a9c36470ee 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -897,124 +897,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_find_mapping);
-#ifdef CONFIG_IRQ_DOMAIN_DEBUG
-static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc)
-{
- struct irq_domain *domain;
- struct irq_data *data;
-
- domain = desc->irq_data.domain;
- data = &desc->irq_data;
-
- while (domain) {
- unsigned int irq = data->irq;
- unsigned long hwirq = data->hwirq;
- struct irq_chip *chip;
- bool direct;
-
- if (data == &desc->irq_data)
- seq_printf(m, "%5d ", irq);
- else
- seq_printf(m, "%5d+ ", irq);
- seq_printf(m, "0x%05lx ", hwirq);
-
- chip = irq_data_get_irq_chip(data);
- seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
-
- seq_printf(m, "0x%p ", irq_data_get_irq_chip_data(data));
-
- seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
- direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq);
- seq_printf(m, "%6s%-8s ",
- (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
- direct ? "(DIRECT)" : "");
- seq_printf(m, "%s\n", domain->name);
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- domain = domain->parent;
- data = data->parent_data;
-#else
- domain = NULL;
-#endif
- }
-}
-
-static int virq_debug_show(struct seq_file *m, void *private)
-{
- unsigned long flags;
- struct irq_desc *desc;
- struct irq_domain *domain;
- struct radix_tree_iter iter;
- void __rcu **slot;
- int i;
-
- seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
- "name", "mapped", "linear-max", "direct-max", "devtree-node");
- mutex_lock(&irq_domain_mutex);
- list_for_each_entry(domain, &irq_domain_list, link) {
- struct device_node *of_node;
- const char *name;
-
- int count = 0;
-
- of_node = irq_domain_get_of_node(domain);
- if (of_node)
- name = of_node_full_name(of_node);
- else if (is_fwnode_irqchip(domain->fwnode))
- name = container_of(domain->fwnode, struct irqchip_fwid,
- fwnode)->name;
- else
- name = "";
-
- radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
- count++;
- seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
- domain == irq_default_domain ? '*' : ' ', domain->name,
- domain->revmap_size + count, domain->revmap_size,
- domain->revmap_direct_max_irq,
- name);
- }
- mutex_unlock(&irq_domain_mutex);
-
- seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq",
- "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
- "active", "type", "domain");
-
- for (i = 1; i < nr_irqs; i++) {
- desc = irq_to_desc(i);
- if (!desc)
- continue;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- virq_debug_show_one(m, desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- }
-
- return 0;
-}
-
-static int virq_debug_open(struct inode *inode, struct file *file)
-{
- return single_open(file, virq_debug_show, inode->i_private);
-}
-
-static const struct file_operations virq_debug_fops = {
- .open = virq_debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int __init irq_debugfs_init(void)
-{
- if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
- NULL, &virq_debug_fops) == NULL)
- return -ENOMEM;
-
- return 0;
-}
-__initcall(irq_debugfs_init);
-#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
-
/**
* irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
*
@@ -1693,7 +1575,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
}
}
-static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve)
{
int ret = 0;
@@ -1702,9 +1584,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
if (irqd->parent_data)
ret = __irq_domain_activate_irq(irqd->parent_data,
- early);
+ reserve);
if (!ret && domain->ops->activate) {
- ret = domain->ops->activate(domain, irqd, early);
+ ret = domain->ops->activate(domain, irqd, reserve);
/* Rollback in case of error */
if (ret && irqd->parent_data)
__irq_domain_deactivate_irq(irqd->parent_data);
@@ -1716,17 +1598,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
/**
* irq_domain_activate_irq - Call domain_ops->activate recursively to activate
* interrupt
- * @irq_data: outermost irq_data associated with interrupt
+ * @irq_data: Outermost irq_data associated with interrupt
+ * @reserve: If set only reserve an interrupt vector instead of assigning one
*
* This is the second step to call domain_ops->activate to program interrupt
* controllers, so the interrupt could actually get delivered.
*/
-int irq_domain_activate_irq(struct irq_data *irq_data, bool early)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve)
{
int ret = 0;
if (!irqd_is_activated(irq_data))
- ret = __irq_domain_activate_irq(irq_data, early);
+ ret = __irq_domain_activate_irq(irq_data, reserve);
if (!ret)
irqd_set_activated(irq_data);
return ret;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2ff1c0c82fc9..0f922729bab9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1246,7 +1246,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* set the trigger type must match. Also all must
* agree on ONESHOT.
*/
- unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data);
+ unsigned int oldtype;
+
+ /*
+ * If nobody did set the configuration before, inherit
+ * the one provided by the requester.
+ */
+ if (irqd_trigger_type_was_set(&desc->irq_data)) {
+ oldtype = irqd_get_trigger_type(&desc->irq_data);
+ } else {
+ oldtype = new->flags & IRQF_TRIGGER_MASK;
+ irqd_set_trigger_type(&desc->irq_data, oldtype);
+ }
if (!((old->flags & new->flags) & IRQF_SHARED) ||
(oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index a3cbbc8191c5..5187dfe809ac 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -321,15 +321,23 @@ void irq_matrix_remove_reserved(struct irq_matrix *m)
int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
bool reserved, unsigned int *mapped_cpu)
{
- unsigned int cpu;
+ unsigned int cpu, best_cpu, maxavl = 0;
+ struct cpumap *cm;
+ unsigned int bit;
+ best_cpu = UINT_MAX;
for_each_cpu(cpu, msk) {
- struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
- unsigned int bit;
+ cm = per_cpu_ptr(m->maps, cpu);
- if (!cm->online)
+ if (!cm->online || cm->available <= maxavl)
continue;
+ best_cpu = cpu;
+ maxavl = cm->available;
+ }
+
+ if (maxavl) {
+ cm = per_cpu_ptr(m->maps, best_cpu);
bit = matrix_alloc_area(m, cm, 1, false);
if (bit < m->alloc_end) {
cm->allocated++;
@@ -338,8 +346,8 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
m->global_available--;
if (reserved)
m->global_reserved--;
- *mapped_cpu = cpu;
- trace_irq_matrix_alloc(bit, cpu, m, cm);
+ *mapped_cpu = best_cpu;
+ trace_irq_matrix_alloc(bit, best_cpu, m, cm);
return bit;
}
}
@@ -384,7 +392,9 @@ unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown)
{
struct cpumap *cm = this_cpu_ptr(m->maps);
- return m->global_available - cpudown ? cm->available : 0;
+ if (!cpudown)
+ return m->global_available;
+ return m->global_available - cm->available;
}
/**
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index edb987b2c58d..2f3c4f5382cc 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -339,6 +339,40 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
return ret;
}
+/*
+ * Carefully check whether the device can use reservation mode. If
+ * reservation mode is enabled then the early activation will assign a
+ * dummy vector to the device. If the PCI/MSI device does not support
+ * masking of the entry then this can result in spurious interrupts when
+ * the device driver is not absolutely careful. But even then a malfunction
+ * of the hardware could result in a spurious interrupt on the dummy vector
+ * and render the device unusable. If the entry can be masked then the core
+ * logic will prevent the spurious interrupt and reservation mode can be
+ * used. For now reservation mode is restricted to PCI/MSI.
+ */
+static bool msi_check_reservation_mode(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ struct device *dev)
+{
+ struct msi_desc *desc;
+
+ if (domain->bus_token != DOMAIN_BUS_PCI_MSI)
+ return false;
+
+ if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
+ return false;
+
+ if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
+ return false;
+
+ /*
+ * Checking the first MSI descriptor is sufficient. MSIX supports
+ * masking and MSI does so when the maskbit is set.
+ */
+ desc = first_msi_entry(dev);
+ return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
+}
+
/**
* msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
* @domain: The domain to allocate from
@@ -353,9 +387,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- msi_alloc_info_t arg;
+ struct irq_data *irq_data;
struct msi_desc *desc;
+ msi_alloc_info_t arg;
int i, ret, virq;
+ bool can_reserve;
ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
if (ret)
@@ -385,6 +421,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
if (ops->msi_finish)
ops->msi_finish(&arg, 0);
+ can_reserve = msi_check_reservation_mode(domain, info, dev);
+
for_each_msi_entry(desc, dev) {
virq = desc->irq;
if (desc->nvec_used == 1)
@@ -397,15 +435,25 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
* the MSI entries before the PCI layer enables MSI in the
* card. Otherwise the card latches a random msi message.
*/
- if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
- struct irq_data *irq_data;
+ if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
+ continue;
+ irq_data = irq_domain_get_irq_data(domain, desc->irq);
+ if (!can_reserve)
+ irqd_clr_can_reserve(irq_data);
+ ret = irq_domain_activate_irq(irq_data, can_reserve);
+ if (ret)
+ goto cleanup;
+ }
+
+ /*
+ * If these interrupts use reservation mode, clear the activated bit
+ * so request_irq() will assign the final vector.
+ */
+ if (can_reserve) {
+ for_each_msi_entry(desc, dev) {
irq_data = irq_domain_get_irq_data(domain, desc->irq);
- ret = irq_domain_activate_irq(irq_data, true);
- if (ret)
- goto cleanup;
- if (info->flags & MSI_FLAG_MUST_REACTIVATE)
- irqd_clr_activated(irq_data);
+ irqd_clr_activated(irq_data);
}
}
return 0;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 1215229d1c12..ef2a47e0eab6 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -20,7 +20,7 @@
static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
-static void poll_spurious_irqs(unsigned long dummy);
+static void poll_spurious_irqs(struct timer_list *unused);
static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
static int irq_poll_cpu;
static atomic_t irq_poll_active;
@@ -143,7 +143,7 @@ out:
return ok;
}
-static void poll_spurious_irqs(unsigned long dummy)
+static void poll_spurious_irqs(struct timer_list *unused)
{
struct irq_desc *desc;
int i;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 40e9d739c169..6b7cdf17ccf8 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -36,7 +36,7 @@ static bool irq_work_claim(struct irq_work *work)
*/
flags = work->flags & ~IRQ_WORK_PENDING;
for (;;) {
- nflags = flags | IRQ_WORK_FLAGS;
+ nflags = flags | IRQ_WORK_CLAIMED;
oflags = cmpxchg(&work->flags, flags, nflags);
if (oflags == flags)
break;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8ff4ca4665ff..b4517095db6a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,7 +79,7 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);
-static void static_key_slow_inc_cpuslocked(struct static_key *key)
+void static_key_slow_inc_cpuslocked(struct static_key *key)
{
int v, v1;
@@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_disable);
-static void static_key_slow_dec_cpuslocked(struct static_key *key,
+static void __static_key_slow_dec_cpuslocked(struct static_key *key,
unsigned long rate_limit,
struct delayed_work *work)
{
@@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key,
struct delayed_work *work)
{
cpus_read_lock();
- static_key_slow_dec_cpuslocked(key, rate_limit, work);
+ __static_key_slow_dec_cpuslocked(key, rate_limit, work);
cpus_read_unlock();
}
@@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
+void static_key_slow_dec_cpuslocked(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE(key);
+ __static_key_slow_dec_cpuslocked(key, 0, NULL);
+}
+
void static_key_slow_dec_deferred(struct static_key_deferred *key)
{
STATIC_KEY_CHECK_USE(key);
@@ -769,7 +775,7 @@ static __init int jump_label_test(void)
return 0;
}
-late_initcall(jump_label_test);
+early_initcall(jump_label_test);
#endif /* STATIC_KEYS_SELFTEST */
#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 1e6ae66c6244..d5fa4116688a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,7 @@
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/filter.h>
+#include <linux/ftrace.h>
#include <linux/compiler.h>
#include <asm/sections.h>
@@ -337,6 +338,10 @@ const char *kallsyms_lookup(unsigned long addr,
if (!ret)
ret = bpf_address_lookup(addr, symbolsize,
offset, modname, namebuf);
+
+ if (!ret)
+ ret = ftrace_mod_address_lookup(addr, symbolsize,
+ offset, modname, namebuf);
return ret;
}
@@ -474,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol);
struct kallsym_iter {
loff_t pos;
loff_t pos_mod_end;
+ loff_t pos_ftrace_mod_end;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
@@ -497,11 +503,25 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
return 1;
}
+static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
+{
+ int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
+ &iter->value, &iter->type,
+ iter->name, iter->module_name,
+ &iter->exported);
+ if (ret < 0) {
+ iter->pos_ftrace_mod_end = iter->pos;
+ return 0;
+ }
+
+ return 1;
+}
+
static int get_ksymbol_bpf(struct kallsym_iter *iter)
{
iter->module_name[0] = '\0';
iter->exported = 0;
- return bpf_get_kallsym(iter->pos - iter->pos_mod_end,
+ return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
&iter->value, &iter->type,
iter->name) < 0 ? 0 : 1;
}
@@ -526,20 +546,31 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
iter->name[0] = '\0';
iter->nameoff = get_symbol_offset(new_pos);
iter->pos = new_pos;
- if (new_pos == 0)
+ if (new_pos == 0) {
iter->pos_mod_end = 0;
+ iter->pos_ftrace_mod_end = 0;
+ }
}
static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
{
iter->pos = pos;
- if (iter->pos_mod_end > 0 &&
- iter->pos_mod_end < iter->pos)
+ if (iter->pos_ftrace_mod_end > 0 &&
+ iter->pos_ftrace_mod_end < iter->pos)
return get_ksymbol_bpf(iter);
- if (!get_ksymbol_mod(iter))
- return get_ksymbol_bpf(iter);
+ if (iter->pos_mod_end > 0 &&
+ iter->pos_mod_end < iter->pos) {
+ if (!get_ksymbol_ftrace_mod(iter))
+ return get_ksymbol_bpf(iter);
+ return 1;
+ }
+
+ if (!get_ksymbol_mod(iter)) {
+ if (!get_ksymbol_ftrace_mod(iter))
+ return get_ksymbol_bpf(iter);
+ }
return 1;
}
@@ -583,14 +614,14 @@ static void s_stop(struct seq_file *m, void *p)
static int s_show(struct seq_file *m, void *p)
{
- unsigned long value;
+ void *value;
struct kallsym_iter *iter = m->private;
/* Some debugging symbols have no name. Ignore them. */
if (!iter->name[0])
return 0;
- value = iter->show_value ? iter->value : 0;
+ value = iter->show_value ? (void *)iter->value : NULL;
if (iter->module_name[0]) {
char type;
@@ -601,10 +632,10 @@ static int s_show(struct seq_file *m, void *p)
*/
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
- seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value,
+ seq_printf(m, "%px %c %s\t[%s]\n", value,
type, iter->name, iter->module_name);
} else
- seq_printf(m, KALLSYM_FMT " %c %s\n", value,
+ seq_printf(m, "%px %c %s\n", value,
iter->type, iter->name);
return 0;
}
diff --git a/kernel/kcov.c b/kernel/kcov.c
index fc6af9e1308b..7594c033d98a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -22,13 +22,21 @@
#include <linux/kcov.h>
#include <asm/setup.h>
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
/*
* kcov descriptor (one per opened debugfs file).
* State transitions of the descriptor:
* - initial state after open()
* - then there must be a single ioctl(KCOV_INIT_TRACE) call
* - then, mmap() call (several calls are allowed but not useful)
- * - then, repeated enable/disable for a task (only one task a time allowed)
+ * - then, ioctl(KCOV_ENABLE, arg), where arg is
+ * KCOV_TRACE_PC - to trace only the PCs
+ * or
+ * KCOV_TRACE_CMP - to trace only the comparison operands
+ * - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
*/
struct kcov {
/*
@@ -48,51 +56,176 @@ struct kcov {
struct task_struct *t;
};
-/*
- * Entry point from instrumented code.
- * This is called once per basic-block/edge.
- */
-void notrace __sanitizer_cov_trace_pc(void)
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
{
- struct task_struct *t;
enum kcov_mode mode;
- t = current;
/*
* We are interested in code coverage as a function of a syscall inputs,
* so we ignore code executed in interrupts.
*/
- if (!t || !in_task())
- return;
+ if (!in_task())
+ return false;
mode = READ_ONCE(t->kcov_mode);
- if (mode == KCOV_MODE_TRACE) {
- unsigned long *area;
- unsigned long pos;
- unsigned long ip = _RET_IP_;
+ /*
+ * There is some code that runs in interrupts but for which
+ * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+ * READ_ONCE()/barrier() effectively provides load-acquire wrt
+ * interrupts, there are paired barrier()/WRITE_ONCE() in
+ * kcov_ioctl_locked().
+ */
+ barrier();
+ return mode == needed_mode;
+}
+static unsigned long canonicalize_ip(unsigned long ip)
+{
#ifdef CONFIG_RANDOMIZE_BASE
- ip -= kaslr_offset();
+ ip -= kaslr_offset();
#endif
+ return ip;
+}
- /*
- * There is some code that runs in interrupts but for which
- * in_interrupt() returns false (e.g. preempt_schedule_irq()).
- * READ_ONCE()/barrier() effectively provides load-acquire wrt
- * interrupts, there are paired barrier()/WRITE_ONCE() in
- * kcov_ioctl_locked().
- */
- barrier();
- area = t->kcov_area;
- /* The first word is number of subsequent PCs. */
- pos = READ_ONCE(area[0]) + 1;
- if (likely(pos < t->kcov_size)) {
- area[pos] = ip;
- WRITE_ONCE(area[0], pos);
- }
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+ struct task_struct *t;
+ unsigned long *area;
+ unsigned long ip = canonicalize_ip(_RET_IP_);
+ unsigned long pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
+ return;
+
+ area = t->kcov_area;
+ /* The first 64-bit word is the number of subsequent PCs. */
+ pos = READ_ONCE(area[0]) + 1;
+ if (likely(pos < t->kcov_size)) {
+ area[pos] = ip;
+ WRITE_ONCE(area[0], pos);
}
}
EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+ struct task_struct *t;
+ u64 *area;
+ u64 count, start_index, end_pos, max_pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+ return;
+
+ ip = canonicalize_ip(ip);
+
+ /*
+ * We write all comparison arguments and types as u64.
+ * The buffer was allocated for t->kcov_size unsigned longs.
+ */
+ area = (u64 *)t->kcov_area;
+ max_pos = t->kcov_size * sizeof(unsigned long);
+
+ count = READ_ONCE(area[0]);
+
+ /* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+ start_index = 1 + count * KCOV_WORDS_PER_CMP;
+ end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+ if (likely(end_pos <= max_pos)) {
+ area[start_index] = type;
+ area[start_index + 1] = arg1;
+ area[start_index + 2] = arg2;
+ area[start_index + 3] = ip;
+ WRITE_ONCE(area[0], count + 1);
+ }
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+ u64 i;
+ u64 count = cases[0];
+ u64 size = cases[1];
+ u64 type = KCOV_CMP_CONST;
+
+ switch (size) {
+ case 8:
+ type |= KCOV_CMP_SIZE(0);
+ break;
+ case 16:
+ type |= KCOV_CMP_SIZE(1);
+ break;
+ case 32:
+ type |= KCOV_CMP_SIZE(2);
+ break;
+ case 64:
+ type |= KCOV_CMP_SIZE(3);
+ break;
+ default:
+ return;
+ }
+ for (i = 0; i < count; i++)
+ write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
static void kcov_get(struct kcov *kcov)
{
atomic_inc(&kcov->refcount);
@@ -129,6 +262,7 @@ void kcov_task_exit(struct task_struct *t)
/* Just to not leave dangling references behind. */
kcov_task_init(t);
kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
spin_unlock(&kcov->lock);
kcov_put(kcov);
}
@@ -147,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
spin_lock(&kcov->lock);
size = kcov->size * sizeof(unsigned long);
- if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+ if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
res = -EINVAL;
goto exit;
@@ -176,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
if (!kcov)
return -ENOMEM;
+ kcov->mode = KCOV_MODE_DISABLED;
atomic_set(&kcov->refcount, 1);
spin_lock_init(&kcov->lock);
filep->private_data = kcov;
@@ -211,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
if (size < 2 || size > INT_MAX / sizeof(unsigned long))
return -EINVAL;
kcov->size = size;
- kcov->mode = KCOV_MODE_TRACE;
+ kcov->mode = KCOV_MODE_INIT;
return 0;
case KCOV_ENABLE:
/*
@@ -221,17 +356,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
* at task exit or voluntary by KCOV_DISABLE. After that it can
* be enabled for another task.
*/
- unused = arg;
- if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
- kcov->area == NULL)
+ if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
return -EINVAL;
if (kcov->t != NULL)
return -EBUSY;
+ if (arg == KCOV_TRACE_PC)
+ kcov->mode = KCOV_MODE_TRACE_PC;
+ else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+ kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+ return -ENOTSUPP;
+#endif
+ else
+ return -EINVAL;
t = current;
/* Cache in task struct for performance. */
t->kcov_size = kcov->size;
t->kcov_area = kcov->area;
- /* See comment in __sanitizer_cov_trace_pc(). */
+ /* See comment in check_kcov_mode(). */
barrier();
WRITE_ONCE(t->kcov_mode, kcov->mode);
t->kcov = kcov;
@@ -249,6 +392,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
return -EINVAL;
kcov_task_init(t);
kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
kcov_put(kcov);
return 0;
default:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8af313081b0d..cd50e99202b0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -843,7 +843,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
- WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn);
+ WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
/*
* If @delay is 0, queue @dwork->work immediately. This is for
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index db933d063bfc..89b5f83f1969 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -47,9 +47,9 @@
#include <linux/stringify.h>
#include <linux/bitops.h>
#include <linux/gfp.h>
-#include <linux/kmemcheck.h>
#include <linux/random.h>
#include <linux/jhash.h>
+#include <linux/nmi.h>
#include <asm/sections.h>
@@ -58,10 +58,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-#include <linux/slab.h>
-#endif
-
#ifdef CONFIG_PROVE_LOCKING
int prove_locking = 1;
module_param(prove_locking, int, 0644);
@@ -76,19 +72,6 @@ module_param(lock_stat, int, 0644);
#define lock_stat 0
#endif
-#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
-static int crossrelease_fullstack = 1;
-#else
-static int crossrelease_fullstack;
-#endif
-static int __init allow_crossrelease_fullstack(char *str)
-{
- crossrelease_fullstack = 1;
- return 0;
-}
-
-early_param("crossrelease_fullstack", allow_crossrelease_fullstack);
-
/*
* lockdep_lock: protects the lockdep graph, the hashes and the
* class/list/hash allocators.
@@ -665,18 +648,12 @@ static int count_matching_names(struct lock_class *new_class)
return count + 1;
}
-/*
- * Register a lock's class in the hash-table, if the class is not present
- * yet. Otherwise we look it up. We cache the result in the lock object
- * itself, so actual lookup of the hash should be once per lock object.
- */
static inline struct lock_class *
-look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
+look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
{
struct lockdep_subclass_key *key;
struct hlist_head *hash_head;
struct lock_class *class;
- bool is_static = false;
if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
debug_locks_off();
@@ -689,24 +666,11 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
}
/*
- * Static locks do not have their class-keys yet - for them the key
- * is the lock object itself. If the lock is in the per cpu area,
- * the canonical address of the lock (per cpu offset removed) is
- * used.
+ * If it is not initialised then it has never been locked,
+ * so it won't be present in the hash table.
*/
- if (unlikely(!lock->key)) {
- unsigned long can_addr, addr = (unsigned long)lock;
-
- if (__is_kernel_percpu_address(addr, &can_addr))
- lock->key = (void *)can_addr;
- else if (__is_module_percpu_address(addr, &can_addr))
- lock->key = (void *)can_addr;
- else if (static_obj(lock))
- lock->key = (void *)lock;
- else
- return ERR_PTR(-EINVAL);
- is_static = true;
- }
+ if (unlikely(!lock->key))
+ return NULL;
/*
* NOTE: the class-key must be unique. For dynamic locks, a static
@@ -738,20 +702,36 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
}
}
- return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
+ return NULL;
}
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-static void cross_init(struct lockdep_map *lock, int cross);
-static int cross_lock(struct lockdep_map *lock);
-static int lock_acquire_crosslock(struct held_lock *hlock);
-static int lock_release_crosslock(struct lockdep_map *lock);
-#else
-static inline void cross_init(struct lockdep_map *lock, int cross) {}
-static inline int cross_lock(struct lockdep_map *lock) { return 0; }
-static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
-static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
-#endif
+/*
+ * Static locks do not have their class-keys yet - for them the key is
+ * the lock object itself. If the lock is in the per cpu area, the
+ * canonical address of the lock (per cpu offset removed) is used.
+ */
+static bool assign_lock_key(struct lockdep_map *lock)
+{
+ unsigned long can_addr, addr = (unsigned long)lock;
+
+ if (__is_kernel_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (__is_module_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (static_obj(lock))
+ lock->key = (void *)lock;
+ else {
+ /* Debug-check: all keys must be persistent! */
+ debug_locks_off();
+ pr_err("INFO: trying to register non-static key.\n");
+ pr_err("the code is fine but needs lockdep annotation.\n");
+ pr_err("turning off the locking correctness validator.\n");
+ dump_stack();
+ return false;
+ }
+
+ return true;
+}
/*
* Register a lock's class in the hash-table, if the class is not present
@@ -768,18 +748,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
class = look_up_lock_class(lock, subclass);
- if (likely(!IS_ERR_OR_NULL(class)))
+ if (likely(class))
goto out_set_class_cache;
- /*
- * Debug-check: all keys must be persistent!
- */
- if (IS_ERR(class)) {
- debug_locks_off();
- printk("INFO: trying to register non-static key.\n");
- printk("the code is fine but needs lockdep annotation.\n");
- printk("turning off the locking correctness validator.\n");
- dump_stack();
+ if (!lock->key) {
+ if (!assign_lock_key(lock))
+ return NULL;
+ } else if (!static_obj(lock->key)) {
return NULL;
}
@@ -1152,41 +1127,22 @@ print_circular_lock_scenario(struct held_lock *src,
printk(KERN_CONT "\n\n");
}
- if (cross_lock(tgt->instance)) {
- printk(" Possible unsafe locking scenario by crosslock:\n\n");
- printk(" CPU0 CPU1\n");
- printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(parent);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(source);
- printk(KERN_CONT ");\n");
- printk(" unlock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk("\n *** DEADLOCK ***\n\n");
- } else {
- printk(" Possible unsafe locking scenario:\n\n");
- printk(" CPU0 CPU1\n");
- printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(parent);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(source);
- printk(KERN_CONT ");\n");
- printk("\n *** DEADLOCK ***\n\n");
- }
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(parent);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(source);
+ printk(KERN_CONT ");\n");
+ printk("\n *** DEADLOCK ***\n\n");
}
/*
@@ -1212,10 +1168,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
curr->comm, task_pid_nr(curr));
print_lock(check_src);
- if (cross_lock(check_tgt->instance))
- pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
- else
- pr_warn("\nbut task is already holding lock:\n");
+ pr_warn("\nbut task is already holding lock:\n");
print_lock(check_tgt);
pr_warn("\nwhich lock already depends on the new lock.\n\n");
@@ -1245,9 +1198,7 @@ static noinline int print_circular_bug(struct lock_list *this,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- if (cross_lock(check_tgt->instance))
- this->trace = *trace;
- else if (!save_trace(&this->trace))
+ if (!save_trace(&this->trace))
return 0;
depth = get_lock_depth(target);
@@ -1851,9 +1802,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
if (nest)
return 2;
- if (cross_lock(prev->instance))
- continue;
-
return print_deadlock_bug(curr, prev, next);
}
return 1;
@@ -2019,31 +1967,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
for (;;) {
int distance = curr->lockdep_depth - depth + 1;
hlock = curr->held_locks + depth - 1;
+
/*
- * Only non-crosslock entries get new dependencies added.
- * Crosslock entries will be added by commit later:
+ * Only non-recursive-read entries get new dependencies
+ * added:
*/
- if (!cross_lock(hlock->instance)) {
+ if (hlock->read != 2 && hlock->check) {
+ int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
+ if (!ret)
+ return 0;
+
/*
- * Only non-recursive-read entries get new dependencies
- * added:
+ * Stop after the first non-trylock entry,
+ * as non-trylock entries have added their
+ * own direct dependencies already, so this
+ * lock is connected to them indirectly:
*/
- if (hlock->read != 2 && hlock->check) {
- int ret = check_prev_add(curr, hlock, next,
- distance, &trace, save_trace);
- if (!ret)
- return 0;
-
- /*
- * Stop after the first non-trylock entry,
- * as non-trylock entries have added their
- * own direct dependencies already, so this
- * lock is connected to them indirectly:
- */
- if (!hlock->trylock)
- break;
- }
+ if (!hlock->trylock)
+ break;
}
+
depth--;
/*
* End of lock-stack?
@@ -3238,8 +3181,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
{
int i;
- kmemcheck_mark_initialized(lock, sizeof(*lock));
-
for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
lock->class_cache[i] = NULL;
@@ -3295,21 +3236,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
void lockdep_init_map(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass)
{
- cross_init(lock, 0);
__lockdep_init_map(lock, name, key, subclass);
}
EXPORT_SYMBOL_GPL(lockdep_init_map);
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
- struct lock_class_key *key, int subclass)
-{
- cross_init(lock, 1);
- __lockdep_init_map(lock, name, key, subclass);
-}
-EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
-#endif
-
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
@@ -3347,7 +3277,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
return 0;
}
-static int __lock_is_held(struct lockdep_map *lock, int read);
+static int __lock_is_held(const struct lockdep_map *lock, int read);
/*
* This gets called for every mutex_lock*()/spin_lock*() operation.
@@ -3365,7 +3295,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int chain_head = 0;
int class_idx;
u64 chain_key;
- int ret;
if (unlikely(!debug_locks))
return 0;
@@ -3414,8 +3343,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes + 1;
- /* TODO: nest_lock is not implemented for crosslock yet. */
- if (depth && !cross_lock(lock)) {
+ if (depth) {
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (hlock->references) {
@@ -3503,14 +3431,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
return 0;
- ret = lock_acquire_crosslock(hlock);
- /*
- * 2 means normal acquire operations are needed. Otherwise, it's
- * ok just to return with '0:fail, 1:success'.
- */
- if (ret != 2)
- return ret;
-
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -3566,13 +3486,14 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
return 0;
}
-static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
+static int match_held_lock(const struct held_lock *hlock,
+ const struct lockdep_map *lock)
{
if (hlock->instance == lock)
return 1;
if (hlock->references) {
- struct lock_class *class = lock->class_cache[0];
+ const struct lock_class *class = lock->class_cache[0];
if (!class)
class = look_up_lock_class(lock, 0);
@@ -3583,7 +3504,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
* Clearly if the lock hasn't been acquired _ever_, we're not
* holding it either, so report failure.
*/
- if (IS_ERR_OR_NULL(class))
+ if (!class)
return 0;
/*
@@ -3748,19 +3669,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
struct task_struct *curr = current;
struct held_lock *hlock;
unsigned int depth;
- int ret, i;
+ int i;
if (unlikely(!debug_locks))
return 0;
- ret = lock_release_crosslock(lock);
- /*
- * 2 means normal release operations are needed. Otherwise, it's
- * ok just to return with '0:fail, 1:success'.
- */
- if (ret != 2)
- return ret;
-
depth = curr->lockdep_depth;
/*
* So we're all set to release this lock.. wait what lock? We don't
@@ -3816,7 +3729,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
return 1;
}
-static int __lock_is_held(struct lockdep_map *lock, int read)
+static int __lock_is_held(const struct lockdep_map *lock, int read)
{
struct task_struct *curr = current;
int i;
@@ -4030,7 +3943,7 @@ void lock_release(struct lockdep_map *lock, int nested,
}
EXPORT_SYMBOL_GPL(lock_release);
-int lock_is_held_type(struct lockdep_map *lock, int read)
+int lock_is_held_type(const struct lockdep_map *lock, int read)
{
unsigned long flags;
int ret = 0;
@@ -4387,7 +4300,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
* If the class exists we look it up and zap it:
*/
class = look_up_lock_class(lock, j);
- if (!IS_ERR_OR_NULL(class))
+ if (class)
zap_class(class);
}
/*
@@ -4583,6 +4496,7 @@ retry:
if (!unlock)
if (read_trylock(&tasklist_lock))
unlock = 1;
+ touch_nmi_watchdog();
} while_each_thread(g, p);
pr_warn("\n");
@@ -4678,494 +4592,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
dump_stack();
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
-
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-
-/*
- * Crossrelease works by recording a lock history for each thread and
- * connecting those historic locks that were taken after the
- * wait_for_completion() in the complete() context.
- *
- * Task-A Task-B
- *
- * mutex_lock(&A);
- * mutex_unlock(&A);
- *
- * wait_for_completion(&C);
- * lock_acquire_crosslock();
- * atomic_inc_return(&cross_gen_id);
- * |
- * | mutex_lock(&B);
- * | mutex_unlock(&B);
- * |
- * | complete(&C);
- * `-- lock_commit_crosslock();
- *
- * Which will then add a dependency between B and C.
- */
-
-#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR])
-
-/*
- * Whenever a crosslock is held, cross_gen_id will be increased.
- */
-static atomic_t cross_gen_id; /* Can be wrapped */
-
-/*
- * Make an entry of the ring buffer invalid.
- */
-static inline void invalidate_xhlock(struct hist_lock *xhlock)
-{
- /*
- * Normally, xhlock->hlock.instance must be !NULL.
- */
- xhlock->hlock.instance = NULL;
-}
-
-/*
- * Lock history stacks; we have 2 nested lock history stacks:
- *
- * HARD(IRQ)
- * SOFT(IRQ)
- *
- * The thing is that once we complete a HARD/SOFT IRQ the future task locks
- * should not depend on any of the locks observed while running the IRQ. So
- * what we do is rewind the history buffer and erase all our knowledge of that
- * temporal event.
- */
-
-void crossrelease_hist_start(enum xhlock_context_t c)
-{
- struct task_struct *cur = current;
-
- if (!cur->xhlocks)
- return;
-
- cur->xhlock_idx_hist[c] = cur->xhlock_idx;
- cur->hist_id_save[c] = cur->hist_id;
-}
-
-void crossrelease_hist_end(enum xhlock_context_t c)
-{
- struct task_struct *cur = current;
-
- if (cur->xhlocks) {
- unsigned int idx = cur->xhlock_idx_hist[c];
- struct hist_lock *h = &xhlock(idx);
-
- cur->xhlock_idx = idx;
-
- /* Check if the ring was overwritten. */
- if (h->hist_id != cur->hist_id_save[c])
- invalidate_xhlock(h);
- }
-}
-
-/*
- * lockdep_invariant_state() is used to annotate independence inside a task, to
- * make one task look like multiple independent 'tasks'.
- *
- * Take for instance workqueues; each work is independent of the last. The
- * completion of a future work does not depend on the completion of a past work
- * (in general). Therefore we must not carry that (lock) dependency across
- * works.
- *
- * This is true for many things; pretty much all kthreads fall into this
- * pattern, where they have an invariant state and future completions do not
- * depend on past completions. Its just that since they all have the 'same'
- * form -- the kthread does the same over and over -- it doesn't typically
- * matter.
- *
- * The same is true for system-calls, once a system call is completed (we've
- * returned to userspace) the next system call does not depend on the lock
- * history of the previous system call.
- *
- * They key property for independence, this invariant state, is that it must be
- * a point where we hold no locks and have no history. Because if we were to
- * hold locks, the restore at _end() would not necessarily recover it's history
- * entry. Similarly, independence per-definition means it does not depend on
- * prior state.
- */
-void lockdep_invariant_state(bool force)
-{
- /*
- * We call this at an invariant point, no current state, no history.
- * Verify the former, enforce the latter.
- */
- WARN_ON_ONCE(!force && current->lockdep_depth);
- invalidate_xhlock(&xhlock(current->xhlock_idx));
-}
-
-static int cross_lock(struct lockdep_map *lock)
-{
- return lock ? lock->cross : 0;
-}
-
-/*
- * This is needed to decide the relationship between wrapable variables.
- */
-static inline int before(unsigned int a, unsigned int b)
-{
- return (int)(a - b) < 0;
-}
-
-static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
-{
- return hlock_class(&xhlock->hlock);
-}
-
-static inline struct lock_class *xlock_class(struct cross_lock *xlock)
-{
- return hlock_class(&xlock->hlock);
-}
-
-/*
- * Should we check a dependency with previous one?
- */
-static inline int depend_before(struct held_lock *hlock)
-{
- return hlock->read != 2 && hlock->check && !hlock->trylock;
-}
-
-/*
- * Should we check a dependency with next one?
- */
-static inline int depend_after(struct held_lock *hlock)
-{
- return hlock->read != 2 && hlock->check;
-}
-
-/*
- * Check if the xhlock is valid, which would be false if,
- *
- * 1. Has not used after initializaion yet.
- * 2. Got invalidated.
- *
- * Remind hist_lock is implemented as a ring buffer.
- */
-static inline int xhlock_valid(struct hist_lock *xhlock)
-{
- /*
- * xhlock->hlock.instance must be !NULL.
- */
- return !!xhlock->hlock.instance;
-}
-
-/*
- * Record a hist_lock entry.
- *
- * Irq disable is only required.
- */
-static void add_xhlock(struct held_lock *hlock)
-{
- unsigned int idx = ++current->xhlock_idx;
- struct hist_lock *xhlock = &xhlock(idx);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
- /*
- * This can be done locklessly because they are all task-local
- * state, we must however ensure IRQs are disabled.
- */
- WARN_ON_ONCE(!irqs_disabled());
-#endif
-
- /* Initialize hist_lock's members */
- xhlock->hlock = *hlock;
- xhlock->hist_id = ++current->hist_id;
-
- xhlock->trace.nr_entries = 0;
- xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
- xhlock->trace.entries = xhlock->trace_entries;
-
- if (crossrelease_fullstack) {
- xhlock->trace.skip = 3;
- save_stack_trace(&xhlock->trace);
- } else {
- xhlock->trace.nr_entries = 1;
- xhlock->trace.entries[0] = hlock->acquire_ip;
- }
-}
-
-static inline int same_context_xhlock(struct hist_lock *xhlock)
-{
- return xhlock->hlock.irq_context == task_irq_context(current);
-}
-
-/*
- * This should be lockless as far as possible because this would be
- * called very frequently.
- */
-static void check_add_xhlock(struct held_lock *hlock)
-{
- /*
- * Record a hist_lock, only in case that acquisitions ahead
- * could depend on the held_lock. For example, if the held_lock
- * is trylock then acquisitions ahead never depends on that.
- * In that case, we don't need to record it. Just return.
- */
- if (!current->xhlocks || !depend_before(hlock))
- return;
-
- add_xhlock(hlock);
-}
-
-/*
- * For crosslock.
- */
-static int add_xlock(struct held_lock *hlock)
-{
- struct cross_lock *xlock;
- unsigned int gen_id;
-
- if (!graph_lock())
- return 0;
-
- xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
-
- /*
- * When acquisitions for a crosslock are overlapped, we use
- * nr_acquire to perform commit for them, based on cross_gen_id
- * of the first acquisition, which allows to add additional
- * dependencies.
- *
- * Moreover, when no acquisition of a crosslock is in progress,
- * we should not perform commit because the lock might not exist
- * any more, which might cause incorrect memory access. So we
- * have to track the number of acquisitions of a crosslock.
- *
- * depend_after() is necessary to initialize only the first
- * valid xlock so that the xlock can be used on its commit.
- */
- if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
- goto unlock;
-
- gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
- xlock->hlock = *hlock;
- xlock->hlock.gen_id = gen_id;
-unlock:
- graph_unlock();
- return 1;
-}
-
-/*
- * Called for both normal and crosslock acquires. Normal locks will be
- * pushed on the hist_lock queue. Cross locks will record state and
- * stop regular lock_acquire() to avoid being placed on the held_lock
- * stack.
- *
- * Return: 0 - failure;
- * 1 - crosslock, done;
- * 2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_acquire_crosslock(struct held_lock *hlock)
-{
- /*
- * CONTEXT 1 CONTEXT 2
- * --------- ---------
- * lock A (cross)
- * X = atomic_inc_return(&cross_gen_id)
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- * Y = atomic_read_acquire(&cross_gen_id)
- * lock B
- *
- * atomic_read_acquire() is for ordering between A and B,
- * IOW, A happens before B, when CONTEXT 2 see Y >= X.
- *
- * Pairs with atomic_inc_return() in add_xlock().
- */
- hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
-
- if (cross_lock(hlock->instance))
- return add_xlock(hlock);
-
- check_add_xhlock(hlock);
- return 2;
-}
-
-static int copy_trace(struct stack_trace *trace)
-{
- unsigned long *buf = stack_trace + nr_stack_trace_entries;
- unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
- unsigned int nr = min(max_nr, trace->nr_entries);
-
- trace->nr_entries = nr;
- memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
- trace->entries = buf;
- nr_stack_trace_entries += nr;
-
- if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
- if (!debug_locks_off_graph_unlock())
- return 0;
-
- print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
- dump_stack();
-
- return 0;
- }
-
- return 1;
-}
-
-static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
-{
- unsigned int xid, pid;
- u64 chain_key;
-
- xid = xlock_class(xlock) - lock_classes;
- chain_key = iterate_chain_key((u64)0, xid);
- pid = xhlock_class(xhlock) - lock_classes;
- chain_key = iterate_chain_key(chain_key, pid);
-
- if (lookup_chain_cache(chain_key))
- return 1;
-
- if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
- chain_key))
- return 0;
-
- if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
- &xhlock->trace, copy_trace))
- return 0;
-
- return 1;
-}
-
-static void commit_xhlocks(struct cross_lock *xlock)
-{
- unsigned int cur = current->xhlock_idx;
- unsigned int prev_hist_id = xhlock(cur).hist_id;
- unsigned int i;
-
- if (!graph_lock())
- return;
-
- if (xlock->nr_acquire) {
- for (i = 0; i < MAX_XHLOCKS_NR; i++) {
- struct hist_lock *xhlock = &xhlock(cur - i);
-
- if (!xhlock_valid(xhlock))
- break;
-
- if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
- break;
-
- if (!same_context_xhlock(xhlock))
- break;
-
- /*
- * Filter out the cases where the ring buffer was
- * overwritten and the current entry has a bigger
- * hist_id than the previous one, which is impossible
- * otherwise:
- */
- if (unlikely(before(prev_hist_id, xhlock->hist_id)))
- break;
-
- prev_hist_id = xhlock->hist_id;
-
- /*
- * commit_xhlock() returns 0 with graph_lock already
- * released if fail.
- */
- if (!commit_xhlock(xlock, xhlock))
- return;
- }
- }
-
- graph_unlock();
-}
-
-void lock_commit_crosslock(struct lockdep_map *lock)
-{
- struct cross_lock *xlock;
- unsigned long flags;
-
- if (unlikely(!debug_locks || current->lockdep_recursion))
- return;
-
- if (!current->xhlocks)
- return;
-
- /*
- * Do commit hist_locks with the cross_lock, only in case that
- * the cross_lock could depend on acquisitions after that.
- *
- * For example, if the cross_lock does not have the 'check' flag
- * then we don't need to check dependencies and commit for that.
- * Just skip it. In that case, of course, the cross_lock does
- * not depend on acquisitions ahead, either.
- *
- * WARNING: Don't do that in add_xlock() in advance. When an
- * acquisition context is different from the commit context,
- * invalid(skipped) cross_lock might be accessed.
- */
- if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
- return;
-
- raw_local_irq_save(flags);
- check_flags(flags);
- current->lockdep_recursion = 1;
- xlock = &((struct lockdep_map_cross *)lock)->xlock;
- commit_xhlocks(xlock);
- current->lockdep_recursion = 0;
- raw_local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(lock_commit_crosslock);
-
-/*
- * Return: 0 - failure;
- * 1 - crosslock, done;
- * 2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_release_crosslock(struct lockdep_map *lock)
-{
- if (cross_lock(lock)) {
- if (!graph_lock())
- return 0;
- ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
- graph_unlock();
- return 1;
- }
- return 2;
-}
-
-static void cross_init(struct lockdep_map *lock, int cross)
-{
- if (cross)
- ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
-
- lock->cross = cross;
-
- /*
- * Crossrelease assumes that the ring buffer size of xhlocks
- * is aligned with power of 2. So force it on build.
- */
- BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
-}
-
-void lockdep_init_task(struct task_struct *task)
-{
- int i;
-
- task->xhlock_idx = UINT_MAX;
- task->hist_id = 0;
-
- for (i = 0; i < XHLOCK_CTX_NR; i++) {
- task->xhlock_idx_hist[i] = UINT_MAX;
- task->hist_id_save[i] = 0;
- }
-
- task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
- GFP_KERNEL);
-}
-
-void lockdep_free_task(struct task_struct *task)
-{
- if (task->xhlocks) {
- void *tmp = task->xhlocks;
- /* Diable crossrelease for current */
- task->xhlocks = NULL;
- kfree(tmp);
- }
-}
-#endif
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f24582d4dad3..6850ffd69125 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -77,10 +77,6 @@ struct lock_stress_stats {
long n_lock_acquired;
};
-int torture_runnable = IS_ENABLED(MODULE);
-module_param(torture_runnable, int, 0444);
-MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
-
/* Forward reference. */
static void lock_torture_cleanup(void);
@@ -130,10 +126,8 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2000 * longdelay_ms)))
mdelay(longdelay_ms);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
static void torture_lock_busted_write_unlock(void)
@@ -179,10 +173,8 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2 * shortdelay_us)))
udelay(shortdelay_us);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
@@ -352,10 +344,8 @@ static void torture_mutex_delay(struct torture_random_state *trsp)
mdelay(longdelay_ms * 5);
else
mdelay(longdelay_ms / 5);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
static void torture_mutex_unlock(void) __releases(torture_mutex)
@@ -507,10 +497,8 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp)
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2 * shortdelay_us)))
udelay(shortdelay_us);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
static void torture_rtmutex_unlock(void) __releases(torture_rtmutex)
@@ -547,10 +535,8 @@ static void torture_rwsem_write_delay(struct torture_random_state *trsp)
mdelay(longdelay_ms * 10);
else
mdelay(longdelay_ms / 10);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
static void torture_rwsem_up_write(void) __releases(torture_rwsem)
@@ -570,14 +556,12 @@ static void torture_rwsem_read_delay(struct torture_random_state *trsp)
/* We want a long delay occasionally to force massive contention. */
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
mdelay(longdelay_ms * 2);
else
mdelay(longdelay_ms / 2);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
static void torture_rwsem_up_read(void) __releases(torture_rwsem)
@@ -715,8 +699,7 @@ static void __torture_print_stats(char *page,
{
bool fail = 0;
int i, n_stress;
- long max = 0;
- long min = statp[0].n_lock_acquired;
+ long max = 0, min = statp ? statp[0].n_lock_acquired : 0;
long long sum = 0;
n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
@@ -823,7 +806,7 @@ static void lock_torture_cleanup(void)
* such, only perform the underlying torture-specific cleanups,
* and avoid anything related to locktorture.
*/
- if (!cxt.lwsa)
+ if (!cxt.lwsa && !cxt.lrsa)
goto end;
if (writer_tasks) {
@@ -879,7 +862,7 @@ static int __init lock_torture_init(void)
&percpu_rwsem_lock_ops,
};
- if (!torture_init_begin(torture_type, verbose, &torture_runnable))
+ if (!torture_init_begin(torture_type, verbose))
return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
@@ -898,6 +881,13 @@ static int __init lock_torture_init(void)
firsterr = -EINVAL;
goto unwind;
}
+
+ if (nwriters_stress == 0 && nreaders_stress == 0) {
+ pr_alert("lock-torture: must run at least one locking thread\n");
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+
if (cxt.cur_ops->init)
cxt.cur_ops->init();
@@ -921,17 +911,19 @@ static int __init lock_torture_init(void)
#endif
/* Initialize the statistics so that each run gets its own numbers. */
+ if (nwriters_stress) {
+ lock_is_write_held = 0;
+ cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL);
+ if (cxt.lwsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
- lock_is_write_held = 0;
- cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL);
- if (cxt.lwsa == NULL) {
- VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
- firsterr = -ENOMEM;
- goto unwind;
- }
- for (i = 0; i < cxt.nrealwriters_stress; i++) {
- cxt.lwsa[i].n_lock_fail = 0;
- cxt.lwsa[i].n_lock_acquired = 0;
+ for (i = 0; i < cxt.nrealwriters_stress; i++) {
+ cxt.lwsa[i].n_lock_fail = 0;
+ cxt.lwsa[i].n_lock_acquired = 0;
+ }
}
if (cxt.cur_ops->readlock) {
@@ -948,19 +940,21 @@ static int __init lock_torture_init(void)
cxt.nrealreaders_stress = cxt.nrealwriters_stress;
}
- lock_is_read_held = 0;
- cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL);
- if (cxt.lrsa == NULL) {
- VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
- firsterr = -ENOMEM;
- kfree(cxt.lwsa);
- cxt.lwsa = NULL;
- goto unwind;
- }
-
- for (i = 0; i < cxt.nrealreaders_stress; i++) {
- cxt.lrsa[i].n_lock_fail = 0;
- cxt.lrsa[i].n_lock_acquired = 0;
+ if (nreaders_stress) {
+ lock_is_read_held = 0;
+ cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL);
+ if (cxt.lrsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
+ firsterr = -ENOMEM;
+ kfree(cxt.lwsa);
+ cxt.lwsa = NULL;
+ goto unwind;
+ }
+
+ for (i = 0; i < cxt.nrealreaders_stress; i++) {
+ cxt.lrsa[i].n_lock_fail = 0;
+ cxt.lrsa[i].n_lock_acquired = 0;
+ }
}
}
@@ -990,12 +984,14 @@ static int __init lock_torture_init(void)
goto unwind;
}
- writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]),
- GFP_KERNEL);
- if (writer_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
- firsterr = -ENOMEM;
- goto unwind;
+ if (nwriters_stress) {
+ writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]),
+ GFP_KERNEL);
+ if (writer_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
}
if (cxt.cur_ops->readlock) {
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 294294c71ba4..38ece035039e 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -170,7 +170,7 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
* @tail : The new queue tail code word
* Return: The previous queue tail code word
*
- * xchg(lock, tail)
+ * xchg(lock, tail), which heads an address dependency
*
* p,*,* -> n,*,* ; prev = xchg(lock, node)
*/
@@ -409,13 +409,11 @@ queue:
if (old & _Q_TAIL_MASK) {
prev = decode_tail(old);
/*
- * The above xchg_tail() is also a load of @lock which generates,
- * through decode_tail(), a pointer.
- *
- * The address dependency matches the RELEASE of xchg_tail()
- * such that the access to @prev must happen after.
+ * The above xchg_tail() is also a load of @lock which
+ * generates, through decode_tail(), a pointer. The address
+ * dependency matches the RELEASE of xchg_tail() such that
+ * the subsequent access to @prev happens after.
*/
- smp_read_barrier_depends();
WRITE_ONCE(prev->next, node);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6f3dba6e4e9e..65cc0cb984e6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
return ret;
}
+static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ int ret = try_to_take_rt_mutex(lock, current, NULL);
+
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ return ret;
+}
+
/*
* Slow path try-lock function:
*/
@@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
- ret = try_to_take_rt_mutex(lock, current, NULL);
-
- /*
- * try_to_take_rt_mutex() sets the lock waiters bit
- * unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
+ ret = __rt_mutex_slowtrylock(lock);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
@@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
return rt_mutex_slowtrylock(lock);
}
+int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
/**
* rt_mutex_timed_lock - lock a rt_mutex interruptible
* the timeout structure is provided
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 124e98ca0b17..68686b3ec3c1 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter);
extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 1fd1a7543cdd..936f3d14dd6b 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
break; \
preempt_enable(); \
\
- if (!(lock)->break_lock) \
- (lock)->break_lock = 1; \
- while ((lock)->break_lock) \
- arch_##op##_relax(&lock->raw_lock); \
+ arch_##op##_relax(&lock->raw_lock); \
} \
- (lock)->break_lock = 0; \
} \
\
unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
@@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
local_irq_restore(flags); \
preempt_enable(); \
\
- if (!(lock)->break_lock) \
- (lock)->break_lock = 1; \
- while ((lock)->break_lock) \
- arch_##op##_relax(&lock->raw_lock); \
+ arch_##op##_relax(&lock->raw_lock); \
} \
- (lock)->break_lock = 0; \
+ \
return flags; \
} \
\
diff --git a/kernel/module.c b/kernel/module.c
index 32c2cdaccd93..09e48eee4d55 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -847,10 +847,8 @@ static int add_module_usage(struct module *a, struct module *b)
pr_debug("Allocating new usage for %s.\n", a->name);
use = kmalloc(sizeof(*use), GFP_ATOMIC);
- if (!use) {
- pr_warn("%s: out of memory loading\n", a->name);
+ if (!use)
return -ENOMEM;
- }
use->source = a;
use->target = b;
@@ -2865,6 +2863,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
}
#endif /* CONFIG_LIVEPATCH */
+static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
+{
+ if (retpoline_module_ok(get_modinfo(info, "retpoline")))
+ return;
+
+ pr_warn("%s: loading module not compiled with retpoline compiler.\n",
+ mod->name);
+}
+
/* Sets info->hdr and info->len. */
static int copy_module_from_user(const void __user *umod, unsigned long len,
struct load_info *info)
@@ -3031,6 +3038,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
}
+ check_modinfo_retpoline(mod, info);
+
if (get_modinfo(info, "staging")) {
add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
pr_warn("%s: module is from the staging directory, the quality "
@@ -3483,6 +3492,8 @@ static noinline int do_init_module(struct module *mod)
if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
async_synchronize_full();
+ ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
+ mod->init_layout.size);
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
@@ -4157,7 +4168,7 @@ static int m_show(struct seq_file *m, void *p)
{
struct module *mod = list_entry(p, struct module, list);
char buf[MODULE_FLAGS_BUF_SIZE];
- unsigned long value;
+ void *value;
/* We always ignore unformed modules. */
if (mod->state == MODULE_STATE_UNFORMED)
@@ -4173,8 +4184,8 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
- value = m->private ? 0 : (unsigned long)mod->core_layout.base;
- seq_printf(m, " 0x" KALLSYM_FMT, value);
+ value = m->private ? NULL : mod->core_layout.base;
+ seq_printf(m, " 0x%px", value);
/* Taints info */
if (mod->taints)
diff --git a/kernel/padata.c b/kernel/padata.c
index f262c9a4e70a..57c0074d50cc 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -288,9 +288,9 @@ static void invoke_padata_reorder(struct work_struct *work)
local_bh_enable();
}
-static void padata_reorder_timer(unsigned long arg)
+static void padata_reorder_timer(struct timer_list *t)
{
- struct parallel_data *pd = (struct parallel_data *)arg;
+ struct parallel_data *pd = from_timer(pd, t, timer);
unsigned int weight;
int target_cpu, cpu;
@@ -485,7 +485,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
padata_init_pqueues(pd);
padata_init_squeues(pd);
- setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
+ timer_setup(&pd->timer, padata_reorder_timer, 0);
atomic_set(&pd->seq_nr, -1);
atomic_set(&pd->reorder_objects, 0);
atomic_set(&pd->refcnt, 0);
diff --git a/kernel/panic.c b/kernel/panic.c
index bdd18afa19a4..2cfef408fec9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,6 +27,8 @@
#include <linux/console.h>
#include <linux/bug.h>
#include <linux/ratelimit.h>
+#include <linux/debugfs.h>
+#include <asm/sections.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -322,6 +324,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
{ 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */
{ 'L', ' ', false }, /* TAINT_SOFTLOCKUP */
{ 'K', ' ', true }, /* TAINT_LIVEPATCH */
+ { 'X', ' ', true }, /* TAINT_AUX */
};
/**
@@ -343,6 +346,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
* 'E' - Unsigned module has been loaded.
* 'L' - A soft lockup has previously occurred.
* 'K' - Kernel has been live patched.
+ * 'X' - Auxiliary taint, for distros' use.
*
* The string is overwritten by the next call to print_tainted().
*/
@@ -518,7 +522,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
{
disable_trace_on_warning();
- pr_warn("------------[ cut here ]------------\n");
+ if (args)
+ pr_warn(CUT_HERE);
if (file)
pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
@@ -582,9 +587,49 @@ EXPORT_SYMBOL(warn_slowpath_fmt_taint);
void warn_slowpath_null(const char *file, int line)
{
+ pr_warn(CUT_HERE);
__warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
}
EXPORT_SYMBOL(warn_slowpath_null);
+#else
+void __warn_printk(const char *fmt, ...)
+{
+ va_list args;
+
+ pr_warn(CUT_HERE);
+
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL(__warn_printk);
+#endif
+
+#ifdef CONFIG_BUG
+
+/* Support resetting WARN*_ONCE state */
+
+static int clear_warn_once_set(void *data, u64 val)
+{
+ generic_bug_clear_once();
+ memset(__start_once, 0, __end_once - __start_once);
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
+ NULL,
+ clear_warn_once_set,
+ "%lld\n");
+
+static __init int register_warn_debugfs(void)
+{
+ /* Don't care about failure */
+ debugfs_create_file("clear_warn_once", 0200, NULL,
+ NULL, &clear_warn_once_fops);
+ return 0;
+}
+
+device_initcall(register_warn_debugfs);
#endif
#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/pid.c b/kernel/pid.c
index 020dedbdf066..5d30c87e3c42 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -39,12 +39,21 @@
#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
#include <linux/sched/task.h>
-
-#define pid_hashfn(nr, ns) \
- hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
-static struct hlist_head *pid_hash;
-static unsigned int pidhash_shift = 4;
-struct pid init_struct_pid = INIT_STRUCT_PID;
+#include <linux/idr.h>
+
+struct pid init_struct_pid = {
+ .count = ATOMIC_INIT(1),
+ .tasks = {
+ { .first = NULL },
+ { .first = NULL },
+ { .first = NULL },
+ },
+ .level = 0,
+ .numbers = { {
+ .nr = 0,
+ .ns = &init_pid_ns,
+ }, }
+};
int pid_max = PID_MAX_DEFAULT;
@@ -53,15 +62,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-static inline int mk_pid(struct pid_namespace *pid_ns,
- struct pidmap *map, int off)
-{
- return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
-
-#define find_next_offset(map, off) \
- find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
-
/*
* PID-map pages start out as NULL, they get allocated upon
* first use and are never deallocated. This way a low pid_max
@@ -70,11 +70,8 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
*/
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
- .pidmap = {
- [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
- },
- .last_pid = 0,
- .nr_hashed = PIDNS_HASH_ADDING,
+ .idr = IDR_INIT,
+ .pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
@@ -101,138 +98,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static void free_pidmap(struct upid *upid)
-{
- int nr = upid->nr;
- struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
- int offset = nr & BITS_PER_PAGE_MASK;
-
- clear_bit(offset, map->page);
- atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
- /*
- * This is the same as saying
- *
- * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
- * and that mapping orders 'a' and 'b' with respect to 'base'.
- */
- return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value. We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
- int prev;
- int last_write = base;
- do {
- prev = last_write;
- last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
- } while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
- int i, offset, max_scan, pid, last = pid_ns->last_pid;
- struct pidmap *map;
-
- pid = last + 1;
- if (pid >= pid_max)
- pid = RESERVED_PIDS;
- offset = pid & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
- /*
- * If last_pid points into the middle of the map->page we
- * want to scan this bitmap block twice, the second time
- * we start with offset == 0 (or RESERVED_PIDS).
- */
- max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
- for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (!map->page) {
- map->page = page;
- page = NULL;
- }
- spin_unlock_irq(&pidmap_lock);
- kfree(page);
- if (unlikely(!map->page))
- return -ENOMEM;
- }
- if (likely(atomic_read(&map->nr_free))) {
- for ( ; ; ) {
- if (!test_and_set_bit(offset, map->page)) {
- atomic_dec(&map->nr_free);
- set_last_pid(pid_ns, last, pid);
- return pid;
- }
- offset = find_next_offset(map, offset);
- if (offset >= BITS_PER_PAGE)
- break;
- pid = mk_pid(pid_ns, map, offset);
- if (pid >= pid_max)
- break;
- }
- }
- if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
- ++map;
- offset = 0;
- } else {
- map = &pid_ns->pidmap[0];
- offset = RESERVED_PIDS;
- if (unlikely(last == offset))
- break;
- }
- pid = mk_pid(pid_ns, map, offset);
- }
- return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
- int offset;
- struct pidmap *map, *end;
-
- if (last >= PID_MAX_LIMIT)
- return -1;
-
- offset = (last + 1) & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
- end = &pid_ns->pidmap[PIDMAP_ENTRIES];
- for (; map < end; map++, offset = 0) {
- if (unlikely(!map->page))
- continue;
- offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
- if (offset < BITS_PER_PAGE)
- return mk_pid(pid_ns, map, offset);
- }
- return -1;
-}
-
void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
@@ -265,8 +130,7 @@ void free_pid(struct pid *pid)
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
- hlist_del_rcu(&upid->pid_chain);
- switch(--ns->nr_hashed) {
+ switch (--ns->pid_allocated) {
case 2:
case 1:
/* When all that is left in the pid namespace
@@ -275,21 +139,20 @@ void free_pid(struct pid *pid)
*/
wake_up_process(ns->child_reaper);
break;
- case PIDNS_HASH_ADDING:
+ case PIDNS_ADDING:
/* Handle a fork failure of the first process */
WARN_ON(ns->child_reaper);
- ns->nr_hashed = 0;
+ ns->pid_allocated = 0;
/* fall through */
case 0:
schedule_work(&ns->proc_work);
break;
}
+
+ idr_remove(&ns->idr, upid->nr);
}
spin_unlock_irqrestore(&pidmap_lock, flags);
- for (i = 0; i <= pid->level; i++)
- free_pidmap(pid->numbers + i);
-
call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -308,8 +171,29 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = ns;
pid->level = ns->level;
+
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ int pid_min = 1;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_irq(&pidmap_lock);
+
+ /*
+ * init really needs pid 1, but after reaching the maximum
+ * wrap back to RESERVED_PIDS
+ */
+ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+ pid_min = RESERVED_PIDS;
+
+ /*
+ * Store a null pointer so find_pid_ns does not find
+ * a partially initialized PID (see below).
+ */
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+ pid_max, GFP_ATOMIC);
+ spin_unlock_irq(&pidmap_lock);
+ idr_preload_end();
+
if (nr < 0) {
retval = nr;
goto out_free;
@@ -321,10 +205,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
if (unlikely(is_child_reaper(pid))) {
- if (pid_ns_prepare_proc(ns)) {
- disable_pid_allocation(ns);
+ if (pid_ns_prepare_proc(ns))
goto out_free;
- }
}
get_pid_ns(ns);
@@ -334,12 +216,12 @@ struct pid *alloc_pid(struct pid_namespace *ns)
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
- if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+ if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
for ( ; upid >= pid->numbers; --upid) {
- hlist_add_head_rcu(&upid->pid_chain,
- &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
- upid->ns->nr_hashed++;
+ /* Make the PID visible to find_pid_ns. */
+ idr_replace(&upid->ns->idr, pid, upid->nr);
+ upid->ns->pid_allocated++;
}
spin_unlock_irq(&pidmap_lock);
@@ -350,8 +232,15 @@ out_unlock:
put_pid_ns(ns);
out_free:
+ spin_lock_irq(&pidmap_lock);
while (++i <= ns->level)
- free_pidmap(pid->numbers + i);
+ idr_remove(&ns->idr, (pid->numbers + i)->nr);
+
+ /* On failure to allocate the first pid, reset the state */
+ if (ns->pid_allocated == PIDNS_ADDING)
+ idr_set_cursor(&ns->idr, 0);
+
+ spin_unlock_irq(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -360,21 +249,13 @@ out_free:
void disable_pid_allocation(struct pid_namespace *ns)
{
spin_lock_irq(&pidmap_lock);
- ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+ ns->pid_allocated &= ~PIDNS_ADDING;
spin_unlock_irq(&pidmap_lock);
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
- struct upid *pnr;
-
- hlist_for_each_entry_rcu(pnr,
- &pid_hash[pid_hashfn(nr, ns)], pid_chain)
- if (pnr->nr == nr && pnr->ns == ns)
- return container_of(pnr, struct pid,
- numbers[ns->level]);
-
- return NULL;
+ return idr_find(&ns->idr, nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);
@@ -530,6 +411,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (type != PIDTYPE_PID) {
if (type == __PIDTYPE_TGID)
type = PIDTYPE_PID;
+
task = task->group_leader;
}
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
@@ -553,35 +435,13 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
*/
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
- struct pid *pid;
-
- do {
- pid = find_pid_ns(nr, ns);
- if (pid)
- break;
- nr = next_pidmap(ns, nr);
- } while (nr > 0);
-
- return pid;
-}
-
-/*
- * The pid hash table is scaled according to the amount of memory in the
- * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
- * more.
- */
-void __init pidhash_init(void)
-{
- pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
- HASH_EARLY | HASH_SMALL | HASH_ZERO,
- &pidhash_shift, NULL,
- 0, 4096);
+ return idr_get_next(&ns->idr, &nr);
}
-void __init pidmap_init(void)
+void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
- BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
pid_max = min(pid_max_max, max_t(int, pid_max,
@@ -590,10 +450,7 @@ void __init pidmap_init(void)
PIDS_PER_CPU_MIN * num_possible_cpus());
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
- init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /* Reserve PID 0. We never call free_pidmap(0) */
- set_bit(0, init_pid_ns.pidmap[0].page);
- atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+ idr_init(&init_pid_ns.idr);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 4918314893bc..0b53eef7d34b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
#include <linux/export.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
+#include <linux/idr.h>
struct pid_cache {
int nr_ids;
@@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
struct ucounts *ucounts;
- int i;
int err;
err = -EINVAL;
@@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns == NULL)
goto out_dec;
- ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (!ns->pidmap[0].page)
- goto out_free;
+ idr_init(&ns->idr);
ns->pid_cachep = create_pid_cachep(level + 1);
if (ns->pid_cachep == NULL)
- goto out_free_map;
+ goto out_free_idr;
err = ns_alloc_inum(&ns->ns);
if (err)
- goto out_free_map;
+ goto out_free_idr;
ns->ns.ops = &pidns_operations;
kref_init(&ns->kref);
@@ -135,20 +133,13 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
- ns->nr_hashed = PIDNS_HASH_ADDING;
+ ns->pid_allocated = PIDNS_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);
- set_bit(0, ns->pidmap[0].page);
- atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
- for (i = 1; i < PIDMAP_ENTRIES; i++)
- atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-
return ns;
-out_free_map:
- kfree(ns->pidmap[0].page);
-out_free:
+out_free_idr:
+ idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
out_dec:
dec_pid_namespaces(ucounts);
@@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
- int i;
-
ns_free_inum(&ns->ns);
- for (i = 0; i < PIDMAP_ENTRIES; i++)
- kfree(ns->pidmap[i].page);
+
+ idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}
@@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
int rc;
struct task_struct *task, *me = current;
int init_pids = thread_group_leader(me) ? 1 : 2;
+ struct pid *pid;
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
@@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* maintain a tasklist for each pid namespace.
*
*/
+ rcu_read_lock();
read_lock(&tasklist_lock);
- nr = next_pidmap(pid_ns, 1);
- while (nr > 0) {
- rcu_read_lock();
-
- task = pid_task(find_vpid(nr), PIDTYPE_PID);
+ nr = 2;
+ idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
+ task = pid_task(pid, PIDTYPE_PID);
if (task && !__fatal_signal_pending(task))
send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
-
- rcu_read_unlock();
-
- nr = next_pidmap(pid_ns, nr);
}
read_unlock(&tasklist_lock);
+ rcu_read_unlock();
/*
* Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
@@ -268,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* sys_wait4() above can't reap the EXIT_DEAD children but we do not
* really care, we could reparent them to the global init. We could
* exit and reap ->child_reaper even if it is not the last thread in
- * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+ * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
* pid_ns can not go away until proc_kill_sb() drops the reference.
*
* But this ns can also have other tasks injected by setns()+fork().
@@ -282,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*/
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- if (pid_ns->nr_hashed == init_pids)
+ if (pid_ns->pid_allocated == init_pids)
break;
schedule();
}
@@ -301,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
+ int ret, next;
if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
@@ -311,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
* it should synchronize its usage with external means.
*/
- tmp.data = &pid_ns->last_pid;
- return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ next = idr_get_cursor(&pid_ns->idr) - 1;
+
+ tmp.data = &next;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (!ret && write)
+ idr_set_cursor(&pid_ns->idr, next + 1);
+
+ return ret;
}
extern int pid_max;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3a2ca9066583..705c2366dafe 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -22,6 +22,35 @@ DEFINE_MUTEX(pm_mutex);
#ifdef CONFIG_PM_SLEEP
+void lock_system_sleep(void)
+{
+ current->flags |= PF_FREEZER_SKIP;
+ mutex_lock(&pm_mutex);
+}
+EXPORT_SYMBOL_GPL(lock_system_sleep);
+
+void unlock_system_sleep(void)
+{
+ /*
+ * Don't use freezer_count() because we don't want the call to
+ * try_to_freeze() here.
+ *
+ * Reason:
+ * Fundamentally, we just don't need it, because freezing condition
+ * doesn't come into effect until we release the pm_mutex lock,
+ * since the freezer always works with pm_mutex held.
+ *
+ * More importantly, in the case of hibernation,
+ * unlock_system_sleep() gets called in snapshot_read() and
+ * snapshot_write() when the freezing condition is still in effect.
+ * Which means, if we use try_to_freeze() here, it would make them
+ * enter the refrigerator, thus causing hibernation to lockup.
+ */
+ current->flags &= ~PF_FREEZER_SKIP;
+ mutex_unlock(&pm_mutex);
+}
+EXPORT_SYMBOL_GPL(unlock_system_sleep);
+
/* Routines for PM-transition notifications */
static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index a917a301e201..3d37c279c090 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1645,8 +1645,7 @@ static unsigned long free_unnecessary_pages(void)
* [number of saveable pages] - [number of pages that can be freed in theory]
*
* where the second term is the sum of (1) reclaimable slab pages, (2) active
- * and (3) inactive anonymous pages, (4) active and (5) inactive file pages,
- * minus mapped file pages.
+ * and (3) inactive anonymous pages, (4) active and (5) inactive file pages.
*/
static unsigned long minimum_image_size(unsigned long saveable)
{
@@ -1656,8 +1655,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)
- + global_node_page_state(NR_INACTIVE_FILE)
- - global_node_page_state(NR_FILE_MAPPED);
+ + global_node_page_state(NR_INACTIVE_FILE);
return saveable <= size ? 0 : saveable - size;
}
@@ -1884,7 +1882,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
*/
static inline int get_highmem_buffer(int safe_needed)
{
- buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+ buffer = get_image_page(GFP_ATOMIC, safe_needed);
return buffer ? 0 : -ENOMEM;
}
@@ -1945,7 +1943,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm,
while (nr_pages-- > 0) {
struct page *page;
- page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+ page = alloc_image_page(GFP_ATOMIC);
if (!page)
goto err_out;
memory_bm_set_bit(copy_bm, page_to_pfn(page));
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 293ead59eccc..11b4282c2d20 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -240,7 +240,7 @@ static void hib_init_batch(struct hib_bio_batch *hb)
static void hib_end_io(struct bio *bio)
{
struct hib_bio_batch *hb = bio->bi_private;
- struct page *page = bio->bi_io_vec[0].bv_page;
+ struct page *page = bio_first_page_all(bio);
if (bio->bi_status) {
pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
@@ -879,7 +879,7 @@ out_clean:
* space avaiable from the resume partition.
*/
-static int enough_swap(unsigned int nr_pages, unsigned int flags)
+static int enough_swap(unsigned int nr_pages)
{
unsigned int free_swap = count_swap_pages(root_swap, 1);
unsigned int required;
@@ -915,7 +915,7 @@ int swsusp_write(unsigned int flags)
return error;
}
if (flags & SF_NOCOMPRESS_MODE) {
- if (!enough_swap(pages, flags)) {
+ if (!enough_swap(pages)) {
pr_err("Not enough free swap\n");
error = -ENOSPC;
goto out_finish;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 512f7c2baedd..c2e713f6ae2e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -920,10 +920,10 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
return ret;
}
-static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
+static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
{
struct devkmsg_user *user = file->private_data;
- int ret = 0;
+ __poll_t ret = 0;
if (!user)
return POLLERR|POLLNVAL;
@@ -2190,7 +2190,7 @@ again:
}
if (console_seq < log_first_seq) {
- len = sprintf(text, "** %u printk messages dropped ** ",
+ len = sprintf(text, "** %u printk messages dropped **\n",
(unsigned)(log_first_seq - console_seq));
/* messages are gone, move to first one */
@@ -3141,9 +3141,6 @@ void dump_stack_print_info(const char *log_lvl)
void show_regs_print_info(const char *log_lvl)
{
dump_stack_print_info(log_lvl);
-
- printk("%stask: %p task.stack: %p\n",
- log_lvl, current, task_stack_page(current));
}
#endif
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 3cdaeaef9ce1..3e3c2004bb23 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -39,7 +39,7 @@
* There are situations when we want to make sure that all buffers
* were handled or when IRQs are blocked.
*/
-static int printk_safe_irq_ready;
+static int printk_safe_irq_ready __read_mostly;
#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
sizeof(atomic_t) - \
@@ -63,11 +63,8 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
/* Get flushed in a more safe context. */
static void queue_flush_work(struct printk_safe_seq_buf *s)
{
- if (printk_safe_irq_ready) {
- /* Make sure that IRQ work is really initialized. */
- smp_rmb();
+ if (printk_safe_irq_ready)
irq_work_queue(&s->work);
- }
}
/*
@@ -75,7 +72,7 @@ static void queue_flush_work(struct printk_safe_seq_buf *s)
* have dedicated buffers, because otherwise printk-safe preempted by
* NMI-printk would have overwritten the NMI messages.
*
- * The messages are fushed from irq work (or from panic()), possibly,
+ * The messages are flushed from irq work (or from panic()), possibly,
* from other CPU, concurrently with printk_safe_log_store(). Should this
* happen, printk_safe_log_store() will notice the buffer->len mismatch
* and repeat the write.
@@ -398,8 +395,12 @@ void __init printk_safe_init(void)
#endif
}
- /* Make sure that IRQ works are initialized before enabling. */
- smp_wmb();
+ /*
+ * In the highly unlikely event that a NMI were to trigger at
+ * this moment. Make sure IRQ work is set up before this
+ * variable is set.
+ */
+ barrier();
printk_safe_irq_ready = 1;
/* Flush pending messages that did not have scheduled IRQ works. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 84b1367935e4..f3c82e26b995 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -659,7 +659,7 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
if (lock_task_sighand(child, &flags)) {
error = -EINVAL;
if (likely(child->last_siginfo != NULL)) {
- *info = *child->last_siginfo;
+ copy_siginfo(info, child->last_siginfo);
error = 0;
}
unlock_task_sighand(child, &flags);
@@ -675,7 +675,7 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
if (lock_task_sighand(child, &flags)) {
error = -EINVAL;
if (likely(child->last_siginfo != NULL)) {
- *child->last_siginfo = *info;
+ copy_siginfo(child->last_siginfo, info);
error = 0;
}
unlock_task_sighand(child, &flags);
@@ -1226,7 +1226,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
break;
case PTRACE_SETSIGINFO:
- memset(&siginfo, 0, sizeof siginfo);
if (copy_siginfo_from_user32(
&siginfo, (struct compat_siginfo __user *) datap))
ret = -EFAULT;
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 59c471de342a..6334f2c1abd0 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -30,31 +30,8 @@
#define RCU_TRACE(stmt)
#endif /* #else #ifdef CONFIG_RCU_TRACE */
-/*
- * Process-level increment to ->dynticks_nesting field. This allows for
- * architectures that use half-interrupts and half-exceptions from
- * process context.
- *
- * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH
- * that counts the number of process-based reasons why RCU cannot
- * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE
- * is the value used to increment or decrement this field.
- *
- * The rest of the bits could in principle be used to count interrupts,
- * but this would mean that a negative-one value in the interrupt
- * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field.
- * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK
- * that is set to DYNTICK_TASK_FLAG upon initial exit from idle.
- * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon
- * initial exit from idle.
- */
-#define DYNTICK_TASK_NEST_WIDTH 7
-#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1)
-#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1)
-#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2)
-#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3)
-#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
- DYNTICK_TASK_FLAG)
+/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */
+#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1)
/*
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 1f87a02c3399..d1ebdf9868bb 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -106,10 +106,6 @@ static int rcu_perf_writer_state;
#define MAX_MEAS 10000
#define MIN_MEAS 100
-static int perf_runnable = IS_ENABLED(MODULE);
-module_param(perf_runnable, int, 0444);
-MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot");
-
/*
* Operations vector for selecting different types of tests.
*/
@@ -646,7 +642,7 @@ rcu_perf_init(void)
&tasks_ops,
};
- if (!torture_init_begin(perf_type, verbose, &perf_runnable))
+ if (!torture_init_begin(perf_type, verbose))
return -EBUSY;
/* Process args and tell the world that the perf'er is on the job. */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 74f6b0146b98..308e6fdbced8 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -187,10 +187,6 @@ static const char *rcu_torture_writer_state_getname(void)
return rcu_torture_writer_state_names[i];
}
-static int torture_runnable = IS_ENABLED(MODULE);
-module_param(torture_runnable, int, 0444);
-MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
-
#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
#define rcu_can_boost() 1
#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
@@ -315,11 +311,9 @@ static void rcu_read_delay(struct torture_random_state *rrsp)
}
if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
udelay(shortdelay_us);
-#ifdef CONFIG_PREEMPT
if (!preempt_count() &&
- !(torture_random(rrsp) % (nrealreaders * 20000)))
- preempt_schedule(); /* No QS if preempt_disable() in effect */
-#endif
+ !(torture_random(rrsp) % (nrealreaders * 500)))
+ torture_preempt_schedule(); /* QS only if preemptible. */
}
static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -1731,7 +1725,7 @@ rcu_torture_init(void)
&sched_ops, &tasks_ops,
};
- if (!torture_init_begin(torture_type, verbose, &torture_runnable))
+ if (!torture_init_begin(torture_type, verbose))
return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6d5880089ff6..d5cea81378cc 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -53,6 +53,33 @@ static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
static void process_srcu(struct work_struct *work);
+/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
+#define spin_lock_rcu_node(p) \
+do { \
+ spin_lock(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
+
+#define spin_lock_irq_rcu_node(p) \
+do { \
+ spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define spin_unlock_irq_rcu_node(p) \
+ spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
+
+#define spin_lock_irqsave_rcu_node(p, flags) \
+do { \
+ spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define spin_unlock_irqrestore_rcu_node(p, flags) \
+ spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
+
/*
* Initialize SRCU combining tree. Note that statically allocated
* srcu_struct structures might already have srcu_read_lock() and
@@ -77,7 +104,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
/* Each pass through this loop initializes one srcu_node structure. */
rcu_for_each_node_breadth_first(sp, snp) {
- raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
+ spin_lock_init(&ACCESS_PRIVATE(snp, lock));
WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
@@ -111,7 +138,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
snp_first = sp->level[level];
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(sp->sda, cpu);
- raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+ spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
rcu_segcblist_init(&sdp->srcu_cblist);
sdp->srcu_cblist_invoking = false;
sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
@@ -170,7 +197,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)sp, sizeof(*sp));
lockdep_init_map(&sp->dep_map, name, key, 0);
- raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
+ spin_lock_init(&ACCESS_PRIVATE(sp, lock));
return init_srcu_struct_fields(sp, false);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -187,7 +214,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
*/
int init_srcu_struct(struct srcu_struct *sp)
{
- raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
+ spin_lock_init(&ACCESS_PRIVATE(sp, lock));
return init_srcu_struct_fields(sp, false);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -210,13 +237,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp)
/* The smp_load_acquire() pairs with the smp_store_release(). */
if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
return; /* Already initialized. */
- raw_spin_lock_irqsave_rcu_node(sp, flags);
+ spin_lock_irqsave_rcu_node(sp, flags);
if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
+ spin_unlock_irqrestore_rcu_node(sp, flags);
return;
}
init_srcu_struct_fields(sp, true);
- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
+ spin_unlock_irqrestore_rcu_node(sp, flags);
}
/*
@@ -513,7 +540,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
mutex_lock(&sp->srcu_cb_mutex);
/* End the current grace period. */
- raw_spin_lock_irq_rcu_node(sp);
+ spin_lock_irq_rcu_node(sp);
idx = rcu_seq_state(sp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
cbdelay = srcu_get_delay(sp);
@@ -522,7 +549,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
gpseq = rcu_seq_current(&sp->srcu_gp_seq);
if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
sp->srcu_gp_seq_needed_exp = gpseq;
- raw_spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(sp);
mutex_unlock(&sp->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
@@ -530,7 +557,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
rcu_for_each_node_breadth_first(sp, snp) {
- raw_spin_lock_irq_rcu_node(snp);
+ spin_lock_irq_rcu_node(snp);
cbs = false;
if (snp >= sp->level[rcu_num_lvls - 1])
cbs = snp->srcu_have_cbs[idx] == gpseq;
@@ -540,7 +567,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
snp->srcu_gp_seq_needed_exp = gpseq;
mask = snp->srcu_data_have_cbs[idx];
snp->srcu_data_have_cbs[idx] = 0;
- raw_spin_unlock_irq_rcu_node(snp);
+ spin_unlock_irq_rcu_node(snp);
if (cbs)
srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
@@ -548,11 +575,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
if (!(gpseq & counter_wrap_check))
for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
sdp = per_cpu_ptr(sp->sda, cpu);
- raw_spin_lock_irqsave_rcu_node(sdp, flags);
+ spin_lock_irqsave_rcu_node(sdp, flags);
if (ULONG_CMP_GE(gpseq,
sdp->srcu_gp_seq_needed + 100))
sdp->srcu_gp_seq_needed = gpseq;
- raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
}
}
@@ -560,17 +587,17 @@ static void srcu_gp_end(struct srcu_struct *sp)
mutex_unlock(&sp->srcu_cb_mutex);
/* Start a new grace period if needed. */
- raw_spin_lock_irq_rcu_node(sp);
+ spin_lock_irq_rcu_node(sp);
gpseq = rcu_seq_current(&sp->srcu_gp_seq);
if (!rcu_seq_state(gpseq) &&
ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
srcu_gp_start(sp);
- raw_spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(sp);
/* Throttle expedited grace periods: Should be rare! */
srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
? 0 : SRCU_INTERVAL);
} else {
- raw_spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(sp);
}
}
@@ -590,18 +617,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
return;
- raw_spin_lock_irqsave_rcu_node(snp, flags);
+ spin_lock_irqsave_rcu_node(snp, flags);
if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
return;
}
WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
}
- raw_spin_lock_irqsave_rcu_node(sp, flags);
+ spin_lock_irqsave_rcu_node(sp, flags);
if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
sp->srcu_gp_seq_needed_exp = s;
- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
+ spin_unlock_irqrestore_rcu_node(sp, flags);
}
/*
@@ -623,12 +650,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
for (; snp != NULL; snp = snp->srcu_parent) {
if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
return; /* GP already done and CBs recorded. */
- raw_spin_lock_irqsave_rcu_node(snp, flags);
+ spin_lock_irqsave_rcu_node(snp, flags);
if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
snp_seq = snp->srcu_have_cbs[idx];
if (snp == sdp->mynode && snp_seq == s)
snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
if (snp == sdp->mynode && snp_seq != s) {
srcu_schedule_cbs_sdp(sdp, do_norm
? SRCU_INTERVAL
@@ -644,11 +671,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
snp->srcu_gp_seq_needed_exp = s;
- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
}
/* Top of tree, must ensure the grace period will be started. */
- raw_spin_lock_irqsave_rcu_node(sp, flags);
+ spin_lock_irqsave_rcu_node(sp, flags);
if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
@@ -667,7 +694,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
queue_delayed_work(system_power_efficient_wq, &sp->work,
srcu_get_delay(sp));
}
- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
+ spin_unlock_irqrestore_rcu_node(sp, flags);
}
/*
@@ -830,7 +857,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
rhp->func = func;
local_irq_save(flags);
sdp = this_cpu_ptr(sp->sda);
- raw_spin_lock_rcu_node(sdp);
+ spin_lock_rcu_node(sdp);
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&sp->srcu_gp_seq));
@@ -844,7 +871,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
sdp->srcu_gp_seq_needed_exp = s;
needexp = true;
}
- raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
if (needgp)
srcu_funnel_gp_start(sp, sdp, s, do_norm);
else if (needexp)
@@ -900,7 +927,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
/*
* Make sure that later code is ordered after the SRCU grace
- * period. This pairs with the raw_spin_lock_irq_rcu_node()
+ * period. This pairs with the spin_lock_irq_rcu_node()
* in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
* because the current CPU might have been totally uninvolved with
* (and thus unordered against) that grace period.
@@ -1024,7 +1051,7 @@ void srcu_barrier(struct srcu_struct *sp)
*/
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(sp->sda, cpu);
- raw_spin_lock_irq_rcu_node(sdp);
+ spin_lock_irq_rcu_node(sdp);
atomic_inc(&sp->srcu_barrier_cpu_cnt);
sdp->srcu_barrier_head.func = srcu_barrier_cb;
debug_rcu_head_queue(&sdp->srcu_barrier_head);
@@ -1033,7 +1060,7 @@ void srcu_barrier(struct srcu_struct *sp)
debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
atomic_dec(&sp->srcu_barrier_cpu_cnt);
}
- raw_spin_unlock_irq_rcu_node(sdp);
+ spin_unlock_irq_rcu_node(sdp);
}
/* Remove the initial count, at which point reaching zero can happen. */
@@ -1082,17 +1109,17 @@ static void srcu_advance_state(struct srcu_struct *sp)
*/
idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
if (idx == SRCU_STATE_IDLE) {
- raw_spin_lock_irq_rcu_node(sp);
+ spin_lock_irq_rcu_node(sp);
if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
- raw_spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(sp);
mutex_unlock(&sp->srcu_gp_mutex);
return;
}
idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
if (idx == SRCU_STATE_IDLE)
srcu_gp_start(sp);
- raw_spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(sp);
if (idx != SRCU_STATE_IDLE) {
mutex_unlock(&sp->srcu_gp_mutex);
return; /* Someone else started the grace period. */
@@ -1141,19 +1168,19 @@ static void srcu_invoke_callbacks(struct work_struct *work)
sdp = container_of(work, struct srcu_data, work.work);
sp = sdp->sp;
rcu_cblist_init(&ready_cbs);
- raw_spin_lock_irq_rcu_node(sdp);
+ spin_lock_irq_rcu_node(sdp);
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&sp->srcu_gp_seq));
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
- raw_spin_unlock_irq_rcu_node(sdp);
+ spin_unlock_irq_rcu_node(sdp);
return; /* Someone else on the job or nothing to do. */
}
/* We are on the job! Extract and invoke ready callbacks. */
sdp->srcu_cblist_invoking = true;
rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
- raw_spin_unlock_irq_rcu_node(sdp);
+ spin_unlock_irq_rcu_node(sdp);
rhp = rcu_cblist_dequeue(&ready_cbs);
for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
debug_rcu_head_unqueue(rhp);
@@ -1166,13 +1193,13 @@ static void srcu_invoke_callbacks(struct work_struct *work)
* Update counts, accelerate new callbacks, and if needed,
* schedule another round of callback invocation.
*/
- raw_spin_lock_irq_rcu_node(sdp);
+ spin_lock_irq_rcu_node(sdp);
rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
rcu_seq_snap(&sp->srcu_gp_seq));
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
- raw_spin_unlock_irq_rcu_node(sdp);
+ spin_unlock_irq_rcu_node(sdp);
if (more)
srcu_schedule_cbs_sdp(sdp, 0);
}
@@ -1185,7 +1212,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
{
bool pushgp = true;
- raw_spin_lock_irq_rcu_node(sp);
+ spin_lock_irq_rcu_node(sp);
if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
/* All requests fulfilled, time to go idle. */
@@ -1195,7 +1222,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
/* Outstanding request and no GP. Start one. */
srcu_gp_start(sp);
}
- raw_spin_unlock_irq_rcu_node(sp);
+ spin_unlock_irq_rcu_node(sp);
if (pushgp)
queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f9c0ca2ccf0c..491bdf39f276 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -265,25 +265,12 @@ void rcu_bh_qs(void)
#endif
static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
- .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
+ .dynticks_nesting = 1,
+ .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
};
/*
- * There's a few places, currently just in the tracing infrastructure,
- * that uses rcu_irq_enter() to make sure RCU is watching. But there's
- * a small location where that will not even work. In those cases
- * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter()
- * can be called.
- */
-static DEFINE_PER_CPU(bool, disable_rcu_irq_enter);
-
-bool rcu_irq_enter_disabled(void)
-{
- return this_cpu_read(disable_rcu_irq_enter);
-}
-
-/*
* Record entry into an extended quiescent state. This is only to be
* called when not already in an extended quiescent state.
*/
@@ -762,68 +749,39 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
- * rcu_eqs_enter_common - current CPU is entering an extended quiescent state
+ * Enter an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
*
- * Enter idle, doing appropriate accounting. The caller must have
- * disabled interrupts.
+ * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
*/
-static void rcu_eqs_enter_common(bool user)
+static void rcu_eqs_enter(bool user)
{
struct rcu_state *rsp;
struct rcu_data *rdp;
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ struct rcu_dynticks *rdtp;
- lockdep_assert_irqs_disabled();
- trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
- if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !user && !is_idle_task(current)) {
- struct task_struct *idle __maybe_unused =
- idle_task(smp_processor_id());
-
- trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0);
- rcu_ftrace_dump(DUMP_ORIG);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ rdtp->dynticks_nesting == 0);
+ if (rdtp->dynticks_nesting != 1) {
+ rdtp->dynticks_nesting--;
+ return;
}
+
+ lockdep_assert_irqs_disabled();
+ trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
do_nocb_deferred_wakeup(rdp);
}
rcu_prepare_for_idle();
- __this_cpu_inc(disable_rcu_irq_enter);
- rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */
- rcu_dynticks_eqs_enter(); /* After this, tracing works again. */
- __this_cpu_dec(disable_rcu_irq_enter);
+ WRITE_ONCE(rdtp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
+ rcu_dynticks_eqs_enter();
rcu_dynticks_task_enter();
-
- /*
- * It is illegal to enter an extended quiescent state while
- * in an RCU read-side critical section.
- */
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
- "Illegal idle entry in RCU read-side critical section.");
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),
- "Illegal idle entry in RCU-bh read-side critical section.");
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),
- "Illegal idle entry in RCU-sched read-side critical section.");
-}
-
-/*
- * Enter an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- */
-static void rcu_eqs_enter(bool user)
-{
- struct rcu_dynticks *rdtp;
-
- rdtp = this_cpu_ptr(&rcu_dynticks);
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
- if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
- rcu_eqs_enter_common(user);
- else
- rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
}
/**
@@ -834,10 +792,6 @@ static void rcu_eqs_enter(bool user)
* critical sections can occur in irq handlers in idle, a possibility
* handled by irq_enter() and irq_exit().)
*
- * We crowbar the ->dynticks_nesting field to zero to allow for
- * the possibility of usermode upcalls having messed up our count
- * of interrupt nesting level during the prior busy period.
- *
* If you add or remove a call to rcu_idle_enter(), be sure to test with
* CONFIG_RCU_EQS_DEBUG=y.
*/
@@ -867,6 +821,46 @@ void rcu_user_enter(void)
#endif /* CONFIG_NO_HZ_FULL */
/**
+ * rcu_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
+ *
+ * If you add or remove a call to rcu_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void rcu_nmi_exit(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /*
+ * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+ * (We are exiting an NMI handler, so RCU better be paying attention
+ * to us!)
+ */
+ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
+ WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
+
+ /*
+ * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+ * leave it in non-RCU-idle state.
+ */
+ if (rdtp->dynticks_nmi_nesting != 1) {
+ trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, rdtp->dynticks_nmi_nesting - 2, rdtp->dynticks);
+ WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */
+ rdtp->dynticks_nmi_nesting - 2);
+ return;
+ }
+
+ /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+ trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0, rdtp->dynticks);
+ WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
+ rcu_dynticks_eqs_enter();
+}
+
+/**
* rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
*
* Exit from an interrupt handler, which might possibly result in entering
@@ -875,8 +869,8 @@ void rcu_user_enter(void)
*
* This code assumes that the idle loop never does anything that might
* result in unbalanced calls to irq_enter() and irq_exit(). If your
- * architecture violates this assumption, RCU will give you what you
- * deserve, good and hard. But very infrequently and irreproducibly.
+ * architecture's idle loop violates this assumption, RCU will give you what
+ * you deserve, good and hard. But very infrequently and irreproducibly.
*
* Use things like work queues to work around this limitation.
*
@@ -887,23 +881,14 @@ void rcu_user_enter(void)
*/
void rcu_irq_exit(void)
{
- struct rcu_dynticks *rdtp;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
lockdep_assert_irqs_disabled();
- rdtp = this_cpu_ptr(&rcu_dynticks);
-
- /* Page faults can happen in NMI handlers, so check... */
- if (rdtp->dynticks_nmi_nesting)
- return;
-
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- rdtp->dynticks_nesting < 1);
- if (rdtp->dynticks_nesting <= 1) {
- rcu_eqs_enter_common(true);
- } else {
- trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1);
- rdtp->dynticks_nesting--;
- }
+ if (rdtp->dynticks_nmi_nesting == 1)
+ rcu_prepare_for_idle();
+ rcu_nmi_exit();
+ if (rdtp->dynticks_nmi_nesting == 0)
+ rcu_dynticks_task_enter();
}
/*
@@ -922,55 +907,33 @@ void rcu_irq_exit_irqson(void)
}
/*
- * rcu_eqs_exit_common - current CPU moving away from extended quiescent state
- *
- * If the new value of the ->dynticks_nesting counter was previously zero,
- * we really have exited idle, and must do the appropriate accounting.
- * The caller must have disabled interrupts.
- */
-static void rcu_eqs_exit_common(long long oldval, int user)
-{
- RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);)
-
- rcu_dynticks_task_exit();
- rcu_dynticks_eqs_exit();
- rcu_cleanup_after_idle();
- trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
- if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !user && !is_idle_task(current)) {
- struct task_struct *idle __maybe_unused =
- idle_task(smp_processor_id());
-
- trace_rcu_dyntick(TPS("Error on exit: not idle task"),
- oldval, rdtp->dynticks_nesting);
- rcu_ftrace_dump(DUMP_ORIG);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
- }
-}
-
-/*
* Exit an RCU extended quiescent state, which can be either the
* idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
+ * allow for the possibility of usermode upcalls messing up our count of
+ * interrupt nesting level during the busy period that is just now starting.
*/
static void rcu_eqs_exit(bool user)
{
struct rcu_dynticks *rdtp;
- long long oldval;
+ long oldval;
lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
- if (oldval & DYNTICK_TASK_NEST_MASK) {
- rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
- } else {
- __this_cpu_inc(disable_rcu_irq_enter);
- rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_eqs_exit_common(oldval, user);
- __this_cpu_dec(disable_rcu_irq_enter);
+ if (oldval) {
+ rdtp->dynticks_nesting++;
+ return;
}
+ rcu_dynticks_task_exit();
+ rcu_dynticks_eqs_exit();
+ rcu_cleanup_after_idle();
+ trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, rdtp->dynticks);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ WRITE_ONCE(rdtp->dynticks_nesting, 1);
+ WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
}
/**
@@ -979,11 +942,6 @@ static void rcu_eqs_exit(bool user)
* Exit idle mode, in other words, -enter- the mode in which RCU
* read-side critical sections can occur.
*
- * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
- * allow for the possibility of usermode upcalls messing up our count
- * of interrupt nesting level during the busy period that is just
- * now starting.
- *
* If you add or remove a call to rcu_idle_exit(), be sure to test with
* CONFIG_RCU_EQS_DEBUG=y.
*/
@@ -1013,65 +971,6 @@ void rcu_user_exit(void)
#endif /* CONFIG_NO_HZ_FULL */
/**
- * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
- *
- * Enter an interrupt handler, which might possibly result in exiting
- * idle mode, in other words, entering the mode in which read-side critical
- * sections can occur. The caller must have disabled interrupts.
- *
- * Note that the Linux kernel is fully capable of entering an interrupt
- * handler that it never exits, for example when doing upcalls to
- * user mode! This code assumes that the idle loop never does upcalls to
- * user mode. If your architecture does do upcalls from the idle loop (or
- * does anything else that results in unbalanced calls to the irq_enter()
- * and irq_exit() functions), RCU will give you what you deserve, good
- * and hard. But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- *
- * If you add or remove a call to rcu_irq_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_irq_enter(void)
-{
- struct rcu_dynticks *rdtp;
- long long oldval;
-
- lockdep_assert_irqs_disabled();
- rdtp = this_cpu_ptr(&rcu_dynticks);
-
- /* Page faults can happen in NMI handlers, so check... */
- if (rdtp->dynticks_nmi_nesting)
- return;
-
- oldval = rdtp->dynticks_nesting;
- rdtp->dynticks_nesting++;
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- rdtp->dynticks_nesting == 0);
- if (oldval)
- trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
- else
- rcu_eqs_exit_common(oldval, true);
-}
-
-/*
- * Wrapper for rcu_irq_enter() where interrupts are enabled.
- *
- * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_irq_enter_irqson(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_irq_enter();
- local_irq_restore(flags);
-}
-
-/**
* rcu_nmi_enter - inform RCU of entry to NMI context
*
* If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
@@ -1086,7 +985,7 @@ void rcu_irq_enter_irqson(void)
void rcu_nmi_enter(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- int incby = 2;
+ long incby = 2;
/* Complain about underflow. */
WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
@@ -1103,45 +1002,61 @@ void rcu_nmi_enter(void)
rcu_dynticks_eqs_exit();
incby = 1;
}
- rdtp->dynticks_nmi_nesting += incby;
+ trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
+ rdtp->dynticks_nmi_nesting,
+ rdtp->dynticks_nmi_nesting + incby, rdtp->dynticks);
+ WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */
+ rdtp->dynticks_nmi_nesting + incby);
barrier();
}
/**
- * rcu_nmi_exit - inform RCU of exit from NMI context
+ * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
*
- * If we are returning from the outermost NMI handler that interrupted an
- * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
- * to let the RCU grace-period handling know that the CPU is back to
- * being RCU-idle.
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
*
- * If you add or remove a call to rcu_nmi_exit(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to user mode!
+ * This code assumes that the idle loop never does upcalls to user mode.
+ * If your architecture's idle loop does do upcalls to user mode (or does
+ * anything else that results in unbalanced calls to the irq_enter() and
+ * irq_exit() functions), RCU will give you what you deserve, good and hard.
+ * But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to rcu_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
-void rcu_nmi_exit(void)
+void rcu_irq_enter(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- /*
- * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
- * (We are exiting an NMI handler, so RCU better be paying attention
- * to us!)
- */
- WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
- WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
+ lockdep_assert_irqs_disabled();
+ if (rdtp->dynticks_nmi_nesting == 0)
+ rcu_dynticks_task_exit();
+ rcu_nmi_enter();
+ if (rdtp->dynticks_nmi_nesting == 1)
+ rcu_cleanup_after_idle();
+}
- /*
- * If the nesting level is not 1, the CPU wasn't RCU-idle, so
- * leave it in non-RCU-idle state.
- */
- if (rdtp->dynticks_nmi_nesting != 1) {
- rdtp->dynticks_nmi_nesting -= 2;
- return;
- }
+/*
+ * Wrapper for rcu_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void rcu_irq_enter_irqson(void)
+{
+ unsigned long flags;
- /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
- rdtp->dynticks_nmi_nesting = 0;
- rcu_dynticks_eqs_enter();
+ local_irq_save(flags);
+ rcu_irq_enter();
+ local_irq_restore(flags);
}
/**
@@ -1233,7 +1148,8 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
*/
static int rcu_is_cpu_rrupt_from_idle(void)
{
- return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
+ return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 0 &&
+ __this_cpu_read(rcu_dynticks.dynticks_nmi_nesting) <= 1;
}
/*
@@ -2789,6 +2705,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
rdp->n_force_qs_snap = rsp->n_force_qs;
} else if (count < rdp->qlen_last_fqs_check - qhimark)
rdp->qlen_last_fqs_check = count;
+
+ /*
+ * The following usually indicates a double call_rcu(). To track
+ * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
+ */
WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
local_irq_restore(flags);
@@ -3723,7 +3644,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
- WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
+ WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1);
WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks)));
rdp->cpu = cpu;
rdp->rsp = rsp;
@@ -3752,7 +3673,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
!init_nocb_callback_list(rdp))
rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
- rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+ rdp->dynticks->dynticks_nesting = 1; /* CPU not up, no tearing. */
rcu_dynticks_eqs_online();
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 46a5d1991450..6488a3b0e729 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -38,9 +38,8 @@
* Dynticks per-CPU state.
*/
struct rcu_dynticks {
- long long dynticks_nesting; /* Track irq/process nesting level. */
- /* Process level is worth LLONG_MAX/2. */
- int dynticks_nmi_nesting; /* Track NMI nesting level. */
+ long dynticks_nesting; /* Track process nesting level. */
+ long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */
atomic_t dynticks; /* Even value for idle, else odd. */
bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index db85ca3975f1..fb88a028deec 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -61,7 +61,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
-static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
@@ -1687,7 +1686,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum;
- pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%ld softirq=%u/%u fqs=%ld %s\n",
cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
@@ -1752,7 +1751,6 @@ static void increment_cpu_stall_ticks(void)
static int __init rcu_nocb_setup(char *str)
{
alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- have_rcu_nocb_mask = true;
cpulist_parse(str, rcu_nocb_mask);
return 1;
}
@@ -1801,7 +1799,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
/* Is the specified CPU a no-CBs CPU? */
bool rcu_is_nocb_cpu(int cpu)
{
- if (have_rcu_nocb_mask)
+ if (cpumask_available(rcu_nocb_mask))
return cpumask_test_cpu(cpu, rcu_nocb_mask);
return false;
}
@@ -2295,14 +2293,13 @@ void __init rcu_init_nohz(void)
need_rcu_nocb_mask = true;
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
- if (!have_rcu_nocb_mask && need_rcu_nocb_mask) {
+ if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
return;
}
- have_rcu_nocb_mask = true;
}
- if (!have_rcu_nocb_mask)
+ if (!cpumask_available(rcu_nocb_mask))
return;
#if defined(CONFIG_NO_HZ_FULL)
@@ -2428,7 +2425,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
struct rcu_data *rdp_prev = NULL;
- if (!have_rcu_nocb_mask)
+ if (!cpumask_available(rcu_nocb_mask))
return;
if (ls == -1) {
ls = int_sqrt(nr_cpu_ids);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index fbd56d6e575b..68fa19a5e7bd 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -422,11 +422,13 @@ void init_rcu_head(struct rcu_head *head)
{
debug_object_init(head, &rcuhead_debug_descr);
}
+EXPORT_SYMBOL_GPL(init_rcu_head);
void destroy_rcu_head(struct rcu_head *head)
{
debug_object_free(head, &rcuhead_debug_descr);
}
+EXPORT_SYMBOL_GPL(destroy_rcu_head);
static bool rcuhead_is_static_object(void *addr)
{
diff --git a/kernel/reboot.c b/kernel/reboot.c
index bd30a973fe94..e4ced883d8de 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_reboot_notifier);
+static void devm_unregister_reboot_notifier(struct device *dev, void *res)
+{
+ WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res));
+}
+
+int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb)
+{
+ struct notifier_block **rcnb;
+ int ret;
+
+ rcnb = devres_alloc(devm_unregister_reboot_notifier,
+ sizeof(*rcnb), GFP_KERNEL);
+ if (!rcnb)
+ return -ENOMEM;
+
+ ret = register_reboot_notifier(nb);
+ if (!ret) {
+ *rcnb = nb;
+ devres_add(dev, rcnb);
+ } else {
+ devres_free(rcnb);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(devm_register_reboot_notifier);
+
/*
* Notifier list for kernel code which wants to be called
* to restart the system.
diff --git a/kernel/relay.c b/kernel/relay.c
index 39a9dfc69486..41280033a4c5 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -919,9 +919,9 @@ static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
*
* Poll implemention.
*/
-static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
+static __poll_t relay_file_poll(struct file *filp, poll_table *wait)
{
- unsigned int mask = 0;
+ __poll_t mask = 0;
struct rchan_buf *buf = filp->private_data;
if (buf->finalized)
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 2ddaec40956f..0926aef10dad 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -34,11 +34,6 @@ void complete(struct completion *x)
spin_lock_irqsave(&x->wait.lock, flags);
- /*
- * Perform commit of crossrelease here.
- */
- complete_release_commit(x);
-
if (x->done != UINT_MAX)
x->done++;
__wake_up_locked(&x->wait, TASK_NORMAL, 1);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b82a0073532..3da7a2444a91 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -16,6 +16,7 @@
#include <linux/init_task.h>
#include <linux/context_tracking.h>
#include <linux/rcupdate_wait.h>
+#include <linux/compat.h>
#include <linux/blkdev.h>
#include <linux/kprobes.h>
@@ -507,7 +508,8 @@ void resched_cpu(int cpu)
unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags);
- resched_curr(rq);
+ if (cpu_online(cpu) || cpu == smp_processor_id())
+ resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -2044,7 +2046,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*
- * Pairs with the smp_store_release() in finish_lock_switch().
+ * Pairs with the smp_store_release() in finish_task().
*
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
@@ -2055,7 +2057,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->state = TASK_WAKING;
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
@@ -2068,7 +2070,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
#else /* CONFIG_SMP */
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
@@ -2121,7 +2123,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
if (!task_on_rq_queued(p)) {
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&rq->nr_iowait);
}
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
@@ -2570,6 +2572,50 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
#endif /* CONFIG_PREEMPT_NOTIFIERS */
+static inline void prepare_task(struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Claim the task as running, we do this before switching to it
+ * such that any running task will have this set.
+ */
+ next->on_cpu = 1;
+#endif
+}
+
+static inline void finish_task(struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ *
+ * In particular, the load of prev->state in finish_task_switch() must
+ * happen before this.
+ *
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
+ */
+ smp_store_release(&prev->on_cpu, 0);
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock.owner = current;
+#endif
+ /*
+ * If we are tracking spinlock dependencies then we have to
+ * fix up the runqueue lock - which gets 'carried over' from
+ * prev into current:
+ */
+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+
+ raw_spin_unlock_irq(&rq->lock);
+}
+
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
@@ -2590,7 +2636,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
- prepare_lock_switch(rq, next);
+ prepare_task(next);
prepare_arch_switch(next);
}
@@ -2645,7 +2691,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* the scheduled task must drop that reference.
*
* We must observe prev->state before clearing prev->on_cpu (in
- * finish_lock_switch), otherwise a concurrent wakeup can get prev
+ * finish_task), otherwise a concurrent wakeup can get prev
* running on another CPU and we could rave with its RUNNING -> DEAD
* transition, resulting in a double drop.
*/
@@ -2662,7 +2708,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* to use.
*/
smp_mb__after_unlock_lock();
- finish_lock_switch(rq, prev);
+ finish_task(prev);
+ finish_lock_switch(rq);
finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current);
@@ -4039,8 +4086,7 @@ recheck:
return -EINVAL;
}
- if (attr->sched_flags &
- ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
return -EINVAL;
/*
@@ -4107,6 +4153,9 @@ recheck:
}
if (user) {
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return -EINVAL;
+
retval = security_task_setscheduler(p);
if (retval)
return retval;
@@ -4162,7 +4211,8 @@ change:
}
#endif
#ifdef CONFIG_SMP
- if (dl_bandwidth_enabled() && dl_policy(policy)) {
+ if (dl_bandwidth_enabled() && dl_policy(policy) &&
+ !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
cpumask_t *span = rq->rd->span;
/*
@@ -4292,6 +4342,11 @@ int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
}
EXPORT_SYMBOL_GPL(sched_setattr);
+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, false, true);
+}
+
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
* @p: the task in question.
@@ -5096,24 +5151,11 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
return ret;
}
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- * this syscall writes the default timeslice value of a given process
- * into the user-space timespec buffer. A value of '0' means infinity.
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
- struct timespec __user *, interval)
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
struct task_struct *p;
unsigned int time_slice;
struct rq_flags rf;
- struct timespec t;
struct rq *rq;
int retval;
@@ -5137,15 +5179,51 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
task_rq_unlock(rq, p, &rf);
rcu_read_unlock();
- jiffies_to_timespec(time_slice, &t);
- retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
- return retval;
+ jiffies_to_timespec64(time_slice, t);
+ return 0;
out_unlock:
rcu_read_unlock();
return retval;
}
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_timespec64(&t, interval);
+
+ return retval;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = compat_put_timespec64(&t, interval);
+ return retval;
+}
+#endif
+
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
@@ -6620,7 +6698,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
return ret;
}
-static int cpu_stats_show(struct seq_file *sf, void *v)
+static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -6660,7 +6738,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
-static struct cftype cpu_files[] = {
+static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "shares",
@@ -6681,7 +6759,7 @@ static struct cftype cpu_files[] = {
},
{
.name = "stat",
- .seq_show = cpu_stats_show,
+ .seq_show = cpu_cfs_stat_show,
},
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -6699,16 +6777,182 @@ static struct cftype cpu_files[] = {
{ } /* Terminate */
};
+static int cpu_extra_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ u64 throttled_usec;
+
+ throttled_usec = cfs_b->throttled_time;
+ do_div(throttled_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "nr_periods %d\n"
+ "nr_throttled %d\n"
+ "throttled_usec %llu\n",
+ cfs_b->nr_periods, cfs_b->nr_throttled,
+ throttled_usec);
+ }
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct task_group *tg = css_tg(css);
+ u64 weight = scale_load_down(tg->shares);
+
+ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+}
+
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 weight)
+{
+ /*
+ * cgroup weight knobs should use the common MIN, DFL and MAX
+ * values which are 1, 100 and 10000 respectively. While it loses
+ * a bit of range on both ends, it maps pretty well onto the shares
+ * value used by scheduler and the round-trip conversions preserve
+ * the original value over the entire range.
+ */
+ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+ return -ERANGE;
+
+ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+
+static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ unsigned long weight = scale_load_down(css_tg(css)->shares);
+ int last_delta = INT_MAX;
+ int prio, delta;
+
+ /* find the closest nice value to the current weight */
+ for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
+ delta = abs(sched_prio_to_weight[prio] - weight);
+ if (delta >= last_delta)
+ break;
+ last_delta = delta;
+ }
+
+ return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
+}
+
+static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 nice)
+{
+ unsigned long weight;
+
+ if (nice < MIN_NICE || nice > MAX_NICE)
+ return -ERANGE;
+
+ weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+#endif
+
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+ long period, long quota)
+{
+ if (quota < 0)
+ seq_puts(sf, "max");
+ else
+ seq_printf(sf, "%ld", quota);
+
+ seq_printf(sf, " %ld\n", period);
+}
+
+/* caller should put the current value in *@periodp before calling */
+static int __maybe_unused cpu_period_quota_parse(char *buf,
+ u64 *periodp, u64 *quotap)
+{
+ char tok[21]; /* U64_MAX */
+
+ if (!sscanf(buf, "%s %llu", tok, periodp))
+ return -EINVAL;
+
+ *periodp *= NSEC_PER_USEC;
+
+ if (sscanf(tok, "%llu", quotap))
+ *quotap *= NSEC_PER_USEC;
+ else if (!strcmp(tok, "max"))
+ *quotap = RUNTIME_INF;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static int cpu_max_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+ return 0;
+}
+
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct task_group *tg = css_tg(of_css(of));
+ u64 period = tg_get_cfs_period(tg);
+ u64 quota;
+ int ret;
+
+ ret = cpu_period_quota_parse(buf, &period, &quota);
+ if (!ret)
+ ret = tg_set_cfs_bandwidth(tg, period, quota);
+ return ret ?: nbytes;
+}
+#endif
+
+static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ {
+ .name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_weight_read_u64,
+ .write_u64 = cpu_weight_write_u64,
+ },
+ {
+ .name = "weight.nice",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_weight_nice_read_s64,
+ .write_s64 = cpu_weight_nice_write_s64,
+ },
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .name = "max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_max_show,
+ .write = cpu_max_write,
+ },
+#endif
+ { } /* terminate */
+};
+
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
+ .css_extra_stat_show = cpu_extra_stat_show,
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
- .legacy_cftypes = cpu_files,
+ .legacy_cftypes = cpu_legacy_files,
+ .dfl_cftypes = cpu_files,
.early_init = true,
+ .threaded = true,
};
#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
deleted file mode 100644
index a8358a57a316..000000000000
--- a/kernel/sched/cpuacct.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifdef CONFIG_CGROUP_CPUACCT
-
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
-
-#else
-
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-}
-
-static inline void
-cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
-{
-}
-
-#endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2f52ec0f1539..dd062a1c8cf0 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -60,7 +60,8 @@ struct sugov_cpu {
u64 last_update;
/* The fields below are only needed when sharing a policy. */
- unsigned long util;
+ unsigned long util_cfs;
+ unsigned long util_dl;
unsigned long max;
unsigned int flags;
@@ -176,21 +177,28 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
-static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu)
+static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
- struct rq *rq = cpu_rq(cpu);
- unsigned long cfs_max;
+ struct rq *rq = cpu_rq(sg_cpu->cpu);
- cfs_max = arch_scale_cpu_capacity(NULL, cpu);
+ sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+ sg_cpu->util_cfs = cpu_util_cfs(rq);
+ sg_cpu->util_dl = cpu_util_dl(rq);
+}
- *util = min(rq->cfs.avg.util_avg, cfs_max);
- *max = cfs_max;
+static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
+{
+ /*
+ * Ideally we would like to set util_dl as min/guaranteed freq and
+ * util_cfs + util_dl as requested freq. However, cpufreq is not yet
+ * ready for such an interface. So, we only do the latter for now.
+ */
+ return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max);
}
-static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
- unsigned int flags)
+static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time)
{
- if (flags & SCHED_CPUFREQ_IOWAIT) {
+ if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) {
if (sg_cpu->iowait_boost_pending)
return;
@@ -244,7 +252,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
#ifdef CONFIG_NO_HZ_COMMON
static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
{
- unsigned long idle_calls = tick_nohz_get_idle_calls();
+ unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
bool ret = idle_calls == sg_cpu->saved_idle_calls;
sg_cpu->saved_idle_calls = idle_calls;
@@ -264,7 +272,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int next_f;
bool busy;
- sugov_set_iowait_boost(sg_cpu, time, flags);
+ sugov_set_iowait_boost(sg_cpu, time);
sg_cpu->last_update = time;
if (!sugov_should_update_freq(sg_policy, time))
@@ -272,10 +280,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
busy = sugov_cpu_is_busy(sg_cpu);
- if (flags & SCHED_CPUFREQ_RT_DL) {
+ if (flags & SCHED_CPUFREQ_RT) {
next_f = policy->cpuinfo.max_freq;
} else {
- sugov_get_util(&util, &max, sg_cpu->cpu);
+ sugov_get_util(sg_cpu);
+ max = sg_cpu->max;
+ util = sugov_aggregate_util(sg_cpu);
sugov_iowait_boost(sg_cpu, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
@@ -305,23 +315,27 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
s64 delta_ns;
/*
- * If the CPU utilization was last updated before the previous
- * frequency update and the time elapsed between the last update
- * of the CPU utilization and the last frequency update is long
- * enough, don't take the CPU into account as it probably is
- * idle now (and clear iowait_boost for it).
+ * If the CFS CPU utilization was last updated before the
+ * previous frequency update and the time elapsed between the
+ * last update of the CPU utilization and the last frequency
+ * update is long enough, reset iowait_boost and util_cfs, as
+ * they are now probably stale. However, still consider the
+ * CPU contribution if it has some DEADLINE utilization
+ * (util_dl).
*/
delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
j_sg_cpu->iowait_boost_pending = false;
- continue;
+ j_sg_cpu->util_cfs = 0;
+ if (j_sg_cpu->util_dl == 0)
+ continue;
}
- if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
+ if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
return policy->cpuinfo.max_freq;
- j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
+ j_util = sugov_aggregate_util(j_sg_cpu);
if (j_util * max > j_max * util) {
util = j_util;
max = j_max;
@@ -338,22 +352,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- unsigned long util, max;
unsigned int next_f;
- sugov_get_util(&util, &max, sg_cpu->cpu);
-
raw_spin_lock(&sg_policy->update_lock);
- sg_cpu->util = util;
- sg_cpu->max = max;
+ sugov_get_util(sg_cpu);
sg_cpu->flags = flags;
- sugov_set_iowait_boost(sg_cpu, time, flags);
+ sugov_set_iowait_boost(sg_cpu, time);
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
- if (flags & SCHED_CPUFREQ_RT_DL)
+ if (flags & SCHED_CPUFREQ_RT)
next_f = sg_policy->policy->cpuinfo.max_freq;
else
next_f = sugov_next_freq_shared(sg_cpu, time);
@@ -383,9 +393,9 @@ static void sugov_irq_work(struct irq_work *irq_work)
sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
/*
- * For RT and deadline tasks, the schedutil governor shoots the
- * frequency to maximum. Special care must be taken to ensure that this
- * kthread doesn't result in the same behavior.
+ * For RT tasks, the schedutil governor shoots the frequency to maximum.
+ * Special care must be taken to ensure that this kthread doesn't result
+ * in the same behavior.
*
* This is (mostly) guaranteed by the work_in_progress flag. The flag is
* updated only at the end of the sugov_work() function and before that
@@ -470,7 +480,20 @@ static void sugov_policy_free(struct sugov_policy *sg_policy)
static int sugov_kthread_create(struct sugov_policy *sg_policy)
{
struct task_struct *thread;
- struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+ struct sched_attr attr = {
+ .size = sizeof(struct sched_attr),
+ .sched_policy = SCHED_DEADLINE,
+ .sched_flags = SCHED_FLAG_SUGOV,
+ .sched_nice = 0,
+ .sched_priority = 0,
+ /*
+ * Fake (unused) bandwidth; workaround to "fix"
+ * priority inheritance.
+ */
+ .sched_runtime = 1000000,
+ .sched_deadline = 10000000,
+ .sched_period = 10000000,
+ };
struct cpufreq_policy *policy = sg_policy->policy;
int ret;
@@ -488,10 +511,10 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
return PTR_ERR(thread);
}
- ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+ ret = sched_setattr_nocheck(thread, &attr);
if (ret) {
kthread_stop(thread);
- pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
return ret;
}
@@ -655,7 +678,7 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
- sg_cpu->flags = SCHED_CPUFREQ_RT;
+ sg_cpu->flags = 0;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9be8b68a66da..bac6ac9a4ec7 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
*/
__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
- cpuacct_account_field(p, index, tmp);
+ cgroup_account_cputime_field(p, index, tmp);
}
/*
@@ -446,6 +446,13 @@ void vtime_account_irq_enter(struct task_struct *tsk)
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+ u64 *ut, u64 *st)
+{
+ *ut = curr->utime;
+ *st = curr->stime;
+}
+
void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
*ut = p->utime;
@@ -584,9 +591,8 @@ drop_precision:
*
* Assuming that rtime_i+1 >= rtime_i.
*/
-static void cputime_adjust(struct task_cputime *curr,
- struct prev_cputime *prev,
- u64 *ut, u64 *st)
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+ u64 *ut, u64 *st)
{
u64 rtime, stime, utime;
unsigned long flags;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f349f7e98dec..9bb0e0c412ec 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -78,7 +78,7 @@ static inline int dl_bw_cpus(int i)
#endif
static inline
-void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
@@ -86,10 +86,12 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
dl_rq->running_bw += dl_bw;
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
}
static inline
-void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
@@ -98,10 +100,12 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
dl_rq->running_bw = 0;
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
}
static inline
-void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
@@ -111,7 +115,7 @@ void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
}
static inline
-void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
@@ -123,16 +127,46 @@ void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
}
+static inline
+void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __add_rq_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __sub_rq_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __add_running_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __sub_running_bw(dl_se->dl_bw, dl_rq);
+}
+
void dl_change_utilization(struct task_struct *p, u64 new_bw)
{
struct rq *rq;
+ BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
+
if (task_on_rq_queued(p))
return;
rq = task_rq(p);
if (p->dl.dl_non_contending) {
- sub_running_bw(p->dl.dl_bw, &rq->dl);
+ sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
@@ -144,8 +178,8 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw)
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p);
}
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
- add_rq_bw(new_bw, &rq->dl);
+ __sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ __add_rq_bw(new_bw, &rq->dl);
}
/*
@@ -217,6 +251,9 @@ static void task_non_contending(struct task_struct *p)
if (dl_se->dl_runtime == 0)
return;
+ if (dl_entity_is_special(dl_se))
+ return;
+
WARN_ON(hrtimer_active(&dl_se->inactive_timer));
WARN_ON(dl_se->dl_non_contending);
@@ -236,12 +273,12 @@ static void task_non_contending(struct task_struct *p)
*/
if (zerolag_time < 0) {
if (dl_task(p))
- sub_running_bw(dl_se->dl_bw, dl_rq);
+ sub_running_bw(dl_se, dl_rq);
if (!dl_task(p) || p->state == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
if (p->state == TASK_DEAD)
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
raw_spin_lock(&dl_b->lock);
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_clear_params(p);
@@ -268,7 +305,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
return;
if (flags & ENQUEUE_MIGRATED)
- add_rq_bw(dl_se->dl_bw, dl_rq);
+ add_rq_bw(dl_se, dl_rq);
if (dl_se->dl_non_contending) {
dl_se->dl_non_contending = 0;
@@ -289,7 +326,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
* when the "inactive timer" fired).
* So, add it back.
*/
- add_running_bw(dl_se->dl_bw, dl_rq);
+ add_running_bw(dl_se, dl_rq);
}
}
@@ -1114,7 +1151,8 @@ static void update_curr_dl(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_dl_entity *dl_se = &curr->dl;
- u64 delta_exec;
+ u64 delta_exec, scaled_delta_exec;
+ int cpu = cpu_of(rq);
if (!dl_task(curr) || !on_dl_rq(dl_se))
return;
@@ -1134,9 +1172,6 @@ static void update_curr_dl(struct rq *rq)
return;
}
- /* kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_util(rq, SCHED_CPUFREQ_DL);
-
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1144,17 +1179,43 @@ static void update_curr_dl(struct rq *rq)
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq);
- cpuacct_charge(curr, delta_exec);
+ cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
- if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
- delta_exec = grub_reclaim(delta_exec, rq, &curr->dl);
- dl_se->runtime -= delta_exec;
+ if (dl_entity_is_special(dl_se))
+ return;
+
+ /*
+ * For tasks that participate in GRUB, we implement GRUB-PA: the
+ * spare reclaimed bandwidth is used to clock down frequency.
+ *
+ * For the others, we still need to scale reservation parameters
+ * according to current frequency and CPU maximum capacity.
+ */
+ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
+ scaled_delta_exec = grub_reclaim(delta_exec,
+ rq,
+ &curr->dl);
+ } else {
+ unsigned long scale_freq = arch_scale_freq_capacity(cpu);
+ unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ scaled_delta_exec = cap_scale(delta_exec, scale_freq);
+ scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
+ }
+
+ dl_se->runtime -= scaled_delta_exec;
throttle:
if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
dl_se->dl_throttled = 1;
+
+ /* If requested, inform the user about runtime overruns. */
+ if (dl_runtime_exceeded(dl_se) &&
+ (dl_se->flags & SCHED_FLAG_DL_OVERRUN))
+ dl_se->dl_overrun = 1;
+
__dequeue_task_dl(rq, curr, 0);
if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
@@ -1204,8 +1265,8 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
- sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
- sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
+ sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
+ sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));
dl_se->dl_non_contending = 0;
}
@@ -1222,7 +1283,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
sched_clock_tick();
update_rq_clock(rq);
- sub_running_bw(dl_se->dl_bw, &rq->dl);
+ sub_running_bw(dl_se, &rq->dl);
dl_se->dl_non_contending = 0;
unlock:
task_rq_unlock(rq, p, &rf);
@@ -1416,8 +1477,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
dl_check_constrained_dl(&p->dl);
if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
- add_rq_bw(p->dl.dl_bw, &rq->dl);
- add_running_bw(p->dl.dl_bw, &rq->dl);
+ add_rq_bw(&p->dl, &rq->dl);
+ add_running_bw(&p->dl, &rq->dl);
}
/*
@@ -1457,8 +1518,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
__dequeue_task_dl(rq, p, flags);
if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
- sub_running_bw(p->dl.dl_bw, &rq->dl);
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ sub_running_bw(&p->dl, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
}
/*
@@ -1564,7 +1625,7 @@ static void migrate_task_rq_dl(struct task_struct *p)
*/
raw_spin_lock(&rq->lock);
if (p->dl.dl_non_contending) {
- sub_running_bw(p->dl.dl_bw, &rq->dl);
+ sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
@@ -1576,7 +1637,7 @@ static void migrate_task_rq_dl(struct task_struct *p)
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p);
}
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
raw_spin_unlock(&rq->lock);
}
@@ -2019,11 +2080,11 @@ retry:
}
deactivate_task(rq, next_task, 0);
- sub_running_bw(next_task->dl.dl_bw, &rq->dl);
- sub_rq_bw(next_task->dl.dl_bw, &rq->dl);
+ sub_running_bw(&next_task->dl, &rq->dl);
+ sub_rq_bw(&next_task->dl, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
- add_rq_bw(next_task->dl.dl_bw, &later_rq->dl);
- add_running_bw(next_task->dl.dl_bw, &later_rq->dl);
+ add_rq_bw(&next_task->dl, &later_rq->dl);
+ add_running_bw(&next_task->dl, &later_rq->dl);
activate_task(later_rq, next_task, 0);
ret = 1;
@@ -2111,11 +2172,11 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
- sub_running_bw(p->dl.dl_bw, &src_rq->dl);
- sub_rq_bw(p->dl.dl_bw, &src_rq->dl);
+ sub_running_bw(&p->dl, &src_rq->dl);
+ sub_rq_bw(&p->dl, &src_rq->dl);
set_task_cpu(p, this_cpu);
- add_rq_bw(p->dl.dl_bw, &this_rq->dl);
- add_running_bw(p->dl.dl_bw, &this_rq->dl);
+ add_rq_bw(&p->dl, &this_rq->dl);
+ add_running_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
@@ -2224,7 +2285,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
task_non_contending(p);
if (!task_on_rq_queued(p))
- sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
/*
* We cannot use inactive_task_timer() to invoke sub_running_bw()
@@ -2256,7 +2317,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
/* If p is not queued we will update its parameters at next wakeup. */
if (!task_on_rq_queued(p)) {
- add_rq_bw(p->dl.dl_bw, &rq->dl);
+ add_rq_bw(&p->dl, &rq->dl);
return;
}
@@ -2435,6 +2496,9 @@ int sched_dl_overflow(struct task_struct *p, int policy,
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
int cpus, err = -1;
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return 0;
+
/* !deadline task may carry old deadline bandwidth */
if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
return 0;
@@ -2521,6 +2585,10 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
*/
bool __checkparam_dl(const struct sched_attr *attr)
{
+ /* special dl tasks don't actually use any parameter */
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return true;
+
/* deadline != 0 */
if (attr->sched_deadline == 0)
return false;
@@ -2566,6 +2634,7 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
+ dl_se->dl_overrun = 0;
}
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0989676c50e9..7b6535987500 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -844,7 +844,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
- cpuacct_charge(curtask, delta_exec);
+ cgroup_account_cputime(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
@@ -3020,9 +3020,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
+ * a real problem.
*
* It will not get called when we go idle, because the idle
* thread is a different class (!fair), nor will the utilization
@@ -3091,8 +3089,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
return c1 + c2 + c3;
}
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
-
/*
* Accumulate the three separate parts of the sum; d1 the remainder
* of the last (incomplete) period, d2 the span of full periods and d3
@@ -3122,7 +3118,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
- scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_freq = arch_scale_freq_capacity(cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
delta += sa->period_contrib;
@@ -3413,9 +3409,9 @@ void set_task_rq_fair(struct sched_entity *se,
* _IFF_ we look at the pure running and runnable sums. Because they
* represent the very same entity, just at different points in the hierarchy.
*
- *
- * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
- * simply copies the running sum over.
+ * Per the above update_tg_cfs_util() is trivial and simply copies the running
+ * sum over (but still wrong, because the group entity and group rq do not have
+ * their PELT windows aligned).
*
* However, update_tg_cfs_runnable() is more complex. So we have:
*
@@ -3424,11 +3420,11 @@ void set_task_rq_fair(struct sched_entity *se,
* And since, like util, the runnable part should be directly transferable,
* the following would _appear_ to be the straight forward approach:
*
- * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3)
+ * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
*
* And per (1) we have:
*
- * ge->avg.running_avg == grq->avg.running_avg
+ * ge->avg.runnable_avg == grq->avg.runnable_avg
*
* Which gives:
*
@@ -3447,27 +3443,28 @@ void set_task_rq_fair(struct sched_entity *se,
* to (shortly) return to us. This only works by keeping the weights as
* integral part of the sum. We therefore cannot decompose as per (3).
*
- * OK, so what then?
+ * Another reason this doesn't work is that runnable isn't a 0-sum entity.
+ * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
+ * rq itself is runnable anywhere between 2/3 and 1 depending on how the
+ * runnable section of these tasks overlap (or not). If they were to perfectly
+ * align the rq as a whole would be runnable 2/3 of the time. If however we
+ * always have at least 1 runnable task, the rq as a whole is always runnable.
*
+ * So we'll have to approximate.. :/
*
- * Another way to look at things is:
+ * Given the constraint:
*
- * grq->avg.load_avg = \Sum se->avg.load_avg
+ * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
*
- * Therefore, per (2):
+ * We can construct a rule that adds runnable to a rq by assuming minimal
+ * overlap.
*
- * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ * On removal, we'll assume each task is equally runnable; which yields:
*
- * And the very thing we're propagating is a change in that sum (someone
- * joined/left). So we can easily know the runnable change, which would be, per
- * (2) the already tracked se->load_avg divided by the corresponding
- * se->weight.
+ * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
*
- * Basically (4) but in differential form:
+ * XXX: only do this for the part of runnable > running ?
*
- * d(runnable_avg) += se->avg.load_avg / se->load.weight
- * (5)
- * ge->avg.load_avg += ge->load.weight * d(runnable_avg)
*/
static inline void
@@ -3479,6 +3476,14 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
if (!delta)
return;
+ /*
+ * The relation between sum and avg is:
+ *
+ * LOAD_AVG_MAX - 1024 + sa->period_contrib
+ *
+ * however, the PELT windows are not aligned between grq and gse.
+ */
+
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
@@ -3491,33 +3496,68 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long runnable_sum = gcfs_rq->prop_runnable_sum;
- long runnable_load_avg, load_avg;
- s64 runnable_load_sum, load_sum;
+ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ unsigned long runnable_load_avg, load_avg;
+ u64 runnable_load_sum, load_sum = 0;
+ s64 delta_sum;
if (!runnable_sum)
return;
gcfs_rq->prop_runnable_sum = 0;
+ if (runnable_sum >= 0) {
+ /*
+ * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
+ * the CPU is saturated running == runnable.
+ */
+ runnable_sum += se->avg.load_sum;
+ runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
+ } else {
+ /*
+ * Estimate the new unweighted runnable_sum of the gcfs_rq by
+ * assuming all tasks are equally runnable.
+ */
+ if (scale_load_down(gcfs_rq->load.weight)) {
+ load_sum = div_s64(gcfs_rq->avg.load_sum,
+ scale_load_down(gcfs_rq->load.weight));
+ }
+
+ /* But make sure to not inflate se's runnable */
+ runnable_sum = min(se->avg.load_sum, load_sum);
+ }
+
+ /*
+ * runnable_sum can't be lower than running_sum
+ * As running sum is scale with cpu capacity wehreas the runnable sum
+ * is not we rescale running_sum 1st
+ */
+ running_sum = se->avg.util_sum /
+ arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ runnable_sum = max(runnable_sum, running_sum);
+
load_sum = (s64)se_weight(se) * runnable_sum;
load_avg = div_s64(load_sum, LOAD_AVG_MAX);
- add_positive(&se->avg.load_sum, runnable_sum);
- add_positive(&se->avg.load_avg, load_avg);
+ delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
+ delta_avg = load_avg - se->avg.load_avg;
- add_positive(&cfs_rq->avg.load_avg, load_avg);
- add_positive(&cfs_rq->avg.load_sum, load_sum);
+ se->avg.load_sum = runnable_sum;
+ se->avg.load_avg = load_avg;
+ add_positive(&cfs_rq->avg.load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.load_sum, delta_sum);
runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
+ delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
+ delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
- add_positive(&se->avg.runnable_load_sum, runnable_sum);
- add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
+ se->avg.runnable_load_sum = runnable_sum;
+ se->avg.runnable_load_avg = runnable_load_avg;
if (se->on_rq) {
- add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
- add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
+ add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
}
}
@@ -4321,12 +4361,12 @@ static inline bool cfs_bandwidth_used(void)
void cfs_bandwidth_usage_inc(void)
{
- static_key_slow_inc(&__cfs_bandwidth_used);
+ static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
}
void cfs_bandwidth_usage_dec(void)
{
- static_key_slow_dec(&__cfs_bandwidth_used);
+ static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
@@ -5645,8 +5685,8 @@ static int wake_wide(struct task_struct *p)
* soonest. For the purpose of speed we only consider the waking and previous
* CPU.
*
- * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
- * will be) idle.
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is
+ * cache-affine and is (or will be) idle.
*
* wake_affine_weight() - considers the weight to reflect the average
* scheduling latency of the CPUs. This seems to work
@@ -5657,7 +5697,13 @@ static bool
wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
- if (idle_cpu(this_cpu))
+ /*
+ * If this_cpu is idle, it implies the wakeup is from interrupt
+ * context. Only allow the move if cache is shared. Otherwise an
+ * interrupt intensive workload could force all tasks onto one
+ * node depending on the IO topology or IRQ affinity settings.
+ */
+ if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
return true;
if (sync && cpu_rq(this_cpu)->nr_running == 1)
@@ -5721,12 +5767,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return affine;
}
-static inline int task_util(struct task_struct *p);
-static int cpu_util_wake(int cpu, struct task_struct *p);
+static inline unsigned long task_util(struct task_struct *p);
+static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
{
- return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+ return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
}
/*
@@ -5906,7 +5952,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
}
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(cpu_rq(i));
- if (load < min_load || (load == min_load && i == this_cpu)) {
+ if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
}
@@ -6203,7 +6249,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* capacity_orig) as it useful for predicting the capacity required after task
* migrations (scheduler-driven DVFS).
*/
-static int cpu_util(int cpu)
+static unsigned long cpu_util(int cpu)
{
unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
unsigned long capacity = capacity_orig_of(cpu);
@@ -6211,7 +6257,7 @@ static int cpu_util(int cpu)
return (util >= capacity) ? capacity : util;
}
-static inline int task_util(struct task_struct *p)
+static inline unsigned long task_util(struct task_struct *p)
{
return p->se.avg.util_avg;
}
@@ -6220,7 +6266,7 @@ static inline int task_util(struct task_struct *p)
* cpu_util_wake: Compute cpu utilization with any contributions from
* the waking task p removed.
*/
-static int cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
{
unsigned long util, capacity;
@@ -6405,8 +6451,7 @@ static void task_dead_fair(struct task_struct *p)
}
#endif /* CONFIG_SMP */
-static unsigned long
-wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
+static unsigned long wakeup_gran(struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
@@ -6448,7 +6493,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
if (vdiff <= 0)
return -1;
- gran = wakeup_gran(curr, se);
+ gran = wakeup_gran(se);
if (vdiff > gran)
return 1;
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index dd7908743dab..9bcbacba82a8 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -89,7 +89,9 @@ static int membarrier_private_expedited(void)
rcu_read_unlock();
}
if (!fallback) {
+ preempt_disable();
smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
free_cpumask_var(tmpmask);
}
cpus_read_unlock();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d8c43d73e078..862a513adca3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -969,7 +969,7 @@ static void update_curr_rt(struct rq *rq)
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq);
- cpuacct_charge(curr, delta_exec);
+ cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
@@ -2034,8 +2034,9 @@ static void pull_rt_task(struct rq *this_rq)
bool resched = false;
struct task_struct *p;
struct rq *src_rq;
+ int rt_overload_count = rt_overloaded(this_rq);
- if (likely(!rt_overloaded(this_rq)))
+ if (likely(!rt_overload_count))
return;
/*
@@ -2044,6 +2045,11 @@ static void pull_rt_task(struct rq *this_rq)
*/
smp_rmb();
+ /* If we are the only overloaded CPU do nothing */
+ if (rt_overload_count == 1 &&
+ cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
+ return;
+
#ifdef HAVE_RT_PUSH_IPI
if (sched_feat(RT_PUSH_IPI)) {
tell_cpu_to_push(this_rq);
@@ -2206,7 +2212,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
#endif /* CONFIG_SMP */
- if (p->prio < rq->curr->prio)
+ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
resched_curr(rq);
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 45ab0bf564e7..2e95505e23c6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -30,6 +30,7 @@
#include <linux/irq_work.h>
#include <linux/tick.h>
#include <linux/slab.h>
+#include <linux/cgroup.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
@@ -37,7 +38,6 @@
#include "cpupri.h"
#include "cpudeadline.h"
-#include "cpuacct.h"
#ifdef CONFIG_SCHED_DEBUG
# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
@@ -156,13 +156,39 @@ static inline int task_has_dl_policy(struct task_struct *p)
return dl_policy(p->policy);
}
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
+/*
+ * !! For sched_setattr_nocheck() (kernel) only !!
+ *
+ * This is actually gross. :(
+ *
+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE
+ * tasks, but still be able to sleep. We need this on platforms that cannot
+ * atomically change clock frequency. Remove once fast switching will be
+ * available on such platforms.
+ *
+ * SUGOV stands for SchedUtil GOVernor.
+ */
+#define SCHED_FLAG_SUGOV 0x10000000
+
+static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
+{
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+ return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
+#else
+ return false;
+#endif
+}
+
/*
* Tells if entity @a should preempt entity @b.
*/
static inline bool
dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
{
- return dl_time_before(a->deadline, b->deadline);
+ return dl_entity_is_special(a) ||
+ dl_time_before(a->deadline, b->deadline);
}
/*
@@ -1328,47 +1354,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
# define finish_arch_post_lock_switch() do { } while (0)
#endif
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
- /*
- * We can optimise this out completely for !SMP, because the
- * SMP rebalancing from interrupt is the only thing that cares
- * here.
- */
- next->on_cpu = 1;
-#endif
-}
-
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
- /*
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
- * We must ensure this doesn't happen until the switch is completely
- * finished.
- *
- * In particular, the load of prev->state in finish_task_switch() must
- * happen before this.
- *
- * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
- */
- smp_store_release(&prev->on_cpu, 0);
-#endif
-#ifdef CONFIG_DEBUG_SPINLOCK
- /* this is a valid case when another task releases the spinlock */
- rq->lock.owner = current;
-#endif
- /*
- * If we are tracking spinlock dependencies then we have to
- * fix up the runqueue lock - which gets 'carried over' from
- * prev into current:
- */
- spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-
- raw_spin_unlock_irq(&rq->lock);
-}
-
/*
* wake flags
*/
@@ -1687,17 +1672,17 @@ static inline int hrtick_enabled(struct rq *rq)
#endif /* CONFIG_SCHED_HRTICK */
-#ifdef CONFIG_SMP
-extern void sched_avg_update(struct rq *rq);
-
#ifndef arch_scale_freq_capacity
static __always_inline
-unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+unsigned long arch_scale_freq_capacity(int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
+#ifdef CONFIG_SMP
+extern void sched_avg_update(struct rq *rq);
+
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -1711,10 +1696,17 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
- rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
+ rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq));
sched_avg_update(rq);
}
#else
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
static inline void sched_avg_update(struct rq *rq) { }
#endif
@@ -2096,14 +2088,14 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
* The way cpufreq is currently arranged requires it to evaluate the CPU
* performance state (frequency/voltage) on a regular basis to prevent it from
* being stuck in a completely inadequate performance level for too long.
- * That is not guaranteed to happen if the updates are only triggered from CFS,
- * though, because they may not be coming in if RT or deadline tasks are active
- * all the time (or there are RT and DL tasks only).
+ * That is not guaranteed to happen if the updates are only triggered from CFS
+ * and DL, though, because they may not be coming in if only RT tasks are
+ * active all the time (or there are RT tasks only).
*
- * As a workaround for that issue, this function is called by the RT and DL
- * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * As a workaround for that issue, this function is called periodically by the
+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling,
* but that really is a band-aid. Going forward it should be replaced with
- * solutions targeted more specifically at RT and DL tasks.
+ * solutions targeted more specifically at RT tasks.
*/
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
@@ -2125,3 +2117,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false)
#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+
+static inline unsigned long cpu_util_dl(struct rq *rq)
+{
+ return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
+}
+
+static inline unsigned long cpu_util_cfs(struct rq *rq)
+{
+ return rq->cfs.avg.util_avg;
+}
+
+#endif
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 45caf90b24cd..210b1f2146ff 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -72,7 +72,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq);
- cpuacct_charge(curr, delta_exec);
+ cgroup_account_cputime(curr, delta_exec);
}
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 98feab7933c7..929ecb7d6b78 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&wq_head->lock, flags);
- __add_wait_queue_entry_tail(wq_head, wq_entry);
+ __add_wait_queue(wq_head, wq_entry);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index f8159698aa4d..84cb3acd9260 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -183,7 +183,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
*/
static __sched
int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
- int (*action)(atomic_t *), unsigned mode)
+ wait_atomic_t_action_f action, unsigned int mode)
{
atomic_t *val;
int ret = 0;
@@ -193,7 +193,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
val = wbq_entry->key.flags;
if (atomic_read(val) == 0)
break;
- ret = (*action)(val);
+ ret = (*action)(val, mode);
} while (!ret && atomic_read(val) != 0);
finish_wait(wq_head, &wbq_entry->wq_entry);
return ret;
@@ -210,8 +210,9 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
}, \
}
-__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
- unsigned mode)
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p,
+ wait_atomic_t_action_f action,
+ unsigned int mode)
{
struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
DEFINE_WAIT_ATOMIC_T(wq_entry, p);
@@ -220,6 +221,15 @@ __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
}
EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
+__sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
+{
+ schedule();
+ if (signal_pending_state(mode, current))
+ return -EINTR;
+ return 0;
+}
+EXPORT_SYMBOL(atomic_t_wait);
+
/**
* wake_up_atomic_t - Wake up a waiter on a atomic_t
* @p: The atomic_t being waited on, a kernel virtual address
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5f0dfb2abb8d..3153c9ea51bf 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -515,7 +515,7 @@ void put_seccomp_filter(struct task_struct *tsk)
static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
{
- memset(info, 0, sizeof(*info));
+ clear_siginfo(info);
info->si_signo = SIGSYS;
info->si_code = SYS_SECCOMP;
info->si_call_addr = (void __user *)KSTK_EIP(current);
diff --git a/kernel/signal.c b/kernel/signal.c
index 186143b06d00..c6e4c83dc090 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -79,7 +79,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force)
handler = sig_handler(t, sig);
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
- handler == SIG_DFL && !force)
+ handler == SIG_DFL && !(force && sig_kernel_only(sig)))
return 1;
return sig_handler_ignored(handler, sig);
@@ -95,13 +95,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
return 0;
- if (!sig_task_ignored(t, sig, force))
- return 0;
-
/*
- * Tracers may want to know about even ignored signals.
+ * Tracers may want to know about even ignored signal unless it
+ * is SIGKILL which can't be reported anyway but can be ignored
+ * by SIGNAL_UNKILLABLE task.
*/
- return !t->ptrace;
+ if (t->ptrace && sig != SIGKILL)
+ return 0;
+
+ return sig_task_ignored(t, sig, force);
}
/*
@@ -549,6 +551,7 @@ still_pending:
* a fast-pathed signal or we must have been
* out of queue space. So zero out the info.
*/
+ clear_siginfo(info);
info->si_signo = sig;
info->si_errno = 0;
info->si_code = SI_USER;
@@ -642,6 +645,9 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
spin_unlock(&tsk->sighand->siglock);
posixtimer_rearm(info);
spin_lock(&tsk->sighand->siglock);
+
+ /* Don't expose the si_sys_private value to userspace */
+ info->si_sys_private = 0;
}
#endif
return signr;
@@ -931,9 +937,9 @@ static void complete_signal(int sig, struct task_struct *p, int group)
* then start taking the whole group down immediately.
*/
if (sig_fatal(p, sig) &&
- !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
+ !(signal->flags & SIGNAL_GROUP_EXIT) &&
!sigismember(&t->real_blocked, sig) &&
- (sig == SIGKILL || !t->ptrace)) {
+ (sig == SIGKILL || !p->ptrace)) {
/*
* This signal will be fatal to the whole group.
*/
@@ -1038,12 +1044,12 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
else
override_rlimit = 0;
- q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
- override_rlimit);
+ q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
if (q) {
list_add_tail(&q->list, &pending->list);
switch ((unsigned long) info) {
case (unsigned long) SEND_SIG_NOINFO:
+ clear_siginfo(&q->info);
q->info.si_signo = sig;
q->info.si_errno = 0;
q->info.si_code = SI_USER;
@@ -1052,6 +1058,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
break;
case (unsigned long) SEND_SIG_PRIV:
+ clear_siginfo(&q->info);
q->info.si_signo = sig;
q->info.si_errno = 0;
q->info.si_code = SI_KERNEL;
@@ -1486,6 +1493,129 @@ force_sigsegv(int sig, struct task_struct *p)
return 0;
}
+int force_sig_fault(int sig, int code, void __user *addr
+ ___ARCH_SI_TRAPNO(int trapno)
+ ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
+ , struct task_struct *t)
+{
+ struct siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = code;
+ info.si_addr = addr;
+#ifdef __ARCH_SI_TRAPNO
+ info.si_trapno = trapno;
+#endif
+#ifdef __ia64__
+ info.si_imm = imm;
+ info.si_flags = flags;
+ info.si_isr = isr;
+#endif
+ return force_sig_info(info.si_signo, &info, t);
+}
+
+int send_sig_fault(int sig, int code, void __user *addr
+ ___ARCH_SI_TRAPNO(int trapno)
+ ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
+ , struct task_struct *t)
+{
+ struct siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = code;
+ info.si_addr = addr;
+#ifdef __ARCH_SI_TRAPNO
+ info.si_trapno = trapno;
+#endif
+#ifdef __ia64__
+ info.si_imm = imm;
+ info.si_flags = flags;
+ info.si_isr = isr;
+#endif
+ return send_sig_info(info.si_signo, &info, t);
+}
+
+#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR)
+int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
+{
+ struct siginfo info;
+
+ WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
+ clear_siginfo(&info);
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = code;
+ info.si_addr = addr;
+ info.si_addr_lsb = lsb;
+ return force_sig_info(info.si_signo, &info, t);
+}
+
+int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
+{
+ struct siginfo info;
+
+ WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
+ clear_siginfo(&info);
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = code;
+ info.si_addr = addr;
+ info.si_addr_lsb = lsb;
+ return send_sig_info(info.si_signo, &info, t);
+}
+EXPORT_SYMBOL(send_sig_mceerr);
+#endif
+
+#ifdef SEGV_BNDERR
+int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
+{
+ struct siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_BNDERR;
+ info.si_addr = addr;
+ info.si_lower = lower;
+ info.si_upper = upper;
+ return force_sig_info(info.si_signo, &info, current);
+}
+#endif
+
+#ifdef SEGV_PKUERR
+int force_sig_pkuerr(void __user *addr, u32 pkey)
+{
+ struct siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_PKUERR;
+ info.si_addr = addr;
+ info.si_pkey = pkey;
+ return force_sig_info(info.si_signo, &info, current);
+}
+#endif
+
+/* For the crazy architectures that include trap information in
+ * the errno field, instead of an actual errno value.
+ */
+int force_sig_ptrace_errno_trap(int errno, void __user *addr)
+{
+ struct siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGTRAP;
+ info.si_errno = errno;
+ info.si_code = TRAP_HWBKPT;
+ info.si_addr = addr;
+ return force_sig_info(info.si_signo, &info, current);
+}
+
int kill_pgrp(struct pid *pid, int sig, int priv)
{
int ret;
@@ -1624,6 +1754,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
sig = SIGCHLD;
}
+ clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
/*
@@ -1718,6 +1849,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
parent = tsk->real_parent;
}
+ clear_siginfo(&info);
info.si_signo = SIGCHLD;
info.si_errno = 0;
/*
@@ -1930,7 +2062,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
{
siginfo_t info;
- memset(&info, 0, sizeof info);
+ clear_siginfo(&info);
info.si_signo = signr;
info.si_code = exit_code;
info.si_pid = task_pid_vnr(current);
@@ -2137,6 +2269,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
* have updated *info via PTRACE_SETSIGINFO.
*/
if (signr != info->si_signo) {
+ clear_siginfo(info);
info->si_signo = signr;
info->si_errno = 0;
info->si_code = SI_USER;
@@ -2602,7 +2735,6 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
{
-#ifdef __BIG_ENDIAN
sigset_t old_set = current->blocked;
/* XXX: Don't preclude handling different sized sigset_t's. */
@@ -2610,38 +2742,22 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
return -EINVAL;
if (nset) {
- compat_sigset_t new32;
sigset_t new_set;
int error;
- if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
+ if (get_compat_sigset(&new_set, nset))
return -EFAULT;
-
- sigset_from_compat(&new_set, &new32);
sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
error = sigprocmask(how, &new_set, NULL);
if (error)
return error;
}
- if (oset) {
- compat_sigset_t old32;
- sigset_to_compat(&old32, &old_set);
- if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
- return -EFAULT;
- }
- return 0;
-#else
- return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
- (sigset_t __user *)oset, sigsetsize);
-#endif
+ return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
}
#endif
-static int do_sigpending(void *set, unsigned long sigsetsize)
+static int do_sigpending(sigset_t *set)
{
- if (sigsetsize > sizeof(sigset_t))
- return -EINVAL;
-
spin_lock_irq(&current->sighand->siglock);
sigorsets(set, &current->pending.signal,
&current->signal->shared_pending.signal);
@@ -2661,7 +2777,12 @@ static int do_sigpending(void *set, unsigned long sigsetsize)
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
sigset_t set;
- int err = do_sigpending(&set, sigsetsize);
+ int err;
+
+ if (sigsetsize > sizeof(*uset))
+ return -EINVAL;
+
+ err = do_sigpending(&set);
if (!err && copy_to_user(uset, &set, sigsetsize))
err = -EFAULT;
return err;
@@ -2671,20 +2792,16 @@ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
compat_size_t, sigsetsize)
{
-#ifdef __BIG_ENDIAN
sigset_t set;
- int err = do_sigpending(&set, sigsetsize);
- if (!err) {
- compat_sigset_t set32;
- sigset_to_compat(&set32, &set);
- /* we can get here only if sigsetsize <= sizeof(set) */
- if (copy_to_user(uset, &set32, sigsetsize))
- err = -EFAULT;
- }
+ int err;
+
+ if (sigsetsize > sizeof(*uset))
+ return -EINVAL;
+
+ err = do_sigpending(&set);
+ if (!err)
+ err = put_compat_sigset(uset, &set, sigsetsize);
return err;
-#else
- return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
-#endif
}
#endif
@@ -2705,9 +2822,7 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
#endif
[SIGCHLD] = { NSIGCHLD, SIL_CHLD },
[SIGPOLL] = { NSIGPOLL, SIL_POLL },
-#ifdef __ARCH_SIGSYS
[SIGSYS] = { NSIGSYS, SIL_SYS },
-#endif
};
if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
layout = filter[sig].layout;
@@ -2729,12 +2844,14 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
if ((sig == SIGFPE) && (si_code == FPE_FIXME))
layout = SIL_FAULT;
#endif
+#ifdef BUS_FIXME
+ if ((sig == SIGBUS) && (si_code == BUS_FIXME))
+ layout = SIL_FAULT;
+#endif
}
return layout;
}
-#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
-
int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
{
int err;
@@ -2773,13 +2890,21 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
#ifdef __ARCH_SI_TRAPNO
err |= __put_user(from->si_trapno, &to->si_trapno);
#endif
-#ifdef BUS_MCEERR_AO
+#ifdef __ia64__
+ err |= __put_user(from->si_imm, &to->si_imm);
+ err |= __put_user(from->si_flags, &to->si_flags);
+ err |= __put_user(from->si_isr, &to->si_isr);
+#endif
/*
* Other callers might not initialize the si_lsb field,
* so check explicitly for the right codes here.
*/
- if (from->si_signo == SIGBUS &&
- (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO))
+#ifdef BUS_MCEERR_AR
+ if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR)
+ err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
+#endif
+#ifdef BUS_MCEERR_AO
+ if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO)
err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
#endif
#ifdef SEGV_BNDERR
@@ -2805,18 +2930,185 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
-#ifdef __ARCH_SIGSYS
case SIL_SYS:
err |= __put_user(from->si_call_addr, &to->si_call_addr);
err |= __put_user(from->si_syscall, &to->si_syscall);
err |= __put_user(from->si_arch, &to->si_arch);
break;
-#endif
}
return err;
}
+#ifdef CONFIG_COMPAT
+int copy_siginfo_to_user32(struct compat_siginfo __user *to,
+ const struct siginfo *from)
+#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
+{
+ return __copy_siginfo_to_user32(to, from, in_x32_syscall());
+}
+int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
+ const struct siginfo *from, bool x32_ABI)
+#endif
+{
+ struct compat_siginfo new;
+ memset(&new, 0, sizeof(new));
+
+ new.si_signo = from->si_signo;
+ new.si_errno = from->si_errno;
+ new.si_code = from->si_code;
+ switch(siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_KILL:
+ new.si_pid = from->si_pid;
+ new.si_uid = from->si_uid;
+ break;
+ case SIL_TIMER:
+ new.si_tid = from->si_tid;
+ new.si_overrun = from->si_overrun;
+ new.si_int = from->si_int;
+ break;
+ case SIL_POLL:
+ new.si_band = from->si_band;
+ new.si_fd = from->si_fd;
+ break;
+ case SIL_FAULT:
+ new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ new.si_trapno = from->si_trapno;
+#endif
+#ifdef BUS_MCEERR_AR
+ if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR))
+ new.si_addr_lsb = from->si_addr_lsb;
+#endif
+#ifdef BUS_MCEERR_AO
+ if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO))
+ new.si_addr_lsb = from->si_addr_lsb;
+#endif
+#ifdef SEGV_BNDERR
+ if ((from->si_signo == SIGSEGV) &&
+ (from->si_code == SEGV_BNDERR)) {
+ new.si_lower = ptr_to_compat(from->si_lower);
+ new.si_upper = ptr_to_compat(from->si_upper);
+ }
+#endif
+#ifdef SEGV_PKUERR
+ if ((from->si_signo == SIGSEGV) &&
+ (from->si_code == SEGV_PKUERR))
+ new.si_pkey = from->si_pkey;
+#endif
+
+ break;
+ case SIL_CHLD:
+ new.si_pid = from->si_pid;
+ new.si_uid = from->si_uid;
+ new.si_status = from->si_status;
+#ifdef CONFIG_X86_X32_ABI
+ if (x32_ABI) {
+ new._sifields._sigchld_x32._utime = from->si_utime;
+ new._sifields._sigchld_x32._stime = from->si_stime;
+ } else
#endif
+ {
+ new.si_utime = from->si_utime;
+ new.si_stime = from->si_stime;
+ }
+ break;
+ case SIL_RT:
+ new.si_pid = from->si_pid;
+ new.si_uid = from->si_uid;
+ new.si_int = from->si_int;
+ break;
+ case SIL_SYS:
+ new.si_call_addr = ptr_to_compat(from->si_call_addr);
+ new.si_syscall = from->si_syscall;
+ new.si_arch = from->si_arch;
+ break;
+ }
+
+ if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int copy_siginfo_from_user32(struct siginfo *to,
+ const struct compat_siginfo __user *ufrom)
+{
+ struct compat_siginfo from;
+
+ if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
+ return -EFAULT;
+
+ clear_siginfo(to);
+ to->si_signo = from.si_signo;
+ to->si_errno = from.si_errno;
+ to->si_code = from.si_code;
+ switch(siginfo_layout(from.si_signo, from.si_code)) {
+ case SIL_KILL:
+ to->si_pid = from.si_pid;
+ to->si_uid = from.si_uid;
+ break;
+ case SIL_TIMER:
+ to->si_tid = from.si_tid;
+ to->si_overrun = from.si_overrun;
+ to->si_int = from.si_int;
+ break;
+ case SIL_POLL:
+ to->si_band = from.si_band;
+ to->si_fd = from.si_fd;
+ break;
+ case SIL_FAULT:
+ to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ to->si_trapno = from.si_trapno;
+#endif
+#ifdef BUS_MCEERR_AR
+ if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR))
+ to->si_addr_lsb = from.si_addr_lsb;
+#endif
+#ifdef BUS_MCEER_AO
+ if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO))
+ to->si_addr_lsb = from.si_addr_lsb;
+#endif
+#ifdef SEGV_BNDERR
+ if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) {
+ to->si_lower = compat_ptr(from.si_lower);
+ to->si_upper = compat_ptr(from.si_upper);
+ }
+#endif
+#ifdef SEGV_PKUERR
+ if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR))
+ to->si_pkey = from.si_pkey;
+#endif
+ break;
+ case SIL_CHLD:
+ to->si_pid = from.si_pid;
+ to->si_uid = from.si_uid;
+ to->si_status = from.si_status;
+#ifdef CONFIG_X86_X32_ABI
+ if (in_x32_syscall()) {
+ to->si_utime = from._sifields._sigchld_x32._utime;
+ to->si_stime = from._sifields._sigchld_x32._stime;
+ } else
+#endif
+ {
+ to->si_utime = from.si_utime;
+ to->si_stime = from.si_stime;
+ }
+ break;
+ case SIL_RT:
+ to->si_pid = from.si_pid;
+ to->si_uid = from.si_uid;
+ to->si_int = from.si_int;
+ break;
+ case SIL_SYS:
+ to->si_call_addr = compat_ptr(from.si_call_addr);
+ to->si_syscall = from.si_syscall;
+ to->si_arch = from.si_arch;
+ break;
+ }
+ return 0;
+}
+#endif /* CONFIG_COMPAT */
/**
* do_sigtimedwait - wait for queued signals specified in @which
@@ -2918,7 +3210,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
struct compat_siginfo __user *, uinfo,
struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
{
- compat_sigset_t s32;
sigset_t s;
struct timespec t;
siginfo_t info;
@@ -2927,9 +3218,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+ if (get_compat_sigset(&s, uthese))
return -EFAULT;
- sigset_from_compat(&s, &s32);
if (uts) {
if (compat_get_timespec(&t, uts))
@@ -2956,6 +3246,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct siginfo info;
+ clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_USER;
@@ -2997,8 +3288,9 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
- struct siginfo info = {};
+ struct siginfo info;
+ clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_TKILL;
@@ -3079,7 +3371,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
int, sig,
struct compat_siginfo __user *, uinfo)
{
- siginfo_t info = {};
+ siginfo_t info;
int ret = copy_siginfo_from_user32(&info, uinfo);
if (unlikely(ret))
return ret;
@@ -3123,7 +3415,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
int, sig,
struct compat_siginfo __user *, uinfo)
{
- siginfo_t info = {};
+ siginfo_t info;
if (copy_siginfo_from_user32(&info, uinfo))
return -EFAULT;
@@ -3347,15 +3639,11 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
-#ifdef __BIG_ENDIAN
sigset_t set;
- int err = do_sigpending(&set, sizeof(set.sig[0]));
+ int err = do_sigpending(&set);
if (!err)
err = put_user(set.sig[0], set32);
return err;
-#else
- return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32));
-#endif
}
#endif
@@ -3453,7 +3741,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
compat_size_t, sigsetsize)
{
struct k_sigaction new_ka, old_ka;
- compat_sigset_t mask;
#ifdef __ARCH_HAS_SA_RESTORER
compat_uptr_t restorer;
#endif
@@ -3471,19 +3758,18 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
ret |= get_user(restorer, &act->sa_restorer);
new_ka.sa.sa_restorer = compat_ptr(restorer);
#endif
- ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
+ ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
if (ret)
return -EFAULT;
- sigset_from_compat(&new_ka.sa.sa_mask, &mask);
}
ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
if (!ret && oact) {
- sigset_to_compat(&mask, &old_ka.sa.sa_mask);
ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
&oact->sa_handler);
- ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
+ ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
+ sizeof(oact->sa_mask));
ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
#ifdef __ARCH_HAS_SA_RESTORER
ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
@@ -3663,22 +3949,15 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
{
-#ifdef __BIG_ENDIAN
sigset_t newset;
- compat_sigset_t newset32;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+ if (get_compat_sigset(&newset, unewset))
return -EFAULT;
- sigset_from_compat(&newset, &newset32);
return sigsuspend(&newset);
-#else
- /* on little-endian bitmaps don't care about granularity */
- return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
-#endif
}
#endif
@@ -3709,6 +3988,7 @@ void __init signals_init(void)
/* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */
BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE
!= offsetof(struct siginfo, _sifields._pad));
+ BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE);
sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
}
@@ -3716,26 +3996,25 @@ void __init signals_init(void)
#ifdef CONFIG_KGDB_KDB
#include <linux/kdb.h>
/*
- * kdb_send_sig_info - Allows kdb to send signals without exposing
+ * kdb_send_sig - Allows kdb to send signals without exposing
* signal internals. This function checks if the required locks are
* available before calling the main signal code, to avoid kdb
* deadlocks.
*/
-void
-kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
+void kdb_send_sig(struct task_struct *t, int sig)
{
static struct task_struct *kdb_prev_t;
- int sig, new_t;
+ int new_t, ret;
if (!spin_trylock(&t->sighand->siglock)) {
kdb_printf("Can't do kill command now.\n"
"The sigmask lock is held somewhere else in "
"kernel, try again later\n");
return;
}
- spin_unlock(&t->sighand->siglock);
new_t = kdb_prev_t != t;
kdb_prev_t = t;
if (t->state != TASK_RUNNING && new_t) {
+ spin_unlock(&t->sighand->siglock);
kdb_printf("Process is not RUNNING, sending a signal from "
"kdb risks deadlock\n"
"on the run queue locks. "
@@ -3744,8 +4023,9 @@ kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
"the deadlock.\n");
return;
}
- sig = info->si_signo;
- if (send_sig_info(sig, info, t))
+ ret = send_signal(sig, SEND_SIG_PRIV, t, false);
+ spin_unlock(&t->sighand->siglock);
+ if (ret)
kdb_printf("Fail to deliver Signal %d to process %d.\n",
sig, t->pid);
else
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 662f7b1b7a78..24d243ef8e71 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -486,16 +486,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
}
EXPORT_SYMBOL(__tasklet_hi_schedule);
-void __tasklet_hi_schedule_first(struct tasklet_struct *t)
-{
- lockdep_assert_irqs_disabled();
-
- t->next = __this_cpu_read(tasklet_hi_vec.head);
- __this_cpu_write(tasklet_hi_vec.head, t);
- __raise_softirq_irqoff(HI_SOFTIRQ);
-}
-EXPORT_SYMBOL(__tasklet_hi_schedule_first);
-
static __latent_entropy void tasklet_action(struct softirq_action *a)
{
struct tasklet_struct *list;
@@ -675,7 +665,7 @@ static void run_ksoftirqd(unsigned int cpu)
*/
__do_softirq();
local_irq_enable();
- cond_resched_rcu_qs();
+ cond_resched();
return;
}
local_irq_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index 524a4cb9bbe2..f2289de20e19 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -111,6 +111,12 @@
#ifndef SET_FP_MODE
# define SET_FP_MODE(a,b) (-EINVAL)
#endif
+#ifndef SVE_SET_VL
+# define SVE_SET_VL(a) (-EINVAL)
+#endif
+#ifndef SVE_GET_VL
+# define SVE_GET_VL() (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -129,7 +135,7 @@ EXPORT_SYMBOL(overflowgid);
*/
int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
-int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
+int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;
EXPORT_SYMBOL(fs_overflowuid);
EXPORT_SYMBOL(fs_overflowgid);
@@ -2386,6 +2392,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_GET_FP_MODE:
error = GET_FP_MODE(me);
break;
+ case PR_SVE_SET_VL:
+ error = SVE_SET_VL(arg2);
+ break;
+ case PR_SVE_GET_VL:
+ error = SVE_GET_VL();
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9576bd582d4a..557d46728577 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,7 +30,6 @@
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/ctype.h>
-#include <linux/kmemcheck.h>
#include <linux/kmemleak.h>
#include <linux/fs.h>
#include <linux/init.h>
@@ -67,6 +66,7 @@
#include <linux/kexec.h>
#include <linux/bpf.h>
#include <linux/mount.h>
+#include <linux/pipe_fs_i.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
@@ -1174,15 +1174,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &one_thousand,
},
#endif
-#ifdef CONFIG_KMEMCHECK
- {
- .procname = "kmemcheck",
- .data = &kmemcheck_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
{
.procname = "panic_on_warn",
.data = &panic_on_warn,
@@ -1366,6 +1357,15 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &hugetlb_mempolicy_sysctl_handler,
},
+ {
+ .procname = "numa_stat",
+ .data = &sysctl_vm_numa_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_vm_numa_stat_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif
{
.procname = "hugetlb_shm_group",
@@ -1817,7 +1817,7 @@ static struct ctl_table fs_table[] = {
{
.procname = "pipe-max-size",
.data = &pipe_max_size,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(pipe_max_size),
.mode = 0644,
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
@@ -2576,12 +2576,13 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
if (write) {
unsigned int val = *lvalp;
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
+
if ((param->min && *param->min > val) ||
(param->max && *param->max < val))
return -ERANGE;
- if (*lvalp > UINT_MAX)
- return -EINVAL;
*valp = val;
} else {
unsigned int val = *valp;
@@ -2621,6 +2622,48 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
do_proc_douintvec_minmax_conv, &param);
}
+struct do_proc_dopipe_max_size_conv_param {
+ unsigned int *min;
+};
+
+static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ struct do_proc_dopipe_max_size_conv_param *param = data;
+
+ if (write) {
+ unsigned int val;
+
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
+
+ val = round_pipe_size(*lvalp);
+ if (val == 0)
+ return -EINVAL;
+
+ if (param->min && *param->min > val)
+ return -ERANGE;
+
+ *valp = val;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long) val;
+ }
+
+ return 0;
+}
+
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_dopipe_max_size_conv_param param = {
+ .min = (unsigned int *) table->extra1,
+ };
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_dopipe_max_size_conv, &param);
+}
+
static void validate_coredump_safety(void)
{
#ifdef CONFIG_COREDUMP
@@ -3084,14 +3127,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
else
bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
}
- kfree(tmp_bitmap);
*lenp -= left;
*ppos += *lenp;
- return 0;
- } else {
- kfree(tmp_bitmap);
- return err;
}
+
+ kfree(tmp_bitmap);
+ return err;
}
#else /* CONFIG_PROC_SYSCTL */
@@ -3126,6 +3167,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -3169,6 +3216,7 @@ EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
+EXPORT_SYMBOL_GPL(proc_dopipe_max_size);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index d689a9557e17..f6b5f19223d6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -21,10 +21,6 @@ config CLOCKSOURCE_VALIDATE_LAST_CYCLE
config GENERIC_TIME_VSYSCALL
bool
-# Timekeeping vsyscall support
-config GENERIC_TIME_VSYSCALL_OLD
- bool
-
# Old style timekeeping
config ARCH_USES_GETTIMEOFFSET
bool
@@ -99,6 +95,7 @@ config NO_HZ_FULL
select RCU_NOCB_CPU
select VIRT_CPU_ACCOUNTING_GEN
select IRQ_WORK
+ select CPU_ISOLATION
help
Adaptively try to shutdown the tick whenever possible, even when
the CPU is running tasks. Typically this requires running a single
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 03918a19cf2d..65f9e3f24dde 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -171,7 +171,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-static void clocksource_watchdog(unsigned long data)
+static void clocksource_watchdog(struct timer_list *unused)
{
struct clocksource *cs;
u64 csnow, wdnow, cslast, wdlast, delta;
@@ -290,8 +290,7 @@ static inline void clocksource_start_watchdog(void)
{
if (watchdog_running || !watchdog || list_empty(&watchdog_list))
return;
- init_timer(&watchdog_timer);
- watchdog_timer.function = clocksource_watchdog;
+ timer_setup(&watchdog_timer, clocksource_watchdog, 0);
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
watchdog_running = 1;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d32520840fde..ae0c8a411fe7 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -60,6 +60,15 @@
#include "tick-internal.h"
/*
+ * Masks for selecting the soft and hard context timers from
+ * cpu_base->active
+ */
+#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT)
+#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1)
+#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
+#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
+
+/*
* The timer bases:
*
* There are more clockids than hrtimer bases. Thus, we index
@@ -70,7 +79,6 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
- .seq = SEQCNT_ZERO(hrtimer_bases.seq),
.clock_base =
{
{
@@ -93,6 +101,26 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
+ {
+ .index = HRTIMER_BASE_MONOTONIC_SOFT,
+ .clockid = CLOCK_MONOTONIC,
+ .get_time = &ktime_get,
+ },
+ {
+ .index = HRTIMER_BASE_REALTIME_SOFT,
+ .clockid = CLOCK_REALTIME,
+ .get_time = &ktime_get_real,
+ },
+ {
+ .index = HRTIMER_BASE_BOOTTIME_SOFT,
+ .clockid = CLOCK_BOOTTIME,
+ .get_time = &ktime_get_boottime,
+ },
+ {
+ .index = HRTIMER_BASE_TAI_SOFT,
+ .clockid = CLOCK_TAI,
+ .get_time = &ktime_get_clocktai,
+ },
}
};
@@ -118,7 +146,6 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
* timer->base->cpu_base
*/
static struct hrtimer_cpu_base migration_cpu_base = {
- .seq = SEQCNT_ZERO(migration_cpu_base),
.clock_base = { { .cpu_base = &migration_cpu_base, }, },
};
@@ -156,45 +183,33 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
/*
- * With HIGHRES=y we do not migrate the timer when it is expiring
- * before the next event on the target cpu because we cannot reprogram
- * the target cpu hardware and we would cause it to fire late.
+ * We do not migrate the timer when it is expiring before the next
+ * event on the target cpu. When high resolution is enabled, we cannot
+ * reprogram the target cpu hardware and we would cause it to fire
+ * late. To keep it simple, we handle the high resolution enabled and
+ * disabled case similar.
*
* Called with cpu_base->lock of target cpu held.
*/
static int
hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
{
-#ifdef CONFIG_HIGH_RES_TIMERS
ktime_t expires;
- if (!new_base->cpu_base->hres_active)
- return 0;
-
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
- return expires <= new_base->cpu_base->expires_next;
-#else
- return 0;
-#endif
+ return expires < new_base->cpu_base->expires_next;
}
-#ifdef CONFIG_NO_HZ_COMMON
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
- int pinned)
-{
- if (pinned || !base->migration_enabled)
- return base;
- return &per_cpu(hrtimer_bases, get_nohz_timer_target());
-}
-#else
static inline
struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
int pinned)
{
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ if (static_branch_likely(&timers_migration_enabled) && !pinned)
+ return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+#endif
return base;
}
-#endif
/*
* We switch the timer base to a power-optimized selected CPU target,
@@ -396,7 +411,8 @@ static inline void debug_hrtimer_init(struct hrtimer *timer)
debug_object_init(timer, &hrtimer_debug_descr);
}
-static inline void debug_hrtimer_activate(struct hrtimer *timer)
+static inline void debug_hrtimer_activate(struct hrtimer *timer,
+ enum hrtimer_mode mode)
{
debug_object_activate(timer, &hrtimer_debug_descr);
}
@@ -429,8 +445,10 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer)
EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
#else
+
static inline void debug_hrtimer_init(struct hrtimer *timer) { }
-static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
+static inline void debug_hrtimer_activate(struct hrtimer *timer,
+ enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
#endif
@@ -442,10 +460,11 @@ debug_init(struct hrtimer *timer, clockid_t clockid,
trace_hrtimer_init(timer, clockid, mode);
}
-static inline void debug_activate(struct hrtimer *timer)
+static inline void debug_activate(struct hrtimer *timer,
+ enum hrtimer_mode mode)
{
- debug_hrtimer_activate(timer);
- trace_hrtimer_start(timer);
+ debug_hrtimer_activate(timer, mode);
+ trace_hrtimer_start(timer, mode);
}
static inline void debug_deactivate(struct hrtimer *timer)
@@ -454,35 +473,43 @@ static inline void debug_deactivate(struct hrtimer *timer)
trace_hrtimer_cancel(timer);
}
-#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
-static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
- struct hrtimer *timer)
+static struct hrtimer_clock_base *
+__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
{
-#ifdef CONFIG_HIGH_RES_TIMERS
- cpu_base->next_timer = timer;
-#endif
+ unsigned int idx;
+
+ if (!*active)
+ return NULL;
+
+ idx = __ffs(*active);
+ *active &= ~(1U << idx);
+
+ return &cpu_base->clock_base[idx];
}
-static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
+#define for_each_active_base(base, cpu_base, active) \
+ while ((base = __next_base((cpu_base), &(active))))
+
+static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
+ unsigned int active,
+ ktime_t expires_next)
{
- struct hrtimer_clock_base *base = cpu_base->clock_base;
- unsigned int active = cpu_base->active_bases;
- ktime_t expires, expires_next = KTIME_MAX;
+ struct hrtimer_clock_base *base;
+ ktime_t expires;
- hrtimer_update_next_timer(cpu_base, NULL);
- for (; active; base++, active >>= 1) {
+ for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *next;
struct hrtimer *timer;
- if (!(active & 0x01))
- continue;
-
next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
if (expires < expires_next) {
expires_next = expires;
- hrtimer_update_next_timer(cpu_base, timer);
+ if (timer->is_soft)
+ cpu_base->softirq_next_timer = timer;
+ else
+ cpu_base->next_timer = timer;
}
}
/*
@@ -494,7 +521,47 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
expires_next = 0;
return expires_next;
}
-#endif
+
+/*
+ * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
+ * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
+ *
+ * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
+ * those timers will get run whenever the softirq gets handled, at the end of
+ * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
+ *
+ * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
+ * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
+ * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
+ *
+ * @active_mask must be one of:
+ * - HRTIMER_ACTIVE_ALL,
+ * - HRTIMER_ACTIVE_SOFT, or
+ * - HRTIMER_ACTIVE_HARD.
+ */
+static ktime_t
+__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
+{
+ unsigned int active;
+ struct hrtimer *next_timer = NULL;
+ ktime_t expires_next = KTIME_MAX;
+
+ if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+ cpu_base->softirq_next_timer = NULL;
+ expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
+
+ next_timer = cpu_base->softirq_next_timer;
+ }
+
+ if (active_mask & HRTIMER_ACTIVE_HARD) {
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+ cpu_base->next_timer = next_timer;
+ expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
+ }
+
+ return expires_next;
+}
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
@@ -502,36 +569,14 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
- return ktime_get_update_offsets_now(&base->clock_was_set_seq,
+ ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
offs_real, offs_boot, offs_tai);
-}
-/* High resolution timer related functions */
-#ifdef CONFIG_HIGH_RES_TIMERS
-
-/*
- * High resolution timer enabled ?
- */
-static bool hrtimer_hres_enabled __read_mostly = true;
-unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
-EXPORT_SYMBOL_GPL(hrtimer_resolution);
-
-/*
- * Enable / Disable high resolution mode
- */
-static int __init setup_hrtimer_hres(char *str)
-{
- return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
-}
-
-__setup("highres=", setup_hrtimer_hres);
+ base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
+ base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
+ base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
-/*
- * hrtimer_high_res_enabled - query, if the highres mode is enabled
- */
-static inline int hrtimer_is_hres_enabled(void)
-{
- return hrtimer_hres_enabled;
+ return now;
}
/*
@@ -539,7 +584,8 @@ static inline int hrtimer_is_hres_enabled(void)
*/
static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
- return cpu_base->hres_active;
+ return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
+ cpu_base->hres_active : 0;
}
static inline int hrtimer_hres_active(void)
@@ -557,10 +603,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
ktime_t expires_next;
- if (!cpu_base->hres_active)
- return;
+ /*
+ * Find the current next expiration time.
+ */
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
- expires_next = __hrtimer_get_next_event(cpu_base);
+ if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
+ /*
+ * When the softirq is activated, hrtimer has to be
+ * programmed with the first hard hrtimer because soft
+ * timer interrupt could occur too late.
+ */
+ if (cpu_base->softirq_activated)
+ expires_next = __hrtimer_get_next_event(cpu_base,
+ HRTIMER_ACTIVE_HARD);
+ else
+ cpu_base->softirq_expires_next = expires_next;
+ }
if (skip_equal && expires_next == cpu_base->expires_next)
return;
@@ -568,6 +627,9 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
cpu_base->expires_next = expires_next;
/*
+ * If hres is not active, hardware does not have to be
+ * reprogrammed yet.
+ *
* If a hang was detected in the last timer interrupt then we
* leave the hang delay active in the hardware. We want the
* system to make progress. That also prevents the following
@@ -581,81 +643,38 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
* set. So we'd effectivly block all timers until the T2 event
* fires.
*/
- if (cpu_base->hang_detected)
+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
tick_program_event(cpu_base->expires_next, 1);
}
+/* High resolution timer related functions */
+#ifdef CONFIG_HIGH_RES_TIMERS
+
/*
- * When a timer is enqueued and expires earlier than the already enqueued
- * timers, we have to check, whether it expires earlier than the timer for
- * which the clock event device was armed.
- *
- * Called with interrupts disabled and base->cpu_base.lock held
+ * High resolution timer enabled ?
*/
-static void hrtimer_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
-{
- struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-
- WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
-
- /*
- * If the timer is not on the current cpu, we cannot reprogram
- * the other cpus clock event device.
- */
- if (base->cpu_base != cpu_base)
- return;
-
- /*
- * If the hrtimer interrupt is running, then it will
- * reevaluate the clock bases and reprogram the clock event
- * device. The callbacks are always executed in hard interrupt
- * context so we don't need an extra check for a running
- * callback.
- */
- if (cpu_base->in_hrtirq)
- return;
-
- /*
- * CLOCK_REALTIME timer might be requested with an absolute
- * expiry time which is less than base->offset. Set it to 0.
- */
- if (expires < 0)
- expires = 0;
-
- if (expires >= cpu_base->expires_next)
- return;
-
- /* Update the pointer to the next expiring timer */
- cpu_base->next_timer = timer;
-
- /*
- * If a hang was detected in the last timer interrupt then we
- * do not schedule a timer which is earlier than the expiry
- * which we enforced in the hang detection. We want the system
- * to make progress.
- */
- if (cpu_base->hang_detected)
- return;
+static bool hrtimer_hres_enabled __read_mostly = true;
+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
+EXPORT_SYMBOL_GPL(hrtimer_resolution);
- /*
- * Program the timer hardware. We enforce the expiry for
- * events which are already in the past.
- */
- cpu_base->expires_next = expires;
- tick_program_event(expires, 1);
+/*
+ * Enable / Disable high resolution mode
+ */
+static int __init setup_hrtimer_hres(char *str)
+{
+ return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}
+__setup("highres=", setup_hrtimer_hres);
+
/*
- * Initialize the high resolution related parts of cpu_base
+ * hrtimer_high_res_enabled - query, if the highres mode is enabled
*/
-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
+static inline int hrtimer_is_hres_enabled(void)
{
- base->expires_next = KTIME_MAX;
- base->hres_active = 0;
+ return hrtimer_hres_enabled;
}
/*
@@ -667,7 +686,7 @@ static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
- if (!base->hres_active)
+ if (!__hrtimer_hres_active(base))
return;
raw_spin_lock(&base->lock);
@@ -714,23 +733,102 @@ void clock_was_set_delayed(void)
#else
-static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
-static inline int hrtimer_hres_active(void) { return 0; }
static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }
-static inline void
-hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
-static inline int hrtimer_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
-{
- return 0;
-}
-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
static inline void retrigger_next_event(void *arg) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
/*
+ * When a timer is enqueued and expires earlier than the already enqueued
+ * timers, we have to check, whether it expires earlier than the timer for
+ * which the clock event device was armed.
+ *
+ * Called with interrupts disabled and base->cpu_base.lock held
+ */
+static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ struct hrtimer_clock_base *base = timer->base;
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+
+ WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
+
+ /*
+ * CLOCK_REALTIME timer might be requested with an absolute
+ * expiry time which is less than base->offset. Set it to 0.
+ */
+ if (expires < 0)
+ expires = 0;
+
+ if (timer->is_soft) {
+ /*
+ * soft hrtimer could be started on a remote CPU. In this
+ * case softirq_expires_next needs to be updated on the
+ * remote CPU. The soft hrtimer will not expire before the
+ * first hard hrtimer on the remote CPU -
+ * hrtimer_check_target() prevents this case.
+ */
+ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
+
+ if (timer_cpu_base->softirq_activated)
+ return;
+
+ if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
+ return;
+
+ timer_cpu_base->softirq_next_timer = timer;
+ timer_cpu_base->softirq_expires_next = expires;
+
+ if (!ktime_before(expires, timer_cpu_base->expires_next) ||
+ !reprogram)
+ return;
+ }
+
+ /*
+ * If the timer is not on the current cpu, we cannot reprogram
+ * the other cpus clock event device.
+ */
+ if (base->cpu_base != cpu_base)
+ return;
+
+ /*
+ * If the hrtimer interrupt is running, then it will
+ * reevaluate the clock bases and reprogram the clock event
+ * device. The callbacks are always executed in hard interrupt
+ * context so we don't need an extra check for a running
+ * callback.
+ */
+ if (cpu_base->in_hrtirq)
+ return;
+
+ if (expires >= cpu_base->expires_next)
+ return;
+
+ /* Update the pointer to the next expiring timer */
+ cpu_base->next_timer = timer;
+ cpu_base->expires_next = expires;
+
+ /*
+ * If hres is not active, hardware does not have to be
+ * programmed yet.
+ *
+ * If a hang was detected in the last timer interrupt then we
+ * do not schedule a timer which is earlier than the expiry
+ * which we enforced in the hang detection. We want the system
+ * to make progress.
+ */
+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
+ return;
+
+ /*
+ * Program the timer hardware. We enforce the expiry for
+ * events which are already in the past.
+ */
+ tick_program_event(expires, 1);
+}
+
+/*
* Clock realtime was set
*
* Change the offset of the realtime clock vs. the monotonic
@@ -835,9 +933,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
* Returns 1 when the new timer is the leftmost timer in the tree.
*/
static int enqueue_hrtimer(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+ struct hrtimer_clock_base *base,
+ enum hrtimer_mode mode)
{
- debug_activate(timer);
+ debug_activate(timer, mode);
base->cpu_base->active_bases |= 1 << base->index;
@@ -870,7 +969,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
if (!timerqueue_del(&base->active, &timer->node))
cpu_base->active_bases &= ~(1 << base->index);
-#ifdef CONFIG_HIGH_RES_TIMERS
/*
* Note: If reprogram is false we do not update
* cpu_base->next_timer. This happens when we remove the first
@@ -881,7 +979,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
*/
if (reprogram && timer == cpu_base->next_timer)
hrtimer_force_reprogram(cpu_base, 1);
-#endif
}
/*
@@ -930,22 +1027,36 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
return tim;
}
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer: the timer to be added
- * @tim: expiry time
- * @delta_ns: "slack" range for the timer
- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
- * relative (HRTIMER_MODE_REL)
- */
-void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- u64 delta_ns, const enum hrtimer_mode mode)
+static void
+hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
- struct hrtimer_clock_base *base, *new_base;
- unsigned long flags;
- int leftmost;
+ ktime_t expires;
- base = lock_hrtimer_base(timer, &flags);
+ /*
+ * Find the next SOFT expiration.
+ */
+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
+
+ /*
+ * reprogramming needs to be triggered, even if the next soft
+ * hrtimer expires at the same time than the next hard
+ * hrtimer. cpu_base->softirq_expires_next needs to be updated!
+ */
+ if (expires == KTIME_MAX)
+ return;
+
+ /*
+ * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
+ * cpu_base->*expires_next is only set by hrtimer_reprogram()
+ */
+ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
+}
+
+static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ u64 delta_ns, const enum hrtimer_mode mode,
+ struct hrtimer_clock_base *base)
+{
+ struct hrtimer_clock_base *new_base;
/* Remove an active timer from the queue: */
remove_hrtimer(timer, base, true);
@@ -960,21 +1071,35 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/* Switch the timer base, if necessary: */
new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
- leftmost = enqueue_hrtimer(timer, new_base);
- if (!leftmost)
- goto unlock;
+ return enqueue_hrtimer(timer, new_base, mode);
+}
+
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @delta_ns: "slack" range for the timer
+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
+ * softirq based mode is considered for debug purpose only!
+ */
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ u64 delta_ns, const enum hrtimer_mode mode)
+{
+ struct hrtimer_clock_base *base;
+ unsigned long flags;
+
+ /*
+ * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
+ * match.
+ */
+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+
+ base = lock_hrtimer_base(timer, &flags);
+
+ if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
+ hrtimer_reprogram(timer, true);
- if (!hrtimer_is_hres_active(timer)) {
- /*
- * Kick to reschedule the next tick to handle the new timer
- * on dynticks target.
- */
- if (new_base->cpu_base->nohz_active)
- wake_up_nohz_cpu(new_base->cpu_base->cpu);
- } else {
- hrtimer_reprogram(timer, new_base);
- }
-unlock:
unlock_hrtimer_base(timer, &flags);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
@@ -1072,7 +1197,7 @@ u64 hrtimer_get_next_event(void)
raw_spin_lock_irqsave(&cpu_base->lock, flags);
if (!__hrtimer_hres_active(cpu_base))
- expires = __hrtimer_get_next_event(cpu_base);
+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1095,17 +1220,24 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
+ bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
+ int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
struct hrtimer_cpu_base *cpu_base;
- int base;
memset(timer, 0, sizeof(struct hrtimer));
cpu_base = raw_cpu_ptr(&hrtimer_bases);
- if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
+ /*
+ * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
+ * clock modifications, so they needs to become CLOCK_MONOTONIC to
+ * ensure POSIX compliance.
+ */
+ if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
clock_id = CLOCK_MONOTONIC;
- base = hrtimer_clockid_to_base(clock_id);
+ base += hrtimer_clockid_to_base(clock_id);
+ timer->is_soft = softtimer;
timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);
}
@@ -1114,7 +1246,13 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
* hrtimer_init - initialize a timer to the given clock
* @timer: the timer to be initialized
* @clock_id: the clock to be used
- * @mode: timer mode abs/rel
+ * @mode: The modes which are relevant for intitialization:
+ * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
+ * HRTIMER_MODE_REL_SOFT
+ *
+ * The PINNED variants of the above can be handed in,
+ * but the PINNED bit is ignored as pinning happens
+ * when the hrtimer is started
*/
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
@@ -1133,19 +1271,19 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
*/
bool hrtimer_active(const struct hrtimer *timer)
{
- struct hrtimer_cpu_base *cpu_base;
+ struct hrtimer_clock_base *base;
unsigned int seq;
do {
- cpu_base = READ_ONCE(timer->base->cpu_base);
- seq = raw_read_seqcount_begin(&cpu_base->seq);
+ base = READ_ONCE(timer->base);
+ seq = raw_read_seqcount_begin(&base->seq);
if (timer->state != HRTIMER_STATE_INACTIVE ||
- cpu_base->running == timer)
+ base->running == timer)
return true;
- } while (read_seqcount_retry(&cpu_base->seq, seq) ||
- cpu_base != READ_ONCE(timer->base->cpu_base));
+ } while (read_seqcount_retry(&base->seq, seq) ||
+ base != READ_ONCE(timer->base));
return false;
}
@@ -1171,7 +1309,8 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer_clock_base *base,
- struct hrtimer *timer, ktime_t *now)
+ struct hrtimer *timer, ktime_t *now,
+ unsigned long flags)
{
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
@@ -1179,16 +1318,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
lockdep_assert_held(&cpu_base->lock);
debug_deactivate(timer);
- cpu_base->running = timer;
+ base->running = timer;
/*
* Separate the ->running assignment from the ->state assignment.
*
* As with a regular write barrier, this ensures the read side in
- * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * hrtimer_active() cannot observe base->running == NULL &&
* timer->state == INACTIVE.
*/
- raw_write_seqcount_barrier(&cpu_base->seq);
+ raw_write_seqcount_barrier(&base->seq);
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
fn = timer->function;
@@ -1202,15 +1341,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
timer->is_rel = false;
/*
- * Because we run timers from hardirq context, there is no chance
- * they get migrated to another cpu, therefore its safe to unlock
- * the timer base.
+ * The timer is marked as running in the CPU base, so it is
+ * protected against migration to a different CPU even if the lock
+ * is dropped.
*/
- raw_spin_unlock(&cpu_base->lock);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
trace_hrtimer_expire_entry(timer, now);
restart = fn(timer);
trace_hrtimer_expire_exit(timer);
- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irq(&cpu_base->lock);
/*
* Note: We clear the running state after enqueue_hrtimer and
@@ -1223,33 +1362,31 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
*/
if (restart != HRTIMER_NORESTART &&
!(timer->state & HRTIMER_STATE_ENQUEUED))
- enqueue_hrtimer(timer, base);
+ enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
/*
* Separate the ->running assignment from the ->state assignment.
*
* As with a regular write barrier, this ensures the read side in
- * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * hrtimer_active() cannot observe base->running.timer == NULL &&
* timer->state == INACTIVE.
*/
- raw_write_seqcount_barrier(&cpu_base->seq);
+ raw_write_seqcount_barrier(&base->seq);
- WARN_ON_ONCE(cpu_base->running != timer);
- cpu_base->running = NULL;
+ WARN_ON_ONCE(base->running != timer);
+ base->running = NULL;
}
-static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
+static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
+ unsigned long flags, unsigned int active_mask)
{
- struct hrtimer_clock_base *base = cpu_base->clock_base;
- unsigned int active = cpu_base->active_bases;
+ struct hrtimer_clock_base *base;
+ unsigned int active = cpu_base->active_bases & active_mask;
- for (; active; base++, active >>= 1) {
+ for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *node;
ktime_t basenow;
- if (!(active & 0x01))
- continue;
-
basenow = ktime_add(now, base->offset);
while ((node = timerqueue_getnext(&base->active))) {
@@ -1272,11 +1409,28 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
if (basenow < hrtimer_get_softexpires_tv64(timer))
break;
- __run_hrtimer(cpu_base, base, timer, &basenow);
+ __run_hrtimer(cpu_base, base, timer, &basenow, flags);
}
}
}
+static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ unsigned long flags;
+ ktime_t now;
+
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
+
+ now = hrtimer_update_base(cpu_base);
+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
+
+ cpu_base->softirq_activated = 0;
+ hrtimer_update_softirq_timer(cpu_base, true);
+
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+}
+
#ifdef CONFIG_HIGH_RES_TIMERS
/*
@@ -1287,13 +1441,14 @@ void hrtimer_interrupt(struct clock_event_device *dev)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires_next, now, entry_time, delta;
+ unsigned long flags;
int retries = 0;
BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
dev->next_event = KTIME_MAX;
- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
entry_time = now = hrtimer_update_base(cpu_base);
retry:
cpu_base->in_hrtirq = 1;
@@ -1306,17 +1461,23 @@ retry:
*/
cpu_base->expires_next = KTIME_MAX;
- __hrtimer_run_queues(cpu_base, now);
+ if (!ktime_before(now, cpu_base->softirq_expires_next)) {
+ cpu_base->softirq_expires_next = KTIME_MAX;
+ cpu_base->softirq_activated = 1;
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ }
+
+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
/* Reevaluate the clock bases for the next expiry */
- expires_next = __hrtimer_get_next_event(cpu_base);
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
/*
* Store the new expiry value so the migration code can verify
* against it.
*/
cpu_base->expires_next = expires_next;
cpu_base->in_hrtirq = 0;
- raw_spin_unlock(&cpu_base->lock);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
/* Reprogramming necessary ? */
if (!tick_program_event(expires_next, 0)) {
@@ -1337,7 +1498,7 @@ retry:
* Acquire base lock for updating the offsets and retrieving
* the current time.
*/
- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
cpu_base->nr_retries++;
if (++retries < 3)
@@ -1350,7 +1511,8 @@ retry:
*/
cpu_base->nr_hangs++;
cpu_base->hang_detected = 1;
- raw_spin_unlock(&cpu_base->lock);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+
delta = ktime_sub(now, entry_time);
if ((unsigned int)delta > cpu_base->max_hang_time)
cpu_base->max_hang_time = (unsigned int) delta;
@@ -1392,6 +1554,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { }
void hrtimer_run_queues(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ unsigned long flags;
ktime_t now;
if (__hrtimer_hres_active(cpu_base))
@@ -1409,10 +1572,17 @@ void hrtimer_run_queues(void)
return;
}
- raw_spin_lock(&cpu_base->lock);
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
- __hrtimer_run_queues(cpu_base, now);
- raw_spin_unlock(&cpu_base->lock);
+
+ if (!ktime_before(now, cpu_base->softirq_expires_next)) {
+ cpu_base->softirq_expires_next = KTIME_MAX;
+ cpu_base->softirq_activated = 1;
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ }
+
+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}
/*
@@ -1590,7 +1760,13 @@ int hrtimers_prepare_cpu(unsigned int cpu)
}
cpu_base->cpu = cpu;
- hrtimer_init_hres(cpu_base);
+ cpu_base->active_bases = 0;
+ cpu_base->hres_active = 0;
+ cpu_base->hang_detected = 0;
+ cpu_base->next_timer = NULL;
+ cpu_base->softirq_next_timer = NULL;
+ cpu_base->expires_next = KTIME_MAX;
+ cpu_base->softirq_expires_next = KTIME_MAX;
return 0;
}
@@ -1622,7 +1798,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* sort out already expired timers and reprogram the
* event device.
*/
- enqueue_hrtimer(timer, new_base);
+ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
}
}
@@ -1634,6 +1810,12 @@ int hrtimers_dead_cpu(unsigned int scpu)
BUG_ON(cpu_online(scpu));
tick_cancel_sched_timer(scpu);
+ /*
+ * this BH disable ensures that raise_softirq_irqoff() does
+ * not wakeup ksoftirqd (and acquire the pi-lock) while
+ * holding the cpu_base lock
+ */
+ local_bh_disable();
local_irq_disable();
old_base = &per_cpu(hrtimer_bases, scpu);
new_base = this_cpu_ptr(&hrtimer_bases);
@@ -1649,12 +1831,19 @@ int hrtimers_dead_cpu(unsigned int scpu)
&new_base->clock_base[i]);
}
+ /*
+ * The migration might have changed the first expiring softirq
+ * timer on this CPU. Update it.
+ */
+ hrtimer_update_softirq_timer(new_base, false);
+
raw_spin_unlock(&old_base->lock);
raw_spin_unlock(&new_base->lock);
/* Check, if we got expired work to do */
__hrtimer_peek_ahead_timers();
local_irq_enable();
+ local_bh_enable();
return 0;
}
@@ -1663,18 +1852,19 @@ int hrtimers_dead_cpu(unsigned int scpu)
void __init hrtimers_init(void)
{
hrtimers_prepare_cpu(smp_processor_id());
+ open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}
/**
* schedule_hrtimeout_range_clock - sleep until timeout
* @expires: timeout value (ktime_t)
* @delta: slack in expires timeout (ktime_t)
- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
- * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
+ * @mode: timer mode
+ * @clock_id: timer clock to be used
*/
int __sched
schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
- const enum hrtimer_mode mode, int clock)
+ const enum hrtimer_mode mode, clockid_t clock_id)
{
struct hrtimer_sleeper t;
@@ -1695,7 +1885,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
return -EINTR;
}
- hrtimer_init_on_stack(&t.timer, clock, mode);
+ hrtimer_init_on_stack(&t.timer, clock_id, mode);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
hrtimer_init_sleeper(&t, current);
@@ -1717,7 +1907,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
* schedule_hrtimeout_range - sleep until timeout
* @expires: timeout value (ktime_t)
* @delta: slack in expires timeout (ktime_t)
- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ * @mode: timer mode
*
* Make the current task sleep until the given expiry time has
* elapsed. The routine will return immediately unless
@@ -1756,7 +1946,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
/**
* schedule_hrtimeout - sleep until timeout
* @expires: timeout value (ktime_t)
- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ * @mode: timer mode
*
* Make the current task sleep until the given expiry time has
* elapsed. The routine will return immediately unless
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 17cdc554c9fe..94ad46d50b56 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -68,10 +68,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
return err;
}
-static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
+static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
{
struct posix_clock *clk = get_posix_clock(fp);
- unsigned int result = 0;
+ __poll_t result = 0;
if (!clk)
return POLLERR;
@@ -216,7 +216,7 @@ struct posix_clock_desc {
static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
{
- struct file *fp = fget(CLOCKID_TO_FD(id));
+ struct file *fp = fget(clockid_to_fd(id));
int err = -EINVAL;
if (!fp)
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1f27887aa194..2541bd89f20e 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -14,6 +14,7 @@
#include <linux/tick.h>
#include <linux/workqueue.h>
#include <linux/compat.h>
+#include <linux/sched/deadline.h>
#include "posix-timers.h"
@@ -791,6 +792,14 @@ check_timers_list(struct list_head *timers,
return 0;
}
+static inline void check_dl_overrun(struct task_struct *tsk)
+{
+ if (tsk->dl.dl_overrun) {
+ tsk->dl.dl_overrun = 0;
+ __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+ }
+}
+
/*
* Check for any per-thread CPU timers that have fired and move them off
* the tsk->cpu_timers[N] list onto the firing list. Here we update the
@@ -804,6 +813,9 @@ static void check_thread_timers(struct task_struct *tsk,
u64 expires;
unsigned long soft;
+ if (dl_task(tsk))
+ check_dl_overrun(tsk);
+
/*
* If cputime_expires is zero, then there are no active
* per thread CPU timers.
@@ -906,6 +918,9 @@ static void check_process_timers(struct task_struct *tsk,
struct task_cputime cputime;
unsigned long soft;
+ if (dl_task(tsk))
+ check_dl_overrun(tsk);
+
/*
* If cputimer is not running, then there are no active
* process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
@@ -1111,6 +1126,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
return 1;
}
+ if (dl_task(tsk) && tsk->dl.dl_overrun)
+ return 1;
+
return 0;
}
@@ -1189,9 +1207,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
u64 now;
WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
- cpu_timer_sample_group(clock_idx, tsk, &now);
- if (oldval) {
+ if (oldval && cpu_timer_sample_group(clock_idx, tsk, &now) != -EINVAL) {
/*
* We are setting itimer. The *oldval is absolute and we update
* it to be relative, *newval argument is relative and we update
@@ -1363,8 +1380,8 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
}
-#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
-#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
+#define PROCESS_CLOCK make_process_cpuclock(0, CPUCLOCK_SCHED)
+#define THREAD_CLOCK make_thread_cpuclock(0, CPUCLOCK_SCHED)
static int process_cpu_clock_getres(const clockid_t which_clock,
struct timespec64 *tp)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 13d6881f908b..75043046914e 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event)
{
struct task_struct *rtn = current->group_leader;
- if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
- (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
- !same_thread_group(rtn, current) ||
- (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
+ switch (event->sigev_notify) {
+ case SIGEV_SIGNAL | SIGEV_THREAD_ID:
+ rtn = find_task_by_vpid(event->sigev_notify_thread_id);
+ if (!rtn || !same_thread_group(rtn, current))
+ return NULL;
+ /* FALLTHRU */
+ case SIGEV_SIGNAL:
+ case SIGEV_THREAD:
+ if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
+ return NULL;
+ /* FALLTHRU */
+ case SIGEV_NONE:
+ return task_pid(rtn);
+ default:
return NULL;
-
- if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
- ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
- return NULL;
-
- return task_pid(rtn);
+ }
}
static struct k_itimer * alloc_posix_timer(void)
@@ -457,7 +462,7 @@ static struct k_itimer * alloc_posix_timer(void)
kmem_cache_free(posix_timers_cache, tmr);
return NULL;
}
- memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
+ clear_siginfo(&tmr->sigq->info);
return tmr;
}
@@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
struct timespec64 ts64;
bool sig_none;
- sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
+ sig_none = timr->it_sigev_notify == SIGEV_NONE;
iv = timr->it_interval;
/* interval timer ? */
@@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags,
timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
expires = timespec64_to_ktime(new_setting->it_value);
- sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
+ sigev_none = timr->it_sigev_notify == SIGEV_NONE;
kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
timr->it_active = !sigev_none;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f8e1845aa464..e277284c2831 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -150,16 +150,15 @@ static inline void tick_nohz_init(void) { }
#ifdef CONFIG_NO_HZ_COMMON
extern unsigned long tick_nohz_active;
-#else
+extern void timers_update_nohz(void);
+# ifdef CONFIG_SMP
+extern struct static_key_false timers_migration_enabled;
+# endif
+#else /* CONFIG_NO_HZ_COMMON */
+static inline void timers_update_nohz(void) { }
#define tick_nohz_active (0)
#endif
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void timers_update_migration(bool update_nohz);
-#else
-static inline void timers_update_migration(bool update_nohz) { }
-#endif
-
DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99578f06c8d4..29a5733eff83 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
ts->next_tick = 0;
}
+static inline bool local_timer_softirq_pending(void)
+{
+ return local_softirq_pending() & TIMER_SOFTIRQ;
+}
+
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ktime_t now, int cpu)
{
@@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
} while (read_seqretry(&jiffies_lock, seq));
ts->last_jiffies = basejiff;
- if (rcu_needs_cpu(basemono, &next_rcu) ||
- arch_needs_cpu() || irq_work_needs_cpu()) {
+ /*
+ * Keep the periodic tick, when RCU, architecture or irq_work
+ * requests it.
+ * Aside of that check whether the local timer softirq is
+ * pending. If so its a bad idea to call get_next_timer_interrupt()
+ * because there is an already expired timer, so it will request
+ * immeditate expiry, which rearms the hardware timer with a
+ * minimal delta which brings us back to this place
+ * immediately. Lather, rinse and repeat...
+ */
+ if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+ irq_work_needs_cpu() || local_timer_softirq_pending()) {
next_tick = basemono + TICK_NSEC;
} else {
/*
@@ -986,6 +1001,19 @@ ktime_t tick_nohz_get_sleep_length(void)
}
/**
+ * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
+ * for a particular CPU.
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
+{
+ struct tick_sched *ts = tick_get_tick_sched(cpu);
+
+ return ts->idle_calls;
+}
+
+/**
* tick_nohz_get_idle_calls - return the current idle calls counter value
*
* Called from the schedutil frequency scaling governor in scheduler context.
@@ -1079,7 +1107,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
ts->nohz_mode = mode;
/* One update is enough */
if (!test_and_set_bit(0, &tick_nohz_active))
- timers_update_migration(true);
+ timers_update_nohz();
}
/**
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 198afa78bf69..cd03317e7b57 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -557,45 +557,6 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}
-#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
-#warning Please contact your maintainers, as GENERIC_TIME_VSYSCALL_OLD compatibity will disappear soon.
-
-static inline void update_vsyscall(struct timekeeper *tk)
-{
- struct timespec xt, wm;
-
- xt = timespec64_to_timespec(tk_xtime(tk));
- wm = timespec64_to_timespec(tk->wall_to_monotonic);
- update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
- tk->tkr_mono.cycle_last);
-}
-
-static inline void old_vsyscall_fixup(struct timekeeper *tk)
-{
- s64 remainder;
-
- /*
- * Store only full nanoseconds into xtime_nsec after rounding
- * it up and add the remainder to the error difference.
- * XXX - This is necessary to avoid small 1ns inconsistnecies caused
- * by truncating the remainder in vsyscalls. However, it causes
- * additional work to be done in timekeeping_adjust(). Once
- * the vsyscall implementations are converted to use xtime_nsec
- * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
- * users are removed, this can be killed.
- */
- remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
- if (remainder != 0) {
- tk->tkr_mono.xtime_nsec -= remainder;
- tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
- tk->ntp_error += remainder << tk->ntp_error_shift;
- tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
- }
-}
-#else
-#define old_vsyscall_fixup(tk)
-#endif
-
static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
@@ -2164,12 +2125,6 @@ void update_wall_time(void)
timekeeping_adjust(tk, offset);
/*
- * XXX This can be killed once everyone converts
- * to the new update_vsyscall.
- */
- old_vsyscall_fixup(tk);
-
- /*
* Finally, make sure that after the rounding
* xtime_nsec isn't larger than NSEC_PER_SEC
*/
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index af0b8bae4502..48150ab42de9 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -200,8 +200,6 @@ struct timer_base {
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
- bool migration_enabled;
- bool nohz_active;
bool is_idle;
bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
@@ -210,45 +208,64 @@ struct timer_base {
static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
+
+static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
+static DEFINE_MUTEX(timer_keys_mutex);
+
+static void timer_update_keys(struct work_struct *work);
+static DECLARE_WORK(timer_update_work, timer_update_keys);
+
+#ifdef CONFIG_SMP
unsigned int sysctl_timer_migration = 1;
-void timers_update_migration(bool update_nohz)
+DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
+
+static void timers_update_migration(void)
{
- bool on = sysctl_timer_migration && tick_nohz_active;
- unsigned int cpu;
+ if (sysctl_timer_migration && tick_nohz_active)
+ static_branch_enable(&timers_migration_enabled);
+ else
+ static_branch_disable(&timers_migration_enabled);
+}
+#else
+static inline void timers_update_migration(void) { }
+#endif /* !CONFIG_SMP */
- /* Avoid the loop, if nothing to update */
- if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
- return;
+static void timer_update_keys(struct work_struct *work)
+{
+ mutex_lock(&timer_keys_mutex);
+ timers_update_migration();
+ static_branch_enable(&timers_nohz_active);
+ mutex_unlock(&timer_keys_mutex);
+}
- for_each_possible_cpu(cpu) {
- per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
- per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
- per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
- if (!update_nohz)
- continue;
- per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
- per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
- per_cpu(hrtimer_bases.nohz_active, cpu) = true;
- }
+void timers_update_nohz(void)
+{
+ schedule_work(&timer_update_work);
}
int timer_migration_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- static DEFINE_MUTEX(mutex);
int ret;
- mutex_lock(&mutex);
+ mutex_lock(&timer_keys_mutex);
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write)
- timers_update_migration(false);
- mutex_unlock(&mutex);
+ timers_update_migration();
+ mutex_unlock(&timer_keys_mutex);
return ret;
}
-#endif
+
+static inline bool is_timers_nohz_active(void)
+{
+ return static_branch_unlikely(&timers_nohz_active);
+}
+#else
+static inline bool is_timers_nohz_active(void) { return false; }
+#endif /* NO_HZ_COMMON */
static unsigned long round_jiffies_common(unsigned long j, int cpu,
bool force_up)
@@ -534,7 +551,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer)
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ if (!is_timers_nohz_active())
return;
/*
@@ -707,14 +724,18 @@ static inline void debug_timer_assert_init(struct timer_list *timer)
debug_object_assert_init(timer, &timer_debug_descr);
}
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+ void (*func)(struct timer_list *),
+ unsigned int flags,
const char *name, struct lock_class_key *key);
-void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
+void init_timer_on_stack_key(struct timer_list *timer,
+ void (*func)(struct timer_list *),
+ unsigned int flags,
const char *name, struct lock_class_key *key)
{
debug_object_init_on_stack(timer, &timer_debug_descr);
- do_init_timer(timer, flags, name, key);
+ do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
@@ -755,10 +776,13 @@ static inline void debug_assert_init(struct timer_list *timer)
debug_timer_assert_init(timer);
}
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+ void (*func)(struct timer_list *),
+ unsigned int flags,
const char *name, struct lock_class_key *key)
{
timer->entry.pprev = NULL;
+ timer->function = func;
timer->flags = flags | raw_smp_processor_id();
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
@@ -766,6 +790,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
/**
* init_timer_key - initialize a timer
* @timer: the timer to be initialized
+ * @func: timer callback function
* @flags: timer flags
* @name: name of the timer
* @key: lockdep class key of the fake lock used for tracking timer
@@ -774,11 +799,12 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
* init_timer_key() must be done to a timer prior calling *any* of the
* other timer functions.
*/
-void init_timer_key(struct timer_list *timer, unsigned int flags,
+void init_timer_key(struct timer_list *timer,
+ void (*func)(struct timer_list *), unsigned int flags,
const char *name, struct lock_class_key *key)
{
debug_init(timer);
- do_init_timer(timer, flags, name, key);
+ do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL(init_timer_key);
@@ -814,11 +840,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
/*
- * If the timer is deferrable and nohz is active then we need to use
- * the deferrable base.
+ * If the timer is deferrable and NO_HZ_COMMON is set then we need
+ * to use the deferrable base.
*/
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
- (tflags & TIMER_DEFERRABLE))
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
return base;
}
@@ -828,11 +853,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
/*
- * If the timer is deferrable and nohz is active then we need to use
- * the deferrable base.
+ * If the timer is deferrable and NO_HZ_COMMON is set then we need
+ * to use the deferrable base.
*/
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
- (tflags & TIMER_DEFERRABLE))
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
base = this_cpu_ptr(&timer_bases[BASE_DEF]);
return base;
}
@@ -842,21 +866,20 @@ static inline struct timer_base *get_timer_base(u32 tflags)
return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
}
-#ifdef CONFIG_NO_HZ_COMMON
static inline struct timer_base *
get_target_base(struct timer_base *base, unsigned tflags)
{
-#ifdef CONFIG_SMP
- if ((tflags & TIMER_PINNED) || !base->migration_enabled)
- return get_timer_this_cpu_base(tflags);
- return get_timer_cpu_base(tflags, get_nohz_timer_target());
-#else
- return get_timer_this_cpu_base(tflags);
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ if (static_branch_likely(&timers_migration_enabled) &&
+ !(tflags & TIMER_PINNED))
+ return get_timer_cpu_base(tflags, get_nohz_timer_target());
#endif
+ return get_timer_this_cpu_base(tflags);
}
static inline void forward_timer_base(struct timer_base *base)
{
+#ifdef CONFIG_NO_HZ_COMMON
unsigned long jnow;
/*
@@ -880,16 +903,8 @@ static inline void forward_timer_base(struct timer_base *base)
base->clk = jnow;
else
base->clk = base->next_expiry;
-}
-#else
-static inline struct timer_base *
-get_target_base(struct timer_base *base, unsigned tflags)
-{
- return get_timer_this_cpu_base(tflags);
-}
-
-static inline void forward_timer_base(struct timer_base *base) { }
#endif
+}
/*
@@ -1000,8 +1015,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
if (!ret && (options & MOD_TIMER_PENDING_ONLY))
goto out_unlock;
- debug_activate(timer, expires);
-
new_base = get_target_base(base, timer->flags);
if (base != new_base) {
@@ -1025,6 +1038,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
}
}
+ debug_activate(timer, expires);
+
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
@@ -1107,12 +1122,12 @@ EXPORT_SYMBOL(timer_reduce);
* add_timer - start a timer
* @timer: the timer to be added
*
- * The kernel will do a ->function(->data) callback from the
+ * The kernel will do a ->function(@timer) callback from the
* timer interrupt at the ->expires point in the future. The
* current time is 'jiffies'.
*
- * The timer's ->expires, ->function (and if the handler uses it, ->data)
- * fields must be set prior calling this function.
+ * The timer's ->expires, ->function fields must be set prior calling this
+ * function.
*
* Timers with an ->expires field in the past will be executed in the next
* timer tick.
@@ -1284,8 +1299,7 @@ int del_timer_sync(struct timer_list *timer)
EXPORT_SYMBOL(del_timer_sync);
#endif
-static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
- unsigned long data)
+static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *))
{
int count = preempt_count();
@@ -1309,7 +1323,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
lock_map_acquire(&lockdep_map);
trace_timer_expire_entry(timer);
- fn(data);
+ fn(timer);
trace_timer_expire_exit(timer);
lock_map_release(&lockdep_map);
@@ -1331,8 +1345,7 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
while (!hlist_empty(head)) {
struct timer_list *timer;
- void (*fn)(unsigned long);
- unsigned long data;
+ void (*fn)(struct timer_list *);
timer = hlist_entry(head->first, struct timer_list, entry);
@@ -1340,15 +1353,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
detach_timer(timer, true);
fn = timer->function;
- data = timer->data;
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
- call_timer_fn(timer, fn, data);
+ call_timer_fn(timer, fn);
raw_spin_lock(&base->lock);
} else {
raw_spin_unlock_irq(&base->lock);
- call_timer_fn(timer, fn, data);
+ call_timer_fn(timer, fn);
raw_spin_lock_irq(&base->lock);
}
}
@@ -1678,7 +1690,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
base->must_forward_clk = false;
__run_timers(base);
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
}
@@ -1692,7 +1704,7 @@ void run_local_timers(void)
hrtimer_run_queues();
/* Raise the softirq only if required. */
if (time_before(jiffies, base->clk)) {
- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
return;
/* CPU is awake, so check the deferrable base. */
base++;
@@ -1849,6 +1861,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
}
}
+int timers_prepare_cpu(unsigned int cpu)
+{
+ struct timer_base *base;
+ int b;
+
+ for (b = 0; b < NR_BASES; b++) {
+ base = per_cpu_ptr(&timer_bases[b], cpu);
+ base->clk = jiffies;
+ base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->is_idle = false;
+ base->must_forward_clk = true;
+ }
+ return 0;
+}
+
int timers_dead_cpu(unsigned int cpu)
{
struct timer_base *old_base;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 0e7f5428a148..0ed768b56c60 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -389,7 +389,7 @@ static int __init init_timer_list_procfs(void)
{
struct proc_dir_entry *pe;
- pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
+ pe = proc_create("timer_list", 0400, NULL, &timer_list_fops);
if (!pe)
return -ENOMEM;
return 0;
diff --git a/kernel/torture.c b/kernel/torture.c
index 637e172835d8..37b94012a3f8 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -47,6 +47,7 @@
#include <linux/ktime.h>
#include <asm/byteorder.h>
#include <linux/torture.h>
+#include "rcu/rcu.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
@@ -60,7 +61,6 @@ static bool verbose;
#define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */
static int fullstop = FULLSTOP_RMMOD;
static DEFINE_MUTEX(fullstop_mutex);
-static int *torture_runnable;
#ifdef CONFIG_HOTPLUG_CPU
@@ -500,7 +500,7 @@ static int torture_shutdown(void *arg)
torture_shutdown_hook();
else
VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
- ftrace_dump(DUMP_ALL);
+ rcu_ftrace_dump(DUMP_ALL);
kernel_power_off(); /* Shut down the system. */
return 0;
}
@@ -572,17 +572,19 @@ static int stutter;
*/
void stutter_wait(const char *title)
{
+ int spt;
+
cond_resched_rcu_qs();
- while (READ_ONCE(stutter_pause_test) ||
- (torture_runnable && !READ_ONCE(*torture_runnable))) {
- if (stutter_pause_test)
- if (READ_ONCE(stutter_pause_test) == 1)
- schedule_timeout_interruptible(1);
- else
- while (READ_ONCE(stutter_pause_test))
- cond_resched();
- else
+ spt = READ_ONCE(stutter_pause_test);
+ for (; spt; spt = READ_ONCE(stutter_pause_test)) {
+ if (spt == 1) {
+ schedule_timeout_interruptible(1);
+ } else if (spt == 2) {
+ while (READ_ONCE(stutter_pause_test))
+ cond_resched();
+ } else {
schedule_timeout_interruptible(round_jiffies_relative(HZ));
+ }
torture_shutdown_absorb(title);
}
}
@@ -596,17 +598,15 @@ static int torture_stutter(void *arg)
{
VERBOSE_TOROUT_STRING("torture_stutter task started");
do {
- if (!torture_must_stop()) {
- if (stutter > 1) {
- schedule_timeout_interruptible(stutter - 1);
- WRITE_ONCE(stutter_pause_test, 2);
- }
- schedule_timeout_interruptible(1);
+ if (!torture_must_stop() && stutter > 1) {
WRITE_ONCE(stutter_pause_test, 1);
+ schedule_timeout_interruptible(stutter - 1);
+ WRITE_ONCE(stutter_pause_test, 2);
+ schedule_timeout_interruptible(1);
}
+ WRITE_ONCE(stutter_pause_test, 0);
if (!torture_must_stop())
schedule_timeout_interruptible(stutter);
- WRITE_ONCE(stutter_pause_test, 0);
torture_shutdown_absorb("torture_stutter");
} while (!torture_must_stop());
torture_kthread_stopping("torture_stutter");
@@ -647,7 +647,7 @@ static void torture_stutter_cleanup(void)
* The runnable parameter points to a flag that controls whether or not
* the test is currently runnable. If there is no such flag, pass in NULL.
*/
-bool torture_init_begin(char *ttype, bool v, int *runnable)
+bool torture_init_begin(char *ttype, bool v)
{
mutex_lock(&fullstop_mutex);
if (torture_type != NULL) {
@@ -659,7 +659,6 @@ bool torture_init_begin(char *ttype, bool v, int *runnable)
}
torture_type = ttype;
verbose = v;
- torture_runnable = runnable;
fullstop = FULLSTOP_DONTSTOP;
return true;
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f54b7b6b4a4b..f54dc62b599c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -160,6 +160,18 @@ config FUNCTION_GRAPH_TRACER
address on the current task structure into a stack of calls.
+config PREEMPTIRQ_EVENTS
+ bool "Enable trace events for preempt and irq disable/enable"
+ select TRACE_IRQFLAGS
+ depends on DEBUG_PREEMPT || !PROVE_LOCKING
+ depends on TRACING
+ default n
+ help
+ Enable tracing of disable and enable events for preemption and irqs.
+ For tracing preempt disable/enable events, DEBUG_PREEMPT must be
+ enabled. For tracing irq disable/enable events, PROVE_LOCKING must
+ be disabled.
+
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
@@ -343,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES
on if you need to profile the system's use of these macros.
config PROFILE_ALL_BRANCHES
- bool "Profile all if conditionals"
+ bool "Profile all if conditionals" if !FORTIFY_SOURCE
select TRACE_BRANCH_PROFILING
help
This tracer profiles all branch conditions. Every if ()
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 19a15b2f1190..e2538c7638d4 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 206e0e2ace53..987d9a9ae283 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -591,7 +591,7 @@ static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
return ret;
if (copy_to_user(arg, &buts, sizeof(buts))) {
- blk_trace_remove(q);
+ __blk_trace_remove(q);
return -EFAULT;
}
return 0;
@@ -637,7 +637,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
return ret;
if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
- blk_trace_remove(q);
+ __blk_trace_remove(q);
return -EFAULT;
}
@@ -872,7 +872,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what, int error, union kernfs_node_id *cgid)
+ u32 what, int error)
{
struct blk_trace *bt = q->blk_trace;
@@ -880,22 +880,21 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
return;
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid);
+ bio_op(bio), bio->bi_opf, what, error, 0, NULL,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_bounce(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
}
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio,
int error)
{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
}
static void blk_add_trace_bio_backmerge(void *ignore,
@@ -903,8 +902,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
}
static void blk_add_trace_bio_frontmerge(void *ignore,
@@ -912,15 +910,13 @@ static void blk_add_trace_bio_frontmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
}
static void blk_add_trace_bio_queue(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
}
static void blk_add_trace_getrq(void *ignore,
@@ -928,8 +924,7 @@ static void blk_add_trace_getrq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
else {
struct blk_trace *bt = q->blk_trace;
@@ -945,8 +940,7 @@ static void blk_add_trace_sleeprq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
else {
struct blk_trace *bt = q->blk_trace;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 95888ae6c263..40207c2a4113 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -15,9 +15,11 @@
#include <linux/ctype.h>
#include "trace.h"
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+
/**
* trace_call_bpf - invoke BPF program
- * @prog: BPF program
+ * @call: tracepoint event
* @ctx: opaque context pointer
*
* kprobe handlers execute BPF programs via this helper.
@@ -29,7 +31,7 @@
* 1 - store kprobe event into ring buffer
* Other values are reserved and currently alias to 1
*/
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
unsigned int ret;
@@ -49,9 +51,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
goto out;
}
- rcu_read_lock();
- ret = BPF_PROG_RUN(prog, ctx);
- rcu_read_unlock();
+ /*
+ * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
+ * to all call sites, we did a bpf_prog_array_valid() there to check
+ * whether call->prog_array is empty or not, which is
+ * a heurisitc to speed up execution.
+ *
+ * If bpf_prog_array_valid() fetched prog_array was
+ * non-NULL, we go into trace_call_bpf() and do the actual
+ * proper rcu_dereference() under RCU lock.
+ * If it turns out that prog_array is NULL then, we bail out.
+ * For the opposite, if the bpf_prog_array_valid() fetched pointer
+ * was NULL, you'll skip the prog_array with the risk of missing
+ * out of events when it was updated in between this and the
+ * rcu_dereference() which is accepted risk.
+ */
+ ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
out:
__this_cpu_dec(bpf_prog_active);
@@ -77,7 +92,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
- .arg2_type = ARG_CONST_SIZE,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
@@ -255,14 +270,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
-BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+static __always_inline int
+get_map_perf_counter(struct bpf_map *map, u64 flags,
+ u64 *value, u64 *enabled, u64 *running)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
- u64 value = 0;
- int err;
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -275,7 +290,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
if (!ee)
return -ENOENT;
- err = perf_event_read_local(ee->event, &value, NULL, NULL);
+ return perf_event_read_local(ee->event, value, enabled, running);
+}
+
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+{
+ u64 value = 0;
+ int err;
+
+ err = get_map_perf_counter(map, flags, &value, NULL, NULL);
/*
* this api is ugly since we miss [-22..-2] range of valid
* counter values, but that's uapi
@@ -293,14 +316,40 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
+BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
+ struct bpf_perf_event_value *, buf, u32, size)
+{
+ int err = -EINVAL;
+
+ if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+ goto clear;
+ err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
+ &buf->running);
+ if (unlikely(err))
+ goto clear;
+ return 0;
+clear:
+ memset(buf, 0, size);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
+ .func = bpf_perf_event_read_value,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
- u64 flags, struct perf_raw_record *raw)
+ u64 flags, struct perf_sample_data *sd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
@@ -323,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
- perf_sample_data_init(sd, 0, 0);
- sd->raw = raw;
perf_event_output(event, sd, regs);
return 0;
}
@@ -332,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
+ struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
struct perf_raw_record raw = {
.frag = {
.size = size,
@@ -342,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
- return __bpf_perf_event_output(regs, map, flags, &raw);
+ perf_sample_data_init(sd, 0, 0);
+ sd->raw = &raw;
+
+ return __bpf_perf_event_output(regs, map, flags, sd);
}
static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -353,14 +404,16 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM,
- .arg5_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd);
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
+ struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd);
struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
struct perf_raw_frag frag = {
.copy = ctx_copy,
@@ -378,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
};
perf_fetch_caller_regs(regs);
+ perf_sample_data_init(sd, 0, 0);
+ sd->raw = &raw;
- return __bpf_perf_event_output(regs, map, flags, &raw);
+ return __bpf_perf_event_output(regs, map, flags, sd);
}
BPF_CALL_0(bpf_get_current_task)
@@ -444,7 +499,7 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
- .arg2_type = ARG_CONST_SIZE,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
@@ -499,6 +554,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_perf_event_output_proto;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
+ case BPF_FUNC_perf_event_read_value:
+ return &bpf_perf_event_read_value_proto;
default:
return tracing_func_proto(func_id);
}
@@ -524,11 +581,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
return true;
}
-const struct bpf_verifier_ops kprobe_prog_ops = {
+const struct bpf_verifier_ops kprobe_verifier_ops = {
.get_func_proto = kprobe_prog_func_proto,
.is_valid_access = kprobe_prog_is_valid_access,
};
+const struct bpf_prog_ops kprobe_prog_ops = {
+};
+
BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
@@ -550,7 +610,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM,
- .arg5_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
@@ -576,6 +636,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+ struct bpf_perf_event_value *, buf, u32, size)
+{
+ int err = -EINVAL;
+
+ if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+ goto clear;
+ err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
+ &buf->running);
+ if (unlikely(err))
+ goto clear;
+ return 0;
+clear:
+ memset(buf, 0, size);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
+ .func = bpf_perf_prog_read_value_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -583,6 +669,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
+ case BPF_FUNC_perf_prog_read_value:
+ return &bpf_perf_prog_read_value_proto_tp;
default:
return tracing_func_proto(func_id);
}
@@ -602,11 +690,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
return true;
}
-const struct bpf_verifier_ops tracepoint_prog_ops = {
+const struct bpf_verifier_ops tracepoint_verifier_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = tp_prog_is_valid_access,
};
+const struct bpf_prog_ops tracepoint_prog_ops = {
+};
+
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
@@ -662,8 +753,75 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
-const struct bpf_verifier_ops perf_event_prog_ops = {
+const struct bpf_verifier_ops perf_event_verifier_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = pe_prog_is_valid_access,
.convert_ctx_access = pe_prog_convert_ctx_access,
};
+
+const struct bpf_prog_ops perf_event_prog_ops = {
+};
+
+static DEFINE_MUTEX(bpf_event_mutex);
+
+#define BPF_TRACE_MAX_PROGS 64
+
+int perf_event_attach_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog)
+{
+ struct bpf_prog_array __rcu *old_array;
+ struct bpf_prog_array *new_array;
+ int ret = -EEXIST;
+
+ mutex_lock(&bpf_event_mutex);
+
+ if (event->prog)
+ goto unlock;
+
+ old_array = event->tp_event->prog_array;
+ if (old_array &&
+ bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
+ ret = -E2BIG;
+ goto unlock;
+ }
+
+ ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+ if (ret < 0)
+ goto unlock;
+
+ /* set the new array to event->tp_event and set event->prog */
+ event->prog = prog;
+ rcu_assign_pointer(event->tp_event->prog_array, new_array);
+ bpf_prog_array_free(old_array);
+
+unlock:
+ mutex_unlock(&bpf_event_mutex);
+ return ret;
+}
+
+void perf_event_detach_bpf_prog(struct perf_event *event)
+{
+ struct bpf_prog_array __rcu *old_array;
+ struct bpf_prog_array *new_array;
+ int ret;
+
+ mutex_lock(&bpf_event_mutex);
+
+ if (!event->prog)
+ goto unlock;
+
+ old_array = event->tp_event->prog_array;
+ ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+ if (ret < 0) {
+ bpf_prog_array_delete_safe(old_array, event->prog);
+ } else {
+ rcu_assign_pointer(event->tp_event->prog_array, new_array);
+ bpf_prog_array_free(old_array);
+ }
+
+ bpf_prog_put(event->prog);
+ event->prog = NULL;
+
+unlock:
+ mutex_unlock(&bpf_event_mutex);
+}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8319e09e15b9..554b517c61a0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -203,30 +203,6 @@ void clear_ftrace_function(void)
ftrace_trace_function = ftrace_stub;
}
-static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- *per_cpu_ptr(ops->disabled, cpu) = 1;
-}
-
-static int per_cpu_ops_alloc(struct ftrace_ops *ops)
-{
- int __percpu *disabled;
-
- if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
- return -EINVAL;
-
- disabled = alloc_percpu(int);
- if (!disabled)
- return -ENOMEM;
-
- ops->disabled = disabled;
- per_cpu_ops_disable_all(ops);
- return 0;
-}
-
static void ftrace_sync(struct work_struct *work)
{
/*
@@ -262,8 +238,8 @@ static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
* If this is a dynamic, RCU, or per CPU ops, or we force list func,
* then it needs to call the list anyway.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
- FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) ||
+ FTRACE_FORCE_LIST_FUNC)
return ftrace_ops_list_func;
return ftrace_ops_get_func(ops);
@@ -422,11 +398,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
- if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
- if (per_cpu_ops_alloc(ops))
- return -ENOMEM;
- }
-
add_ftrace_ops(&ftrace_ops_list, ops);
/* Always save the function, and reset at unregistering */
@@ -1148,15 +1119,11 @@ static struct ftrace_ops global_ops = {
};
/*
- * This is used by __kernel_text_address() to return true if the
- * address is on a dynamically allocated trampoline that would
- * not return true for either core_kernel_text() or
- * is_module_text_address().
+ * Used by the stack undwinder to know about dynamic ftrace trampolines.
*/
-bool is_ftrace_trampoline(unsigned long addr)
+struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
{
- struct ftrace_ops *op;
- bool ret = false;
+ struct ftrace_ops *op = NULL;
/*
* Some of the ops may be dynamically allocated,
@@ -1173,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr)
if (op->trampoline && op->trampoline_size)
if (addr >= op->trampoline &&
addr < op->trampoline + op->trampoline_size) {
- ret = true;
- goto out;
+ preempt_enable_notrace();
+ return op;
}
} while_for_each_ftrace_op(op);
-
- out:
preempt_enable_notrace();
- return ret;
+ return NULL;
+}
+
+/*
+ * This is used by __kernel_text_address() to return true if the
+ * address is on a dynamically allocated trampoline that would
+ * not return true for either core_kernel_text() or
+ * is_module_text_address().
+ */
+bool is_ftrace_trampoline(unsigned long addr)
+{
+ return ftrace_ops_trampoline(addr) != NULL;
}
struct ftrace_page {
@@ -2727,11 +2703,6 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
}
-static void per_cpu_ops_free(struct ftrace_ops *ops)
-{
- free_percpu(ops->disabled);
-}
-
static void ftrace_startup_enable(int command)
{
if (saved_ftrace_func != ftrace_trace_function) {
@@ -2833,7 +2804,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
goto free_ops;
return 0;
@@ -2880,7 +2851,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* The same goes for freeing the per_cpu data of the per_cpu
* ops.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC) {
/*
* We need to do a hard force of sched synchronization.
* This is because we use preempt_disable() to do RCU, but
@@ -2903,9 +2874,6 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
free_ops:
arch_ftrace_trampoline_free(ops);
-
- if (ops->flags & FTRACE_OPS_FL_PER_CPU)
- per_cpu_ops_free(ops);
}
return 0;
@@ -5672,10 +5640,29 @@ static int ftrace_process_locs(struct module *mod,
return ret;
}
+struct ftrace_mod_func {
+ struct list_head list;
+ char *name;
+ unsigned long ip;
+ unsigned int size;
+};
+
+struct ftrace_mod_map {
+ struct rcu_head rcu;
+ struct list_head list;
+ struct module *mod;
+ unsigned long start_addr;
+ unsigned long end_addr;
+ struct list_head funcs;
+ unsigned int num_funcs;
+};
+
#ifdef CONFIG_MODULES
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
+static LIST_HEAD(ftrace_mod_maps);
+
static int referenced_filters(struct dyn_ftrace *rec)
{
struct ftrace_ops *ops;
@@ -5729,8 +5716,26 @@ static void clear_mod_from_hashes(struct ftrace_page *pg)
mutex_unlock(&trace_types_lock);
}
+static void ftrace_free_mod_map(struct rcu_head *rcu)
+{
+ struct ftrace_mod_map *mod_map = container_of(rcu, struct ftrace_mod_map, rcu);
+ struct ftrace_mod_func *mod_func;
+ struct ftrace_mod_func *n;
+
+ /* All the contents of mod_map are now not visible to readers */
+ list_for_each_entry_safe(mod_func, n, &mod_map->funcs, list) {
+ kfree(mod_func->name);
+ list_del(&mod_func->list);
+ kfree(mod_func);
+ }
+
+ kfree(mod_map);
+}
+
void ftrace_release_mod(struct module *mod)
{
+ struct ftrace_mod_map *mod_map;
+ struct ftrace_mod_map *n;
struct dyn_ftrace *rec;
struct ftrace_page **last_pg;
struct ftrace_page *tmp_page = NULL;
@@ -5742,6 +5747,14 @@ void ftrace_release_mod(struct module *mod)
if (ftrace_disabled)
goto out_unlock;
+ list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) {
+ if (mod_map->mod == mod) {
+ list_del_rcu(&mod_map->list);
+ call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map);
+ break;
+ }
+ }
+
/*
* Each module has its own ftrace_pages, remove
* them from the list.
@@ -5749,7 +5762,8 @@ void ftrace_release_mod(struct module *mod)
last_pg = &ftrace_pages_start;
for (pg = ftrace_pages_start; pg; pg = *last_pg) {
rec = &pg->records[0];
- if (within_module_core(rec->ip, mod)) {
+ if (within_module_core(rec->ip, mod) ||
+ within_module_init(rec->ip, mod)) {
/*
* As core pages are first, the first
* page should never be a module page.
@@ -5818,7 +5832,8 @@ void ftrace_module_enable(struct module *mod)
* not part of this module, then skip this pg,
* which the "break" will do.
*/
- if (!within_module_core(rec->ip, mod))
+ if (!within_module_core(rec->ip, mod) &&
+ !within_module_init(rec->ip, mod))
break;
cnt = 0;
@@ -5863,23 +5878,245 @@ void ftrace_module_init(struct module *mod)
ftrace_process_locs(mod, mod->ftrace_callsites,
mod->ftrace_callsites + mod->num_ftrace_callsites);
}
+
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+ struct dyn_ftrace *rec)
+{
+ struct ftrace_mod_func *mod_func;
+ unsigned long symsize;
+ unsigned long offset;
+ char str[KSYM_SYMBOL_LEN];
+ char *modname;
+ const char *ret;
+
+ ret = kallsyms_lookup(rec->ip, &symsize, &offset, &modname, str);
+ if (!ret)
+ return;
+
+ mod_func = kmalloc(sizeof(*mod_func), GFP_KERNEL);
+ if (!mod_func)
+ return;
+
+ mod_func->name = kstrdup(str, GFP_KERNEL);
+ if (!mod_func->name) {
+ kfree(mod_func);
+ return;
+ }
+
+ mod_func->ip = rec->ip - offset;
+ mod_func->size = symsize;
+
+ mod_map->num_funcs++;
+
+ list_add_rcu(&mod_func->list, &mod_map->funcs);
+}
+
+static struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+ unsigned long start, unsigned long end)
+{
+ struct ftrace_mod_map *mod_map;
+
+ mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL);
+ if (!mod_map)
+ return NULL;
+
+ mod_map->mod = mod;
+ mod_map->start_addr = start;
+ mod_map->end_addr = end;
+ mod_map->num_funcs = 0;
+
+ INIT_LIST_HEAD_RCU(&mod_map->funcs);
+
+ list_add_rcu(&mod_map->list, &ftrace_mod_maps);
+
+ return mod_map;
+}
+
+static const char *
+ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
+ unsigned long addr, unsigned long *size,
+ unsigned long *off, char *sym)
+{
+ struct ftrace_mod_func *found_func = NULL;
+ struct ftrace_mod_func *mod_func;
+
+ list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+ if (addr >= mod_func->ip &&
+ addr < mod_func->ip + mod_func->size) {
+ found_func = mod_func;
+ break;
+ }
+ }
+
+ if (found_func) {
+ if (size)
+ *size = found_func->size;
+ if (off)
+ *off = addr - found_func->ip;
+ if (sym)
+ strlcpy(sym, found_func->name, KSYM_NAME_LEN);
+
+ return found_func->name;
+ }
+
+ return NULL;
+}
+
+const char *
+ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
+ unsigned long *off, char **modname, char *sym)
+{
+ struct ftrace_mod_map *mod_map;
+ const char *ret = NULL;
+
+ /* mod_map is freed via call_rcu_sched() */
+ preempt_disable();
+ list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+ ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym);
+ if (ret) {
+ if (modname)
+ *modname = mod_map->mod->name;
+ break;
+ }
+ }
+ preempt_enable();
+
+ return ret;
+}
+
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *name,
+ char *module_name, int *exported)
+{
+ struct ftrace_mod_map *mod_map;
+ struct ftrace_mod_func *mod_func;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+
+ if (symnum >= mod_map->num_funcs) {
+ symnum -= mod_map->num_funcs;
+ continue;
+ }
+
+ list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+ if (symnum > 1) {
+ symnum--;
+ continue;
+ }
+
+ *value = mod_func->ip;
+ *type = 'T';
+ strlcpy(name, mod_func->name, KSYM_NAME_LEN);
+ strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN);
+ *exported = 1;
+ preempt_enable();
+ return 0;
+ }
+ WARN_ON(1);
+ break;
+ }
+ preempt_enable();
+ return -ERANGE;
+}
+
+#else
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+ struct dyn_ftrace *rec) { }
+static inline struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+ unsigned long start, unsigned long end)
+{
+ return NULL;
+}
#endif /* CONFIG_MODULES */
-void __init ftrace_free_init_mem(void)
+struct ftrace_init_func {
+ struct list_head list;
+ unsigned long ip;
+};
+
+/* Clear any init ips from hashes */
+static void
+clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash)
+{
+ struct ftrace_func_entry *entry;
+
+ if (ftrace_hash_empty(hash))
+ return;
+
+ entry = __ftrace_lookup_ip(hash, func->ip);
+
+ /*
+ * Do not allow this rec to match again.
+ * Yeah, it may waste some memory, but will be removed
+ * if/when the hash is modified again.
+ */
+ if (entry)
+ entry->ip = 0;
+}
+
+static void
+clear_func_from_hashes(struct ftrace_init_func *func)
+{
+ struct trace_array *tr;
+
+ mutex_lock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->ops || !tr->ops->func_hash)
+ continue;
+ mutex_lock(&tr->ops->func_hash->regex_lock);
+ clear_func_from_hash(func, tr->ops->func_hash->filter_hash);
+ clear_func_from_hash(func, tr->ops->func_hash->notrace_hash);
+ mutex_unlock(&tr->ops->func_hash->regex_lock);
+ }
+ mutex_unlock(&trace_types_lock);
+}
+
+static void add_to_clear_hash_list(struct list_head *clear_list,
+ struct dyn_ftrace *rec)
{
- unsigned long start = (unsigned long)(&__init_begin);
- unsigned long end = (unsigned long)(&__init_end);
+ struct ftrace_init_func *func;
+
+ func = kmalloc(sizeof(*func), GFP_KERNEL);
+ if (!func) {
+ WARN_ONCE(1, "alloc failure, ftrace filter could be stale\n");
+ return;
+ }
+
+ func->ip = rec->ip;
+ list_add(&func->list, clear_list);
+}
+
+void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
+{
+ unsigned long start = (unsigned long)(start_ptr);
+ unsigned long end = (unsigned long)(end_ptr);
struct ftrace_page **last_pg = &ftrace_pages_start;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
struct dyn_ftrace key;
+ struct ftrace_mod_map *mod_map = NULL;
+ struct ftrace_init_func *func, *func_next;
+ struct list_head clear_hash;
int order;
+ INIT_LIST_HEAD(&clear_hash);
+
key.ip = start;
key.flags = end; /* overload flags, as it is unsigned long */
mutex_lock(&ftrace_lock);
+ /*
+ * If we are freeing module init memory, then check if
+ * any tracer is active. If so, we need to save a mapping of
+ * the module functions being freed with the address.
+ */
+ if (mod && ftrace_ops_list != &ftrace_list_end)
+ mod_map = allocate_ftrace_mod_map(mod, start, end);
+
for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) {
if (end < pg->records[0].ip ||
start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
@@ -5890,6 +6127,13 @@ void __init ftrace_free_init_mem(void)
ftrace_cmp_recs);
if (!rec)
continue;
+
+ /* rec will be cleared from hashes after ftrace_lock unlock */
+ add_to_clear_hash_list(&clear_hash, rec);
+
+ if (mod_map)
+ save_ftrace_mod_rec(mod_map, rec);
+
pg->index--;
ftrace_update_tot_cnt--;
if (!pg->index) {
@@ -5908,6 +6152,19 @@ void __init ftrace_free_init_mem(void)
goto again;
}
mutex_unlock(&ftrace_lock);
+
+ list_for_each_entry_safe(func, func_next, &clear_hash, list) {
+ clear_func_from_hashes(func);
+ kfree(func);
+ }
+}
+
+void __init ftrace_free_init_mem(void)
+{
+ void *start = (void *)(&__init_begin);
+ void *end = (void *)(&__init_end);
+
+ ftrace_free_mem(NULL, start, end);
}
void __init ftrace_init(void)
@@ -6063,10 +6320,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
* If any of the above fails then the op->func() is not executed.
*/
if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
- (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
- !ftrace_function_local_disabled(op)) &&
ftrace_ops_test(op, ip, regs)) {
-
if (FTRACE_WARN_ON(!op->func)) {
pr_warn("op=%p %pS\n", op, op);
goto out;
@@ -6124,10 +6378,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
preempt_disable_notrace();
- if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
- !ftrace_function_local_disabled(op)) {
- op->func(ip, parent_ip, op, regs);
- }
+ op->func(ip, parent_ip, op, regs);
preempt_enable_notrace();
trace_clear_recursion(bit);
@@ -6151,7 +6402,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
* or does per cpu logic, then we need to call the assist handler.
*/
if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
- ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+ ops->flags & FTRACE_OPS_FL_RCU)
return ftrace_ops_assist_func;
return ops->func;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 845f3805c73d..ca6930e0d25e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -13,7 +13,6 @@
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kthread.h> /* for self test */
-#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/mutex.h>
@@ -281,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
/* Missed count stored at end */
#define RB_MISSED_STORED (1 << 30)
+#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
+
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
@@ -332,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
*/
size_t ring_buffer_page_len(void *page)
{
- return local_read(&((struct buffer_data_page *)page)->commit)
+ struct buffer_data_page *bpage = page;
+
+ return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
+ BUF_PAGE_HDR_SIZE;
}
@@ -627,7 +630,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
* Returns POLLIN | POLLRDNORM if data exists in the buffers,
* zero otherwise.
*/
-int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+__poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
struct file *filp, poll_table *poll_table)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -1800,12 +1803,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
}
EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
-static __always_inline void *
-__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
-{
- return bpage->data + index;
-}
-
static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
{
return bpage->page->data + index;
@@ -2055,7 +2052,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
}
event = __rb_page_index(tail_page, tail);
- kmemcheck_annotate_bitfield(event, bitfield);
/* account for padding bytes */
local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
@@ -2576,17 +2572,14 @@ static __always_inline int
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned int val = cpu_buffer->current_context;
+ unsigned long pc = preempt_count();
int bit;
- if (in_interrupt()) {
- if (in_nmi())
- bit = RB_CTX_NMI;
- else if (in_irq())
- bit = RB_CTX_IRQ;
- else
- bit = RB_CTX_SOFTIRQ;
- } else
+ if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
bit = RB_CTX_NORMAL;
+ else
+ bit = pc & NMI_MASK ? RB_CTX_NMI :
+ pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
if (unlikely(val & (1 << bit)))
return 1;
@@ -2686,7 +2679,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
/* We reserved something on the buffer */
event = __rb_page_index(tail_page, tail);
- kmemcheck_annotate_bitfield(event, bitfield);
rb_update_event(cpu_buffer, event, info);
local_inc(&tail_page->entries);
@@ -4439,8 +4431,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct buffer_data_page *bpage = data;
+ struct page *page = virt_to_page(bpage);
unsigned long flags;
+ /* If the page is still in use someplace else, we can't reuse it */
+ if (page_ref_count(page) > 1)
+ goto out;
+
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
@@ -4452,6 +4449,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
+ out:
free_page((unsigned long)bpage);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 752e5daf0896..32c069bbf41b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct
}
/**
- * trace_pid_filter_add_remove - Add or remove a task from a pid_list
+ * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list
* @pid_list: The list to modify
* @self: The current task for fork or NULL for exit
* @task: The task to add or remove
@@ -925,7 +925,7 @@ static void tracing_snapshot_instance(struct trace_array *tr)
}
/**
- * trace_snapshot - take a snapshot of the current buffer.
+ * tracing_snapshot - take a snapshot of the current buffer.
*
* This causes a swap between the snapshot buffer and the current live
* tracing buffer. You can use this to take snapshots of the live
@@ -1004,9 +1004,9 @@ int tracing_alloc_snapshot(void)
EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
/**
- * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
*
- * This is similar to trace_snapshot(), but it will allocate the
+ * This is similar to tracing_snapshot(), but it will allocate the
* snapshot buffer if it isn't already allocated. Use this only
* where it is safe to sleep, as the allocation may sleep.
*
@@ -1303,7 +1303,7 @@ unsigned long __read_mostly tracing_thresh;
/*
* Copy the new maximum trace into the separate maximum-trace
* structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
+ * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
*/
static void
__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
+/*
+ * Skip 3:
+ *
+ * trace_buffer_unlock_commit_regs()
+ * trace_event_buffer_commit()
+ * trace_event_raw_event_xxx()
+*/
+# define STACK_SKIP 3
+
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct ring_buffer *buffer,
struct ring_buffer_event *event,
@@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
__buffer_unlock_commit(buffer, event);
/*
- * If regs is not set, then skip the following callers:
- * trace_buffer_unlock_commit_regs
- * event_trigger_unlock_commit
- * trace_event_buffer_commit
- * trace_event_raw_event_sched_switch
+ * If regs is not set, then skip the necessary functions.
* Note, we can still get here via blktrace, wakeup tracer
* and mmiotrace, but that's ok if they lose a function or
- * two. They are that meaningful.
+ * two. They are not that meaningful.
*/
- ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
+ ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
@@ -2415,7 +2420,7 @@ trace_process_export(struct trace_export *export,
entry = ring_buffer_event_data(event);
size = ring_buffer_event_length(event);
- export->write(entry, size);
+ export->write(export, entry, size);
}
static DEFINE_MUTEX(ftrace_export_lock);
@@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
trace.skip = skip;
/*
- * Add two, for this function and the call to save_stack_trace()
+ * Add one, for this function and the call to save_stack_trace()
* If regs is set, then these functions will not be in the way.
*/
+#ifndef CONFIG_UNWINDER_ORC
if (!regs)
- trace.skip += 2;
+ trace.skip++;
+#endif
/*
* Since events can happen in NMIs there's no safe way to
@@ -2682,17 +2689,6 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
if (unlikely(in_nmi()))
return;
- /*
- * It is possible that a function is being traced in a
- * location that RCU is not watching. A call to
- * rcu_irq_enter() will make sure that it is, but there's
- * a few internal rcu functions that could be traced
- * where that wont work either. In those cases, we just
- * do nothing.
- */
- if (unlikely(rcu_irq_enter_disabled()))
- return;
-
rcu_irq_enter_irqson();
__ftrace_trace_stack(buffer, flags, skip, pc, NULL);
rcu_irq_exit_irqson();
@@ -2711,11 +2707,10 @@ void trace_dump_stack(int skip)
local_save_flags(flags);
- /*
- * Skip 3 more, seems to get us at the caller of
- * this function.
- */
- skip += 3;
+#ifndef CONFIG_UNWINDER_ORC
+ /* Skip 1 to skip this function. */
+ skip++;
+#endif
__ftrace_trace_stack(global_trace.trace_buffer.buffer,
flags, skip, preempt_count(), NULL);
}
@@ -4178,37 +4173,30 @@ static const struct file_operations show_traces_fops = {
.llseek = seq_lseek,
};
-/*
- * The tracer itself will not take this lock, but still we want
- * to provide a consistent cpumask to user-space:
- */
-static DEFINE_MUTEX(tracing_cpumask_update_lock);
-
-/*
- * Temporary storage for the character representation of the
- * CPU bitmask (and one more byte for the newline):
- */
-static char mask_str[NR_CPUS + 1];
-
static ssize_t
tracing_cpumask_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
struct trace_array *tr = file_inode(filp)->i_private;
+ char *mask_str;
int len;
- mutex_lock(&tracing_cpumask_update_lock);
+ len = snprintf(NULL, 0, "%*pb\n",
+ cpumask_pr_args(tr->tracing_cpumask)) + 1;
+ mask_str = kmalloc(len, GFP_KERNEL);
+ if (!mask_str)
+ return -ENOMEM;
- len = snprintf(mask_str, count, "%*pb\n",
+ len = snprintf(mask_str, len, "%*pb\n",
cpumask_pr_args(tr->tracing_cpumask));
if (len >= count) {
count = -EINVAL;
goto out_err;
}
- count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+ count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
out_err:
- mutex_unlock(&tracing_cpumask_update_lock);
+ kfree(mask_str);
return count;
}
@@ -4228,8 +4216,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
if (err)
goto err_unlock;
- mutex_lock(&tracing_cpumask_update_lock);
-
local_irq_disable();
arch_spin_lock(&tr->max_lock);
for_each_tracing_cpu(cpu) {
@@ -4252,8 +4238,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
local_irq_enable();
cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
-
- mutex_unlock(&tracing_cpumask_update_lock);
free_cpumask_var(tracing_cpumask_new);
return count;
@@ -5632,7 +5616,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
return 0;
}
-static unsigned int
+static __poll_t
trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
{
struct trace_array *tr = iter->tr;
@@ -5651,7 +5635,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
filp, poll_table);
}
-static unsigned int
+static __poll_t
tracing_poll_pipe(struct file *filp, poll_table *poll_table)
{
struct trace_iterator *iter = filp->private_data;
@@ -6605,7 +6589,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
return ret;
}
-static unsigned int
+static __poll_t
tracing_buffers_poll(struct file *filp, poll_table *poll_table)
{
struct ftrace_buffer_info *info = filp->private_data;
@@ -6780,7 +6764,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
- int entries, size, i;
+ int entries, i;
ssize_t ret = 0;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -6834,14 +6818,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- /*
- * zero out any left over data, this is going to
- * user land.
- */
- size = ring_buffer_page_len(ref->page);
- if (size < PAGE_SIZE)
- memset(ref->page + size, 0, PAGE_SIZE - size);
-
page = virt_to_page(ref->page);
spd.pages[i] = page;
@@ -7599,6 +7575,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
buf->data = alloc_percpu(struct trace_array_cpu);
if (!buf->data) {
ring_buffer_free(buf->buffer);
+ buf->buffer = NULL;
return -ENOMEM;
}
@@ -7622,7 +7599,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
allocate_snapshot ? size : 1);
if (WARN_ON(ret)) {
ring_buffer_free(tr->trace_buffer.buffer);
+ tr->trace_buffer.buffer = NULL;
free_percpu(tr->trace_buffer.data);
+ tr->trace_buffer.data = NULL;
return -ENOMEM;
}
tr->allocated_snapshot = allocate_snapshot;
@@ -7687,6 +7666,7 @@ static int instance_mkdir(const char *name)
struct trace_array *tr;
int ret;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = -EEXIST;
@@ -7742,6 +7722,7 @@ static int instance_mkdir(const char *name)
list_add(&tr->list, &ftrace_trace_arrays);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return 0;
@@ -7753,6 +7734,7 @@ static int instance_mkdir(const char *name)
out_unlock:
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
@@ -7765,6 +7747,7 @@ static int instance_rmdir(const char *name)
int ret;
int i;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = -ENODEV;
@@ -7810,6 +7793,7 @@ static int instance_rmdir(const char *name)
out_unlock:
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
@@ -8276,6 +8260,92 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
}
EXPORT_SYMBOL_GPL(ftrace_dump);
+int trace_run_command(const char *buf, int (*createfn)(int, char **))
+{
+ char **argv;
+ int argc, ret;
+
+ argc = 0;
+ ret = 0;
+ argv = argv_split(GFP_KERNEL, buf, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ if (argc)
+ ret = createfn(argc, argv);
+
+ argv_free(argv);
+
+ return ret;
+}
+
+#define WRITE_BUFSIZE 4096
+
+ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos,
+ int (*createfn)(int, char **))
+{
+ char *kbuf, *buf, *tmp;
+ int ret = 0;
+ size_t done = 0;
+ size_t size;
+
+ kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ while (done < count) {
+ size = count - done;
+
+ if (size >= WRITE_BUFSIZE)
+ size = WRITE_BUFSIZE - 1;
+
+ if (copy_from_user(kbuf, buffer + done, size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ kbuf[size] = '\0';
+ buf = kbuf;
+ do {
+ tmp = strchr(buf, '\n');
+ if (tmp) {
+ *tmp = '\0';
+ size = tmp - buf + 1;
+ } else {
+ size = strlen(buf);
+ if (done + size < count) {
+ if (buf != kbuf)
+ break;
+ /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
+ pr_warn("Line length is too long: Should be less than %d\n",
+ WRITE_BUFSIZE - 2);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ done += size;
+
+ /* Remove comments */
+ tmp = strchr(buf, '#');
+
+ if (tmp)
+ *tmp = '\0';
+
+ ret = trace_run_command(buf, createfn);
+ if (ret)
+ goto out;
+ buf += size;
+
+ } while (done < count);
+ }
+ ret = done;
+
+out:
+ kfree(kbuf);
+
+ return ret;
+}
+
__init static int tracer_alloc_buffers(void)
{
int ring_buf_size;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6b0b343a36a2..2a6d0325a761 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -739,8 +739,6 @@ extern int trace_selftest_startup_wakeup(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_nop(struct tracer *trace,
struct trace_array *tr);
-extern int trace_selftest_startup_sched_switch(struct tracer *trace,
- struct trace_array *tr);
extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr);
/*
@@ -1755,6 +1753,13 @@ void trace_printk_start_comm(void);
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
+#define MAX_EVENT_NAME_LEN 64
+
+extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
+extern ssize_t trace_parse_run_command(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos,
+ int (*createfn)(int, char**));
+
/*
* Normal trace_printk() and friends allocates special buffers
* to do the manipulation, as well as saves the print formats
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 79f838a75077..22fee766081b 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -165,7 +165,7 @@ static int benchmark_event_kthread(void *arg)
* this thread will never voluntarily schedule which would
* block synchronize_rcu_tasks() indefinitely.
*/
- cond_resched_rcu_qs();
+ cond_resched();
}
return 0;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 13ba2d3f6a91..55d6dff37daf 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_event *p_event)
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- struct hlist_head __percpu *pcpu_list;
- struct hlist_head *list;
-
- pcpu_list = tp_event->perf_events;
- if (WARN_ON_ONCE(!pcpu_list))
- return -EINVAL;
if (!(flags & PERF_EF_START))
p_event->hw.state = PERF_HES_STOPPED;
- list = this_cpu_ptr(pcpu_list);
- hlist_add_head_rcu(&p_event->hlist_entry, list);
+ /*
+ * If TRACE_REG_PERF_ADD returns false; no custom action was performed
+ * and we need to take the default action of enqueueing our event on
+ * the right per-cpu hlist.
+ */
+ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
+ struct hlist_head __percpu *pcpu_list;
+ struct hlist_head *list;
+
+ pcpu_list = tp_event->perf_events;
+ if (WARN_ON_ONCE(!pcpu_list))
+ return -EINVAL;
+
+ list = this_cpu_ptr(pcpu_list);
+ hlist_add_head_rcu(&p_event->hlist_entry, list);
+ }
- return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+ return 0;
}
void perf_trace_del(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- hlist_del_rcu(&p_event->hlist_entry);
- tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+
+ /*
+ * If TRACE_REG_PERF_DEL returns false; no custom action was performed
+ * and we need to take the default action of dequeueing our event from
+ * the right per-cpu hlist.
+ */
+ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
+ hlist_del_rcu(&p_event->hlist_entry);
}
void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -306,16 +320,25 @@ static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *pt_regs)
{
- struct perf_event *event;
struct ftrace_entry *entry;
- struct hlist_head *head;
+ struct perf_event *event;
+ struct hlist_head head;
struct pt_regs regs;
int rctx;
- head = this_cpu_ptr(event_function.perf_events);
- if (hlist_empty(head))
+ if ((unsigned long)ops->private != smp_processor_id())
return;
+ event = container_of(ops, struct perf_event, ftrace_ops);
+
+ /*
+ * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
+ * the perf code does is hlist_for_each_entry_rcu(), so we can
+ * get away with simply setting the @head.first pointer in order
+ * to create a singular list.
+ */
+ head.first = &event->hlist_entry;
+
#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
sizeof(u64)) - sizeof(u32))
@@ -330,9 +353,8 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
entry->ip = ip;
entry->parent_ip = parent_ip;
- event = container_of(ops, struct perf_event, ftrace_ops);
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
- 1, &regs, head, NULL, event);
+ 1, &regs, &head, NULL);
#undef ENTRY_SIZE
}
@@ -341,8 +363,10 @@ static int perf_ftrace_function_register(struct perf_event *event)
{
struct ftrace_ops *ops = &event->ftrace_ops;
- ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
- ops->func = perf_ftrace_function_call;
+ ops->flags = FTRACE_OPS_FL_RCU;
+ ops->func = perf_ftrace_function_call;
+ ops->private = (void *)(unsigned long)nr_cpu_ids;
+
return register_ftrace_function(ops);
}
@@ -354,19 +378,11 @@ static int perf_ftrace_function_unregister(struct perf_event *event)
return ret;
}
-static void perf_ftrace_function_enable(struct perf_event *event)
-{
- ftrace_function_local_enable(&event->ftrace_ops);
-}
-
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
- ftrace_function_local_disable(&event->ftrace_ops);
-}
-
int perf_ftrace_event_register(struct trace_event_call *call,
enum trace_reg type, void *data)
{
+ struct perf_event *event = data;
+
switch (type) {
case TRACE_REG_REGISTER:
case TRACE_REG_UNREGISTER:
@@ -379,11 +395,11 @@ int perf_ftrace_event_register(struct trace_event_call *call,
case TRACE_REG_PERF_CLOSE:
return perf_ftrace_function_unregister(data);
case TRACE_REG_PERF_ADD:
- perf_ftrace_function_enable(data);
- return 0;
+ event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
+ return 1;
case TRACE_REG_PERF_DEL:
- perf_ftrace_function_disable(data);
- return 0;
+ event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
+ return 1;
}
return -EINVAL;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 87468398b9ed..1b87157edbff 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1406,8 +1406,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
return -ENODEV;
/* Make sure the system still exists */
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
list_for_each_entry(dir, &tr->systems, list) {
if (dir == inode->i_private) {
@@ -1421,8 +1421,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
}
}
exit_loop:
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
if (!system)
return -ENODEV;
@@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
+ bool first = false;
int last_i;
int i;
@@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
list_for_each_entry_safe(call, p, &ftrace_events, list) {
/* events are usually grouped together with systems */
if (!last_system || call->class->system != last_system) {
+ first = true;
last_i = 0;
last_system = call->class->system;
}
+ /*
+ * Since calls are grouped by systems, the likelyhood that the
+ * next call in the iteration belongs to the same system as the
+ * previous call is high. As an optimization, we skip seaching
+ * for a map[] that matches the call's system if the last call
+ * was from the same system. That's what last_i is for. If the
+ * call has the same system as the previous call, then last_i
+ * will be the index of the first map[] that has a matching
+ * system.
+ */
for (i = last_i; i < len; i++) {
if (call->class->system == map[i]->system) {
/* Save the first system if need be */
- if (!last_i)
+ if (first) {
last_i = i;
+ first = false;
+ }
update_event_printk(call, map[i]);
}
}
@@ -2294,15 +2308,15 @@ static void __add_event_to_tracers(struct trace_event_call *call);
int trace_add_event_call(struct trace_event_call *call)
{
int ret;
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
ret = __register_event(call, NULL);
if (ret >= 0)
__add_event_to_tracers(call);
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
@@ -2356,13 +2370,13 @@ int trace_remove_event_call(struct trace_event_call *call)
{
int ret;
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
down_write(&trace_event_sem);
ret = probe_remove_event_call(call);
up_write(&trace_event_sem);
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
@@ -2424,8 +2438,8 @@ static int trace_module_notify(struct notifier_block *self,
{
struct module *mod = data;
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
switch (val) {
case MODULE_STATE_COMING:
trace_module_add_events(mod);
@@ -2434,8 +2448,8 @@ static int trace_module_notify(struct notifier_block *self,
trace_module_remove_events(mod);
break;
}
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return 0;
}
@@ -2950,24 +2964,24 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
* creates the event hierachry in the @parent/events directory.
*
* Returns 0 on success.
+ *
+ * Must be called with event_mutex held.
*/
int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
{
int ret;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
ret = create_event_toplevel_files(parent, tr);
if (ret)
- goto out_unlock;
+ goto out;
down_write(&trace_event_sem);
__trace_add_event_dirs(tr);
up_write(&trace_event_sem);
- out_unlock:
- mutex_unlock(&event_mutex);
-
+ out:
return ret;
}
@@ -2996,9 +3010,10 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
return ret;
}
+/* Must be called with event_mutex held */
int event_trace_del_tracer(struct trace_array *tr)
{
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
/* Disable any event triggers and associated soft-disabled events */
clear_event_triggers(tr);
@@ -3019,8 +3034,6 @@ int event_trace_del_tracer(struct trace_array *tr)
tr->event_dir = NULL;
- mutex_unlock(&event_mutex);
-
return 0;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1c21d0e2a145..1e1558c99d56 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -28,12 +28,16 @@ struct hist_field;
typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
+#define HIST_FIELD_OPERANDS_MAX 2
+
struct hist_field {
struct ftrace_event_field *field;
unsigned long flags;
hist_field_fn_t fn;
unsigned int size;
unsigned int offset;
+ unsigned int is_signed;
+ struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
};
static u64 hist_field_none(struct hist_field *field, void *event)
@@ -71,7 +75,9 @@ static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
static u64 hist_field_log2(struct hist_field *hist_field, void *event)
{
- u64 val = *(u64 *)(event + hist_field->field->offset);
+ struct hist_field *operand = hist_field->operands[0];
+
+ u64 val = operand->fn(operand, event);
return (u64) ilog2(roundup_pow_of_two(val));
}
@@ -110,16 +116,16 @@ DEFINE_HIST_FIELD_FN(u8);
#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
enum hist_field_flags {
- HIST_FIELD_FL_HITCOUNT = 1,
- HIST_FIELD_FL_KEY = 2,
- HIST_FIELD_FL_STRING = 4,
- HIST_FIELD_FL_HEX = 8,
- HIST_FIELD_FL_SYM = 16,
- HIST_FIELD_FL_SYM_OFFSET = 32,
- HIST_FIELD_FL_EXECNAME = 64,
- HIST_FIELD_FL_SYSCALL = 128,
- HIST_FIELD_FL_STACKTRACE = 256,
- HIST_FIELD_FL_LOG2 = 512,
+ HIST_FIELD_FL_HITCOUNT = 1 << 0,
+ HIST_FIELD_FL_KEY = 1 << 1,
+ HIST_FIELD_FL_STRING = 1 << 2,
+ HIST_FIELD_FL_HEX = 1 << 3,
+ HIST_FIELD_FL_SYM = 1 << 4,
+ HIST_FIELD_FL_SYM_OFFSET = 1 << 5,
+ HIST_FIELD_FL_EXECNAME = 1 << 6,
+ HIST_FIELD_FL_SYSCALL = 1 << 7,
+ HIST_FIELD_FL_STACKTRACE = 1 << 8,
+ HIST_FIELD_FL_LOG2 = 1 << 9,
};
struct hist_trigger_attrs {
@@ -146,6 +152,25 @@ struct hist_trigger_data {
struct tracing_map *map;
};
+static const char *hist_field_name(struct hist_field *field,
+ unsigned int level)
+{
+ const char *field_name = "";
+
+ if (level > 1)
+ return field_name;
+
+ if (field->field)
+ field_name = field->field->name;
+ else if (field->flags & HIST_FIELD_FL_LOG2)
+ field_name = hist_field_name(field->operands[0], ++level);
+
+ if (field_name == NULL)
+ field_name = "";
+
+ return field_name;
+}
+
static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
{
hist_field_fn_t fn = NULL;
@@ -340,8 +365,20 @@ static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
.elt_init = hist_trigger_elt_comm_init,
};
-static void destroy_hist_field(struct hist_field *hist_field)
+static void destroy_hist_field(struct hist_field *hist_field,
+ unsigned int level)
{
+ unsigned int i;
+
+ if (level > 2)
+ return;
+
+ if (!hist_field)
+ return;
+
+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
+ destroy_hist_field(hist_field->operands[i], level + 1);
+
kfree(hist_field);
}
@@ -368,7 +405,10 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
}
if (flags & HIST_FIELD_FL_LOG2) {
+ unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
hist_field->fn = hist_field_log2;
+ hist_field->operands[0] = create_hist_field(field, fl);
+ hist_field->size = hist_field->operands[0]->size;
goto out;
}
@@ -388,7 +428,7 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
hist_field->fn = select_value_fn(field->size,
field->is_signed);
if (!hist_field->fn) {
- destroy_hist_field(hist_field);
+ destroy_hist_field(hist_field, 0);
return NULL;
}
}
@@ -405,7 +445,7 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data)
for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
if (hist_data->fields[i]) {
- destroy_hist_field(hist_data->fields[i]);
+ destroy_hist_field(hist_data->fields[i], 0);
hist_data->fields[i] = NULL;
}
}
@@ -450,7 +490,7 @@ static int create_val_field(struct hist_trigger_data *hist_data,
}
field = trace_find_event_field(file->event_call, field_name);
- if (!field) {
+ if (!field || !field->size) {
ret = -EINVAL;
goto out;
}
@@ -548,7 +588,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
}
field = trace_find_event_field(file->event_call, field_name);
- if (!field) {
+ if (!field || !field->size) {
ret = -EINVAL;
goto out;
}
@@ -653,7 +693,6 @@ static int is_descending(const char *str)
static int create_sort_keys(struct hist_trigger_data *hist_data)
{
char *fields_str = hist_data->attrs->sort_key_str;
- struct ftrace_event_field *field = NULL;
struct tracing_map_sort_key *sort_key;
int descending, ret = 0;
unsigned int i, j;
@@ -670,7 +709,9 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
}
for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
+ struct hist_field *hist_field;
char *field_str, *field_name;
+ const char *test_name;
sort_key = &hist_data->sort_keys[i];
@@ -703,8 +744,10 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
}
for (j = 1; j < hist_data->n_fields; j++) {
- field = hist_data->fields[j]->field;
- if (field && (strcmp(field_name, field->name) == 0)) {
+ hist_field = hist_data->fields[j];
+ test_name = hist_field_name(hist_field, 0);
+
+ if (strcmp(field_name, test_name) == 0) {
sort_key->field_idx = j;
descending = is_descending(field_str);
if (descending < 0) {
@@ -952,6 +995,7 @@ hist_trigger_entry_print(struct seq_file *m,
struct hist_field *key_field;
char str[KSYM_SYMBOL_LEN];
bool multiline = false;
+ const char *field_name;
unsigned int i;
u64 uval;
@@ -963,26 +1007,27 @@ hist_trigger_entry_print(struct seq_file *m,
if (i > hist_data->n_vals)
seq_puts(m, ", ");
+ field_name = hist_field_name(key_field, 0);
+
if (key_field->flags & HIST_FIELD_FL_HEX) {
uval = *(u64 *)(key + key_field->offset);
- seq_printf(m, "%s: %llx",
- key_field->field->name, uval);
+ seq_printf(m, "%s: %llx", field_name, uval);
} else if (key_field->flags & HIST_FIELD_FL_SYM) {
uval = *(u64 *)(key + key_field->offset);
sprint_symbol_no_offset(str, uval);
- seq_printf(m, "%s: [%llx] %-45s",
- key_field->field->name, uval, str);
+ seq_printf(m, "%s: [%llx] %-45s", field_name,
+ uval, str);
} else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
uval = *(u64 *)(key + key_field->offset);
sprint_symbol(str, uval);
- seq_printf(m, "%s: [%llx] %-55s",
- key_field->field->name, uval, str);
+ seq_printf(m, "%s: [%llx] %-55s", field_name,
+ uval, str);
} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
char *comm = elt->private_data;
uval = *(u64 *)(key + key_field->offset);
- seq_printf(m, "%s: %-16s[%10llu]",
- key_field->field->name, comm, uval);
+ seq_printf(m, "%s: %-16s[%10llu]", field_name,
+ comm, uval);
} else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
const char *syscall_name;
@@ -991,8 +1036,8 @@ hist_trigger_entry_print(struct seq_file *m,
if (!syscall_name)
syscall_name = "unknown_syscall";
- seq_printf(m, "%s: %-30s[%3llu]",
- key_field->field->name, syscall_name, uval);
+ seq_printf(m, "%s: %-30s[%3llu]", field_name,
+ syscall_name, uval);
} else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
seq_puts(m, "stacktrace:\n");
hist_trigger_stacktrace_print(m,
@@ -1000,15 +1045,14 @@ hist_trigger_entry_print(struct seq_file *m,
HIST_STACKTRACE_DEPTH);
multiline = true;
} else if (key_field->flags & HIST_FIELD_FL_LOG2) {
- seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
+ seq_printf(m, "%s: ~ 2^%-2llu", field_name,
*(u64 *)(key + key_field->offset));
} else if (key_field->flags & HIST_FIELD_FL_STRING) {
- seq_printf(m, "%s: %-50s", key_field->field->name,
+ seq_printf(m, "%s: %-50s", field_name,
(char *)(key + key_field->offset));
} else {
uval = *(u64 *)(key + key_field->offset);
- seq_printf(m, "%s: %10llu", key_field->field->name,
- uval);
+ seq_printf(m, "%s: %10llu", field_name, uval);
}
}
@@ -1021,13 +1065,13 @@ hist_trigger_entry_print(struct seq_file *m,
tracing_map_read_sum(elt, HITCOUNT_IDX));
for (i = 1; i < hist_data->n_vals; i++) {
+ field_name = hist_field_name(hist_data->fields[i], 0);
+
if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
- seq_printf(m, " %s: %10llx",
- hist_data->fields[i]->field->name,
+ seq_printf(m, " %s: %10llx", field_name,
tracing_map_read_sum(elt, i));
} else {
- seq_printf(m, " %s: %10llu",
- hist_data->fields[i]->field->name,
+ seq_printf(m, " %s: %10llu", field_name,
tracing_map_read_sum(elt, i));
}
}
@@ -1062,7 +1106,7 @@ static void hist_trigger_show(struct seq_file *m,
struct event_trigger_data *data, int n)
{
struct hist_trigger_data *hist_data;
- int n_entries, ret = 0;
+ int n_entries;
if (n > 0)
seq_puts(m, "\n\n");
@@ -1073,10 +1117,8 @@ static void hist_trigger_show(struct seq_file *m,
hist_data = data->private_data;
n_entries = print_entries(m, hist_data);
- if (n_entries < 0) {
- ret = n_entries;
+ if (n_entries < 0)
n_entries = 0;
- }
seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n",
(u64)atomic64_read(&hist_data->map->hits),
@@ -1142,7 +1184,9 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
{
- seq_printf(m, "%s", hist_field->field->name);
+ const char *field_name = hist_field_name(hist_field, 0);
+
+ seq_printf(m, "%s", field_name);
if (hist_field->flags) {
const char *flags_str = get_hist_field_flags(hist_field);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index f2ac9d44f6c4..87411482a46f 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
#endif /* CONFIG_TRACER_SNAPSHOT */
#ifdef CONFIG_STACKTRACE
+#ifdef CONFIG_UNWINDER_ORC
+/* Skip 2:
+ * event_triggers_post_call()
+ * trace_event_raw_event_xxx()
+ */
+# define STACK_SKIP 2
+#else
/*
- * Skip 3:
+ * Skip 4:
* stacktrace_trigger()
* event_triggers_post_call()
+ * trace_event_buffer_commit()
* trace_event_raw_event_xxx()
*/
-#define STACK_SKIP 3
+#define STACK_SKIP 4
+#endif
static void
stacktrace_trigger(struct event_trigger_data *data, void *rec)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 27f7ad12c4b1..b611cd36e22d 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
preempt_enable_notrace();
}
+#ifdef CONFIG_UNWINDER_ORC
+/*
+ * Skip 2:
+ *
+ * function_stack_trace_call()
+ * ftrace_call()
+ */
+#define STACK_SKIP 2
+#else
+/*
+ * Skip 3:
+ * __trace_stack()
+ * function_stack_trace_call()
+ * ftrace_call()
+ */
+#define STACK_SKIP 3
+#endif
+
static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
@@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (likely(disabled == 1)) {
pc = preempt_count();
trace_function(tr, ip, parent_ip, flags, pc);
- /*
- * skip over 5 funcs:
- * __ftrace_trace_stack,
- * __trace_stack,
- * function_stack_trace_call
- * ftrace_list_func
- * ftrace_call
- */
- __trace_stack(tr, flags, 5, pc);
+ __trace_stack(tr, flags, STACK_SKIP, pc);
}
atomic_dec(&data->disabled);
@@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
tracer_tracing_off(tr);
}
+#ifdef CONFIG_UNWINDER_ORC
/*
- * Skip 4:
+ * Skip 3:
+ *
+ * function_trace_probe_call()
+ * ftrace_ops_assist_func()
+ * ftrace_call()
+ */
+#define FTRACE_STACK_SKIP 3
+#else
+/*
+ * Skip 5:
+ *
+ * __trace_stack()
* ftrace_stacktrace()
* function_trace_probe_call()
- * ftrace_ops_list_func()
+ * ftrace_ops_assist_func()
* ftrace_call()
*/
-#define STACK_SKIP 4
+#define FTRACE_STACK_SKIP 5
+#endif
static __always_inline void trace_stack(struct trace_array *tr)
{
@@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr)
local_save_flags(flags);
pc = preempt_count();
- __trace_stack(tr, flags, STACK_SKIP, pc);
+ __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc);
}
static void
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7758bc0617cb..03ecb4465ee4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,10 @@
#include "trace.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
+#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
static struct trace_array *irqsoff_trace __read_mostly;
static int tracer_enabled __read_mostly;
@@ -463,63 +467,43 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
#else /* !CONFIG_PROVE_LOCKING */
/*
- * Stubs:
- */
-
-void trace_softirqs_on(unsigned long ip)
-{
-}
-
-void trace_softirqs_off(unsigned long ip)
-{
-}
-
-inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
-
-/*
* We are only interested in hardirq on/off events:
*/
-void trace_hardirqs_on(void)
+static inline void tracer_hardirqs_on(void)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_on);
-void trace_hardirqs_off(void)
+static inline void tracer_hardirqs_off(void)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_off);
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_PROVE_LOCKING */
#endif /* CONFIG_IRQSOFF_TRACER */
#ifdef CONFIG_PREEMPT_TRACER
-void trace_preempt_on(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
stop_critical_timing(a0, a1);
}
-void trace_preempt_off(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
start_critical_timing(a0, a1);
@@ -781,3 +765,100 @@ __init static int init_irqsoff_tracer(void)
return 0;
}
core_initcall(init_irqsoff_tracer);
+#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
+
+#ifndef CONFIG_IRQSOFF_TRACER
+static inline void tracer_hardirqs_on(void) { }
+static inline void tracer_hardirqs_off(void) { }
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { }
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { }
+#endif
+
+#ifndef CONFIG_PREEMPT_TRACER
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#endif
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
+void trace_hardirqs_on(void)
+{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_on();
+
+ this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_off();
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
+ tracer_hardirqs_on_caller(caller_addr);
+
+ this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
+ tracer_hardirqs_off_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+/*
+ * Stubs:
+ */
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACER) || \
+ (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+ trace_preempt_enable_rcuidle(a0, a1);
+ tracer_preempt_on(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+ trace_preempt_disable_rcuidle(a0, a1);
+ tracer_preempt_off(a0, a1);
+}
+#endif
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8a907e12b6b9..492700c5fb4d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -907,8 +907,8 @@ static int probes_open(struct inode *inode, struct file *file)
static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- return traceprobe_probes_write(file, buffer, count, ppos,
- create_trace_kprobe);
+ return trace_parse_run_command(file, buffer, count, ppos,
+ create_trace_kprobe);
}
static const struct file_operations kprobe_events_ops = {
@@ -1174,13 +1174,12 @@ static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct trace_event_call *call = &tk->tp.call;
- struct bpf_prog *prog = call->prog;
struct kprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
head = this_cpu_ptr(call->perf_events);
@@ -1200,7 +1199,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
memset(&entry[1], 0, dsize);
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL, NULL);
+ head, NULL);
}
NOKPROBE_SYMBOL(kprobe_perf_func);
@@ -1210,13 +1209,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_event_call *call = &tk->tp.call;
- struct bpf_prog *prog = call->prog;
struct kretprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
head = this_cpu_ptr(call->perf_events);
@@ -1236,7 +1234,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL, NULL);
+ head, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
@@ -1433,9 +1431,9 @@ static __init int kprobe_trace_self_tests_init(void)
pr_info("Testing kprobe tracing: ");
- ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
- "$stack $stack0 +0($stack)",
- create_trace_kprobe);
+ ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
+ "$stack $stack0 +0($stack)",
+ create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function entry.\n");
warn++;
@@ -1455,8 +1453,8 @@ static __init int kprobe_trace_self_tests_init(void)
}
}
- ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
- "$retval", create_trace_kprobe);
+ ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
+ "$retval", create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function return.\n");
warn++;
@@ -1526,13 +1524,13 @@ static __init int kprobe_trace_self_tests_init(void)
disable_trace_kprobe(tk, file);
}
- ret = traceprobe_command("-:testprobe", create_trace_kprobe);
+ ret = trace_run_command("-:testprobe", create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
}
- ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
+ ret = trace_run_command("-:testprobe2", create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 52478f033f88..d59357308677 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -623,92 +623,6 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
kfree(arg->comm);
}
-int traceprobe_command(const char *buf, int (*createfn)(int, char **))
-{
- char **argv;
- int argc, ret;
-
- argc = 0;
- ret = 0;
- argv = argv_split(GFP_KERNEL, buf, &argc);
- if (!argv)
- return -ENOMEM;
-
- if (argc)
- ret = createfn(argc, argv);
-
- argv_free(argv);
-
- return ret;
-}
-
-#define WRITE_BUFSIZE 4096
-
-ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
- size_t count, loff_t *ppos,
- int (*createfn)(int, char **))
-{
- char *kbuf, *buf, *tmp;
- int ret = 0;
- size_t done = 0;
- size_t size;
-
- kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
- if (!kbuf)
- return -ENOMEM;
-
- while (done < count) {
- size = count - done;
-
- if (size >= WRITE_BUFSIZE)
- size = WRITE_BUFSIZE - 1;
-
- if (copy_from_user(kbuf, buffer + done, size)) {
- ret = -EFAULT;
- goto out;
- }
- kbuf[size] = '\0';
- buf = kbuf;
- do {
- tmp = strchr(buf, '\n');
- if (tmp) {
- *tmp = '\0';
- size = tmp - buf + 1;
- } else {
- size = strlen(buf);
- if (done + size < count) {
- if (buf != kbuf)
- break;
- /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
- pr_warn("Line length is too long: Should be less than %d\n",
- WRITE_BUFSIZE - 2);
- ret = -EINVAL;
- goto out;
- }
- }
- done += size;
-
- /* Remove comments */
- tmp = strchr(buf, '#');
-
- if (tmp)
- *tmp = '\0';
-
- ret = traceprobe_command(buf, createfn);
- if (ret)
- goto out;
- buf += size;
-
- } while (done < count);
- }
- ret = done;
-
-out:
- kfree(kbuf);
-
- return ret;
-}
-
static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
bool is_return)
{
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 903273c93e61..fb66e3eaa192 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -42,7 +42,6 @@
#define MAX_TRACE_ARGS 128
#define MAX_ARGSTR_LEN 63
-#define MAX_EVENT_NAME_LEN 64
#define MAX_STRING_SIZE PATH_MAX
/* Reserved field names */
@@ -356,12 +355,6 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg);
extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
-extern ssize_t traceprobe_probes_write(struct file *file,
- const char __user *buffer, size_t count, loff_t *ppos,
- int (*createfn)(int, char**));
-
-extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
-
/* Sum up total data length for dynamic arraies (strings) */
static nokprobe_inline int
__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index cd70eb5df38e..11e9daa4a568 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -60,7 +60,7 @@ static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
* Test the trace buffer to see if all the elements
* are still sane.
*/
-static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
+static int __maybe_unused trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
{
unsigned long flags, cnt = 0;
int cpu, ret = 0;
@@ -1151,38 +1151,6 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
}
#endif /* CONFIG_SCHED_TRACER */
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-int
-trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
-{
- unsigned long count;
- int ret;
-
- /* start the tracing */
- ret = tracer_init(trace, tr);
- if (ret) {
- warn_failed_init_tracer(trace, ret);
- return ret;
- }
-
- /* Sleep for a 1/10 of a second */
- msleep(100);
- /* stop the tracing. */
- tracing_stop();
- /* check the trace buffer */
- ret = trace_test_buffer(&tr->trace_buffer, &count);
- trace->reset(tr);
- tracing_start();
-
- if (!ret && !count) {
- printk(KERN_CONT ".. no entries found ..");
- ret = -1;
- }
-
- return ret;
-}
-#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
-
#ifdef CONFIG_BRANCH_TRACER
int
trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 734accc02418..3c7bfc4bf5e9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (__this_cpu_read(disable_stack_tracer) != 1)
goto out;
+ /* If rcu is not watching, then save stack trace can fail */
+ if (!rcu_is_watching())
+ goto out;
+
ip += MCOUNT_INSN_SIZE;
check_stack(ip, &stack);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a2a642f2c64f..f93a56d2db27 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -560,9 +560,10 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
-static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
- struct syscall_metadata *sys_data,
- struct syscall_trace_enter *rec) {
+static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+ struct syscall_metadata *sys_data,
+ struct syscall_trace_enter *rec)
+{
struct syscall_tp_t {
unsigned long long regs;
unsigned long syscall_nr;
@@ -574,7 +575,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
param.syscall_nr = rec->nr;
for (i = 0; i < sys_data->nb_args; i++)
param.args[i] = rec->args[i];
- return trace_call_bpf(prog, &param);
+ return trace_call_bpf(call, &param);
}
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
@@ -582,7 +583,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct hlist_head *head;
- struct bpf_prog *prog;
+ bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
@@ -597,9 +598,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (!sys_data)
return;
- prog = READ_ONCE(sys_data->enter_event->prog);
head = this_cpu_ptr(sys_data->enter_event->perf_events);
- if (!prog && hlist_empty(head))
+ valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+ if (!valid_prog_array && hlist_empty(head))
return;
/* get the size after alignment with the u32 buffer size field */
@@ -615,7 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
- if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+ if ((valid_prog_array &&
+ !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
@@ -623,7 +625,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
- head, NULL, NULL);
+ head, NULL);
}
static int perf_sysenter_enable(struct trace_event_call *call)
@@ -660,8 +662,9 @@ static void perf_sysenter_disable(struct trace_event_call *call)
mutex_unlock(&syscall_trace_lock);
}
-static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
- struct syscall_trace_exit *rec) {
+static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
+ struct syscall_trace_exit *rec)
+{
struct syscall_tp_t {
unsigned long long regs;
unsigned long syscall_nr;
@@ -671,7 +674,7 @@ static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
param.ret = rec->ret;
- return trace_call_bpf(prog, &param);
+ return trace_call_bpf(call, &param);
}
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
@@ -679,7 +682,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
struct hlist_head *head;
- struct bpf_prog *prog;
+ bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
@@ -694,9 +697,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
- prog = READ_ONCE(sys_data->exit_event->prog);
head = this_cpu_ptr(sys_data->exit_event->perf_events);
- if (!prog && hlist_empty(head))
+ valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
+ if (!valid_prog_array && hlist_empty(head))
return;
/* We can probably do that at build time */
@@ -710,14 +713,15 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+ if ((valid_prog_array &&
+ !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
- 1, regs, head, NULL, NULL);
+ 1, regs, head, NULL);
}
static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4525e0271a53..40592e7b3568 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -651,7 +651,7 @@ static int probes_open(struct inode *inode, struct file *file)
static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
+ return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
}
static const struct file_operations uprobe_events_ops = {
@@ -1113,13 +1113,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
{
struct trace_event_call *call = &tu->tp.call;
struct uprobe_trace_entry_head *entry;
- struct bpf_prog *prog = call->prog;
struct hlist_head *head;
void *data;
int size, esize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
@@ -1156,7 +1155,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
}
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL, NULL);
+ head, NULL);
out:
preempt_enable();
}
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 305039b122fa..07e75344725b 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -428,7 +428,8 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
if (test_key && test_key == key_hash && entry->val &&
keys_match(key, entry->val->key, map->key_size)) {
- atomic64_inc(&map->hits);
+ if (!lookup_only)
+ atomic64_inc(&map->hits);
return entry->val;
}
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index ab0ca77331d0..5b5bbf8ae550 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -6,7 +6,7 @@
#define TRACING_MAP_BITS_MAX 17
#define TRACING_MAP_BITS_MIN 7
-#define TRACING_MAP_KEYS_MAX 2
+#define TRACING_MAP_KEYS_MAX 3
#define TRACING_MAP_VALS_MAX 3
#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \
TRACING_MAP_VALS_MAX)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 685c50ae6300..671b13457387 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -212,11 +212,10 @@ static int tracepoint_add_func(struct tracepoint *tp,
}
/*
- * rcu_assign_pointer has a smp_wmb() which makes sure that the new
- * probe callbacks array is consistent before setting a pointer to it.
- * This array is referenced by __DO_TRACE from
- * include/linux/tracepoints.h. A matching smp_read_barrier_depends()
- * is used.
+ * rcu_assign_pointer has as smp_store_release() which makes sure
+ * that the new probe callbacks array is consistent before setting
+ * a pointer to it. This array is referenced by __DO_TRACE from
+ * include/linux/tracepoint.h using rcu_dereference_sched().
*/
rcu_assign_pointer(tp->funcs, tp_funcs);
if (!static_key_enabled(&tp->key))
diff --git a/kernel/uid16.c b/kernel/uid16.c
index ce74a4901d2b..ef1da2a5f9bd 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
return retval;
}
+ groups_sort(group_info);
retval = set_current_groups(group_info);
put_group_info(group_info);
diff --git a/kernel/umh.c b/kernel/umh.c
index 6ff9905250ff..18e5fa4b0e71 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -537,14 +537,14 @@ static int proc_cap_handler(struct ctl_table *table, int write,
/*
* Drop everything not in the new_cap (but don't add things)
*/
- spin_lock(&umh_sysctl_lock);
if (write) {
+ spin_lock(&umh_sysctl_lock);
if (table->data == CAP_BSET)
usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
if (table->data == CAP_PI)
usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ spin_unlock(&umh_sysctl_lock);
}
- spin_unlock(&umh_sysctl_lock);
return 0;
}
diff --git a/kernel/user.c b/kernel/user.c
index 00281add65b2..9a20acce460d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,26 +26,32 @@
struct user_namespace init_user_ns = {
.uid_map = {
.nr_extents = 1,
- .extent[0] = {
- .first = 0,
- .lower_first = 0,
- .count = 4294967295U,
+ {
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
},
},
.gid_map = {
.nr_extents = 1,
- .extent[0] = {
- .first = 0,
- .lower_first = 0,
- .count = 4294967295U,
+ {
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
},
},
.projid_map = {
.nr_extents = 1,
- .extent[0] = {
- .first = 0,
- .lower_first = 0,
- .count = 4294967295U,
+ {
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
},
},
.count = ATOMIC_INIT(3),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index d32b45662fb6..246d4d4ce5c7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -23,6 +23,8 @@
#include <linux/ctype.h>
#include <linux/projid.h>
#include <linux/fs_struct.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
static DEFINE_MUTEX(userns_state_mutex);
@@ -181,6 +183,18 @@ static void free_user_ns(struct work_struct *work)
do {
struct ucounts *ucounts = ns->ucounts;
parent = ns->parent;
+ if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+ kfree(ns->gid_map.forward);
+ kfree(ns->gid_map.reverse);
+ }
+ if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+ kfree(ns->uid_map.forward);
+ kfree(ns->uid_map.reverse);
+ }
+ if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+ kfree(ns->projid_map.forward);
+ kfree(ns->projid_map.reverse);
+ }
retire_userns_sysctls(ns);
#ifdef CONFIG_PERSISTENT_KEYRINGS
key_put(ns->persistent_keyring_register);
@@ -198,26 +212,101 @@ void __put_user_ns(struct user_namespace *ns)
}
EXPORT_SYMBOL(__put_user_ns);
-static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+/**
+ * idmap_key struct holds the information necessary to find an idmapping in a
+ * sorted idmap array. It is passed to cmp_map_id() as first argument.
+ */
+struct idmap_key {
+ bool map_up; /* true -> id from kid; false -> kid from id */
+ u32 id; /* id to find */
+ u32 count; /* == 0 unless used with map_id_range_down() */
+};
+
+/**
+ * cmp_map_id - Function to be passed to bsearch() to find the requested
+ * idmapping. Expects struct idmap_key to be passed via @k.
+ */
+static int cmp_map_id(const void *k, const void *e)
{
- unsigned idx, extents;
+ u32 first, last, id2;
+ const struct idmap_key *key = k;
+ const struct uid_gid_extent *el = e;
+
+ id2 = key->id + key->count - 1;
+
+ /* handle map_id_{down,up}() */
+ if (key->map_up)
+ first = el->lower_first;
+ else
+ first = el->first;
+
+ last = first + el->count - 1;
+
+ if (key->id >= first && key->id <= last &&
+ (id2 >= first && id2 <= last))
+ return 0;
+
+ if (key->id < first || id2 < first)
+ return -1;
+
+ return 1;
+}
+
+/**
+ * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static struct uid_gid_extent *
+map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
+{
+ struct idmap_key key;
+
+ key.map_up = false;
+ key.count = count;
+ key.id = id;
+
+ return bsearch(&key, map->forward, extents,
+ sizeof(struct uid_gid_extent), cmp_map_id);
+}
+
+/**
+ * map_id_range_down_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static struct uid_gid_extent *
+map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
+{
+ unsigned idx;
u32 first, last, id2;
id2 = id + count - 1;
/* Find the matching extent */
- extents = map->nr_extents;
- smp_rmb();
for (idx = 0; idx < extents; idx++) {
first = map->extent[idx].first;
last = first + map->extent[idx].count - 1;
if (id >= first && id <= last &&
(id2 >= first && id2 <= last))
- break;
+ return &map->extent[idx];
}
+ return NULL;
+}
+
+static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+{
+ struct uid_gid_extent *extent;
+ unsigned extents = map->nr_extents;
+ smp_rmb();
+
+ if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ extent = map_id_range_down_base(extents, map, id, count);
+ else
+ extent = map_id_range_down_max(extents, map, id, count);
+
/* Map the id or note failure */
- if (idx < extents)
- id = (id - first) + map->extent[idx].lower_first;
+ if (extent)
+ id = (id - extent->first) + extent->lower_first;
else
id = (u32) -1;
@@ -226,44 +315,61 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
static u32 map_id_down(struct uid_gid_map *map, u32 id)
{
- unsigned idx, extents;
+ return map_id_range_down(map, id, 1);
+}
+
+/**
+ * map_id_up_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static struct uid_gid_extent *
+map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
+{
+ unsigned idx;
u32 first, last;
/* Find the matching extent */
- extents = map->nr_extents;
- smp_rmb();
for (idx = 0; idx < extents; idx++) {
- first = map->extent[idx].first;
+ first = map->extent[idx].lower_first;
last = first + map->extent[idx].count - 1;
if (id >= first && id <= last)
- break;
+ return &map->extent[idx];
}
- /* Map the id or note failure */
- if (idx < extents)
- id = (id - first) + map->extent[idx].lower_first;
- else
- id = (u32) -1;
+ return NULL;
+}
- return id;
+/**
+ * map_id_up_max - Find idmap via binary search in ordered idmap array.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static struct uid_gid_extent *
+map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
+{
+ struct idmap_key key;
+
+ key.map_up = true;
+ key.count = 1;
+ key.id = id;
+
+ return bsearch(&key, map->reverse, extents,
+ sizeof(struct uid_gid_extent), cmp_map_id);
}
static u32 map_id_up(struct uid_gid_map *map, u32 id)
{
- unsigned idx, extents;
- u32 first, last;
-
- /* Find the matching extent */
- extents = map->nr_extents;
+ struct uid_gid_extent *extent;
+ unsigned extents = map->nr_extents;
smp_rmb();
- for (idx = 0; idx < extents; idx++) {
- first = map->extent[idx].lower_first;
- last = first + map->extent[idx].count - 1;
- if (id >= first && id <= last)
- break;
- }
+
+ if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ extent = map_id_up_base(extents, map, id);
+ else
+ extent = map_id_up_max(extents, map, id);
+
/* Map the id or note failure */
- if (idx < extents)
- id = (id - first) + map->extent[idx].first;
+ if (extent)
+ id = (id - extent->lower_first) + extent->first;
else
id = (u32) -1;
@@ -540,13 +646,17 @@ static int projid_m_show(struct seq_file *seq, void *v)
static void *m_start(struct seq_file *seq, loff_t *ppos,
struct uid_gid_map *map)
{
- struct uid_gid_extent *extent = NULL;
loff_t pos = *ppos;
+ unsigned extents = map->nr_extents;
+ smp_rmb();
- if (pos < map->nr_extents)
- extent = &map->extent[pos];
+ if (pos >= extents)
+ return NULL;
- return extent;
+ if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ return &map->extent[pos];
+
+ return &map->forward[pos];
}
static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
@@ -618,7 +728,10 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
u32 prev_upper_last, prev_lower_last;
struct uid_gid_extent *prev;
- prev = &new_map->extent[idx];
+ if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ prev = &new_map->extent[idx];
+ else
+ prev = &new_map->forward[idx];
prev_upper_first = prev->first;
prev_lower_first = prev->lower_first;
@@ -638,6 +751,101 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
return false;
}
+/**
+ * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
+ * Takes care to allocate a 4K block of memory if the number of mappings exceeds
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
+{
+ struct uid_gid_extent *dest;
+
+ if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
+ struct uid_gid_extent *forward;
+
+ /* Allocate memory for 340 mappings. */
+ forward = kmalloc(sizeof(struct uid_gid_extent) *
+ UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL);
+ if (!forward)
+ return -ENOMEM;
+
+ /* Copy over memory. Only set up memory for the forward pointer.
+ * Defer the memory setup for the reverse pointer.
+ */
+ memcpy(forward, map->extent,
+ map->nr_extents * sizeof(map->extent[0]));
+
+ map->forward = forward;
+ map->reverse = NULL;
+ }
+
+ if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS)
+ dest = &map->extent[map->nr_extents];
+ else
+ dest = &map->forward[map->nr_extents];
+
+ *dest = *extent;
+ map->nr_extents++;
+ return 0;
+}
+
+/* cmp function to sort() forward mappings */
+static int cmp_extents_forward(const void *a, const void *b)
+{
+ const struct uid_gid_extent *e1 = a;
+ const struct uid_gid_extent *e2 = b;
+
+ if (e1->first < e2->first)
+ return -1;
+
+ if (e1->first > e2->first)
+ return 1;
+
+ return 0;
+}
+
+/* cmp function to sort() reverse mappings */
+static int cmp_extents_reverse(const void *a, const void *b)
+{
+ const struct uid_gid_extent *e1 = a;
+ const struct uid_gid_extent *e2 = b;
+
+ if (e1->lower_first < e2->lower_first)
+ return -1;
+
+ if (e1->lower_first > e2->lower_first)
+ return 1;
+
+ return 0;
+}
+
+/**
+ * sort_idmaps - Sorts an array of idmap entries.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static int sort_idmaps(struct uid_gid_map *map)
+{
+ if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ return 0;
+
+ /* Sort forward array. */
+ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent),
+ cmp_extents_forward, NULL);
+
+ /* Only copy the memory from forward we actually need. */
+ map->reverse = kmemdup(map->forward,
+ map->nr_extents * sizeof(struct uid_gid_extent),
+ GFP_KERNEL);
+ if (!map->reverse)
+ return -ENOMEM;
+
+ /* Sort reverse array. */
+ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent),
+ cmp_extents_reverse, NULL);
+
+ return 0;
+}
+
static ssize_t map_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos,
int cap_setid,
@@ -648,7 +856,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct user_namespace *ns = seq->private;
struct uid_gid_map new_map;
unsigned idx;
- struct uid_gid_extent *extent = NULL;
+ struct uid_gid_extent extent;
char *kbuf = NULL, *pos, *next_line;
ssize_t ret = -EINVAL;
@@ -673,6 +881,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
*/
mutex_lock(&userns_state_mutex);
+ memset(&new_map, 0, sizeof(struct uid_gid_map));
+
ret = -EPERM;
/* Only allow one successful write to the map */
if (map->nr_extents != 0)
@@ -700,9 +910,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
/* Parse the user data */
ret = -EINVAL;
pos = kbuf;
- new_map.nr_extents = 0;
for (; pos; pos = next_line) {
- extent = &new_map.extent[new_map.nr_extents];
/* Find the end of line and ensure I don't look past it */
next_line = strchr(pos, '\n');
@@ -714,17 +922,17 @@ static ssize_t map_write(struct file *file, const char __user *buf,
}
pos = skip_spaces(pos);
- extent->first = simple_strtoul(pos, &pos, 10);
+ extent.first = simple_strtoul(pos, &pos, 10);
if (!isspace(*pos))
goto out;
pos = skip_spaces(pos);
- extent->lower_first = simple_strtoul(pos, &pos, 10);
+ extent.lower_first = simple_strtoul(pos, &pos, 10);
if (!isspace(*pos))
goto out;
pos = skip_spaces(pos);
- extent->count = simple_strtoul(pos, &pos, 10);
+ extent.count = simple_strtoul(pos, &pos, 10);
if (*pos && !isspace(*pos))
goto out;
@@ -734,29 +942,31 @@ static ssize_t map_write(struct file *file, const char __user *buf,
goto out;
/* Verify we have been given valid starting values */
- if ((extent->first == (u32) -1) ||
- (extent->lower_first == (u32) -1))
+ if ((extent.first == (u32) -1) ||
+ (extent.lower_first == (u32) -1))
goto out;
/* Verify count is not zero and does not cause the
* extent to wrap
*/
- if ((extent->first + extent->count) <= extent->first)
+ if ((extent.first + extent.count) <= extent.first)
goto out;
- if ((extent->lower_first + extent->count) <=
- extent->lower_first)
+ if ((extent.lower_first + extent.count) <=
+ extent.lower_first)
goto out;
/* Do the ranges in extent overlap any previous extents? */
- if (mappings_overlap(&new_map, extent))
+ if (mappings_overlap(&new_map, &extent))
goto out;
- new_map.nr_extents++;
-
- /* Fail if the file contains too many extents */
- if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
+ if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
(next_line != NULL))
goto out;
+
+ ret = insert_extent(&new_map, &extent);
+ if (ret < 0)
+ goto out;
+ ret = -EINVAL;
}
/* Be very certaint the new map actually exists */
if (new_map.nr_extents == 0)
@@ -767,16 +977,26 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
goto out;
+ ret = sort_idmaps(&new_map);
+ if (ret < 0)
+ goto out;
+
+ ret = -EPERM;
/* Map the lower ids from the parent user namespace to the
* kernel global id space.
*/
for (idx = 0; idx < new_map.nr_extents; idx++) {
+ struct uid_gid_extent *e;
u32 lower_first;
- extent = &new_map.extent[idx];
+
+ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ e = &new_map.extent[idx];
+ else
+ e = &new_map.forward[idx];
lower_first = map_id_range_down(parent_map,
- extent->lower_first,
- extent->count);
+ e->lower_first,
+ e->count);
/* Fail if we can not map the specified extent to
* the kernel global id space.
@@ -784,18 +1004,31 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (lower_first == (u32) -1)
goto out;
- extent->lower_first = lower_first;
+ e->lower_first = lower_first;
}
/* Install the map */
- memcpy(map->extent, new_map.extent,
- new_map.nr_extents*sizeof(new_map.extent[0]));
+ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
+ memcpy(map->extent, new_map.extent,
+ new_map.nr_extents * sizeof(new_map.extent[0]));
+ } else {
+ map->forward = new_map.forward;
+ map->reverse = new_map.reverse;
+ }
smp_wmb();
map->nr_extents = new_map.nr_extents;
*ppos = count;
ret = count;
out:
+ if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+ kfree(new_map.forward);
+ kfree(new_map.reverse);
+ map->forward = NULL;
+ map->reverse = NULL;
+ map->nr_extents = 0;
+ }
+
mutex_unlock(&userns_state_mutex);
kfree(kbuf);
return ret;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7368b57842ea..8dd2e66e8383 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -38,7 +38,6 @@
#include <linux/hardirq.h>
#include <linux/mempolicy.h>
#include <linux/freezer.h>
-#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
@@ -48,6 +47,8 @@
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
+#include <linux/sched/isolation.h>
+#include <linux/nmi.h>
#include "workqueue_internal.h"
@@ -1509,7 +1510,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work = &dwork->work;
WARN_ON_ONCE(!wq);
- WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn);
+ WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
WARN_ON_ONCE(timer_pending(timer));
WARN_ON_ONCE(!list_empty(&work->entry));
@@ -1634,7 +1635,7 @@ static void worker_enter_idle(struct worker *worker)
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
/*
- * Sanity check nr_running. Because wq_unbind_fn() releases
+ * Sanity check nr_running. Because unbind_workers() releases
* pool->lock between setting %WORKER_UNBOUND and zapping
* nr_running, the warning may trigger spuriously. Check iff
* unbind is not in progress.
@@ -2135,7 +2136,7 @@ __acquires(&pool->lock)
* stop_machine. At the same time, report a quiescent RCU state so
* the same condition doesn't freeze RCU.
*/
- cond_resched_rcu_qs();
+ cond_resched();
spin_lock_irq(&pool->lock);
@@ -3939,6 +3940,37 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
return clamp_val(max_active, 1, lim);
}
+/*
+ * Workqueues which may be used during memory reclaim should have a rescuer
+ * to guarantee forward progress.
+ */
+static int init_rescuer(struct workqueue_struct *wq)
+{
+ struct worker *rescuer;
+ int ret;
+
+ if (!(wq->flags & WQ_MEM_RECLAIM))
+ return 0;
+
+ rescuer = alloc_worker(NUMA_NO_NODE);
+ if (!rescuer)
+ return -ENOMEM;
+
+ rescuer->rescue_wq = wq;
+ rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
+ ret = PTR_ERR_OR_ZERO(rescuer->task);
+ if (ret) {
+ kfree(rescuer);
+ return ret;
+ }
+
+ wq->rescuer = rescuer;
+ kthread_bind_mask(rescuer->task, cpu_possible_mask);
+ wake_up_process(rescuer->task);
+
+ return 0;
+}
+
struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
unsigned int flags,
int max_active,
@@ -4001,29 +4033,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
if (alloc_and_link_pwqs(wq) < 0)
goto err_free_wq;
- /*
- * Workqueues which may be used during memory reclaim should
- * have a rescuer to guarantee forward progress.
- */
- if (flags & WQ_MEM_RECLAIM) {
- struct worker *rescuer;
-
- rescuer = alloc_worker(NUMA_NO_NODE);
- if (!rescuer)
- goto err_destroy;
-
- rescuer->rescue_wq = wq;
- rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
- wq->name);
- if (IS_ERR(rescuer->task)) {
- kfree(rescuer);
- goto err_destroy;
- }
-
- wq->rescuer = rescuer;
- kthread_bind_mask(rescuer->task, cpu_possible_mask);
- wake_up_process(rescuer->task);
- }
+ if (wq_online && init_rescuer(wq) < 0)
+ goto err_destroy;
if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
goto err_destroy;
@@ -4463,6 +4474,12 @@ void show_workqueue_state(void)
if (pwq->nr_active || !list_empty(&pwq->delayed_works))
show_pwq(pwq);
spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_workqueue_state(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
}
}
@@ -4490,6 +4507,12 @@ void show_workqueue_state(void)
pr_cont("\n");
next_pool:
spin_unlock_irqrestore(&pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_workqueue_state(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
}
rcu_read_unlock_sched();
@@ -4510,9 +4533,8 @@ void show_workqueue_state(void)
* cpu comes back online.
*/
-static void wq_unbind_fn(struct work_struct *work)
+static void unbind_workers(int cpu)
{
- int cpu = smp_processor_id();
struct worker_pool *pool;
struct worker *worker;
@@ -4589,16 +4611,6 @@ static void rebind_workers(struct worker_pool *pool)
spin_lock_irq(&pool->lock);
- /*
- * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
- * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is
- * being reworked and this can go away in time.
- */
- if (!(pool->flags & POOL_DISASSOCIATED)) {
- spin_unlock_irq(&pool->lock);
- return;
- }
-
pool->flags &= ~POOL_DISASSOCIATED;
for_each_pool_worker(worker, pool) {
@@ -4709,12 +4721,13 @@ int workqueue_online_cpu(unsigned int cpu)
int workqueue_offline_cpu(unsigned int cpu)
{
- struct work_struct unbind_work;
struct workqueue_struct *wq;
/* unbinding per-cpu workers should happen on the local CPU */
- INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
- queue_work_on(cpu, system_highpri_wq, &unbind_work);
+ if (WARN_ON(cpu != smp_processor_id()))
+ return -1;
+
+ unbind_workers(cpu);
/* update NUMA affinity of unbound workqueues */
mutex_lock(&wq_pool_mutex);
@@ -4722,9 +4735,6 @@ int workqueue_offline_cpu(unsigned int cpu)
wq_update_unbound_numa(wq, cpu, false);
mutex_unlock(&wq_pool_mutex);
- /* wait for per-cpu unbinding to finish */
- flush_work(&unbind_work);
- destroy_work_on_stack(&unbind_work);
return 0;
}
@@ -4957,6 +4967,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
return -ENOMEM;
+ /*
+ * Not excluding isolated cpus on purpose.
+ * If the user wishes to include them, we allow that.
+ */
cpumask_and(cpumask, cpumask, cpu_possible_mask);
if (!cpumask_empty(cpumask)) {
apply_wqattrs_lock();
@@ -4990,9 +5004,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
*
* Unbound workqueues have the following extra attributes.
*
- * id RO int : the associated pool ID
+ * pool_ids RO int : the associated pool IDs for each node
* nice RW int : nice value of the workers
* cpumask RW mask : bitmask of allowed CPUs for the workers
+ * numa RW bool : whether enable NUMA affinity
*/
struct wq_device {
struct workqueue_struct *wq;
@@ -5554,7 +5569,7 @@ int __init workqueue_init_early(void)
WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
- cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+ cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
@@ -5637,6 +5652,8 @@ int __init workqueue_init(void)
* archs such as power and arm64. As per-cpu pools created
* previously could be missing node hint and unbound pools NUMA
* affinity, fix them up.
+ *
+ * Also, while iterating workqueues, create rescuers if requested.
*/
wq_numa_init();
@@ -5648,8 +5665,12 @@ int __init workqueue_init(void)
}
}
- list_for_each_entry(wq, &workqueues, list)
+ list_for_each_entry(wq, &workqueues, list) {
wq_update_unbound_numa(wq, smp_processor_id(), true);
+ WARN(init_rescuer(wq),
+ "workqueue: failed to create early rescuer for %s",
+ wq->name);
+ }
mutex_unlock(&wq_pool_mutex);