summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c18
-rw-r--r--kernel/bpf/core.c4
-rw-r--r--kernel/bpf/verifier.c71
-rw-r--r--kernel/cgroup/cgroup.c1
-rw-r--r--kernel/delayacct.c42
-rw-r--r--kernel/events/core.c47
-rw-r--r--kernel/futex.c92
-rw-r--r--kernel/irq/matrix.c20
-rw-r--r--kernel/jump_label.c12
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/locking/rtmutex.c26
-rw-r--r--kernel/locking/rtmutex_common.h1
-rw-r--r--kernel/pid.c14
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/sched/fair.c4
-rw-r--r--kernel/time/hrtimer.c3
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/ftrace.c29
-rw-r--r--kernel/trace/ring_buffer.c61
-rw-r--r--kernel/trace/trace.c34
-rw-r--r--kernel/trace/trace_events.c16
-rw-r--r--kernel/trace/trace_events_trigger.c13
-rw-r--r--kernel/trace/trace_functions.c49
-rw-r--r--kernel/workqueue.c13
25 files changed, 442 insertions, 140 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index aaa319848e7d..ab94d304a634 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -56,7 +56,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
u32 elem_size, index_mask, max_entries;
bool unpriv = !capable(CAP_SYS_ADMIN);
struct bpf_array *array;
- u64 array_size;
+ u64 array_size, mask64;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -74,13 +74,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
elem_size = round_up(attr->value_size, 8);
max_entries = attr->max_entries;
- index_mask = roundup_pow_of_two(max_entries) - 1;
- if (unpriv)
+ /* On 32 bit archs roundup_pow_of_two() with max_entries that has
+ * upper most bit set in u32 space is undefined behavior due to
+ * resulting 1U << 32, so do it manually here in u64 space.
+ */
+ mask64 = fls_long(max_entries - 1);
+ mask64 = 1ULL << mask64;
+ mask64 -= 1;
+
+ index_mask = mask64;
+ if (unpriv) {
/* round up array size to nearest power of 2,
* since cpu will speculate within index_mask limits
*/
max_entries = index_mask + 1;
+ /* Check for overflows. */
+ if (max_entries < attr->max_entries)
+ return ERR_PTR(-E2BIG);
+ }
array_size = sizeof(*array);
if (percpu)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 51ec2dda7f08..7949e8b8f94e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -956,7 +956,7 @@ select_insn:
DST = tmp;
CONT;
ALU_MOD_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
DST = do_div(tmp, (u32) SRC);
@@ -975,7 +975,7 @@ select_insn:
DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
do_div(tmp, (u32) SRC);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b414d6b2d470..13551e623501 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -978,6 +978,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
}
+static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = cur_regs(env) + regno;
+
+ return reg->type == PTR_TO_CTX;
+}
+
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size, bool strict)
@@ -1258,6 +1265,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
return -EACCES;
}
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose(env, "BPF_XADD stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check whether atomic_add can read the memory */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
@@ -1882,17 +1895,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg = &regs[dst];
- if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
- print_verifier_state(env, env->cur_state);
- verbose(env,
- "verifier internal error: known but bad sbounds\n");
- return -EINVAL;
- }
- if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
- print_verifier_state(env, env->cur_state);
- verbose(env,
- "verifier internal error: known but bad ubounds\n");
- return -EINVAL;
+ if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
+ smin_val > smax_val || umin_val > umax_val) {
+ /* Taint dst register if offset had invalid bounds derived from
+ * e.g. dead branches.
+ */
+ __mark_reg_unknown(dst_reg);
+ return 0;
}
if (BPF_CLASS(insn->code) != BPF_ALU64) {
@@ -2084,6 +2093,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
src_known = tnum_is_const(src_reg.var_off);
dst_known = tnum_is_const(dst_reg->var_off);
+ if ((src_known && (smin_val != smax_val || umin_val != umax_val)) ||
+ smin_val > smax_val || umin_val > umax_val) {
+ /* Taint dst register if offset had invalid bounds derived from
+ * e.g. dead branches.
+ */
+ __mark_reg_unknown(dst_reg);
+ return 0;
+ }
+
if (!src_known &&
opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
__mark_reg_unknown(dst_reg);
@@ -2493,6 +2511,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
+ verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
+ return -EINVAL;
+ }
+
if ((opcode == BPF_LSH || opcode == BPF_RSH ||
opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -3988,6 +4011,12 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose(env, "BPF_ST stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
@@ -4440,6 +4469,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
int i, cnt, delta = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+ /* due to JIT bugs clear upper 32-bits of src register
+ * before div/mod operation
+ */
+ insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
+ insn_buf[1] = *insn;
+ cnt = 2;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
if (insn->code != (BPF_JMP | BPF_CALL))
continue;
@@ -4472,7 +4519,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
map_ptr = env->insn_aux_data[i + delta].map_ptr;
if (map_ptr == BPF_MAP_PTR_POISON) {
- verbose(env, "tail_call obusing map_ptr\n");
+ verbose(env, "tail_call abusing map_ptr\n");
return -EINVAL;
}
if (!map_ptr->unpriv_array)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 2cf06c274e4c..7e4c44538119 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4447,6 +4447,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cgroup.threads",
+ .flags = CFTYPE_NS_DELEGATABLE,
.release = cgroup_procs_release,
.seq_start = cgroup_threads_start,
.seq_next = cgroup_procs_next,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 4a1c33416b6a..e2764d767f18 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk)
* Finish delay accounting for a statistic using its timestamps (@start),
* accumalator (@total) and @count
*/
-static void delayacct_end(u64 *start, u64 *total, u32 *count)
+static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
{
s64 ns = ktime_get_ns() - *start;
unsigned long flags;
if (ns > 0) {
- spin_lock_irqsave(&current->delays->lock, flags);
+ spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
- spin_unlock_irqrestore(&current->delays->lock, flags);
+ spin_unlock_irqrestore(lock, flags);
}
}
@@ -69,17 +69,25 @@ void __delayacct_blkio_start(void)
current->delays->blkio_start = ktime_get_ns();
}
-void __delayacct_blkio_end(void)
+/*
+ * We cannot rely on the `current` macro, as we haven't yet switched back to
+ * the process being woken.
+ */
+void __delayacct_blkio_end(struct task_struct *p)
{
- if (current->delays->flags & DELAYACCT_PF_SWAPIN)
- /* Swapin block I/O */
- delayacct_end(&current->delays->blkio_start,
- &current->delays->swapin_delay,
- &current->delays->swapin_count);
- else /* Other block I/O */
- delayacct_end(&current->delays->blkio_start,
- &current->delays->blkio_delay,
- &current->delays->blkio_count);
+ struct task_delay_info *delays = p->delays;
+ u64 *total;
+ u32 *count;
+
+ if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
+ total = &delays->swapin_delay;
+ count = &delays->swapin_count;
+ } else {
+ total = &delays->blkio_delay;
+ count = &delays->blkio_count;
+ }
+
+ delayacct_end(&delays->lock, &delays->blkio_start, total, count);
}
int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -153,8 +161,10 @@ void __delayacct_freepages_start(void)
void __delayacct_freepages_end(void)
{
- delayacct_end(&current->delays->freepages_start,
- &current->delays->freepages_delay,
- &current->delays->freepages_count);
+ delayacct_end(
+ &current->delays->lock,
+ &current->delays->freepages_start,
+ &current->delays->freepages_delay,
+ &current->delays->freepages_count);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4df5b695bf0d..5d8f4031f8d5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
+ *
+ * cpu_hotplug_lock
+ * pmus_lock
+ * cpuctx->mutex / perf_event_context::mutex
*/
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
@@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct perf_event *child, *tmp;
+ LIST_HEAD(free_list);
/*
* If we got here through err_file: fput(event_file); we will not have
@@ -4268,8 +4273,7 @@ again:
struct perf_event, child_list);
if (tmp == child) {
perf_remove_from_context(child, DETACH_GROUP);
- list_del(&child->child_list);
- free_event(child);
+ list_move(&child->child_list, &free_list);
/*
* This matches the refcount bump in inherit_event();
* this can't be the last reference.
@@ -4284,6 +4288,11 @@ again:
}
mutex_unlock(&event->child_mutex);
+ list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+ list_del(&child->child_list);
+ free_event(child);
+ }
+
no_ctx:
put_event(event); /* Must be the 'last' reference */
return 0;
@@ -8516,6 +8525,29 @@ fail_clear_files:
return ret;
}
+static int
+perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
+{
+ struct perf_event_context *ctx = event->ctx;
+ int ret;
+
+ /*
+ * Beware, here be dragons!!
+ *
+ * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
+ * stuff does not actually need it. So temporarily drop ctx->mutex. As per
+ * perf_event_ctx_lock() we already have a reference on ctx.
+ *
+ * This can result in event getting moved to a different ctx, but that
+ * does not affect the tracepoint state.
+ */
+ mutex_unlock(&ctx->mutex);
+ ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+ mutex_lock(&ctx->mutex);
+
+ return ret;
+}
+
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
char *filter_str;
@@ -8532,8 +8564,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
event->attr.type == PERF_TYPE_TRACEPOINT)
- ret = ftrace_profile_set_filter(event, event->attr.config,
- filter_str);
+ ret = perf_tracepoint_set_filter(event, filter_str);
else if (has_addr_filter(event))
ret = perf_event_set_addr_filter(event, filter_str);
@@ -9168,7 +9199,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (!try_module_get(pmu->module))
return -ENODEV;
- if (event->group_leader != event) {
+ /*
+ * A number of pmu->event_init() methods iterate the sibling_list to,
+ * for example, validate if the group fits on the PMU. Therefore,
+ * if this is a sibling event, acquire the ctx->mutex to protect
+ * the sibling_list.
+ */
+ if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
/*
* This ctx->mutex can nest when we're called through
* inheritance. See the perf_event_ctx_lock_nested() comment.
diff --git a/kernel/futex.c b/kernel/futex.c
index 57d0b3657e16..7f719d110908 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
+ if (nr_wake < 0 || nr_requeue < 0)
+ return -EINVAL;
+
/*
* When PI not supported: return -ENOSYS if requeue_pi is true,
* consequently the compiler knows requeue_pi is always false past
@@ -2294,34 +2297,33 @@ static void unqueue_me_pi(struct futex_q *q)
spin_unlock(q->lock_ptr);
}
-/*
- * Fixup the pi_state owner with the new owner.
- *
- * Must be called with hash bucket lock held and mm->sem held for non
- * private futexes.
- */
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner)
+ struct task_struct *argowner)
{
- u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
u32 uval, uninitialized_var(curval), newval;
- struct task_struct *oldowner;
+ struct task_struct *oldowner, *newowner;
+ u32 newtid;
int ret;
+ lockdep_assert_held(q->lock_ptr);
+
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
oldowner = pi_state->owner;
- /* Owner died? */
- if (!pi_state->owner)
- newtid |= FUTEX_OWNER_DIED;
/*
- * We are here either because we stole the rtmutex from the
- * previous highest priority waiter or we are the highest priority
- * waiter but have failed to get the rtmutex the first time.
+ * We are here because either:
+ *
+ * - we stole the lock and pi_state->owner needs updating to reflect
+ * that (@argowner == current),
+ *
+ * or:
*
- * We have to replace the newowner TID in the user space variable.
+ * - someone stole our lock and we need to fix things to point to the
+ * new owner (@argowner == NULL).
+ *
+ * Either way, we have to replace the TID in the user space variable.
* This must be atomic as we have to preserve the owner died bit here.
*
* Note: We write the user space value _before_ changing the pi_state
@@ -2334,6 +2336,45 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* in the PID check in lookup_pi_state.
*/
retry:
+ if (!argowner) {
+ if (oldowner != current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+ /* We got the lock after all, nothing to fix. */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * Since we just failed the trylock; there must be an owner.
+ */
+ newowner = rt_mutex_owner(&pi_state->pi_mutex);
+ BUG_ON(!newowner);
+ } else {
+ WARN_ON_ONCE(argowner != current);
+ if (oldowner == current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ ret = 0;
+ goto out_unlock;
+ }
+ newowner = argowner;
+ }
+
+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ /* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;
@@ -2434,9 +2475,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case:
*
- * We can safely read pi_state->owner without holding wait_lock
- * because we now own the rt_mutex, only the owner will attempt
- * to change it.
+ * Speculative pi_state->owner read (we don't hold wait_lock);
+ * since we own the lock pi_state->owner == current is the
+ * stable state, anything else needs more attention.
*/
if (q->pi_state->owner != current)
ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2444,6 +2485,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
}
/*
+ * If we didn't get the lock; check if anybody stole it from us. In
+ * that case, we need to fix up the uval to point to them instead of
+ * us, otherwise bad things happen. [10]
+ *
+ * Another speculative read; pi_state->owner == current is unstable
+ * but needs our attention.
+ */
+ if (q->pi_state->owner == current) {
+ ret = fixup_pi_state_owner(uaddr, q, NULL);
+ goto out;
+ }
+
+ /*
* Paranoia check. If we did not take the lock, then we should not be
* the owner of the rt_mutex.
*/
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 0ba0dd8863a7..5187dfe809ac 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -321,15 +321,23 @@ void irq_matrix_remove_reserved(struct irq_matrix *m)
int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
bool reserved, unsigned int *mapped_cpu)
{
- unsigned int cpu;
+ unsigned int cpu, best_cpu, maxavl = 0;
+ struct cpumap *cm;
+ unsigned int bit;
+ best_cpu = UINT_MAX;
for_each_cpu(cpu, msk) {
- struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
- unsigned int bit;
+ cm = per_cpu_ptr(m->maps, cpu);
- if (!cm->online)
+ if (!cm->online || cm->available <= maxavl)
continue;
+ best_cpu = cpu;
+ maxavl = cm->available;
+ }
+
+ if (maxavl) {
+ cm = per_cpu_ptr(m->maps, best_cpu);
bit = matrix_alloc_area(m, cm, 1, false);
if (bit < m->alloc_end) {
cm->allocated++;
@@ -338,8 +346,8 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
m->global_available--;
if (reserved)
m->global_reserved--;
- *mapped_cpu = cpu;
- trace_irq_matrix_alloc(bit, cpu, m, cm);
+ *mapped_cpu = best_cpu;
+ trace_irq_matrix_alloc(bit, best_cpu, m, cm);
return bit;
}
}
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8594d24e4adc..b4517095db6a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,7 +79,7 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);
-static void static_key_slow_inc_cpuslocked(struct static_key *key)
+void static_key_slow_inc_cpuslocked(struct static_key *key)
{
int v, v1;
@@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_disable);
-static void static_key_slow_dec_cpuslocked(struct static_key *key,
+static void __static_key_slow_dec_cpuslocked(struct static_key *key,
unsigned long rate_limit,
struct delayed_work *work)
{
@@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key,
struct delayed_work *work)
{
cpus_read_lock();
- static_key_slow_dec_cpuslocked(key, rate_limit, work);
+ __static_key_slow_dec_cpuslocked(key, rate_limit, work);
cpus_read_unlock();
}
@@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
+void static_key_slow_dec_cpuslocked(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE(key);
+ __static_key_slow_dec_cpuslocked(key, 0, NULL);
+}
+
void static_key_slow_dec_deferred(struct static_key_deferred *key)
{
STATIC_KEY_CHECK_USE(key);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 5fa1324a4f29..521659044719 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -49,6 +49,7 @@
#include <linux/gfp.h>
#include <linux/random.h>
#include <linux/jhash.h>
+#include <linux/nmi.h>
#include <asm/sections.h>
@@ -4490,6 +4491,7 @@ retry:
if (!unlock)
if (read_trylock(&tasklist_lock))
unlock = 1;
+ touch_nmi_watchdog();
} while_each_thread(g, p);
pr_warn("\n");
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6f3dba6e4e9e..65cc0cb984e6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
return ret;
}
+static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ int ret = try_to_take_rt_mutex(lock, current, NULL);
+
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ return ret;
+}
+
/*
* Slow path try-lock function:
*/
@@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
- ret = try_to_take_rt_mutex(lock, current, NULL);
-
- /*
- * try_to_take_rt_mutex() sets the lock waiters bit
- * unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
+ ret = __rt_mutex_slowtrylock(lock);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
@@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
return rt_mutex_slowtrylock(lock);
}
+int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
/**
* rt_mutex_timed_lock - lock a rt_mutex interruptible
* the timeout structure is provided
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 124e98ca0b17..68686b3ec3c1 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter);
extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
diff --git a/kernel/pid.c b/kernel/pid.c
index 1e8bb6550ec4..5d30c87e3c42 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,7 +41,19 @@
#include <linux/sched/task.h>
#include <linux/idr.h>
-struct pid init_struct_pid = INIT_STRUCT_PID;
+struct pid init_struct_pid = {
+ .count = ATOMIC_INIT(1),
+ .tasks = {
+ { .first = NULL },
+ { .first = NULL },
+ { .first = NULL },
+ },
+ .level = 0,
+ .numbers = { {
+ .nr = 0,
+ .ns = &init_pid_ns,
+ }, }
+};
int pid_max = PID_MAX_DEFAULT;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 644fa2e3d993..a7bf32aabfda 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->state = TASK_WAKING;
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
@@ -2069,7 +2069,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
#else /* CONFIG_SMP */
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
@@ -2122,7 +2122,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
if (!task_on_rq_queued(p)) {
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&rq->nr_iowait);
}
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2fe3aa853e4d..26a71ebcd3c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4365,12 +4365,12 @@ static inline bool cfs_bandwidth_used(void)
void cfs_bandwidth_usage_inc(void)
{
- static_key_slow_inc(&__cfs_bandwidth_used);
+ static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
}
void cfs_bandwidth_usage_dec(void)
{
- static_key_slow_dec(&__cfs_bandwidth_used);
+ static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d32520840fde..aa9d2a2b1210 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -655,7 +655,9 @@ static void hrtimer_reprogram(struct hrtimer *timer,
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
{
base->expires_next = KTIME_MAX;
+ base->hang_detected = 0;
base->hres_active = 0;
+ base->next_timer = NULL;
}
/*
@@ -1589,6 +1591,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
timerqueue_init_head(&cpu_base->clock_base[i].active);
}
+ cpu_base->active_bases = 0;
cpu_base->cpu = cpu;
hrtimer_init_hres(cpu_base);
return 0;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 89a9e1b4264a..0bcf00e3ce48 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1696,7 +1696,7 @@ void run_local_timers(void)
hrtimer_run_queues();
/* Raise the softirq only if required. */
if (time_before(jiffies, base->clk)) {
- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
return;
/* CPU is awake, so check the deferrable base. */
base++;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 904c952ac383..f54dc62b599c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -355,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES
on if you need to profile the system's use of these macros.
config PROFILE_ALL_BRANCHES
- bool "Profile all if conditionals"
+ bool "Profile all if conditionals" if !FORTIFY_SOURCE
select TRACE_BRANCH_PROFILING
help
This tracer profiles all branch conditions. Every if ()
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ccdf3664e4a9..554b517c61a0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = {
};
/*
- * This is used by __kernel_text_address() to return true if the
- * address is on a dynamically allocated trampoline that would
- * not return true for either core_kernel_text() or
- * is_module_text_address().
+ * Used by the stack undwinder to know about dynamic ftrace trampolines.
*/
-bool is_ftrace_trampoline(unsigned long addr)
+struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
{
- struct ftrace_ops *op;
- bool ret = false;
+ struct ftrace_ops *op = NULL;
/*
* Some of the ops may be dynamically allocated,
@@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr)
if (op->trampoline && op->trampoline_size)
if (addr >= op->trampoline &&
addr < op->trampoline + op->trampoline_size) {
- ret = true;
- goto out;
+ preempt_enable_notrace();
+ return op;
}
} while_for_each_ftrace_op(op);
-
- out:
preempt_enable_notrace();
- return ret;
+ return NULL;
+}
+
+/*
+ * This is used by __kernel_text_address() to return true if the
+ * address is on a dynamically allocated trampoline that would
+ * not return true for either core_kernel_text() or
+ * is_module_text_address().
+ */
+bool is_ftrace_trampoline(unsigned long addr)
+{
+ return ftrace_ops_trampoline(addr) != NULL;
}
struct ftrace_page {
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9ab18995ff1e..5af2842dea96 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2534,29 +2534,58 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
* by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. There are four
- * different contexts that we need to consider.
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
*
- * Normal context.
- * SoftIRQ context
- * IRQ context
- * NMI context
+ * bit 0 = NMI context
+ * bit 1 = IRQ context
+ * bit 2 = SoftIRQ context
+ * bit 3 = normal context.
*
- * If for some reason the ring buffer starts to recurse, we
- * only allow that to happen at most 4 times (one for each
- * context). If it happens 5 times, then we consider this a
- * recusive loop and do not let it go further.
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
+ *
+ * (binary)
+ * 101 - 1 = 100
+ * 101 & 100 = 100 (clearing bit zero)
+ *
+ * 1010 - 1 = 1001
+ * 1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
*/
static __always_inline int
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
- if (cpu_buffer->current_context >= 4)
+ unsigned int val = cpu_buffer->current_context;
+ unsigned long pc = preempt_count();
+ int bit;
+
+ if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+ bit = RB_CTX_NORMAL;
+ else
+ bit = pc & NMI_MASK ? RB_CTX_NMI :
+ pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
+
+ if (unlikely(val & (1 << bit)))
return 1;
- cpu_buffer->current_context++;
- /* Interrupts must see this update */
- barrier();
+ val |= (1 << bit);
+ cpu_buffer->current_context = val;
return 0;
}
@@ -2564,9 +2593,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
static __always_inline void
trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
- /* Don't let the dec leak out */
- barrier();
- cpu_buffer->current_context--;
+ cpu_buffer->current_context &= cpu_buffer->current_context - 1;
}
/**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2a8d8a294345..8e3f20a18a06 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
+/*
+ * Skip 3:
+ *
+ * trace_buffer_unlock_commit_regs()
+ * trace_event_buffer_commit()
+ * trace_event_raw_event_xxx()
+*/
+# define STACK_SKIP 3
+
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct ring_buffer *buffer,
struct ring_buffer_event *event,
@@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
__buffer_unlock_commit(buffer, event);
/*
- * If regs is not set, then skip the following callers:
- * trace_buffer_unlock_commit_regs
- * event_trigger_unlock_commit
- * trace_event_buffer_commit
- * trace_event_raw_event_sched_switch
+ * If regs is not set, then skip the necessary functions.
* Note, we can still get here via blktrace, wakeup tracer
* and mmiotrace, but that's ok if they lose a function or
- * two. They are that meaningful.
+ * two. They are not that meaningful.
*/
- ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
+ ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
@@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
trace.skip = skip;
/*
- * Add two, for this function and the call to save_stack_trace()
+ * Add one, for this function and the call to save_stack_trace()
* If regs is set, then these functions will not be in the way.
*/
+#ifndef CONFIG_UNWINDER_ORC
if (!regs)
- trace.skip += 2;
+ trace.skip++;
+#endif
/*
* Since events can happen in NMIs there's no safe way to
@@ -2711,11 +2718,10 @@ void trace_dump_stack(int skip)
local_save_flags(flags);
- /*
- * Skip 3 more, seems to get us at the caller of
- * this function.
- */
- skip += 3;
+#ifndef CONFIG_UNWINDER_ORC
+ /* Skip 1 to skip this function. */
+ skip++;
+#endif
__ftrace_trace_stack(global_trace.trace_buffer.buffer,
flags, skip, preempt_count(), NULL);
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ec0f9aa4e151..1b87157edbff 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
+ bool first = false;
int last_i;
int i;
@@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
list_for_each_entry_safe(call, p, &ftrace_events, list) {
/* events are usually grouped together with systems */
if (!last_system || call->class->system != last_system) {
+ first = true;
last_i = 0;
last_system = call->class->system;
}
+ /*
+ * Since calls are grouped by systems, the likelyhood that the
+ * next call in the iteration belongs to the same system as the
+ * previous call is high. As an optimization, we skip seaching
+ * for a map[] that matches the call's system if the last call
+ * was from the same system. That's what last_i is for. If the
+ * call has the same system as the previous call, then last_i
+ * will be the index of the first map[] that has a matching
+ * system.
+ */
for (i = last_i; i < len; i++) {
if (call->class->system == map[i]->system) {
/* Save the first system if need be */
- if (!last_i)
+ if (first) {
last_i = i;
+ first = false;
+ }
update_event_printk(call, map[i]);
}
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index f2ac9d44f6c4..87411482a46f 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
#endif /* CONFIG_TRACER_SNAPSHOT */
#ifdef CONFIG_STACKTRACE
+#ifdef CONFIG_UNWINDER_ORC
+/* Skip 2:
+ * event_triggers_post_call()
+ * trace_event_raw_event_xxx()
+ */
+# define STACK_SKIP 2
+#else
/*
- * Skip 3:
+ * Skip 4:
* stacktrace_trigger()
* event_triggers_post_call()
+ * trace_event_buffer_commit()
* trace_event_raw_event_xxx()
*/
-#define STACK_SKIP 3
+#define STACK_SKIP 4
+#endif
static void
stacktrace_trigger(struct event_trigger_data *data, void *rec)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 27f7ad12c4b1..b611cd36e22d 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
preempt_enable_notrace();
}
+#ifdef CONFIG_UNWINDER_ORC
+/*
+ * Skip 2:
+ *
+ * function_stack_trace_call()
+ * ftrace_call()
+ */
+#define STACK_SKIP 2
+#else
+/*
+ * Skip 3:
+ * __trace_stack()
+ * function_stack_trace_call()
+ * ftrace_call()
+ */
+#define STACK_SKIP 3
+#endif
+
static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
@@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (likely(disabled == 1)) {
pc = preempt_count();
trace_function(tr, ip, parent_ip, flags, pc);
- /*
- * skip over 5 funcs:
- * __ftrace_trace_stack,
- * __trace_stack,
- * function_stack_trace_call
- * ftrace_list_func
- * ftrace_call
- */
- __trace_stack(tr, flags, 5, pc);
+ __trace_stack(tr, flags, STACK_SKIP, pc);
}
atomic_dec(&data->disabled);
@@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
tracer_tracing_off(tr);
}
+#ifdef CONFIG_UNWINDER_ORC
/*
- * Skip 4:
+ * Skip 3:
+ *
+ * function_trace_probe_call()
+ * ftrace_ops_assist_func()
+ * ftrace_call()
+ */
+#define FTRACE_STACK_SKIP 3
+#else
+/*
+ * Skip 5:
+ *
+ * __trace_stack()
* ftrace_stacktrace()
* function_trace_probe_call()
- * ftrace_ops_list_func()
+ * ftrace_ops_assist_func()
* ftrace_call()
*/
-#define STACK_SKIP 4
+#define FTRACE_STACK_SKIP 5
+#endif
static __always_inline void trace_stack(struct trace_array *tr)
{
@@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr)
local_save_flags(flags);
pc = preempt_count();
- __trace_stack(tr, flags, STACK_SKIP, pc);
+ __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc);
}
static void
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 43d18cb46308..f699122dab32 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,6 +48,7 @@
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
+#include <linux/nmi.h>
#include "workqueue_internal.h"
@@ -4463,6 +4464,12 @@ void show_workqueue_state(void)
if (pwq->nr_active || !list_empty(&pwq->delayed_works))
show_pwq(pwq);
spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_workqueue_state(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
}
}
@@ -4490,6 +4497,12 @@ void show_workqueue_state(void)
pr_cont("\n");
next_pool:
spin_unlock_irqrestore(&pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_workqueue_state(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
}
rcu_read_unlock_sched();