summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit_watch.c14
-rw-r--r--kernel/bpf/hashtab.c30
-rw-r--r--kernel/bpf/syscall.c4
-rw-r--r--kernel/bpf/verifier.c21
-rw-r--r--kernel/cgroup/cgroup-internal.h3
-rw-r--r--kernel/cgroup/cgroup.c66
-rw-r--r--kernel/cgroup/cpuset.c2
-rw-r--r--kernel/cpu_pm.c50
-rw-r--r--kernel/events/core.c101
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/futex.c5
-rw-r--r--kernel/irq/chip.c10
-rw-r--r--kernel/irq/cpuhotplug.c9
-rw-r--r--kernel/irq/ipi.c4
-rw-r--r--kernel/kmod.c25
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/pid.c14
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/signal.c17
-rw-r--r--kernel/time/timekeeping.c4
-rw-r--r--kernel/time/timer.c52
-rw-r--r--kernel/trace/bpf_trace.c34
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/ring_buffer.c14
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c19
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/trace/tracing_map.c11
-rw-r--r--kernel/watchdog.c1
-rw-r--r--kernel/watchdog_hld.c59
-rw-r--r--kernel/workqueue.c30
38 files changed, 447 insertions, 201 deletions
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 62d686d96581..9eb8b3511636 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -66,7 +66,7 @@ static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+ FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
static void audit_free_parent(struct audit_parent *parent)
{
@@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule)
list_del(&krule->rlist);
if (list_empty(&watch->rules)) {
+ /*
+ * audit_remove_watch() drops our reference to 'parent' which
+ * can get freed. Grab our own reference to be safe.
+ */
+ audit_get_parent(parent);
audit_remove_watch(watch);
-
- if (list_empty(&parent->watches)) {
- audit_get_parent(parent);
+ if (list_empty(&parent->watches))
fsnotify_destroy_mark(&parent->mark, audit_watch_group);
- audit_put_parent(parent);
- }
+ audit_put_parent(parent);
}
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4fb463172aa8..d11c8181f4c5 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -652,12 +652,27 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
}
}
+static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
+ BITS_PER_LONG == 64;
+}
+
+static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
+{
+ u32 size = htab->map.value_size;
+
+ if (percpu || fd_htab_map_needs_adjust(htab))
+ size = round_up(size, 8);
+ return size;
+}
+
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
struct htab_elem *old_elem)
{
- u32 size = htab->map.value_size;
+ u32 size = htab_size_value(htab, percpu);
bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
@@ -696,9 +711,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
memcpy(l_new->key, key, key_size);
if (percpu) {
- /* round up value_size to 8 bytes */
- size = round_up(size, 8);
-
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
@@ -1209,17 +1221,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
{
- struct bpf_map *map;
-
if (attr->value_size != sizeof(u32))
return ERR_PTR(-EINVAL);
-
- /* pointer is stored internally */
- attr->value_size = sizeof(void *);
- map = htab_map_alloc(attr);
- attr->value_size = sizeof(u32);
-
- return map;
+ return htab_map_alloc(attr);
}
static void fd_htab_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 045646da97cc..6c772adabad2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1289,7 +1289,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info_len = min_t(u32, sizeof(info), info_len);
if (copy_from_user(&info, uinfo, info_len))
- return err;
+ return -EFAULT;
info.type = prog->type;
info.id = prog->aux->id;
@@ -1312,7 +1312,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
}
ulen = info.xlated_prog_len;
- info.xlated_prog_len = bpf_prog_size(prog->len);
+ info.xlated_prog_len = bpf_prog_insn_size(prog);
if (info.xlated_prog_len && ulen) {
uinsns = u64_to_user_ptr(info.xlated_prog_insns);
ulen = min_t(u32, info.xlated_prog_len, ulen);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index af9e84a4944e..664d93972373 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1865,10 +1865,12 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* do our normal operations to the register, we need to set the values
* to the min/max since they are undefined.
*/
- if (min_val == BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- if (max_val == BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ if (opcode != BPF_SUB) {
+ if (min_val == BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ if (max_val == BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ }
switch (opcode) {
case BPF_ADD:
@@ -1879,10 +1881,17 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
dst_reg->min_align = min(src_align, dst_align);
break;
case BPF_SUB:
+ /* If one of our values was at the end of our ranges, then the
+ * _opposite_ value in the dst_reg goes to the end of our range.
+ */
+ if (min_val == BPF_REGISTER_MIN_RANGE)
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ if (max_val == BPF_REGISTER_MAX_RANGE)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value -= min_val;
+ dst_reg->min_value -= max_val;
if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value -= max_val;
+ dst_reg->max_value -= min_val;
dst_reg->min_align = min(src_align, dst_align);
break;
case BPF_MUL:
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 793565c05742..8b4c3c2f2509 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -33,6 +33,9 @@ struct cgroup_taskset {
struct list_head src_csets;
struct list_head dst_csets;
+ /* the number of tasks in the set */
+ int nr_tasks;
+
/* the subsys currently being processed */
int ssid;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 620794a20a33..df2e0f14a95d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2006,6 +2006,8 @@ static void cgroup_migrate_add_task(struct task_struct *task,
if (!cset->mg_src_cgrp)
return;
+ mgctx->tset.nr_tasks++;
+
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
list_add_tail(&cset->mg_node,
@@ -2094,21 +2096,19 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
struct css_set *cset, *tmp_cset;
int ssid, failed_ssid, ret;
- /* methods shouldn't be called if no task is actually migrating */
- if (list_empty(&tset->src_csets))
- return 0;
-
/* check that we can legitimately attach to the cgroup */
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ss->can_attach) {
- tset->ssid = ssid;
- ret = ss->can_attach(tset);
- if (ret) {
- failed_ssid = ssid;
- goto out_cancel_attach;
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->can_attach) {
+ tset->ssid = ssid;
+ ret = ss->can_attach(tset);
+ if (ret) {
+ failed_ssid = ssid;
+ goto out_cancel_attach;
+ }
}
- }
- } while_each_subsys_mask();
+ } while_each_subsys_mask();
+ }
/*
* Now that we're guaranteed success, proceed to move all tasks to
@@ -2137,25 +2137,29 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
*/
tset->csets = &tset->dst_csets;
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ss->attach) {
- tset->ssid = ssid;
- ss->attach(tset);
- }
- } while_each_subsys_mask();
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->attach) {
+ tset->ssid = ssid;
+ ss->attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
ret = 0;
goto out_release_tset;
out_cancel_attach:
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ssid == failed_ssid)
- break;
- if (ss->cancel_attach) {
- tset->ssid = ssid;
- ss->cancel_attach(tset);
- }
- } while_each_subsys_mask();
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ssid == failed_ssid)
+ break;
+ if (ss->cancel_attach) {
+ tset->ssid = ssid;
+ ss->cancel_attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
out_release_tset:
spin_lock_irq(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets);
@@ -2997,11 +3001,11 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
cgrp->subtree_control &= ~disable;
ret = cgroup_apply_control(cgrp);
-
cgroup_finalize_control(cgrp, ret);
+ if (ret)
+ goto out_unlock;
kernfs_activate(cgrp->kn);
- ret = 0;
out_unlock:
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
@@ -4669,6 +4673,10 @@ int __init cgroup_init(void)
if (ss->bind)
ss->bind(init_css_set.subsys[ssid]);
+
+ mutex_lock(&cgroup_mutex);
+ css_populate_dir(init_css_set.subsys[ssid]);
+ mutex_unlock(&cgroup_mutex);
}
/* init_css_set.subsys[] has been updated, re-hash */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ca8376e5008c..87a1213dd326 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -63,6 +63,7 @@
#include <linux/cgroup.h>
#include <linux/wait.h>
+DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/* See "Frequency meter" comments, below. */
@@ -1891,6 +1892,7 @@ static struct cftype files[] = {
{
.name = "memory_pressure",
.read_u64 = cpuset_read_u64,
+ .private = FILE_MEMORY_PRESSURE,
},
{
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 009cc9a17d95..67b02e138a47 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -22,15 +22,21 @@
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
-static DEFINE_RWLOCK(cpu_pm_notifier_lock);
-static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
{
int ret;
- ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
+ /*
+ * __atomic_notifier_call_chain has a RCU read critical section, which
+ * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
+ * RCU know this.
+ */
+ rcu_irq_enter_irqson();
+ ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
nr_to_call, nr_calls);
+ rcu_irq_exit_irqson();
return notifier_to_errno(ret);
}
@@ -47,14 +53,7 @@ static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
*/
int cpu_pm_register_notifier(struct notifier_block *nb)
{
- unsigned long flags;
- int ret;
-
- write_lock_irqsave(&cpu_pm_notifier_lock, flags);
- ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
- write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-
- return ret;
+ return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
@@ -69,14 +68,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
*/
int cpu_pm_unregister_notifier(struct notifier_block *nb)
{
- unsigned long flags;
- int ret;
-
- write_lock_irqsave(&cpu_pm_notifier_lock, flags);
- ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
- write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-
- return ret;
+ return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
@@ -100,7 +92,6 @@ int cpu_pm_enter(void)
int nr_calls;
int ret = 0;
- read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
if (ret)
/*
@@ -108,7 +99,6 @@ int cpu_pm_enter(void)
* PM entry who are notified earlier to prepare for it.
*/
cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
return ret;
}
@@ -128,13 +118,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter);
*/
int cpu_pm_exit(void)
{
- int ret;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
}
EXPORT_SYMBOL_GPL(cpu_pm_exit);
@@ -159,7 +143,6 @@ int cpu_cluster_pm_enter(void)
int nr_calls;
int ret = 0;
- read_lock(&cpu_pm_notifier_lock);
ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
if (ret)
/*
@@ -167,7 +150,6 @@ int cpu_cluster_pm_enter(void)
* PM entry who are notified earlier to prepare for it.
*/
cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
return ret;
}
@@ -190,13 +172,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
*/
int cpu_cluster_pm_exit(void)
{
- int ret;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
}
EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ffba16d..03ac9c8b02fb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2217,6 +2217,33 @@ static int group_can_go_on(struct perf_event *event,
return can_add_hw;
}
+/*
+ * Complement to update_event_times(). This computes the tstamp_* values to
+ * continue 'enabled' state from @now, and effectively discards the time
+ * between the prior tstamp_stopped and now (as we were in the OFF state, or
+ * just switched (context) time base).
+ *
+ * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
+ * cannot have been scheduled in yet. And going into INACTIVE state means
+ * '@event->tstamp_stopped = @now'.
+ *
+ * Thus given the rules of update_event_times():
+ *
+ * total_time_enabled = tstamp_stopped - tstamp_enabled
+ * total_time_running = tstamp_stopped - tstamp_running
+ *
+ * We can insert 'tstamp_stopped == now' and reverse them to compute new
+ * tstamp_* values.
+ */
+static void __perf_event_enable_time(struct perf_event *event, u64 now)
+{
+ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
+
+ event->tstamp_stopped = now;
+ event->tstamp_enabled = now - event->total_time_enabled;
+ event->tstamp_running = now - event->total_time_running;
+}
+
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
@@ -2224,9 +2251,12 @@ static void add_event_to_ctx(struct perf_event *event,
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = tstamp;
- event->tstamp_running = tstamp;
- event->tstamp_stopped = tstamp;
+ /*
+ * We can be called with event->state == STATE_OFF when we create with
+ * .disabled = 1. In that case the IOC_ENABLE will call this function.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ __perf_event_enable_time(event, tstamp);
}
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2471,10 +2501,11 @@ static void __perf_event_mark_enabled(struct perf_event *event)
u64 tstamp = perf_event_time(event);
event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = tstamp - event->total_time_enabled;
+ __perf_event_enable_time(event, tstamp);
list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ /* XXX should not be > INACTIVE if event isn't */
if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ __perf_event_enable_time(sub, tstamp);
}
}
@@ -5090,7 +5121,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)
atomic_inc(&event->rb->aux_mmap_count);
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
}
static void perf_pmu_output_stop(struct perf_event *event);
@@ -5113,7 +5144,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
unsigned long size = perf_data_size(rb);
if (event->pmu->event_unmapped)
- event->pmu->event_unmapped(event);
+ event->pmu->event_unmapped(event, vma->vm_mm);
/*
* rb->aux_mmap_count will always drop before rb->mmap_count and
@@ -5411,7 +5442,7 @@ aux_unlock:
vma->vm_ops = &perf_mmap_vmops;
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
return ret;
}
@@ -7875,16 +7906,15 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
}
perf_tp_event(call->event.type, count, raw_data, size, regs, head,
- rctx, task);
+ rctx, task, NULL);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
- struct task_struct *task)
+ struct task_struct *task, struct perf_event *event)
{
struct perf_sample_data data;
- struct perf_event *event;
struct perf_raw_record raw = {
.frag = {
@@ -7898,9 +7928,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
perf_trace_buf_update(record, event_type);
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ /* Use the given event instead of the hlist */
+ if (event) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
+ } else {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
}
/*
@@ -9580,6 +9616,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (ret)
return -EFAULT;
+ attr->size = size;
+
if (attr->__reserved_1)
return -EINVAL;
@@ -10001,28 +10039,27 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
/*
- * Do not allow to attach to a group in a different
- * task or CPU context:
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
*/
- if (move_group) {
- /*
- * Make sure we're both on the same task, or both
- * per-cpu events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ if (group_leader->cpu != event->cpu)
+ goto err_context;
- /*
- * Make sure we're both events for the same CPU;
- * grouping events for different CPUs is broken; since
- * you can never concurrently schedule them anyhow.
- */
- if (group_leader->cpu != event->cpu)
- goto err_context;
- } else {
- if (group_leader->ctx != ctx)
- goto err_context;
- }
+ /*
+ * Make sure we're both on the same task, or both
+ * per-CPU events.
+ */
+ if (group_leader->ctx->task != ctx->task)
+ goto err_context;
+
+ /*
+ * Do not allow to attach to a group in a different task
+ * or CPU context. If we're moving SW events, we'll fix
+ * this up later, so allow that.
+ */
+ if (!move_group && group_leader->ctx != ctx)
+ goto err_context;
/*
* Only a group leader can be exclusive or pinned
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 0e137f98a50c..267f6ef91d97 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1262,8 +1262,6 @@ void uprobe_end_dup_mmap(void)
void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
- newmm->uprobes_state.xol_area = NULL;
-
if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
set_bit(MMF_HAS_UPROBES, &newmm->flags);
/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
diff --git a/kernel/fork.c b/kernel/fork.c
index 17921b0390b4..b7e9e57b71ea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -785,6 +785,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
+static void mm_init_uprobes_state(struct mm_struct *mm)
+{
+#ifdef CONFIG_UPROBES
+ mm->uprobes_state.xol_area = NULL;
+#endif
+}
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
@@ -806,11 +813,13 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
- clear_tlb_flush_pending(mm);
+ init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
+ mm_init_uprobes_state(mm);
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
diff --git a/kernel/futex.c b/kernel/futex.c
index 16dbe4c93895..f50b434756c1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -670,13 +670,14 @@ again:
* this reference was taken by ihold under the page lock
* pinning the inode in place so i_lock was unnecessary. The
* only way for this check to fail is if the inode was
- * truncated in parallel so warn for now if this happens.
+ * truncated in parallel which is almost certainly an
+ * application bug. In such a case, just retry.
*
* We are not calling into get_futex_key_refs() in file-backed
* cases, therefore a successful atomic_inc return below will
* guarantee that get_futex_key() will still imply smp_mb(); (B).
*/
- if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
+ if (!atomic_inc_not_zero(&inode->i_count)) {
rcu_read_unlock();
put_page(page);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a3cc37c0c85e..3675c6004f2a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1000,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
- unsigned long flags;
+ unsigned long flags, trigger, tmp;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
@@ -1014,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
irq_settings_clr_and_set(desc, clr, set);
+ trigger = irqd_get_trigger_type(&desc->irq_data);
+
irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
if (irq_settings_has_no_balance_set(desc))
@@ -1025,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
if (irq_settings_is_level(desc))
irqd_set(&desc->irq_data, IRQD_LEVEL);
- irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+ tmp = irq_settings_get_trigger_mask(desc);
+ if (tmp != IRQ_TYPE_NONE)
+ trigger = tmp;
+
+ irqd_set(&desc->irq_data, trigger);
irq_put_desc_unlock(desc, flags);
}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index aee8f7ec40af..638eb9c83d9f 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -95,8 +95,13 @@ static bool migrate_one_irq(struct irq_desc *desc)
affinity = cpu_online_mask;
brokeaff = true;
}
-
- err = irq_do_set_affinity(d, affinity, true);
+ /*
+ * Do not set the force argument of irq_do_set_affinity() as this
+ * disables the masking of offline CPUs from the supplied affinity
+ * mask and therefore might keep/reassign the irq to the outgoing
+ * CPU.
+ */
+ err = irq_do_set_affinity(d, affinity, false);
if (err) {
pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
d->irq, err);
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 1a9abc1c8ea0..259a22aa9934 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
struct irq_data *data = irq_get_irq_data(irq);
struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
- if (!data || !ipimask || cpu > nr_cpu_ids)
+ if (!data || !ipimask || cpu >= nr_cpu_ids)
return INVALID_HWIRQ;
if (!cpumask_test_cpu(cpu, ipimask))
@@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
if (!chip->ipi_send_single && !chip->ipi_send_mask)
return -EINVAL;
- if (cpu > nr_cpu_ids)
+ if (cpu >= nr_cpu_ids)
return -EINVAL;
if (dest) {
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6d016c5d97c8..2f37acde640b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -71,6 +71,18 @@ static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
/*
+ * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
+ * running at the same time without returning. When this happens we
+ * believe you've somehow ended up with a recursive module dependency
+ * creating a loop.
+ *
+ * We have no option but to fail.
+ *
+ * Userspace should proactively try to detect and prevent these.
+ */
+#define MAX_KMOD_ALL_BUSY_TIMEOUT 5
+
+/*
modprobe_path is set via /proc/sys.
*/
char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
@@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...)
pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
atomic_read(&kmod_concurrent_max),
MAX_KMOD_CONCURRENT, module_name);
- wait_event_interruptible(kmod_wq,
- atomic_dec_if_positive(&kmod_concurrent_max) >= 0);
+ ret = wait_event_killable_timeout(kmod_wq,
+ atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
+ MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
+ if (!ret) {
+ pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
+ module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
+ return -ETIME;
+ } else if (ret == -ERESTARTSYS) {
+ pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
+ return ret;
+ }
}
trace_module_request(module_name, wait, _RET_IP_);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 26db528c1d88..1c19edf82427 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -637,6 +637,7 @@ repeat:
schedule();
try_to_freeze();
+ cond_resched();
goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
diff --git a/kernel/pid.c b/kernel/pid.c
index 731c4e528f4e..020dedbdf066 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (!ns)
ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
- if (type != PIDTYPE_PID)
+ if (type != PIDTYPE_PID) {
+ if (type == __PIDTYPE_TGID)
+ type = PIDTYPE_PID;
task = task->group_leader;
+ }
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
}
rcu_read_unlock();
@@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
}
EXPORT_SYMBOL(__task_pid_nr_ns);
-pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return pid_nr_ns(task_tgid(tsk), ns);
-}
-EXPORT_SYMBOL(task_tgid_nr_ns);
-
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));
@@ -575,13 +572,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
- unsigned int pidhash_size;
-
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
- pidhash_size = 1U << pidhash_shift;
}
void __init pidmap_init(void)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 222317721c5a..0972a8e09d08 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1650,7 +1650,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
{
unsigned long size;
- size = global_page_state(NR_SLAB_RECLAIMABLE)
+ size = global_node_page_state(NR_SLAB_RECLAIMABLE)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 17c667b427b4..0869b20fba81 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2069,7 +2069,7 @@ out:
/**
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
- * @cookie: context's cookie for pinning
+ * @rf: request-queue flags for pinning
*
* Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 17f11c6b0a9f..d6afed6d0752 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -70,9 +70,10 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
-
- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ int ret = curr->func(curr, mode, wake_flags, key);
+ if (ret < 0)
+ break;
+ if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
diff --git a/kernel/signal.c b/kernel/signal.c
index caed9133ae52..ed804a470dcd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
recalc_sigpending_and_wake(t);
}
}
- if (action->sa.sa_handler == SIG_DFL)
+ /*
+ * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
+ * debugging to leave init killable.
+ */
+ if (action->sa.sa_handler == SIG_DFL && !t->ptrace)
t->signal->flags &= ~SIGNAL_UNKILLABLE;
ret = specific_send_sig_info(sig, info, t);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
@@ -3303,12 +3307,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
+#ifdef __BIG_ENDIAN
sigset_t set;
- int err = do_sigpending(&set, sizeof(old_sigset_t));
- if (err == 0)
- if (copy_to_user(set32, &set, sizeof(old_sigset_t)))
- err = -EFAULT;
+ int err = do_sigpending(&set, sizeof(set.sig[0]));
+ if (!err)
+ err = put_user(set.sig[0], set32);
return err;
+#else
+ return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32));
+#endif
}
#endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cedafa008de5..7e7e61c00d61 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -637,9 +637,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
tk->ktime_sec = seconds;
/* Update the monotonic raw base */
- seconds = tk->raw_sec;
- nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift);
- tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
/* must hold timekeeper_lock */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 71ce3f4eead3..f2674a056c26 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -203,6 +203,7 @@ struct timer_base {
bool migration_enabled;
bool nohz_active;
bool is_idle;
+ bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
- unsigned long jnow = READ_ONCE(jiffies);
+ unsigned long jnow;
/*
- * We only forward the base when it's idle and we have a delta between
- * base clock and jiffies.
+ * We only forward the base when we are idle or have just come out of
+ * idle (must_forward_clk logic), and have a delta between base clock
+ * and jiffies. In the common case, run_timers will take care of it.
*/
- if (!base->is_idle || (long) (jnow - base->clk) < 2)
+ if (likely(!base->must_forward_clk))
+ return;
+
+ jnow = READ_ONCE(jiffies);
+ base->must_forward_clk = base->is_idle;
+ if ((long)(jnow - base->clk) < 2)
return;
/*
@@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* same array bucket then just return:
*/
if (timer_pending(timer)) {
+ /*
+ * The downside of this optimization is that it can result in
+ * larger granularity than you would get from adding a new
+ * timer with this expiry.
+ */
if (timer->expires == expires)
return 1;
@@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* dequeue/enqueue dance.
*/
base = lock_timer_base(timer, &flags);
+ forward_timer_base(base);
clk = base->clk;
idx = calc_wheel_index(expires, clk);
@@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
}
} else {
base = lock_timer_base(timer, &flags);
+ forward_timer_base(base);
}
ret = detach_if_pending(timer, base, false);
@@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
raw_spin_lock(&base->lock);
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | base->cpu);
+ forward_timer_base(base);
}
}
- /* Try to forward a stale timer base clock */
- forward_timer_base(base);
-
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
@@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | cpu);
}
+ forward_timer_base(base);
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
@@ -1495,12 +1508,18 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
base->is_idle = false;
} else {
if (!is_max_delta)
- expires = basem + (nextevt - basej) * TICK_NSEC;
+ expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
- * If we expect to sleep more than a tick, mark the base idle:
+ * If we expect to sleep more than a tick, mark the base idle.
+ * Also the tick is stopped so any added timer must forward
+ * the base clk itself to keep granularity small. This idle
+ * logic is only maintained for the BASE_STD base, deferrable
+ * timers may still see large granularity skew (by design).
*/
- if ((expires - basem) > TICK_NSEC)
+ if ((expires - basem) > TICK_NSEC) {
+ base->must_forward_clk = true;
base->is_idle = true;
+ }
}
raw_spin_unlock(&base->lock);
@@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ /*
+ * must_forward_clk must be cleared before running timers so that any
+ * timer functions that call mod_timer will not try to forward the
+ * base. idle trcking / clock forwarding logic is only used with
+ * BASE_STD timers.
+ *
+ * The deferrable base does not do idle tracking at all, so we do
+ * not forward it. This can result in very large variations in
+ * granularity for deferrable timers, but they can be deferred for
+ * long periods due to idle.
+ */
+ base->must_forward_clk = false;
+
__run_timers(base);
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 37385193a608..dc498b605d5d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -204,10 +204,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
fmt_cnt++;
}
- return __trace_printk(1/* fake ip will not be printed */, fmt,
- mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
- mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
- mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
+/* Horrid workaround for getting va_list handling working with different
+ * argument type combinations generically for 32 and 64 bit archs.
+ */
+#define __BPF_TP_EMIT() __BPF_ARG3_TP()
+#define __BPF_TP(...) \
+ __trace_printk(1 /* Fake ip will not be printed. */, \
+ fmt, ##__VA_ARGS__)
+
+#define __BPF_ARG1_TP(...) \
+ ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_TP(arg1, ##__VA_ARGS__) \
+ : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_TP((long)arg1, ##__VA_ARGS__) \
+ : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
+
+#define __BPF_ARG2_TP(...) \
+ ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \
+ : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \
+ : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
+
+#define __BPF_ARG3_TP(...) \
+ ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \
+ : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \
+ : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
+
+ return __BPF_TP_EMIT();
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02004ae91860..96cea88fa00f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -889,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
function_profile_call(trace->func, 0, NULL, NULL);
+ /* If function graph is shutting down, ret_stack can be NULL */
+ if (!current->ret_stack)
+ return 0;
+
if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
current->ret_stack[index].subtime = 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 529cc50d7243..81279c6602ff 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* the page that was allocated, with the read page of the buffer.
*
* Returns:
- * The page allocated, or NULL on error.
+ * The page allocated, or ERR_PTR
*/
void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
{
- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_page *bpage = NULL;
unsigned long flags;
struct page *page;
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return ERR_PTR(-ENODEV);
+
+ cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
@@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
if (!page)
- return NULL;
+ return ERR_PTR(-ENOMEM);
bpage = page_address(page);
@@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
*
* for example:
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
- * if (!rpage)
- * return error;
+ * if (IS_ERR(rpage))
+ * return PTR_ERR(rpage);
* ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
* if (ret >= 0)
* process_page(rpage, ret);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 9fbcaf567886..68ee79afe31c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -113,7 +113,7 @@ static enum event_status read_page(int cpu)
int i;
bpage = ring_buffer_alloc_read_page(buffer, cpu);
- if (!bpage)
+ if (IS_ERR(bpage))
return EVENT_DROPPED;
ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 42b9355033d4..44004d8aa3b3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6598,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
- ssize_t ret;
+ ssize_t ret = 0;
ssize_t size;
if (!count)
@@ -6612,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
iter->cpu_file);
- info->spare_cpu = iter->cpu_file;
+ if (IS_ERR(info->spare)) {
+ ret = PTR_ERR(info->spare);
+ info->spare = NULL;
+ } else {
+ info->spare_cpu = iter->cpu_file;
+ }
}
if (!info->spare)
- return -ENOMEM;
+ return ret;
/* Do we have previous read data to read? */
if (info->read < PAGE_SIZE)
@@ -6790,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
ref->ref = 1;
ref->buffer = iter->trace_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
- if (!ref->page) {
- ret = -ENOMEM;
+ if (IS_ERR(ref->page)) {
+ ret = PTR_ERR(ref->page);
+ ref->page = NULL;
kfree(ref);
break;
}
@@ -8293,6 +8299,7 @@ __init static int tracer_alloc_buffers(void)
if (ret < 0)
goto out_free_cpumask;
/* Used for event triggers */
+ ret = -ENOMEM;
temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
if (!temp_buffer)
goto out_rm_hp_state;
@@ -8407,4 +8414,4 @@ __init static int clear_boot_tracer(void)
}
fs_initcall(tracer_init_tracefs);
-late_initcall(clear_boot_tracer);
+late_initcall_sync(clear_boot_tracer);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 562fa69df5d3..13ba2d3f6a91 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -306,6 +306,7 @@ static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *pt_regs)
{
+ struct perf_event *event;
struct ftrace_entry *entry;
struct hlist_head *head;
struct pt_regs regs;
@@ -329,8 +330,9 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
entry->ip = ip;
entry->parent_ip = parent_ip;
+ event = container_of(ops, struct perf_event, ftrace_ops);
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
- 1, &regs, head, NULL);
+ 1, &regs, head, NULL, event);
#undef ENTRY_SIZE
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 59a411ff60c7..181e139a8057 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call,
if (err && set_str)
append_filter_err(ps, filter);
}
+ if (err && !set_str) {
+ free_event_filter(filter);
+ filter = NULL;
+ }
create_filter_finish(ps);
*filterp = filter;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c9b5aa10fbf9..8a907e12b6b9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
memset(&entry[1], 0, dsize);
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
NOKPROBE_SYMBOL(kprobe_perf_func);
@@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e10395da88e..74d9a86eccc0 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -596,7 +596,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
(unsigned long *)&rec->args);
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
static int perf_sysenter_enable(struct trace_event_call *call)
@@ -667,7 +667,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
- 1, regs, head, NULL);
+ 1, regs, head, NULL, NULL);
}
static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a7581fec9681..4525e0271a53 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
}
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
out:
preempt_enable();
}
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 0a689bbb78ef..305039b122fa 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a)
if (!a)
return;
- if (!a->pages) {
- kfree(a);
- return;
- }
+ if (!a->pages)
+ goto free;
for (i = 0; i < a->n_pages; i++) {
if (!a->pages[i])
break;
free_page((unsigned long)a->pages[i]);
}
+
+ kfree(a->pages);
+
+ free:
+ kfree(a);
}
struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 06d3389bca0d..f5d52024f6b7 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -240,6 +240,7 @@ static void set_sample_period(void)
* hardlockup detector generates a warning
*/
sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+ watchdog_update_hrtimer_threshold(sample_period);
}
/* Commands for resetting the watchdog */
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 295a0d84934c..3a09ea1b1d3d 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void)
}
EXPORT_SYMBOL(arch_touch_nmi_watchdog);
+#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
+static DEFINE_PER_CPU(ktime_t, last_timestamp);
+static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
+static ktime_t watchdog_hrtimer_sample_threshold __read_mostly;
+
+void watchdog_update_hrtimer_threshold(u64 period)
+{
+ /*
+ * The hrtimer runs with a period of (watchdog_threshold * 2) / 5
+ *
+ * So it runs effectively with 2.5 times the rate of the NMI
+ * watchdog. That means the hrtimer should fire 2-3 times before
+ * the NMI watchdog expires. The NMI watchdog on x86 is based on
+ * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles
+ * might run way faster than expected and the NMI fires in a
+ * smaller period than the one deduced from the nominal CPU
+ * frequency. Depending on the Turbo-Mode factor this might be fast
+ * enough to get the NMI period smaller than the hrtimer watchdog
+ * period and trigger false positives.
+ *
+ * The sample threshold is used to check in the NMI handler whether
+ * the minimum time between two NMI samples has elapsed. That
+ * prevents false positives.
+ *
+ * Set this to 4/5 of the actual watchdog threshold period so the
+ * hrtimer is guaranteed to fire at least once within the real
+ * watchdog threshold.
+ */
+ watchdog_hrtimer_sample_threshold = period * 2;
+}
+
+static bool watchdog_check_timestamp(void)
+{
+ ktime_t delta, now = ktime_get_mono_fast_ns();
+
+ delta = now - __this_cpu_read(last_timestamp);
+ if (delta < watchdog_hrtimer_sample_threshold) {
+ /*
+ * If ktime is jiffies based, a stalled timer would prevent
+ * jiffies from being incremented and the filter would look
+ * at a stale timestamp and never trigger.
+ */
+ if (__this_cpu_inc_return(nmi_rearmed) < 10)
+ return false;
+ }
+ __this_cpu_write(nmi_rearmed, 0);
+ __this_cpu_write(last_timestamp, now);
+ return true;
+}
+#else
+static inline bool watchdog_check_timestamp(void)
+{
+ return true;
+}
+#endif
+
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
@@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
return;
}
+ if (!watchdog_check_timestamp())
+ return;
+
/* check for a hardlockup
* This is done by making sure our timer interrupt
* is incrementing. The timer interrupt should have
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a86688fabc55..ca937b0c3a96 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3577,6 +3577,13 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
/* yeap, return possible CPUs in @node that @attrs wants */
cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+
+ if (cpumask_empty(cpumask)) {
+ pr_warn_once("WARNING: workqueue cpumask: online intersect > "
+ "possible intersect\n");
+ return false;
+ }
+
return !cpumask_equal(cpumask, attrs->cpumask);
use_dfl:
@@ -3744,8 +3751,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
return -EINVAL;
/* creating multiple pwqs breaks ordering guarantee */
- if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
- return -EINVAL;
+ if (!list_empty(&wq->pwqs)) {
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
+ return -EINVAL;
+
+ wq->flags &= ~__WQ_ORDERED;
+ }
ctx = apply_wqattrs_prepare(wq, attrs);
if (!ctx)
@@ -3929,6 +3940,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
+ /*
+ * Unbound && max_active == 1 used to imply ordered, which is no
+ * longer the case on NUMA machines due to per-node pools. While
+ * alloc_ordered_workqueue() is the right way to create an ordered
+ * workqueue, keep the previous behavior to avoid subtle breakages
+ * on NUMA.
+ */
+ if ((flags & WQ_UNBOUND) && max_active == 1)
+ flags |= __WQ_ORDERED;
+
/* see the comment above the definition of WQ_POWER_EFFICIENT */
if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
flags |= WQ_UNBOUND;
@@ -4119,13 +4140,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
struct pool_workqueue *pwq;
/* disallow meddling with max_active for ordered workqueues */
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return;
max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
mutex_lock(&wq->mutex);
+ wq->flags &= ~__WQ_ORDERED;
wq->saved_max_active = max_active;
for_each_pwq(pwq, wq)
@@ -5253,7 +5275,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
* attributes breaks ordering guarantee. Disallow exposing ordered
* workqueues.
*/
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return -EINVAL;
wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);