summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2019-02-11 08:01:50 +0100
committerIngo Molnar <mingo@kernel.org>2019-02-11 08:01:50 +0100
commitc9ba7560c550fe6c1f4a8f0666bea41d2a349d1d (patch)
treeb15d53e69f73b160b4625e91187cd436c5a3e683 /kernel
parentf6783319737f28e4436a69611853a5a098cbe974 (diff)
parentd13937116f1e82bf508a6325111b322c30c85eb9 (diff)
downloadlinux-c9ba7560c550fe6c1f4a8f0666bea41d2a349d1d.tar.bz2
Merge tag 'v5.0-rc6' into sched/core, to pick up fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/btf.c3
-rw-r--r--kernel/bpf/cgroup.c2
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/percpu_freelist.c41
-rw-r--r--kernel/bpf/percpu_freelist.h4
-rw-r--r--kernel/bpf/syscall.c12
-rw-r--r--kernel/cpu.c38
-rw-r--r--kernel/events/core.c14
-rw-r--r--kernel/events/ring_buffer.c3
-rw-r--r--kernel/exit.c14
-rw-r--r--kernel/futex.c45
-rw-r--r--kernel/irq/irqdesc.c2
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/locking/rtmutex.c37
-rw-r--r--kernel/locking/rwsem-xadd.c11
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/sched/core.c19
-rw-r--r--kernel/sched/fair.c1
-rw-r--r--kernel/sched/psi.c21
-rw-r--r--kernel/signal.c63
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/time/posix-cpu-timers.c1
-rw-r--r--kernel/trace/bpf_trace.c14
-rw-r--r--kernel/trace/trace_uprobe.c9
-rw-r--r--kernel/workqueue.c23
-rw-r--r--kernel/workqueue_internal.h6
26 files changed, 282 insertions, 114 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index befe570be5ba..c57bd10340ed 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1459,7 +1459,8 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
/* "typedef void new_void", "const void"...etc */
if (!btf_type_is_void(next_type) &&
- !btf_type_is_fwd(next_type)) {
+ !btf_type_is_fwd(next_type) &&
+ !btf_type_is_func_proto(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ab612fe9862f..d17d05570a3f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -572,7 +572,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
bpf_compute_and_save_data_end(skb, &saved_data_end);
ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
- bpf_prog_run_save_cb);
+ __bpf_prog_run_save_cb);
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
skb->sk = save_sk;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4b7c76765d9d..f9274114c88d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -686,7 +686,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
}
if (htab_is_prealloc(htab)) {
- pcpu_freelist_push(&htab->freelist, &l->fnode);
+ __pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
l->htab = htab;
@@ -748,7 +748,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
} else {
struct pcpu_freelist_node *l;
- l = pcpu_freelist_pop(&htab->freelist);
+ l = __pcpu_freelist_pop(&htab->freelist);
if (!l)
return ERR_PTR(-E2BIG);
l_new = container_of(l, struct htab_elem, fnode);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 673fa6fe2d73..0c1b4ba9e90e 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -28,8 +28,8 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
free_percpu(s->freelist);
}
-static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
- struct pcpu_freelist_node *node)
+static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
+ struct pcpu_freelist_node *node)
{
raw_spin_lock(&head->lock);
node->next = head->first;
@@ -37,12 +37,22 @@ static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
raw_spin_unlock(&head->lock);
}
-void pcpu_freelist_push(struct pcpu_freelist *s,
+void __pcpu_freelist_push(struct pcpu_freelist *s,
struct pcpu_freelist_node *node)
{
struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
- __pcpu_freelist_push(head, node);
+ ___pcpu_freelist_push(head, node);
+}
+
+void pcpu_freelist_push(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __pcpu_freelist_push(s, node);
+ local_irq_restore(flags);
}
void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
@@ -63,7 +73,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
for_each_possible_cpu(cpu) {
again:
head = per_cpu_ptr(s->freelist, cpu);
- __pcpu_freelist_push(head, buf);
+ ___pcpu_freelist_push(head, buf);
i++;
buf += elem_size;
if (i == nr_elems)
@@ -74,14 +84,12 @@ again:
local_irq_restore(flags);
}
-struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
- unsigned long flags;
int orig_cpu, cpu;
- local_irq_save(flags);
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
@@ -89,16 +97,25 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
node = head->first;
if (node) {
head->first = node->next;
- raw_spin_unlock_irqrestore(&head->lock, flags);
+ raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
- if (cpu == orig_cpu) {
- local_irq_restore(flags);
+ if (cpu == orig_cpu)
return NULL;
- }
}
}
+
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+ struct pcpu_freelist_node *ret;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ret = __pcpu_freelist_pop(s);
+ local_irq_restore(flags);
+ return ret;
+}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3049aae8ea1e..c3960118e617 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -22,8 +22,12 @@ struct pcpu_freelist_node {
struct pcpu_freelist_node *next;
};
+/* pcpu_freelist_* do spin_lock_irqsave. */
void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
+/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */
+void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *);
void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
u32 nr_elems);
int pcpu_freelist_init(struct pcpu_freelist *);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b155cd17c1bd..8577bb7f8be6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -713,8 +713,13 @@ static int map_lookup_elem(union bpf_attr *attr)
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_lookup_elem(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ goto done;
+ }
+
+ preempt_disable();
+ this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value);
@@ -744,7 +749,10 @@ static int map_lookup_elem(union bpf_attr *attr)
}
rcu_read_unlock();
}
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+done:
if (err)
goto free_value;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 91d5c38eb7e5..d1c6d152da89 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -376,9 +376,6 @@ void __weak arch_smt_update(void) { }
#ifdef CONFIG_HOTPLUG_SMT
enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
-EXPORT_SYMBOL_GPL(cpu_smt_control);
-
-static bool cpu_smt_available __read_mostly;
void __init cpu_smt_disable(bool force)
{
@@ -397,25 +394,11 @@ void __init cpu_smt_disable(bool force)
/*
* The decision whether SMT is supported can only be done after the full
- * CPU identification. Called from architecture code before non boot CPUs
- * are brought up.
- */
-void __init cpu_smt_check_topology_early(void)
-{
- if (!topology_smt_supported())
- cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
-}
-
-/*
- * If SMT was disabled by BIOS, detect it here, after the CPUs have been
- * brought online. This ensures the smt/l1tf sysfs entries are consistent
- * with reality. cpu_smt_available is set to true during the bringup of non
- * boot CPUs when a SMT sibling is detected. Note, this may overwrite
- * cpu_smt_control's previous setting.
+ * CPU identification. Called from architecture code.
*/
void __init cpu_smt_check_topology(void)
{
- if (!cpu_smt_available)
+ if (!topology_smt_supported())
cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
}
@@ -428,18 +411,10 @@ early_param("nosmt", smt_cmdline_disable);
static inline bool cpu_smt_allowed(unsigned int cpu)
{
- if (topology_is_primary_thread(cpu))
+ if (cpu_smt_control == CPU_SMT_ENABLED)
return true;
- /*
- * If the CPU is not a 'primary' thread and the booted_once bit is
- * set then the processor has SMT support. Store this information
- * for the late check of SMT support in cpu_smt_check_topology().
- */
- if (per_cpu(cpuhp_state, cpu).booted_once)
- cpu_smt_available = true;
-
- if (cpu_smt_control == CPU_SMT_ENABLED)
+ if (topology_is_primary_thread(cpu))
return true;
/*
@@ -2090,10 +2065,8 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
*/
cpuhp_offline_cpu_device(cpu);
}
- if (!ret) {
+ if (!ret)
cpu_smt_control = ctrlval;
- arch_smt_update();
- }
cpu_maps_update_done();
return ret;
}
@@ -2104,7 +2077,6 @@ static int cpuhp_smt_enable(void)
cpu_maps_update_begin();
cpu_smt_control = CPU_SMT_ENABLED;
- arch_smt_update();
for_each_present_cpu(cpu) {
/* Skip online CPUs and CPUs on offline nodes */
if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3cd13a30f732..e5ede6918050 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -436,18 +436,18 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
- if (ret || !write)
- return ret;
-
+ int ret;
+ int perf_cpu = sysctl_perf_cpu_time_max_percent;
/*
* If throttling is disabled don't allow the write:
*/
- if (sysctl_perf_cpu_time_max_percent == 100 ||
- sysctl_perf_cpu_time_max_percent == 0)
+ if (write && (perf_cpu == 100 || perf_cpu == 0))
return -EINVAL;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
update_perf_cpu_limits();
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 4a9937076331..309ef5a64af5 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -734,6 +734,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
size = sizeof(struct ring_buffer);
size += nr_pages * sizeof(void *);
+ if (order_base_2(size) >= MAX_ORDER)
+ goto fail;
+
rb = kzalloc(size, GFP_KERNEL);
if (!rb)
goto fail;
diff --git a/kernel/exit.c b/kernel/exit.c
index 284f2fe9a293..2639a30a8aa5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -307,7 +307,7 @@ void rcuwait_wake_up(struct rcuwait *w)
* MB (A) MB (B)
* [L] cond [L] tsk
*/
- smp_rmb(); /* (B) */
+ smp_mb(); /* (B) */
/*
* Avoid using task_rcu_dereference() magic as long as we are careful,
@@ -558,12 +558,14 @@ static struct task_struct *find_alive_thread(struct task_struct *p)
return NULL;
}
-static struct task_struct *find_child_reaper(struct task_struct *father)
+static struct task_struct *find_child_reaper(struct task_struct *father,
+ struct list_head *dead)
__releases(&tasklist_lock)
__acquires(&tasklist_lock)
{
struct pid_namespace *pid_ns = task_active_pid_ns(father);
struct task_struct *reaper = pid_ns->child_reaper;
+ struct task_struct *p, *n;
if (likely(reaper != father))
return reaper;
@@ -579,6 +581,12 @@ static struct task_struct *find_child_reaper(struct task_struct *father)
panic("Attempted to kill init! exitcode=0x%08x\n",
father->signal->group_exit_code ?: father->exit_code);
}
+
+ list_for_each_entry_safe(p, n, dead, ptrace_entry) {
+ list_del_init(&p->ptrace_entry);
+ release_task(p);
+ }
+
zap_pid_ns_processes(pid_ns);
write_lock_irq(&tasklist_lock);
@@ -668,7 +676,7 @@ static void forget_original_parent(struct task_struct *father,
exit_ptrace(father, dead);
/* Can drop and reacquire tasklist_lock */
- reaper = find_child_reaper(father);
+ reaper = find_child_reaper(father, dead);
if (list_empty(&father->children))
return;
diff --git a/kernel/futex.c b/kernel/futex.c
index be3bff2315ff..a0514e01c3eb 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1452,11 +1452,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
return;
- /*
- * Queue the task for later wakeup for after we've released
- * the hb->lock. wake_q_add() grabs reference to p.
- */
- wake_q_add(wake_q, p);
+ get_task_struct(p);
__unqueue_futex(q);
/*
* The waiting task can free the futex_q as soon as q->lock_ptr = NULL
@@ -1466,6 +1462,13 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
* plist_del in __unqueue_futex().
*/
smp_store_release(&q->lock_ptr, NULL);
+
+ /*
+ * Queue the task for later wakeup for after we've released
+ * the hb->lock. wake_q_add() grabs reference to p.
+ */
+ wake_q_add(wake_q, p);
+ put_task_struct(p);
}
/*
@@ -2218,11 +2221,11 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
* decrement the counter at queue_unlock() when some error has
* occurred and we don't end up adding the task to the list.
*/
- hb_waiters_inc(hb);
+ hb_waiters_inc(hb); /* implies smp_mb(); (A) */
q->lock_ptr = &hb->lock;
- spin_lock(&hb->lock); /* implies smp_mb(); (A) */
+ spin_lock(&hb->lock);
return hb;
}
@@ -2858,35 +2861,39 @@ retry_private:
* and BUG when futex_unlock_pi() interleaves with this.
*
* Therefore acquire wait_lock while holding hb->lock, but drop the
- * latter before calling rt_mutex_start_proxy_lock(). This still fully
- * serializes against futex_unlock_pi() as that does the exact same
- * lock handoff sequence.
+ * latter before calling __rt_mutex_start_proxy_lock(). This
+ * interleaves with futex_unlock_pi() -- which does a similar lock
+ * handoff -- such that the latter can observe the futex_q::pi_state
+ * before __rt_mutex_start_proxy_lock() is done.
*/
raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
spin_unlock(q.lock_ptr);
+ /*
+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+ * such that futex_unlock_pi() is guaranteed to observe the waiter when
+ * it sees the futex_q::pi_state.
+ */
ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
if (ret) {
if (ret == 1)
ret = 0;
-
- spin_lock(q.lock_ptr);
- goto no_block;
+ goto cleanup;
}
-
if (unlikely(to))
hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+cleanup:
spin_lock(q.lock_ptr);
/*
- * If we failed to acquire the lock (signal/timeout), we must
+ * If we failed to acquire the lock (deadlock/signal/timeout), we must
* first acquire the hb->lock before removing the lock from the
- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
- * wait lists consistent.
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
+ * lists consistent.
*
* In particular; it is important that futex_unlock_pi() can not
* observe this inconsistency.
@@ -3010,6 +3017,10 @@ retry:
* there is no point where we hold neither; and therefore
* wake_futex_pi() must observe a state consistent with what we
* observed.
+ *
+ * In particular; this forces __rt_mutex_start_proxy() to
+ * complete such that we're guaranteed to observe the
+ * rt_waiter. Also see the WARN in wake_futex_pi().
*/
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index ee062b7939d3..ef8ad36cadcf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -457,7 +457,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
/* Validate affinity mask(s) */
if (affinity) {
- for (i = 0; i < cnt; i++, i++) {
+ for (i = 0; i < cnt; i++) {
if (cpumask_empty(&affinity[i].mask))
return -EINVAL;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a4888ce4667a..84b54a17b95d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -393,6 +393,9 @@ int irq_setup_affinity(struct irq_desc *desc)
}
cpumask_and(&mask, cpu_online_mask, set);
+ if (cpumask_empty(&mask))
+ cpumask_copy(&mask, cpu_online_mask);
+
if (node != NUMA_NO_NODE) {
const struct cpumask *nodemask = cpumask_of_node(node);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 581edcc63c26..978d63a8261c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1726,12 +1726,33 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
rt_mutex_set_owner(lock, NULL);
}
+/**
+ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: does _NOT_ remove the @waiter on failure; must either call
+ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task)
{
int ret;
+ lockdep_assert_held(&lock->wait_lock);
+
if (try_to_take_rt_mutex(lock, task, NULL))
return 1;
@@ -1749,9 +1770,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
ret = 0;
}
- if (unlikely(ret))
- remove_waiter(lock, waiter);
-
debug_rt_mutex_print_deadlock(waiter);
return ret;
@@ -1763,12 +1781,18 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
* @waiter: the pre-initialized rt_mutex_waiter
* @task: the task to prepare
*
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
+ * on failure.
+ *
* Returns:
* 0 - task blocked on lock
* 1 - acquired the lock for task, caller should wake it up
* <0 - error
*
- * Special API call for FUTEX_REQUEUE_PI support.
+ * Special API call for PI-futex support.
*/
int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
@@ -1778,6 +1802,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
raw_spin_lock_irq(&lock->wait_lock);
ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
raw_spin_unlock_irq(&lock->wait_lock);
return ret;
@@ -1845,7 +1871,8 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
* @lock: the rt_mutex we were woken on
* @waiter: the pre-initialized rt_mutex_waiter
*
- * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
+ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
+ * rt_mutex_wait_proxy_lock().
*
* Unless we acquired the lock; we're still enqueued on the wait-list and can
* in fact still be granted ownership until we're removed. Therefore we can
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 09b180063ee1..50d9af615dc4 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -198,15 +198,22 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
woken++;
tsk = waiter->task;
- wake_q_add(wake_q, tsk);
+ get_task_struct(tsk);
list_del(&waiter->list);
/*
- * Ensure that the last operation is setting the reader
+ * Ensure calling get_task_struct() before setting the reader
* waiter to nil such that rwsem_down_read_failed() cannot
* race with do_exit() by always holding a reference count
* to the task to wakeup.
*/
smp_store_release(&waiter->task, NULL);
+ /*
+ * Ensure issuing the wakeup (either by us or someone else)
+ * after setting the reader waiter to nil.
+ */
+ wake_q_add(wake_q, tsk);
+ /* wake_q_add() already take the task ref */
+ put_task_struct(tsk);
}
adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
diff --git a/kernel/relay.c b/kernel/relay.c
index 04f248644e06..9e0f52375487 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -428,6 +428,8 @@ static struct dentry *relay_create_buf_file(struct rchan *chan,
dentry = chan->cb->create_buf_file(tmpname, chan->parent,
S_IRUSR, buf,
&chan->is_global);
+ if (IS_ERR(dentry))
+ dentry = NULL;
kfree(tmpname);
@@ -461,7 +463,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
dentry = chan->cb->create_buf_file(NULL, NULL,
S_IRUSR, buf,
&chan->is_global);
- if (WARN_ON(dentry))
+ if (IS_ERR_OR_NULL(dentry))
goto free_buf;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec1b67a195cc..e86e2b8f6922 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -398,6 +398,18 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
#endif
+/**
+ * wake_q_add() - queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ */
void wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
struct wake_q_node *node = &task->wake_q;
@@ -407,10 +419,11 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
* its already queued (either by us or someone else) and will get the
* wakeup due to that.
*
- * This cmpxchg() executes a full barrier, which pairs with the full
- * barrier executed by the wakeup in wake_up_q().
+ * In order to ensure that a pending wakeup will observe our pending
+ * state, even in the failed case, an explicit smp_mb() must be used.
*/
- if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+ smp_mb__before_atomic();
+ if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
return;
get_task_struct(task);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ffd1ae7237e7..38d4669aa2ef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6009,6 +6009,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+EXPORT_SYMBOL_GPL(sched_smt_present);
static inline void set_idle_cores(int cpu, int val)
{
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index fe24de3fbc93..c3484785b179 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -124,6 +124,7 @@
* sampling of the aggregate task states would be.
*/
+#include "../workqueue_internal.h"
#include <linux/sched/loadavg.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
@@ -480,9 +481,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
groupc->tasks[t]++;
write_seqcount_end(&groupc->seq);
-
- if (!delayed_work_pending(&group->clock_work))
- schedule_delayed_work(&group->clock_work, PSI_FREQ);
}
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -513,6 +511,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
struct psi_group *group;
+ bool wake_clock = true;
void *iter = NULL;
if (!task->pid)
@@ -530,8 +529,22 @@ void psi_task_change(struct task_struct *task, int clear, int set)
task->psi_flags &= ~clear;
task->psi_flags |= set;
- while ((group = iterate_groups(task, &iter)))
+ /*
+ * Periodic aggregation shuts off if there is a period of no
+ * task changes, so we wake it back up if necessary. However,
+ * don't do this if the task change is the aggregation worker
+ * itself going to sleep, or we'll ping-pong forever.
+ */
+ if (unlikely((clear & TSK_RUNNING) &&
+ (task->flags & PF_WQ_WORKER) &&
+ wq_worker_last_func(task) == psi_update_work))
+ wake_clock = false;
+
+ while ((group = iterate_groups(task, &iter))) {
psi_group_change(group, cpu, clear, set);
+ if (wake_clock && !delayed_work_pending(&group->clock_work))
+ schedule_delayed_work(&group->clock_work, PSI_FREQ);
+ }
}
void psi_memstall_tick(struct task_struct *task, int cpu)
diff --git a/kernel/signal.c b/kernel/signal.c
index e1d7ad8e6ab1..99fa8ff06fd9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -688,6 +688,48 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in
}
EXPORT_SYMBOL_GPL(dequeue_signal);
+static int dequeue_synchronous_signal(kernel_siginfo_t *info)
+{
+ struct task_struct *tsk = current;
+ struct sigpending *pending = &tsk->pending;
+ struct sigqueue *q, *sync = NULL;
+
+ /*
+ * Might a synchronous signal be in the queue?
+ */
+ if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK))
+ return 0;
+
+ /*
+ * Return the first synchronous signal in the queue.
+ */
+ list_for_each_entry(q, &pending->list, list) {
+ /* Synchronous signals have a postive si_code */
+ if ((q->info.si_code > SI_USER) &&
+ (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) {
+ sync = q;
+ goto next;
+ }
+ }
+ return 0;
+next:
+ /*
+ * Check if there is another siginfo for the same signal.
+ */
+ list_for_each_entry_continue(q, &pending->list, list) {
+ if (q->info.si_signo == sync->info.si_signo)
+ goto still_pending;
+ }
+
+ sigdelset(&pending->signal, sync->info.si_signo);
+ recalc_sigpending();
+still_pending:
+ list_del_init(&sync->list);
+ copy_siginfo(info, &sync->info);
+ __sigqueue_free(sync);
+ return info->si_signo;
+}
+
/*
* Tell a process that it has a new active signal..
*
@@ -1057,10 +1099,9 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
result = TRACE_SIGNAL_DELIVERED;
/*
- * Skip useless siginfo allocation for SIGKILL SIGSTOP,
- * and kernel threads.
+ * Skip useless siginfo allocation for SIGKILL and kernel threads.
*/
- if (sig_kernel_only(sig) || (t->flags & PF_KTHREAD))
+ if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
goto out_set;
/*
@@ -2394,6 +2435,11 @@ relock:
goto relock;
}
+ /* Has this task already been marked for death? */
+ ksig->info.si_signo = signr = SIGKILL;
+ if (signal_group_exit(signal))
+ goto fatal;
+
for (;;) {
struct k_sigaction *ka;
@@ -2407,7 +2453,15 @@ relock:
goto relock;
}
- signr = dequeue_signal(current, &current->blocked, &ksig->info);
+ /*
+ * Signals generated by the execution of an instruction
+ * need to be delivered before any other pending signals
+ * so that the instruction pointer in the signal stack
+ * frame points to the faulting instruction.
+ */
+ signr = dequeue_synchronous_signal(&ksig->info);
+ if (!signr)
+ signr = dequeue_signal(current, &current->blocked, &ksig->info);
if (!signr)
break; /* will return 0 */
@@ -2489,6 +2543,7 @@ relock:
continue;
}
+ fatal:
spin_unlock_irq(&sighand->siglock);
/*
diff --git a/kernel/smp.c b/kernel/smp.c
index 163c451af42e..f4cf1b0bb3b8 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -584,8 +584,6 @@ void __init smp_init(void)
num_nodes, (num_nodes > 1 ? "s" : ""),
num_cpus, (num_cpus > 1 ? "s" : ""));
- /* Final decision about SMT support */
- cpu_smt_check_topology();
/* Any cleanup work */
smp_cpus_done(setup_max_cpus);
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 8f0644af40be..80f955210861 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -685,6 +685,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
* set up the signal and overrun bookkeeping.
*/
timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
+ timer->it_interval = ns_to_ktime(timer->it.cpu.incr);
/*
* This acts as a modification timestamp for the timer,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8b068adb9da1..f1a86a0d881d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1204,22 +1204,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
{
- int err;
-
- mutex_lock(&bpf_event_mutex);
- err = __bpf_probe_register(btp, prog);
- mutex_unlock(&bpf_event_mutex);
- return err;
+ return __bpf_probe_register(btp, prog);
}
int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
{
- int err;
-
- mutex_lock(&bpf_event_mutex);
- err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
- mutex_unlock(&bpf_event_mutex);
- return err;
+ return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
}
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index e335576b9411..9bde07c06362 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -5,7 +5,7 @@
* Copyright (C) IBM Corporation, 2010-2012
* Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
*/
-#define pr_fmt(fmt) "trace_kprobe: " fmt
+#define pr_fmt(fmt) "trace_uprobe: " fmt
#include <linux/ctype.h>
#include <linux/module.h>
@@ -160,6 +160,13 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
if (ret >= 0) {
if (ret == maxlen)
dst[ret - 1] = '\0';
+ else
+ /*
+ * Include the terminating null byte. In this case it
+ * was copied by strncpy_from_user but not accounted
+ * for in ret.
+ */
+ ret++;
*(u32 *)dest = make_data_loc(ret, (void *)dst - base);
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 392be4b252f6..fc5d23d752a5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -910,6 +910,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
}
/**
+ * wq_worker_last_func - retrieve worker's last work function
+ *
+ * Determine the last function a worker executed. This is called from
+ * the scheduler to get a worker's last known identity.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ *
+ * Return:
+ * The last work function %current executed as a worker, NULL if it
+ * hasn't executed any work yet.
+ */
+work_func_t wq_worker_last_func(struct task_struct *task)
+{
+ struct worker *worker = kthread_data(task);
+
+ return worker->last_func;
+}
+
+/**
* worker_set_flags - set worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to set
@@ -2184,6 +2204,9 @@ __acquires(&pool->lock)
if (unlikely(cpu_intensive))
worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+ /* tag the worker for identification in schedule() */
+ worker->last_func = worker->current_func;
+
/* we're done with it, release */
hash_del(&worker->hentry);
worker->current_work = NULL;
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 66fbb5a9e633..cb68b03ca89a 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -53,6 +53,9 @@ struct worker {
/* used only by rescuers to point to the target workqueue */
struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
+
+ /* used by the scheduler to determine a worker's last known identity */
+ work_func_t last_func;
};
/**
@@ -67,9 +70,10 @@ static inline struct worker *current_wq_worker(void)
/*
* Scheduler hooks for concurrency managed workqueue. Only to be used from
- * sched/core.c and workqueue.c.
+ * sched/ and workqueue.c.
*/
void wq_worker_waking_up(struct task_struct *task, int cpu);
struct task_struct *wq_worker_sleeping(struct task_struct *task);
+work_func_t wq_worker_last_func(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */