summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/hashtab.c119
-rw-r--r--kernel/bpf/lpm_trie.c6
-rw-r--r--kernel/cgroup/cgroup-v1.c2
-rw-r--r--kernel/cgroup/pids.c2
-rw-r--r--kernel/events/core.c64
-rw-r--r--kernel/futex.c22
-rw-r--r--kernel/locking/rwsem-spinlock.c16
-rw-r--r--kernel/memremap.c4
-rw-r--r--kernel/sched/deadline.c63
-rw-r--r--kernel/sched/loadavg.c20
-rw-r--r--kernel/workqueue.c1
11 files changed, 230 insertions, 89 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3ea87fb19a94..afe5bab376c9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -13,11 +13,12 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
+#include <linux/rculist_nulls.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
struct bucket {
- struct hlist_head head;
+ struct hlist_nulls_head head;
raw_spinlock_t lock;
};
@@ -44,9 +45,14 @@ enum extra_elem_state {
/* each htab element is struct htab_elem + key + value */
struct htab_elem {
union {
- struct hlist_node hash_node;
- struct bpf_htab *htab;
- struct pcpu_freelist_node fnode;
+ struct hlist_nulls_node hash_node;
+ struct {
+ void *padding;
+ union {
+ struct bpf_htab *htab;
+ struct pcpu_freelist_node fnode;
+ };
+ };
};
union {
struct rcu_head rcu;
@@ -162,7 +168,8 @@ skip_percpu_elems:
offsetof(struct htab_elem, lru_node),
htab->elem_size, htab->map.max_entries);
else
- pcpu_freelist_populate(&htab->freelist, htab->elems,
+ pcpu_freelist_populate(&htab->freelist,
+ htab->elems + offsetof(struct htab_elem, fnode),
htab->elem_size, htab->map.max_entries);
return 0;
@@ -217,6 +224,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
int err, i;
u64 cost;
+ BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
+ offsetof(struct htab_elem, hash_node.pprev));
+ BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
+ offsetof(struct htab_elem, hash_node.pprev));
+
if (lru && !capable(CAP_SYS_ADMIN))
/* LRU implementation is much complicated than other
* maps. Hence, limit to CAP_SYS_ADMIN for now.
@@ -326,7 +338,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
goto free_htab;
for (i = 0; i < htab->n_buckets; i++) {
- INIT_HLIST_HEAD(&htab->buckets[i].head);
+ INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
raw_spin_lock_init(&htab->buckets[i].lock);
}
@@ -366,20 +378,44 @@ static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
return &htab->buckets[hash & (htab->n_buckets - 1)];
}
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
{
return &__select_bucket(htab, hash)->head;
}
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+/* this lookup function can only be called with bucket lock taken */
+static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
void *key, u32 key_size)
{
+ struct hlist_nulls_node *n;
+ struct htab_elem *l;
+
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+
+ return NULL;
+}
+
+/* can be called without bucket lock. it will repeat the loop in
+ * the unlikely event when elements moved from one bucket into another
+ * while link list is being walked
+ */
+static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
+ u32 hash, void *key,
+ u32 key_size, u32 n_buckets)
+{
+ struct hlist_nulls_node *n;
struct htab_elem *l;
- hlist_for_each_entry_rcu(l, head, hash_node)
+again:
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l->hash == hash && !memcmp(&l->key, key, key_size))
return l;
+ if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
+ goto again;
+
return NULL;
}
@@ -387,7 +423,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct htab_elem *l;
u32 hash, key_size;
@@ -400,7 +436,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
head = select_bucket(htab, hash);
- l = lookup_elem_raw(head, hash, key, key_size);
+ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
return l;
}
@@ -433,8 +469,9 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
{
struct bpf_htab *htab = (struct bpf_htab *)arg;
- struct htab_elem *l, *tgt_l;
- struct hlist_head *head;
+ struct htab_elem *l = NULL, *tgt_l;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
unsigned long flags;
struct bucket *b;
@@ -444,9 +481,9 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
raw_spin_lock_irqsave(&b->lock, flags);
- hlist_for_each_entry_rcu(l, head, hash_node)
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
- hlist_del_rcu(&l->hash_node);
+ hlist_nulls_del_rcu(&l->hash_node);
break;
}
@@ -459,7 +496,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct htab_elem *l, *next_l;
u32 hash, key_size;
int i;
@@ -473,7 +510,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
head = select_bucket(htab, hash);
/* lookup the key */
- l = lookup_elem_raw(head, hash, key, key_size);
+ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
if (!l) {
i = 0;
@@ -481,7 +518,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
}
/* key was found, get next key in the same bucket */
- next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
struct htab_elem, hash_node);
if (next_l) {
@@ -500,7 +537,7 @@ find_first_elem:
head = select_bucket(htab, i);
/* pick first element in the bucket */
- next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
struct htab_elem, hash_node);
if (next_l) {
/* if it's not empty, just return it */
@@ -582,9 +619,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
int err = 0;
if (prealloc) {
- l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
- if (!l_new)
+ struct pcpu_freelist_node *l;
+
+ l = pcpu_freelist_pop(&htab->freelist);
+ if (!l)
err = -E2BIG;
+ else
+ l_new = container_of(l, struct htab_elem, fnode);
} else {
if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
atomic_dec(&htab->count);
@@ -661,7 +702,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -700,9 +741,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* add new element to the head of the list, so that
* concurrent search will find it before old elem
*/
- hlist_add_head_rcu(&l_new->hash_node, head);
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
- hlist_del_rcu(&l_old->hash_node);
+ hlist_nulls_del_rcu(&l_old->hash_node);
free_htab_elem(htab, l_old);
}
ret = 0;
@@ -716,7 +757,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old = NULL;
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -757,10 +798,10 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
/* add new element to the head of the list, so that
* concurrent search will find it before old elem
*/
- hlist_add_head_rcu(&l_new->hash_node, head);
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
bpf_lru_node_set_ref(&l_new->lru_node);
- hlist_del_rcu(&l_old->hash_node);
+ hlist_nulls_del_rcu(&l_old->hash_node);
}
ret = 0;
@@ -781,7 +822,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -820,7 +861,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
ret = PTR_ERR(l_new);
goto err;
}
- hlist_add_head_rcu(&l_new->hash_node, head);
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
}
ret = 0;
err:
@@ -834,7 +875,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -882,7 +923,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
} else {
pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
value, onallcpus);
- hlist_add_head_rcu(&l_new->hash_node, head);
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
l_new = NULL;
}
ret = 0;
@@ -910,7 +951,7 @@ static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct bucket *b;
struct htab_elem *l;
unsigned long flags;
@@ -930,7 +971,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
- hlist_del_rcu(&l->hash_node);
+ hlist_nulls_del_rcu(&l->hash_node);
free_htab_elem(htab, l);
ret = 0;
}
@@ -942,7 +983,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct bucket *b;
struct htab_elem *l;
unsigned long flags;
@@ -962,7 +1003,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
- hlist_del_rcu(&l->hash_node);
+ hlist_nulls_del_rcu(&l->hash_node);
ret = 0;
}
@@ -977,12 +1018,12 @@ static void delete_all_elements(struct bpf_htab *htab)
int i;
for (i = 0; i < htab->n_buckets; i++) {
- struct hlist_head *head = select_bucket(htab, i);
- struct hlist_node *n;
+ struct hlist_nulls_head *head = select_bucket(htab, i);
+ struct hlist_nulls_node *n;
struct htab_elem *l;
- hlist_for_each_entry_safe(l, n, head, hash_node) {
- hlist_del_rcu(&l->hash_node);
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ hlist_nulls_del_rcu(&l->hash_node);
if (l->state != HTAB_EXTRA_ELEM_USED)
htab_elem_free(htab, l);
}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 8bfe0afaee10..b37bd9ab7f57 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -500,9 +500,15 @@ unlock:
raw_spin_unlock(&trie->lock);
}
+static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -ENOTSUPP;
+}
+
static const struct bpf_map_ops trie_ops = {
.map_alloc = trie_alloc,
.map_free = trie_free,
+ .map_get_next_key = trie_get_next_key,
.map_lookup_elem = trie_lookup_elem,
.map_update_elem = trie_update_elem,
.map_delete_elem = trie_delete_elem,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 56eba9caa632..1dc22f6b49f5 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1329,7 +1329,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
struct task_struct *task;
int count = 0;
- seq_printf(seq, "css_set %p\n", cset);
+ seq_printf(seq, "css_set %pK\n", cset);
list_for_each_entry(task, &cset->tasks, cg_list) {
if (count++ > MAX_TASKS_SHOWN_PER_CSS)
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index e756dae49300..2237201d66d5 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -229,7 +229,7 @@ static int pids_can_fork(struct task_struct *task)
/* Only log the first time events_limit is incremented. */
if (atomic64_inc_return(&pids->events_limit) == 1) {
pr_info("cgroup: fork rejected by pids controller in ");
- pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id));
+ pr_cont_cgroup_path(css->cgroup);
pr_cont("\n");
}
cgroup_file_notify(&pids->events_file);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a17ed56c8ce1..ff01cba86f43 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4256,7 +4256,7 @@ int perf_event_release_kernel(struct perf_event *event)
raw_spin_lock_irq(&ctx->lock);
/*
- * Mark this even as STATE_DEAD, there is no external reference to it
+ * Mark this event as STATE_DEAD, there is no external reference to it
* anymore.
*
* Anybody acquiring event->child_mutex after the below loop _must_
@@ -10417,21 +10417,22 @@ void perf_event_free_task(struct task_struct *task)
continue;
mutex_lock(&ctx->mutex);
-again:
- list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
- group_entry)
- perf_free_event(event, ctx);
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Destroy the task <-> ctx relation and mark the context dead.
+ *
+ * This is important because even though the task hasn't been
+ * exposed yet the context has been (through child_list).
+ */
+ RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
+ raw_spin_unlock_irq(&ctx->lock);
- list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
- group_entry)
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
perf_free_event(event, ctx);
- if (!list_empty(&ctx->pinned_groups) ||
- !list_empty(&ctx->flexible_groups))
- goto again;
-
mutex_unlock(&ctx->mutex);
-
put_ctx(ctx);
}
}
@@ -10469,7 +10470,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
}
/*
- * inherit a event from parent task to child task:
+ * Inherit a event from parent task to child task.
+ *
+ * Returns:
+ * - valid pointer on success
+ * - NULL for orphaned events
+ * - IS_ERR() on error
*/
static struct perf_event *
inherit_event(struct perf_event *parent_event,
@@ -10563,6 +10569,16 @@ inherit_event(struct perf_event *parent_event,
return child_event;
}
+/*
+ * Inherits an event group.
+ *
+ * This will quietly suppress orphaned events; !inherit_event() is not an error.
+ * This matches with perf_event_release_kernel() removing all child events.
+ *
+ * Returns:
+ * - 0 on success
+ * - <0 on error
+ */
static int inherit_group(struct perf_event *parent_event,
struct task_struct *parent,
struct perf_event_context *parent_ctx,
@@ -10577,6 +10593,11 @@ static int inherit_group(struct perf_event *parent_event,
child, NULL, child_ctx);
if (IS_ERR(leader))
return PTR_ERR(leader);
+ /*
+ * @leader can be NULL here because of is_orphaned_event(). In this
+ * case inherit_event() will create individual events, similar to what
+ * perf_group_detach() would do anyway.
+ */
list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
child_ctr = inherit_event(sub, parent, parent_ctx,
child, leader, child_ctx);
@@ -10586,6 +10607,17 @@ static int inherit_group(struct perf_event *parent_event,
return 0;
}
+/*
+ * Creates the child task context and tries to inherit the event-group.
+ *
+ * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
+ * inherited_all set when we 'fail' to inherit an orphaned event; this is
+ * consistent with perf_event_release_kernel() removing all child events.
+ *
+ * Returns:
+ * - 0 on success
+ * - <0 on error
+ */
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
@@ -10608,7 +10640,6 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
-
child_ctx = alloc_perf_context(parent_ctx->pmu, child);
if (!child_ctx)
return -ENOMEM;
@@ -10670,7 +10701,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
- break;
+ goto out_unlock;
}
/*
@@ -10686,7 +10717,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
- break;
+ goto out_unlock;
}
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
@@ -10714,6 +10745,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
}
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+out_unlock:
mutex_unlock(&parent_ctx->mutex);
perf_unpin_context(parent_ctx);
diff --git a/kernel/futex.c b/kernel/futex.c
index 229a744b1781..45858ec73941 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2815,7 +2815,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
{
struct hrtimer_sleeper timeout, *to = NULL;
struct rt_mutex_waiter rt_waiter;
- struct rt_mutex *pi_mutex = NULL;
struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
@@ -2899,6 +2898,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
+ rt_mutex_unlock(&q.pi_state->pi_mutex);
/*
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
@@ -2907,6 +2908,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
spin_unlock(q.lock_ptr);
}
} else {
+ struct rt_mutex *pi_mutex;
+
/*
* We have been woken up by futex_unlock_pi(), a timeout, or a
* signal. futex_unlock_pi() will not destroy the lock_ptr nor
@@ -2930,18 +2933,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (res)
ret = (res < 0) ? res : 0;
+ /*
+ * If fixup_pi_state_owner() faulted and was unable to handle
+ * the fault, unlock the rt_mutex and return the fault to
+ * userspace.
+ */
+ if (ret && rt_mutex_owner(pi_mutex) == current)
+ rt_mutex_unlock(pi_mutex);
+
/* Unqueue and drop the lock. */
unqueue_me_pi(&q);
}
- /*
- * If fixup_pi_state_owner() faulted and was unable to handle the
- * fault, unlock the rt_mutex and return the fault to userspace.
- */
- if (ret == -EFAULT) {
- if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
- rt_mutex_unlock(pi_mutex);
- } else if (ret == -EINTR) {
+ if (ret == -EINTR) {
/*
* We've already been requeued, but cannot restart by calling
* futex_lock_pi() directly. We could restart this syscall, but
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 7bc24d477805..c65f7989f850 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -213,10 +213,9 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
*/
if (sem->count == 0)
break;
- if (signal_pending_state(state, current)) {
- ret = -EINTR;
- goto out;
- }
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
set_current_state(state);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
schedule();
@@ -224,12 +223,19 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
}
/* got the lock */
sem->count = -1;
-out:
list_del(&waiter.list);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
return ret;
+
+out_nolock:
+ list_del(&waiter.list);
+ if (!list_empty(&sem->wait_list))
+ __rwsem_do_wake(sem, 1);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return -EINTR;
}
void __sched __down_write(struct rw_semaphore *sem)
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 06123234f118..07e85e5229da 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -247,11 +247,9 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(resource_size(res), SECTION_SIZE);
- lock_device_hotplug();
mem_hotplug_begin();
arch_remove_memory(align_start, align_size);
mem_hotplug_done();
- unlock_device_hotplug();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
pgmap_radix_release(res);
@@ -364,11 +362,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (error)
goto err_pfn_remap;
- lock_device_hotplug();
mem_hotplug_begin();
error = arch_add_memory(nid, align_start, align_size, true);
mem_hotplug_done();
- unlock_device_hotplug();
if (error)
goto err_add_memory;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 99b2c33a9fbc..a2ce59015642 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -445,13 +445,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
*
* This function returns true if:
*
- * runtime / (deadline - t) > dl_runtime / dl_period ,
+ * runtime / (deadline - t) > dl_runtime / dl_deadline ,
*
* IOW we can't recycle current parameters.
*
- * Notice that the bandwidth check is done against the period. For
+ * Notice that the bandwidth check is done against the deadline. For
* task with deadline equal to period this is the same of using
- * dl_deadline instead of dl_period in the equation above.
+ * dl_period instead of dl_deadline in the equation above.
*/
static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se, u64 t)
@@ -476,7 +476,7 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
* of anything below microseconds resolution is actually fiction
* (but still we want to give the user that illusion >;).
*/
- left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
right = ((dl_se->deadline - t) >> DL_SCALE) *
(pi_se->dl_runtime >> DL_SCALE);
@@ -505,10 +505,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
}
}
+static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
+{
+ return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period;
+}
+
/*
* If the entity depleted all its runtime, and if we want it to sleep
* while waiting for some new execution time to become available, we
- * set the bandwidth enforcement timer to the replenishment instant
+ * set the bandwidth replenishment timer to the replenishment instant
* and try to activate it.
*
* Notice that it is important for the caller to know if the timer
@@ -530,7 +535,7 @@ static int start_dl_timer(struct task_struct *p)
* that it is actually coming from rq->clock and not from
* hrtimer's time base reading.
*/
- act = ns_to_ktime(dl_se->deadline);
+ act = ns_to_ktime(dl_next_period(dl_se));
now = hrtimer_cb_get_time(timer);
delta = ktime_to_ns(now) - rq_clock(rq);
act = ktime_add_ns(act, delta);
@@ -638,6 +643,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
lockdep_unpin_lock(&rq->lock, rf.cookie);
rq = dl_task_offline_migration(rq, p);
rf.cookie = lockdep_pin_lock(&rq->lock);
+ update_rq_clock(rq);
/*
* Now that the task has been migrated to the new RQ and we
@@ -689,6 +695,37 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
timer->function = dl_task_timer;
}
+/*
+ * During the activation, CBS checks if it can reuse the current task's
+ * runtime and period. If the deadline of the task is in the past, CBS
+ * cannot use the runtime, and so it replenishes the task. This rule
+ * works fine for implicit deadline tasks (deadline == period), and the
+ * CBS was designed for implicit deadline tasks. However, a task with
+ * constrained deadline (deadine < period) might be awakened after the
+ * deadline, but before the next period. In this case, replenishing the
+ * task would allow it to run for runtime / deadline. As in this case
+ * deadline < period, CBS enables a task to run for more than the
+ * runtime / period. In a very loaded system, this can cause a domino
+ * effect, making other tasks miss their deadlines.
+ *
+ * To avoid this problem, in the activation of a constrained deadline
+ * task after the deadline but before the next period, throttle the
+ * task and set the replenishing timer to the begin of the next period,
+ * unless it is boosted.
+ */
+static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+
+ if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
+ if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+ return;
+ dl_se->dl_throttled = 1;
+ }
+}
+
static
int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
{
@@ -922,6 +959,11 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
__dequeue_dl_entity(dl_se);
}
+static inline bool dl_is_constrained(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_deadline < dl_se->dl_period;
+}
+
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *pi_task = rt_mutex_get_top_task(p);
@@ -948,6 +990,15 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
}
/*
+ * Check if a constrained deadline task was activated
+ * after the deadline but before the next period.
+ * If that is the case, the task will be throttled and
+ * the replenishment timer will be set to the next period.
+ */
+ if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
+ dl_check_constrained_dl(&p->dl);
+
+ /*
* If p is throttled, we do nothing. In fact, if it exhausted
* its budget it needs a replenishment and, since it now is on
* its rq, the bandwidth timer callback (which clearly has not
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 7296b7308eca..f15fb2bdbc0d 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -169,7 +169,7 @@ static inline int calc_load_write_idx(void)
* If the folding window started, make sure we start writing in the
* next idle-delta.
*/
- if (!time_before(jiffies, calc_load_update))
+ if (!time_before(jiffies, READ_ONCE(calc_load_update)))
idx++;
return idx & 1;
@@ -202,8 +202,9 @@ void calc_load_exit_idle(void)
struct rq *this_rq = this_rq();
/*
- * If we're still before the sample window, we're done.
+ * If we're still before the pending sample window, we're done.
*/
+ this_rq->calc_load_update = READ_ONCE(calc_load_update);
if (time_before(jiffies, this_rq->calc_load_update))
return;
@@ -212,7 +213,6 @@ void calc_load_exit_idle(void)
* accounted through the nohz accounting, so skip the entire deal and
* sync up for the next window.
*/
- this_rq->calc_load_update = calc_load_update;
if (time_before(jiffies, this_rq->calc_load_update + 10))
this_rq->calc_load_update += LOAD_FREQ;
}
@@ -308,13 +308,15 @@ calc_load_n(unsigned long load, unsigned long exp,
*/
static void calc_global_nohz(void)
{
+ unsigned long sample_window;
long delta, active, n;
- if (!time_before(jiffies, calc_load_update + 10)) {
+ sample_window = READ_ONCE(calc_load_update);
+ if (!time_before(jiffies, sample_window + 10)) {
/*
* Catch-up, fold however many we are behind still
*/
- delta = jiffies - calc_load_update - 10;
+ delta = jiffies - sample_window - 10;
n = 1 + (delta / LOAD_FREQ);
active = atomic_long_read(&calc_load_tasks);
@@ -324,7 +326,7 @@ static void calc_global_nohz(void)
avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
- calc_load_update += n * LOAD_FREQ;
+ WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
}
/*
@@ -352,9 +354,11 @@ static inline void calc_global_nohz(void) { }
*/
void calc_global_load(unsigned long ticks)
{
+ unsigned long sample_window;
long active, delta;
- if (time_before(jiffies, calc_load_update + 10))
+ sample_window = READ_ONCE(calc_load_update);
+ if (time_before(jiffies, sample_window + 10))
return;
/*
@@ -371,7 +375,7 @@ void calc_global_load(unsigned long ticks)
avenrun[1] = calc_load(avenrun[1], EXP_5, active);
avenrun[2] = calc_load(avenrun[2], EXP_15, active);
- calc_load_update += LOAD_FREQ;
+ WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
/*
* In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 072cbc9b175d..c0168b7da1ea 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1507,6 +1507,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
+ WARN_ON_ONCE(!wq);
WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
timer->data != (unsigned long)dwork);
WARN_ON_ONCE(timer_pending(timer));