summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-10-30 12:42:58 -0400
committerDavid S. Miller <davem@davemloft.net>2016-10-30 12:42:58 -0400
commit27058af401e49d88a905df000dd26f443fcfa8ce (patch)
tree819f32113d3b8374b9fbf72e2202d4c4d4511a60 /kernel
parent357f4aae859b5d74554b0ccbb18556f1df4166c3 (diff)
parent2a26d99b251b8625d27aed14e97fc10707a3a81f (diff)
downloadlinux-27058af401e49d88a905df000dd26f443fcfa8ce.tar.bz2
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
Mostly simple overlapping changes. For example, David Ahern's adjacency list revamp in 'net-next' conflicted with an adjacency list traversal bug fix in 'net'. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c75
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/cpuset.c13
-rw-r--r--kernel/events/core.c23
-rw-r--r--kernel/events/uprobes.c6
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/irq/manage.c1
-rw-r--r--kernel/kcov.c9
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/printk/printk.c4
-rw-r--r--kernel/ptrace.c16
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/fair.c25
-rw-r--r--kernel/sched/wait.c10
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/timer.c76
20 files changed, 192 insertions, 110 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 44066158f0d1..85bc9beb046d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -64,6 +64,9 @@
#include <linux/file.h>
#include <net/sock.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/cgroup.h>
+
/*
* pidlists linger the following amount before being destroyed. The goal
* is avoiding frequent destruction in the middle of consecutive read calls
@@ -1176,6 +1179,8 @@ static void cgroup_destroy_root(struct cgroup_root *root)
struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link;
+ trace_cgroup_destroy_root(root);
+
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
BUG_ON(atomic_read(&root->nr_cgrps));
@@ -1874,6 +1879,9 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
strcpy(root->release_agent_path, opts.release_agent);
spin_unlock(&release_agent_path_lock);
}
+
+ trace_cgroup_remount(root);
+
out_unlock:
kfree(opts.release_agent);
kfree(opts.name);
@@ -2031,6 +2039,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto destroy_root;
+ trace_cgroup_setup_root(root);
+
/*
* There must be no failure case after here, since rebinding takes
* care of subsystems' refcounts, which are explicitly dropped in
@@ -2315,22 +2325,18 @@ static struct file_system_type cgroup2_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
- struct cgroup_namespace *ns)
+static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns)
{
struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
- int ret;
- ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
- if (ret < 0 || ret >= buflen)
- return NULL;
- return buf;
+ return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
}
-char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
- struct cgroup_namespace *ns)
+int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns)
{
- char *ret;
+ int ret;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
@@ -2357,12 +2363,12 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
*
* Return value is the same as kernfs_path().
*/
-char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
{
struct cgroup_root *root;
struct cgroup *cgrp;
int hierarchy_id = 1;
- char *path = NULL;
+ int ret;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
@@ -2371,16 +2377,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
if (root) {
cgrp = task_cgroup_from_root(task, root);
- path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
+ ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
- if (strlcpy(buf, "/", buflen) < buflen)
- path = buf;
+ ret = strlcpy(buf, "/", buflen);
}
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
- return path;
+ return ret;
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
@@ -2830,6 +2835,10 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
cgroup_migrate_finish(&preloaded_csets);
+
+ if (!ret)
+ trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
+
return ret;
}
@@ -3611,6 +3620,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
mutex_lock(&cgroup_mutex);
ret = kernfs_rename(kn, new_parent, new_name_str);
+ if (!ret)
+ trace_cgroup_rename(cgrp);
mutex_unlock(&cgroup_mutex);
@@ -4381,6 +4392,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
if (task) {
ret = cgroup_migrate(task, false, to->root);
+ if (!ret)
+ trace_cgroup_transfer_tasks(to, task, false);
put_task_struct(task);
}
} while (task && !ret);
@@ -5046,6 +5059,8 @@ static void css_release_work_fn(struct work_struct *work)
ss->css_released(css);
} else {
/* cgroup release path */
+ trace_cgroup_release(cgrp);
+
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
@@ -5332,6 +5347,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (ret)
goto out_destroy;
+ trace_cgroup_mkdir(cgrp);
+
/* let's create and online css's */
kernfs_activate(kn);
@@ -5507,6 +5524,9 @@ static int cgroup_rmdir(struct kernfs_node *kn)
ret = cgroup_destroy_locked(cgrp);
+ if (!ret)
+ trace_cgroup_rmdir(cgrp);
+
cgroup_kn_unlock(kn);
return ret;
}
@@ -5743,7 +5763,7 @@ core_initcall(cgroup_wq_init);
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk)
{
- char *buf, *path;
+ char *buf;
int retval;
struct cgroup_root *root;
@@ -5786,18 +5806,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
* " (deleted)" is appended to the cgroup path.
*/
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
- path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
+ retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
- if (!path) {
+ if (retval >= PATH_MAX)
retval = -ENAMETOOLONG;
+ if (retval < 0)
goto out_unlock;
- }
+
+ seq_puts(m, buf);
} else {
- path = "/";
+ seq_puts(m, "/");
}
- seq_puts(m, path);
-
if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
seq_puts(m, " (deleted)\n");
else
@@ -6062,8 +6082,9 @@ static void cgroup_release_agent(struct work_struct *work)
{
struct cgroup *cgrp =
container_of(work, struct cgroup, release_agent_work);
- char *pathbuf = NULL, *agentbuf = NULL, *path;
+ char *pathbuf = NULL, *agentbuf = NULL;
char *argv[3], *envp[3];
+ int ret;
mutex_lock(&cgroup_mutex);
@@ -6073,13 +6094,13 @@ static void cgroup_release_agent(struct work_struct *work)
goto out;
spin_lock_irq(&css_set_lock);
- path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+ ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
spin_unlock_irq(&css_set_lock);
- if (!path)
+ if (ret < 0 || ret >= PATH_MAX)
goto out;
argv[0] = agentbuf;
- argv[1] = path;
+ argv[1] = pathbuf;
argv[2] = NULL;
/* minimal command environment */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5df20d6d1520..29de1a9352c0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -228,7 +228,7 @@ static struct {
.wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
#ifdef CONFIG_DEBUG_LOCK_ALLOC
- .dep_map = {.name = "cpu_hotplug.lock" },
+ .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map),
#endif
};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2b4c20ab5bbe..29f815d2ef7e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2715,7 +2715,7 @@ void __cpuset_memory_pressure_bump(void)
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk)
{
- char *buf, *p;
+ char *buf;
struct cgroup_subsys_state *css;
int retval;
@@ -2724,14 +2724,15 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- retval = -ENAMETOOLONG;
css = task_get_css(tsk, cpuset_cgrp_id);
- p = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
- current->nsproxy->cgroup_ns);
+ retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
css_put(css);
- if (!p)
+ if (retval >= PATH_MAX)
+ retval = -ENAMETOOLONG;
+ if (retval < 0)
goto out_free;
- seq_puts(m, p);
+ seq_puts(m, buf);
seq_putc(m, '\n');
retval = 0;
out_free:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c6e47e97b33f..0e292132efac 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1960,6 +1960,12 @@ void perf_event_disable(struct perf_event *event)
}
EXPORT_SYMBOL_GPL(perf_event_disable);
+void perf_event_disable_inatomic(struct perf_event *event)
+{
+ event->pending_disable = 1;
+ irq_work_queue(&event->pending);
+}
+
static void perf_set_shadow_time(struct perf_event *event,
struct perf_event_context *ctx,
u64 tstamp)
@@ -7075,8 +7081,8 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- event->pending_disable = 1;
- irq_work_queue(&event->pending);
+
+ perf_event_disable_inatomic(event);
}
READ_ONCE(event->overflow_handler)(event, data, regs);
@@ -8855,7 +8861,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register);
void perf_pmu_unregister(struct pmu *pmu)
{
+ int remove_device;
+
mutex_lock(&pmus_lock);
+ remove_device = pmu_bus_running;
list_del_rcu(&pmu->entry);
mutex_unlock(&pmus_lock);
@@ -8869,10 +8878,12 @@ void perf_pmu_unregister(struct pmu *pmu)
free_percpu(pmu->pmu_disable_count);
if (pmu->type >= PERF_TYPE_MAX)
idr_remove(&pmu_idr, pmu->type);
- if (pmu->nr_addr_filters)
- device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
- device_del(pmu->dev);
- put_device(pmu->dev);
+ if (remove_device) {
+ if (pmu->nr_addr_filters)
+ device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ }
free_pmu_context(pmu);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d4129bb05e5d..f9ec9add2164 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
retry:
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
+ ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
+ &vma);
if (ret <= 0)
return ret;
@@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
- result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+ result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
+ NULL);
if (result < 0)
return result;
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d42242485cb..623259fc794d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -547,7 +547,8 @@ free_tsk:
}
#ifdef CONFIG_MMU
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+static __latent_entropy int dup_mmap(struct mm_struct *mm,
+ struct mm_struct *oldmm)
{
struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
struct rb_node **rb_link, *rb_parent;
@@ -1441,7 +1442,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static struct task_struct *copy_process(unsigned long clone_flags,
+static __latent_entropy struct task_struct *copy_process(
+ unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *child_tidptr,
@@ -1926,6 +1928,7 @@ long _do_fork(unsigned long clone_flags,
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+ add_latent_entropy();
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0c5f1a5db654..9c4d30483264 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -721,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq)
irq_put_desc_unlock(desc, flags);
return 0;
}
+EXPORT_SYMBOL_GPL(irq_set_parent);
#endif
/*
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 8d44b3fea9d0..30e6d05aa5a9 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void)
/*
* We are interested in code coverage as a function of a syscall inputs,
* so we ignore code executed in interrupts.
+ * The checks for whether we are in an interrupt are open-coded, because
+ * 1. We can't use in_interrupt() here, since it also returns true
+ * when we are inside local_bh_disable() section.
+ * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
+ * since that leads to slower generated code (three separate tests,
+ * one for each of the flags).
*/
- if (!t || in_interrupt())
+ if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
+ | NMI_MASK)))
return;
mode = READ_ONCE(t->kcov_mode);
if (mode == KCOV_MODE_TRACE) {
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1e7f5da648d9..6ccb08f57fcb 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -498,9 +498,9 @@ static int enter_state(suspend_state_t state)
#ifndef CONFIG_SUSPEND_SKIP_SYNC
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
- printk(KERN_INFO "PM: Syncing filesystems ... ");
+ pr_info("PM: Syncing filesystems ... ");
sys_sync();
- printk("done.\n");
+ pr_cont("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
#endif
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index d5e397315473..de08fc90baaf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1769,6 +1769,10 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c
cont_flush();
}
+ /* Skip empty continuation lines that couldn't be added - they just flush */
+ if (!text_len && (lflags & LOG_CONT))
+ return 0;
+
/* If it doesn't end in a newline, try to buffer the current line */
if (!(lflags & LOG_NEWLINE)) {
if (cont_add(facility, level, lflags, text, text_len))
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2a99027312a6..e6474f7272ec 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
int this_len, retval;
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
- retval = access_process_vm(tsk, src, buf, this_len, 0);
+ retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE);
if (!retval) {
if (copied)
break;
@@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
if (copy_from_user(buf, src, this_len))
return -EFAULT;
- retval = access_process_vm(tsk, dst, buf, this_len, 1);
+ retval = access_process_vm(tsk, dst, buf, this_len,
+ FOLL_FORCE | FOLL_WRITE);
if (!retval) {
if (copied)
break;
@@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
unsigned long tmp;
int copied;
- copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
+ copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
if (copied != sizeof(tmp))
return -EIO;
return put_user(tmp, (unsigned long __user *)data);
@@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
{
int copied;
- copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
+ copied = access_process_vm(tsk, addr, &data, sizeof(data),
+ FOLL_FORCE | FOLL_WRITE);
return (copied == sizeof(data)) ? 0 : -EIO;
}
@@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
switch (request) {
case PTRACE_PEEKTEXT:
case PTRACE_PEEKDATA:
- ret = access_process_vm(child, addr, &word, sizeof(word), 0);
+ ret = access_process_vm(child, addr, &word, sizeof(word),
+ FOLL_FORCE);
if (ret != sizeof(word))
ret = -EIO;
else
@@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
case PTRACE_POKETEXT:
case PTRACE_POKEDATA:
- ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+ ret = access_process_vm(child, addr, &data, sizeof(data),
+ FOLL_FORCE | FOLL_WRITE);
ret = (ret != sizeof(data) ? -EIO : 0);
break;
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 944b1b491ed8..1898559e6b60 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -170,7 +170,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
false));
}
-static void rcu_process_callbacks(struct softirq_action *unused)
+static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
{
__rcu_process_callbacks(&rcu_sched_ctrlblk);
__rcu_process_callbacks(&rcu_bh_ctrlblk);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7e2e03879c2e..69a5611a7e7c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3013,7 +3013,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
/*
* Do RCU core processing for the current CPU.
*/
-static void rcu_process_callbacks(struct softirq_action *unused)
+static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
{
struct rcu_state *rsp;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1ab00a..42d4027f9e26 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly;
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+ const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+ unsigned long val = (unsigned long)word << shift | bit;
+
+ return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(bit_waitqueue);
+
void __init sched_init(void)
{
int i, j;
unsigned long alloc_size = 0, ptr;
+ for (i = 0; i < WAIT_TABLE_SIZE; i++)
+ init_waitqueue_head(bit_wait_table + i);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 13935886a471..fa178b62ea79 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -415,7 +415,8 @@ static char *task_group_path(struct task_group *tg)
if (autogroup_path(tg, group_path, PATH_MAX))
return group_path;
- return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ return group_path;
}
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 502e95a6e927..c242944f5cbd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se)
* will definitely be update (after enqueue).
*/
sa->period_contrib = 1023;
- sa->load_avg = scale_load_down(se->load.weight);
+ /*
+ * Tasks are intialized with full load to be seen as heavy tasks until
+ * they get a chance to stabilize to their real load level.
+ * Group entities are intialized with zero load to reflect the fact that
+ * nothing has been attached to the task group yet.
+ */
+ if (entity_is_task(se))
+ sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
@@ -5471,13 +5478,18 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
- struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
- u64 avg_idle = this_rq()->avg_idle;
- u64 avg_cost = this_sd->avg_scan_cost;
+ struct sched_domain *this_sd;
+ u64 avg_cost, avg_idle = this_rq()->avg_idle;
u64 time, cost;
s64 delta;
int cpu, wrap;
+ this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ if (!this_sd)
+ return -1;
+
+ avg_cost = this_sd->avg_scan_cost;
+
/*
* Due to large variance we need a large fuzz factor; hackbench in
* particularly is sensitive here.
@@ -8522,7 +8534,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
*/
-static void run_rebalance_domains(struct softirq_action *h)
+static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance ?
@@ -8827,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
- struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8842,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) {
- rq = cpu_rq(i);
-
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 4f7053579fe3..9453efe9b25a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit)
}
EXPORT_SYMBOL(wake_up_bit);
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
-{
- const int shift = BITS_PER_LONG == 32 ? 5 : 6;
- const struct zone *zone = page_zone(virt_to_page(word));
- unsigned long val = (unsigned long)word << shift | bit;
-
- return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
-}
-EXPORT_SYMBOL(bit_waitqueue);
-
/*
* Manipulate the atomic_t address to produce a better bit waitqueue table hash
* index (we're keying off bit -1, but that would produce a horrible hash
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 66762645f9e8..744fa611cae0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
const char * const softirq_to_name[NR_SOFTIRQS] = {
- "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
+ "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
"TASKLET", "SCHED", "HRTIMER", "RCU"
};
@@ -496,7 +496,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
}
EXPORT_SYMBOL(__tasklet_hi_schedule_first);
-static void tasklet_action(struct softirq_action *a)
+static __latent_entropy void tasklet_action(struct softirq_action *a)
{
struct tasklet_struct *list;
@@ -532,7 +532,7 @@ static void tasklet_action(struct softirq_action *a)
}
}
-static void tasklet_hi_action(struct softirq_action *a)
+static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
{
struct tasklet_struct *list;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c3aad685bbc0..12dd190634ab 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
static int alarm_timer_create(struct k_itimer *new_timer)
{
enum alarmtimer_type type;
- struct alarm_base *base;
if (!alarmtimer_get_rtcdev())
return -ENOTSUPP;
@@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer)
return -EPERM;
type = clock2alarm(new_timer->it_clock);
- base = &alarm_bases[type];
alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
return 0;
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 32bf6f75a8fe..c611c47de884 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags)
#ifdef CONFIG_NO_HZ_COMMON
static inline struct timer_base *
-__get_target_base(struct timer_base *base, unsigned tflags)
+get_target_base(struct timer_base *base, unsigned tflags)
{
#ifdef CONFIG_SMP
if ((tflags & TIMER_PINNED) || !base->migration_enabled)
@@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
+ unsigned long jnow = READ_ONCE(jiffies);
+
/*
* We only forward the base when it's idle and we have a delta between
* base clock and jiffies.
*/
- if (!base->is_idle || (long) (jiffies - base->clk) < 2)
+ if (!base->is_idle || (long) (jnow - base->clk) < 2)
return;
/*
* If the next expiry value is > jiffies, then we fast forward to
* jiffies otherwise we forward to the next expiry value.
*/
- if (time_after(base->next_expiry, jiffies))
- base->clk = jiffies;
+ if (time_after(base->next_expiry, jnow))
+ base->clk = jnow;
else
base->clk = base->next_expiry;
}
#else
static inline struct timer_base *
-__get_target_base(struct timer_base *base, unsigned tflags)
+get_target_base(struct timer_base *base, unsigned tflags)
{
return get_timer_this_cpu_base(tflags);
}
@@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base) { }
#endif
-static inline struct timer_base *
-get_target_base(struct timer_base *base, unsigned tflags)
-{
- struct timer_base *target = __get_target_base(base, tflags);
-
- forward_timer_base(target);
- return target;
-}
/*
* We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -943,7 +937,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
{
for (;;) {
struct timer_base *base;
- u32 tf = timer->flags;
+ u32 tf;
+
+ /*
+ * We need to use READ_ONCE() here, otherwise the compiler
+ * might re-read @tf between the check for TIMER_MIGRATING
+ * and spin_lock().
+ */
+ tf = READ_ONCE(timer->flags);
if (!(tf & TIMER_MIGRATING)) {
base = get_timer_base(tf);
@@ -964,6 +965,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
unsigned long clk = 0, flags;
int ret = 0;
+ BUG_ON(!timer->function);
+
/*
* This is a common optimization triggered by the networking code - if
* the timer is re-modified to have the same timeout or ends up in the
@@ -972,13 +975,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
if (timer_pending(timer)) {
if (timer->expires == expires)
return 1;
+
/*
- * Take the current timer_jiffies of base, but without holding
- * the lock!
+ * We lock timer base and calculate the bucket index right
+ * here. If the timer ends up in the same bucket, then we
+ * just update the expiry time and avoid the whole
+ * dequeue/enqueue dance.
*/
- base = get_timer_base(timer->flags);
- clk = base->clk;
+ base = lock_timer_base(timer, &flags);
+ clk = base->clk;
idx = calc_wheel_index(expires, clk);
/*
@@ -988,14 +994,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
*/
if (idx == timer_get_idx(timer)) {
timer->expires = expires;
- return 1;
+ ret = 1;
+ goto out_unlock;
}
+ } else {
+ base = lock_timer_base(timer, &flags);
}
timer_stats_timer_set_start_info(timer);
- BUG_ON(!timer->function);
-
- base = lock_timer_base(timer, &flags);
ret = detach_if_pending(timer, base, false);
if (!ret && pending_only)
@@ -1025,12 +1031,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
}
}
+ /* Try to forward a stale timer base clock */
+ forward_timer_base(base);
+
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
- * between calculating 'idx' and taking the lock, only enqueue_timer()
- * and trigger_dyntick_cpu() is required. Otherwise we need to
- * (re)calculate the wheel index via internal_add_timer().
+ * between calculating 'idx' and possibly switching the base, only
+ * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
+ * we need to (re)calculate the wheel index via
+ * internal_add_timer().
*/
if (idx != UINT_MAX && clk == base->clk) {
enqueue_timer(base, timer, idx);
@@ -1510,12 +1520,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
base->next_expiry = nextevt;
/*
- * We have a fresh next event. Check whether we can forward the base:
+ * We have a fresh next event. Check whether we can forward the
+ * base. We can only do that when @basej is past base->clk
+ * otherwise we might rewind base->clk.
*/
- if (time_after(nextevt, jiffies))
- base->clk = jiffies;
- else if (time_after(nextevt, base->clk))
- base->clk = nextevt;
+ if (time_after(basej, base->clk)) {
+ if (time_after(nextevt, basej))
+ base->clk = basej;
+ else if (time_after(nextevt, base->clk))
+ base->clk = nextevt;
+ }
if (time_before_eq(nextevt, basej)) {
expires = basem;
@@ -1633,7 +1647,7 @@ static inline void __run_timers(struct timer_base *base)
/*
* This function runs timers and the timer-tq in bottom half context.
*/
-static void run_timer_softirq(struct softirq_action *h)
+static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);