diff options
author | David S. Miller <davem@davemloft.net> | 2016-10-30 12:42:58 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2016-10-30 12:42:58 -0400 |
commit | 27058af401e49d88a905df000dd26f443fcfa8ce (patch) | |
tree | 819f32113d3b8374b9fbf72e2202d4c4d4511a60 /kernel | |
parent | 357f4aae859b5d74554b0ccbb18556f1df4166c3 (diff) | |
parent | 2a26d99b251b8625d27aed14e97fc10707a3a81f (diff) | |
download | linux-27058af401e49d88a905df000dd26f443fcfa8ce.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
Mostly simple overlapping changes.
For example, David Ahern's adjacency list revamp in 'net-next'
conflicted with an adjacency list traversal bug fix in 'net'.
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 75 | ||||
-rw-r--r-- | kernel/cpu.c | 2 | ||||
-rw-r--r-- | kernel/cpuset.c | 13 | ||||
-rw-r--r-- | kernel/events/core.c | 23 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 6 | ||||
-rw-r--r-- | kernel/fork.c | 7 | ||||
-rw-r--r-- | kernel/irq/manage.c | 1 | ||||
-rw-r--r-- | kernel/kcov.c | 9 | ||||
-rw-r--r-- | kernel/power/suspend.c | 4 | ||||
-rw-r--r-- | kernel/printk/printk.c | 4 | ||||
-rw-r--r-- | kernel/ptrace.c | 16 | ||||
-rw-r--r-- | kernel/rcu/tiny.c | 2 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 16 | ||||
-rw-r--r-- | kernel/sched/debug.c | 3 | ||||
-rw-r--r-- | kernel/sched/fair.c | 25 | ||||
-rw-r--r-- | kernel/sched/wait.c | 10 | ||||
-rw-r--r-- | kernel/softirq.c | 6 | ||||
-rw-r--r-- | kernel/time/alarmtimer.c | 2 | ||||
-rw-r--r-- | kernel/time/timer.c | 76 |
20 files changed, 192 insertions, 110 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 44066158f0d1..85bc9beb046d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -64,6 +64,9 @@ #include <linux/file.h> #include <net/sock.h> +#define CREATE_TRACE_POINTS +#include <trace/events/cgroup.h> + /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls @@ -1176,6 +1179,8 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; + trace_cgroup_destroy_root(root); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); @@ -1874,6 +1879,9 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) strcpy(root->release_agent_path, opts.release_agent); spin_unlock(&release_agent_path_lock); } + + trace_cgroup_remount(root); + out_unlock: kfree(opts.release_agent); kfree(opts.name); @@ -2031,6 +2039,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; + trace_cgroup_setup_root(root); + /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in @@ -2315,22 +2325,18 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); - int ret; - ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; + return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } -char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { - char *ret; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@ -2357,12 +2363,12 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * * Return value is the same as kernfs_path(). */ -char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroup_root *root; struct cgroup *cgrp; int hierarchy_id = 1; - char *path = NULL; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@ -2371,16 +2377,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ - if (strlcpy(buf, "/", buflen) < buflen) - path = buf; + ret = strlcpy(buf, "/", buflen); } spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); - return path; + return ret; } EXPORT_SYMBOL_GPL(task_cgroup_path); @@ -2830,6 +2835,10 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); cgroup_migrate_finish(&preloaded_csets); + + if (!ret) + trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); + return ret; } @@ -3611,6 +3620,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); mutex_unlock(&cgroup_mutex); @@ -4381,6 +4392,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (task) { ret = cgroup_migrate(task, false, to->root); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); put_task_struct(task); } } while (task && !ret); @@ -5046,6 +5059,8 @@ static void css_release_work_fn(struct work_struct *work) ss->css_released(css); } else { /* cgroup release path */ + trace_cgroup_release(cgrp); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -5332,6 +5347,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; + trace_cgroup_mkdir(cgrp); + /* let's create and online css's */ kernfs_activate(kn); @@ -5507,6 +5524,9 @@ static int cgroup_rmdir(struct kernfs_node *kn) ret = cgroup_destroy_locked(cgrp); + if (!ret) + trace_cgroup_rmdir(cgrp); + cgroup_kn_unlock(kn); return ret; } @@ -5743,7 +5763,7 @@ core_initcall(cgroup_wq_init); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *path; + char *buf; int retval; struct cgroup_root *root; @@ -5786,18 +5806,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (!path) { + if (retval >= PATH_MAX) retval = -ENAMETOOLONG; + if (retval < 0) goto out_unlock; - } + + seq_puts(m, buf); } else { - path = "/"; + seq_puts(m, "/"); } - seq_puts(m, path); - if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) seq_puts(m, " (deleted)\n"); else @@ -6062,8 +6082,9 @@ static void cgroup_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL, *path; + char *pathbuf = NULL, *agentbuf = NULL; char *argv[3], *envp[3]; + int ret; mutex_lock(&cgroup_mutex); @@ -6073,13 +6094,13 @@ static void cgroup_release_agent(struct work_struct *work) goto out; spin_lock_irq(&css_set_lock); - path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); spin_unlock_irq(&css_set_lock); - if (!path) + if (ret < 0 || ret >= PATH_MAX) goto out; argv[0] = agentbuf; - argv[1] = path; + argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 5df20d6d1520..29de1a9352c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -228,7 +228,7 @@ static struct { .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), #ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = {.name = "cpu_hotplug.lock" }, + .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), #endif }; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2b4c20ab5bbe..29f815d2ef7e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2715,7 +2715,7 @@ void __cpuset_memory_pressure_bump(void) int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *p; + char *buf; struct cgroup_subsys_state *css; int retval; @@ -2724,14 +2724,15 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - retval = -ENAMETOOLONG; css = task_get_css(tsk, cpuset_cgrp_id); - p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); + retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); css_put(css); - if (!p) + if (retval >= PATH_MAX) + retval = -ENAMETOOLONG; + if (retval < 0) goto out_free; - seq_puts(m, p); + seq_puts(m, buf); seq_putc(m, '\n'); retval = 0; out_free: diff --git a/kernel/events/core.c b/kernel/events/core.c index c6e47e97b33f..0e292132efac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1960,6 +1960,12 @@ void perf_event_disable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_disable); +void perf_event_disable_inatomic(struct perf_event *event) +{ + event->pending_disable = 1; + irq_work_queue(&event->pending); +} + static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, u64 tstamp) @@ -7075,8 +7081,8 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_disable = 1; - irq_work_queue(&event->pending); + + perf_event_disable_inatomic(event); } READ_ONCE(event->overflow_handler)(event, data, regs); @@ -8855,7 +8861,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { + int remove_device; + mutex_lock(&pmus_lock); + remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); mutex_unlock(&pmus_lock); @@ -8869,10 +8878,12 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); + if (remove_device) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); + } free_pmu_context(pmu); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d4129bb05e5d..f9ec9add2164 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, + &vma); if (ret <= 0) return ret; @@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, + NULL); if (result < 0) return result; diff --git a/kernel/fork.c b/kernel/fork.c index 6d42242485cb..623259fc794d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -547,7 +547,8 @@ free_tsk: } #ifdef CONFIG_MMU -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; @@ -1441,7 +1442,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -static struct task_struct *copy_process(unsigned long clone_flags, +static __latent_entropy struct task_struct *copy_process( + unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, @@ -1926,6 +1928,7 @@ long _do_fork(unsigned long clone_flags, p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); + add_latent_entropy(); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0c5f1a5db654..9c4d30483264 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -721,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq) irq_put_desc_unlock(desc, flags); return 0; } +EXPORT_SYMBOL_GPL(irq_set_parent); #endif /* diff --git a/kernel/kcov.c b/kernel/kcov.c index 8d44b3fea9d0..30e6d05aa5a9 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void) /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. + * The checks for whether we are in an interrupt are open-coded, because + * 1. We can't use in_interrupt() here, since it also returns true + * when we are inside local_bh_disable() section. + * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), + * since that leads to slower generated code (three separate tests, + * one for each of the flags). */ - if (!t || in_interrupt()) + if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET + | NMI_MASK))) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1e7f5da648d9..6ccb08f57fcb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -498,9 +498,9 @@ static int enter_state(suspend_state_t state) #ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); - printk(KERN_INFO "PM: Syncing filesystems ... "); + pr_info("PM: Syncing filesystems ... "); sys_sync(); - printk("done.\n"); + pr_cont("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); #endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d5e397315473..de08fc90baaf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1769,6 +1769,10 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c cont_flush(); } + /* Skip empty continuation lines that couldn't be added - they just flush */ + if (!text_len && (lflags & LOG_CONT)) + return 0; + /* If it doesn't end in a newline, try to buffer the current line */ if (!(lflags & LOG_NEWLINE)) { if (cont_add(facility, level, lflags, text, text_len)) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2a99027312a6..e6474f7272ec 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, 0); + retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); if (!retval) { if (copied) break; @@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds this_len = (len > sizeof(buf)) ? sizeof(buf) : len; if (copy_from_user(buf, src, this_len)) return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, 1); + retval = access_process_vm(tsk, dst, buf, this_len, + FOLL_FORCE | FOLL_WRITE); if (!retval) { if (copied) break; @@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long tmp; int copied; - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) return -EIO; return put_user(tmp, (unsigned long __user *)data); @@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, { int copied; - copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); + copied = access_process_vm(tsk, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(data)) ? 0 : -EIO; } @@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), 0); + ret = access_process_vm(child, addr, &word, sizeof(word), + FOLL_FORCE); if (ret != sizeof(word)) ret = -EIO; else @@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, case PTRACE_POKETEXT: case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), 1); + ret = access_process_vm(child, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); ret = (ret != sizeof(data) ? -EIO : 0); break; diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 944b1b491ed8..1898559e6b60 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -170,7 +170,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) false)); } -static void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7e2e03879c2e..69a5611a7e7c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3013,7 +3013,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* * Do RCU core processing for the current CPU. */ -static void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_state *rsp; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..42d4027f9e26 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); + void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); + #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 13935886a471..fa178b62ea79 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -415,7 +415,8 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; } #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 502e95a6e927..c242944f5cbd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(se->load.weight); + /* + * Tasks are intialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are intialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; /* * At this point, util_avg won't be used in select_task_rq_fair anyway @@ -5471,13 +5478,18 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { - struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - u64 avg_idle = this_rq()->avg_idle; - u64 avg_cost = this_sd->avg_scan_cost; + struct sched_domain *this_sd; + u64 avg_cost, avg_idle = this_rq()->avg_idle; u64 time, cost; s64 delta; int cpu, wrap; + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) + return -1; + + avg_cost = this_sd->avg_scan_cost; + /* * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. @@ -8522,7 +8534,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). */ -static void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void run_rebalance_domains(struct softirq_action *h) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? @@ -8827,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct sched_entity *se; struct cfs_rq *cfs_rq; - struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8842,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4f7053579fe3..9453efe9b25a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); - /* * Manipulate the atomic_t address to produce a better bit waitqueue table hash * index (we're keying off bit -1, but that would produce a horrible hash diff --git a/kernel/softirq.c b/kernel/softirq.c index 66762645f9e8..744fa611cae0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp DEFINE_PER_CPU(struct task_struct *, ksoftirqd); const char * const softirq_to_name[NR_SOFTIRQS] = { - "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; @@ -496,7 +496,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule_first); -static void tasklet_action(struct softirq_action *a) +static __latent_entropy void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; @@ -532,7 +532,7 @@ static void tasklet_action(struct softirq_action *a) } } -static void tasklet_hi_action(struct softirq_action *a) +static __latent_entropy void tasklet_hi_action(struct softirq_action *a) { struct tasklet_struct *list; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c3aad685bbc0..12dd190634ab 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) static int alarm_timer_create(struct k_itimer *new_timer) { enum alarmtimer_type type; - struct alarm_base *base; if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; @@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer) return -EPERM; type = clock2alarm(new_timer->it_clock); - base = &alarm_bases[type]; alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); return 0; } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 32bf6f75a8fe..c611c47de884 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags) #ifdef CONFIG_NO_HZ_COMMON static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { #ifdef CONFIG_SMP if ((tflags & TIMER_PINNED) || !base->migration_enabled) @@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { + unsigned long jnow = READ_ONCE(jiffies); + /* * We only forward the base when it's idle and we have a delta between * base clock and jiffies. */ - if (!base->is_idle || (long) (jiffies - base->clk) < 2) + if (!base->is_idle || (long) (jnow - base->clk) < 2) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jiffies)) - base->clk = jiffies; + if (time_after(base->next_expiry, jnow)) + base->clk = jnow; else base->clk = base->next_expiry; } #else static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { return get_timer_this_cpu_base(tflags); } @@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { } #endif -static inline struct timer_base * -get_target_base(struct timer_base *base, unsigned tflags) -{ - struct timer_base *target = __get_target_base(base, tflags); - - forward_timer_base(target); - return target; -} /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means @@ -943,7 +937,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, { for (;;) { struct timer_base *base; - u32 tf = timer->flags; + u32 tf; + + /* + * We need to use READ_ONCE() here, otherwise the compiler + * might re-read @tf between the check for TIMER_MIGRATING + * and spin_lock(). + */ + tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { base = get_timer_base(tf); @@ -964,6 +965,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) unsigned long clk = 0, flags; int ret = 0; + BUG_ON(!timer->function); + /* * This is a common optimization triggered by the networking code - if * the timer is re-modified to have the same timeout or ends up in the @@ -972,13 +975,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) if (timer_pending(timer)) { if (timer->expires == expires) return 1; + /* - * Take the current timer_jiffies of base, but without holding - * the lock! + * We lock timer base and calculate the bucket index right + * here. If the timer ends up in the same bucket, then we + * just update the expiry time and avoid the whole + * dequeue/enqueue dance. */ - base = get_timer_base(timer->flags); - clk = base->clk; + base = lock_timer_base(timer, &flags); + clk = base->clk; idx = calc_wheel_index(expires, clk); /* @@ -988,14 +994,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) */ if (idx == timer_get_idx(timer)) { timer->expires = expires; - return 1; + ret = 1; + goto out_unlock; } + } else { + base = lock_timer_base(timer, &flags); } timer_stats_timer_set_start_info(timer); - BUG_ON(!timer->function); - - base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, false); if (!ret && pending_only) @@ -1025,12 +1031,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } } + /* Try to forward a stale timer base clock */ + forward_timer_base(base); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance - * between calculating 'idx' and taking the lock, only enqueue_timer() - * and trigger_dyntick_cpu() is required. Otherwise we need to - * (re)calculate the wheel index via internal_add_timer(). + * between calculating 'idx' and possibly switching the base, only + * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise + * we need to (re)calculate the wheel index via + * internal_add_timer(). */ if (idx != UINT_MAX && clk == base->clk) { enqueue_timer(base, timer, idx); @@ -1510,12 +1520,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); base->next_expiry = nextevt; /* - * We have a fresh next event. Check whether we can forward the base: + * We have a fresh next event. Check whether we can forward the + * base. We can only do that when @basej is past base->clk + * otherwise we might rewind base->clk. */ - if (time_after(nextevt, jiffies)) - base->clk = jiffies; - else if (time_after(nextevt, base->clk)) - base->clk = nextevt; + if (time_after(basej, base->clk)) { + if (time_after(nextevt, basej)) + base->clk = basej; + else if (time_after(nextevt, base->clk)) + base->clk = nextevt; + } if (time_before_eq(nextevt, basej)) { expires = basem; @@ -1633,7 +1647,7 @@ static inline void __run_timers(struct timer_base *base) /* * This function runs timers and the timer-tq in bottom half context. */ -static void run_timer_softirq(struct softirq_action *h) +static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); |