summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2013-12-17 15:22:35 +0100
committerIngo Molnar <mingo@kernel.org>2013-12-17 15:22:35 +0100
commitffe732c2430c55074bebb172d33d909c662cd0e3 (patch)
treed39087d7b5d8caa505b23b6ea2abc2e33efa5be5 /kernel
parent40ea2b42d7c44386cf81d5636d574193da2c8df2 (diff)
parent757dfcaa41844595964f1220f1d33182dae49976 (diff)
downloadlinux-ffe732c2430c55074bebb172d33d909c662cd0e3.tar.bz2
Merge branch 'sched/urgent' into sched/core
Merge the latest batch of fixes before applying development patches. Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c35
-rw-r--r--kernel/cpuset.c8
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/extable.c4
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/rcu/tree_plugin.h4
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/sched/fair.c171
-rw-r--r--kernel/sched/rt.c14
-rw-r--r--kernel/time/tick-common.c15
-rw-r--r--kernel/time/tick-sched.c25
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/timer.c5
-rw-r--r--kernel/trace/ftrace.c64
-rw-r--r--kernel/trace/trace_event_perf.c8
-rw-r--r--kernel/trace/trace_events.c3
-rw-r--r--kernel/trace/trace_syscalls.c10
-rw-r--r--kernel/workqueue.c50
18 files changed, 262 insertions, 176 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4c62513fe19f..8b729c278b64 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -90,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex);
static DEFINE_MUTEX(cgroup_root_mutex);
/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+
+/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
* populated with the built in subsystems, and modular subsystems are
* registered after that. The mutable section of this array is protected by
@@ -191,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+static int cgroup_file_release(struct inode *inode, struct file *file);
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -871,7 +880,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
- schedule_work(&cgrp->destroy_work);
+ queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
}
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -2421,7 +2430,7 @@ static const struct file_operations cgroup_seqfile_operations = {
.read = seq_read,
.write = cgroup_file_write,
.llseek = seq_lseek,
- .release = single_release,
+ .release = cgroup_file_release,
};
static int cgroup_file_open(struct inode *inode, struct file *file)
@@ -2482,6 +2491,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
ret = cft->release(inode, file);
if (css->ss)
css_put(css);
+ if (file->f_op == &cgroup_seqfile_operations)
+ single_release(inode, file);
return ret;
}
@@ -4249,7 +4260,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
* css_put(). dput() requires process context which we don't have.
*/
INIT_WORK(&css->destroy_work, css_free_work_fn);
- schedule_work(&css->destroy_work);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
}
static void css_release(struct percpu_ref *ref)
@@ -4539,7 +4550,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_killed_work_fn);
- schedule_work(&css->destroy_work);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
}
/**
@@ -5063,6 +5074,22 @@ out:
return err;
}
+static int __init cgroup_wq_init(void)
+{
+ /*
+ * There isn't much point in executing destruction path in
+ * parallel. Good chunk is serialized with cgroup_mutex anyway.
+ * Use 1 for @max_active.
+ *
+ * We would prefer to do this in cgroup_init() above, but that
+ * is called before init_workqueues(): so leave this until after.
+ */
+ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+ BUG_ON(!cgroup_destroy_wq);
+ return 0;
+}
+core_initcall(cgroup_wq_init);
+
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6bf981e13c43..4772034b4b17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
need_loop = task_has_mempolicy(tsk) ||
!nodes_intersects(*newmems, tsk->mems_allowed);
- if (need_loop)
+ if (need_loop) {
+ local_irq_disable();
write_seqcount_begin(&tsk->mems_allowed_seq);
+ }
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
@@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
tsk->mems_allowed = *newmems;
- if (need_loop)
+ if (need_loop) {
write_seqcount_end(&tsk->mems_allowed_seq);
+ local_irq_enable();
+ }
task_unlock(tsk);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d724e7757cd1..72348dc192c1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5680,11 +5680,6 @@ static void swevent_hlist_put(struct perf_event *event)
{
int cpu;
- if (event->cpu != -1) {
- swevent_hlist_put_cpu(event, event->cpu);
- return;
- }
-
for_each_possible_cpu(cpu)
swevent_hlist_put_cpu(event, cpu);
}
@@ -5718,9 +5713,6 @@ static int swevent_hlist_get(struct perf_event *event)
int err;
int cpu, failed_cpu;
- if (event->cpu != -1)
- return swevent_hlist_get_cpu(event, event->cpu);
-
get_online_cpus();
for_each_possible_cpu(cpu) {
err = swevent_hlist_get_cpu(event, cpu);
diff --git a/kernel/extable.c b/kernel/extable.c
index 832cb28105bb..763faf037ec1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
static inline int init_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext &&
- addr <= (unsigned long)_einittext)
+ addr < (unsigned long)_einittext)
return 1;
return 0;
}
@@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
int core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
- addr <= (unsigned long)_etext)
+ addr < (unsigned long)_etext)
return 1;
if (system_state == SYSTEM_BOOTING &&
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cb228bf21760..abcd6ca86cb7 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -50,7 +50,7 @@ static void resume_irqs(bool want_early)
bool is_early = desc->action &&
desc->action->flags & IRQF_EARLY_RESUME;
- if (is_early != want_early)
+ if (!is_early && want_early)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6abb03dff5c0..08a765232432 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1632,7 +1632,7 @@ module_param(rcu_idle_gp_delay, int, 0644);
static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
module_param(rcu_idle_lazy_gp_delay, int, 0644);
-extern int tick_nohz_enabled;
+extern int tick_nohz_active;
/*
* Try to advance callbacks for all flavors of RCU on the current CPU, but
@@ -1729,7 +1729,7 @@ static void rcu_prepare_for_idle(int cpu)
int tne;
/* Handle nohz enablement switches conservatively. */
- tne = ACCESS_ONCE(tick_nohz_enabled);
+ tne = ACCESS_ONCE(tick_nohz_active);
if (tne != rdtp->tick_nohz_enabled_snap) {
if (rcu_cpu_has_callbacks(cpu, NULL))
invoke_rcu_core(); /* force nohz to see update. */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 25b377986547..b21a63ed5d62 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2663,6 +2663,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
} while (need_resched());
}
EXPORT_SYMBOL(preempt_schedule);
+#endif /* CONFIG_PREEMPT */
/*
* this is the entry point to schedule() from kernel preemption
@@ -2696,8 +2697,6 @@ asmlinkage void __sched preempt_schedule_irq(void)
exception_exit(prev_state);
}
-#endif /* CONFIG_PREEMPT */
-
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
@@ -4765,7 +4764,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
cpumask_clear_cpu(rq->cpu, old_rd->span);
/*
- * If we dont want to free the old_rt yet then
+ * If we dont want to free the old_rd yet then
* set old_rd to NULL to skip the freeing later
* in this function:
*/
@@ -4906,6 +4905,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_asym);
static void update_top_cache_domain(int cpu)
{
struct sched_domain *sd;
+ struct sched_domain *busy_sd = NULL;
int id = cpu;
int size = 1;
@@ -4913,8 +4913,9 @@ static void update_top_cache_domain(int cpu)
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
- rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
+ busy_sd = sd->parent; /* sd_busy */
}
+ rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
@@ -5115,6 +5116,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+ sg->sgp->power_orig = sg->sgp->power;
/*
* Make sure the first group of this domain contains the
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 49aa01f9d4fd..a9185f7c9446 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
update_sysctl();
}
-#if BITS_PER_LONG == 32
-# define WMULT_CONST (~0UL)
-#else
-# define WMULT_CONST (1UL << 32)
-#endif
-
+#define WMULT_CONST (~0U)
#define WMULT_SHIFT 32
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+static void __update_inv_weight(struct load_weight *lw)
+{
+ unsigned long w;
+
+ if (likely(lw->inv_weight))
+ return;
+
+ w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+ lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
+ else
+ lw->inv_weight = WMULT_CONST / w;
+}
/*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ * OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
*/
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
- struct load_weight *lw)
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
- u64 tmp;
+ u64 fact = scale_load_down(weight);
+ int shift = WMULT_SHIFT;
- /*
- * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
- * entities since MIN_SHARES = 2. Treat weight as 1 if less than
- * 2^SCHED_LOAD_RESOLUTION.
- */
- if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
- tmp = (u64)delta_exec * scale_load_down(weight);
- else
- tmp = (u64)delta_exec;
-
- if (!lw->inv_weight) {
- unsigned long w = scale_load_down(lw->weight);
+ __update_inv_weight(lw);
- if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
- lw->inv_weight = 1;
- else if (unlikely(!w))
- lw->inv_weight = WMULT_CONST;
- else
- lw->inv_weight = WMULT_CONST / w;
+ if (unlikely(fact >> 32)) {
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
}
- /*
- * Check whether we'd overflow the 64-bit multiplication:
- */
- if (unlikely(tmp > WMULT_CONST))
- tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
- WMULT_SHIFT/2);
- else
- tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+ /* hint to use a 32x32->64 mul */
+ fact = (u64)(u32)fact * lw->inv_weight;
+
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
- return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+ return mul_u64_u32_shr(delta_exec, fact, shift);
}
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
/**************************************************************
* Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
/*
* delta /= w
*/
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
return delta;
}
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_add(&lw, se->load.weight);
load = &lw;
}
- slice = calc_delta_mine(slice, se->load.weight, load);
+ slice = __calc_delta(slice, se->load.weight, load);
}
return slice;
}
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
#endif
/*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
+ * Update the current task's runtime statistics.
*/
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
- unsigned long delta_exec)
-{
- unsigned long delta_exec_weighted;
-
- schedstat_set(curr->statistics.exec_max,
- max((u64)delta_exec, curr->statistics.exec_max));
-
- curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq, exec_clock, delta_exec);
- delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-
- curr->vruntime += delta_exec_weighted;
- update_min_vruntime(cfs_rq);
-}
-
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_clock_task(rq_of(cfs_rq));
- unsigned long delta_exec;
+ u64 delta_exec;
if (unlikely(!curr))
return;
- /*
- * Get the amount of time the current task was running
- * since the last time we changed load (this cannot
- * overflow on 32 bits):
- */
- delta_exec = (unsigned long)(now - curr->exec_start);
- if (!delta_exec)
+ delta_exec = now - curr->exec_start;
+ if (unlikely((s64)delta_exec <= 0))
return;
- __update_curr(cfs_rq, curr, delta_exec);
curr->exec_start = now;
+ schedstat_set(curr->statistics.exec_max,
+ max(delta_exec, curr->statistics.exec_max));
+
+ curr->sum_exec_runtime += delta_exec;
+ schedstat_add(cfs_rq, exec_clock, delta_exec);
+
+ curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_min_vruntime(cfs_rq);
+
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
@@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
}
}
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec)
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
}
static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
@@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
return rq_clock_task(rq_of(cfs_rq));
}
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec) {}
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -5379,10 +5363,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
*/
for_each_cpu(cpu, sched_group_cpus(sdg)) {
- struct sched_group *sg = cpu_rq(cpu)->sd->groups;
+ struct sched_group_power *sgp;
+ struct rq *rq = cpu_rq(cpu);
+
+ /*
+ * build_sched_domains() -> init_sched_groups_power()
+ * gets here before we've attached the domains to the
+ * runqueues.
+ *
+ * Use power_of(), which is set irrespective of domains
+ * in update_cpu_power().
+ *
+ * This avoids power/power_orig from being 0 and
+ * causing divide-by-zero issues on boot.
+ *
+ * Runtime updates will correct power_orig.
+ */
+ if (unlikely(!rq->sd)) {
+ power_orig += power_of(cpu);
+ power += power_of(cpu);
+ continue;
+ }
- power_orig += sg->sgp->power_orig;
- power += sg->sgp->power;
+ sgp = rq->sd->groups->sgp;
+ power_orig += sgp->power_orig;
+ power += sgp->power;
}
} else {
/*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7d57275fc396..1c4065575fa2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -901,6 +901,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
if (rq->online && prio < prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
}
@@ -910,6 +917,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
if (rq->online && rt_rq->highest_prio.curr != prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 64522ecdfe0e..162b03ab0ad2 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
*/
ktime_t tick_next_period;
ktime_t tick_period;
+
+/*
+ * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR
+ * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This
+ * variable has two functions:
+ *
+ * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the
+ * timekeeping lock all at once. Only the CPU which is assigned to do the
+ * update is handling it.
+ *
+ * 2) Hand off the duty in the NOHZ idle case by setting the value to
+ * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks
+ * at it will take over and keep the time keeping alive. The handover
+ * procedure also covers cpu hotplug.
+ */
int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3612fc77f834..ea20f7d1ac2c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -361,8 +361,8 @@ void __init tick_nohz_init(void)
/*
* NO HZ enabled ?
*/
-int tick_nohz_enabled __read_mostly = 1;
-
+static int tick_nohz_enabled __read_mostly = 1;
+int tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
*/
@@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now, idle;
- if (!tick_nohz_enabled)
+ if (!tick_nohz_active)
return -1;
now = ktime_get();
@@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now, iowait;
- if (!tick_nohz_enabled)
+ if (!tick_nohz_active)
return -1;
now = ktime_get();
@@ -711,8 +711,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return false;
}
- if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+ if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
+ ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
return false;
+ }
if (need_resched())
return false;
@@ -799,11 +801,6 @@ void tick_nohz_idle_enter(void)
local_irq_disable();
ts = &__get_cpu_var(tick_cpu_sched);
- /*
- * set ts->inidle unconditionally. even if the system did not
- * switch to nohz mode the cpu frequency governers rely on the
- * update of the idle time accounting in tick_nohz_start_idle().
- */
ts->inidle = 1;
__tick_nohz_idle_enter(ts);
@@ -973,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void)
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t next;
- if (!tick_nohz_enabled)
+ if (!tick_nohz_active)
return;
local_irq_disable();
@@ -981,7 +978,7 @@ static void tick_nohz_switch_to_nohz(void)
local_irq_enable();
return;
}
-
+ tick_nohz_active = 1;
ts->nohz_mode = NOHZ_MODE_LOWRES;
/*
@@ -1139,8 +1136,10 @@ void tick_setup_sched_timer(void)
}
#ifdef CONFIG_NO_HZ_COMMON
- if (tick_nohz_enabled)
+ if (tick_nohz_enabled) {
ts->nohz_mode = NOHZ_MODE_HIGHRES;
+ tick_nohz_active = 1;
+ }
#endif
}
#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3abf53418b67..87b4f00284c9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1347,7 +1347,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
tk->xtime_nsec -= remainder;
tk->xtime_nsec += 1ULL << tk->shift;
tk->ntp_error += remainder << tk->ntp_error_shift;
-
+ tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
}
#else
#define old_vsyscall_fixup(tk)
diff --git a/kernel/timer.c b/kernel/timer.c
index 6582b82fa966..accfd241b9e5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu)
/*
* The APs use this path later in boot
*/
- base = kmalloc_node(sizeof(*base),
- GFP_KERNEL | __GFP_ZERO,
- cpu_to_node(cpu));
+ base = kzalloc_node(sizeof(*base), GFP_KERNEL,
+ cpu_to_node(cpu));
if (!base)
return -ENOMEM;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 22fa55696760..0e9f9eaade2f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -367,9 +367,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
static int __register_ftrace_function(struct ftrace_ops *ops)
{
- if (unlikely(ftrace_disabled))
- return -ENODEV;
-
if (FTRACE_WARN_ON(ops == &global_ops))
return -EINVAL;
@@ -428,9 +425,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
{
int ret;
- if (ftrace_disabled)
- return -ENODEV;
-
if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
return -EBUSY;
@@ -2088,10 +2082,15 @@ static void ftrace_startup_enable(int command)
static int ftrace_startup(struct ftrace_ops *ops, int command)
{
bool hash_enable = true;
+ int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
+ ret = __register_ftrace_function(ops);
+ if (ret)
+ return ret;
+
ftrace_start_up++;
command |= FTRACE_UPDATE_CALLS;
@@ -2113,12 +2112,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
return 0;
}
-static void ftrace_shutdown(struct ftrace_ops *ops, int command)
+static int ftrace_shutdown(struct ftrace_ops *ops, int command)
{
bool hash_disable = true;
+ int ret;
if (unlikely(ftrace_disabled))
- return;
+ return -ENODEV;
+
+ ret = __unregister_ftrace_function(ops);
+ if (ret)
+ return ret;
ftrace_start_up--;
/*
@@ -2153,9 +2157,10 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
}
if (!command || !ftrace_enabled)
- return;
+ return 0;
ftrace_run_update_code(command);
+ return 0;
}
static void ftrace_startup_sysctl(void)
@@ -3060,16 +3065,13 @@ static void __enable_ftrace_function_probe(void)
if (i == FTRACE_FUNC_HASHSIZE)
return;
- ret = __register_ftrace_function(&trace_probe_ops);
- if (!ret)
- ret = ftrace_startup(&trace_probe_ops, 0);
+ ret = ftrace_startup(&trace_probe_ops, 0);
ftrace_probe_registered = 1;
}
static void __disable_ftrace_function_probe(void)
{
- int ret;
int i;
if (!ftrace_probe_registered)
@@ -3082,9 +3084,7 @@ static void __disable_ftrace_function_probe(void)
}
/* no more funcs left */
- ret = __unregister_ftrace_function(&trace_probe_ops);
- if (!ret)
- ftrace_shutdown(&trace_probe_ops, 0);
+ ftrace_shutdown(&trace_probe_ops, 0);
ftrace_probe_registered = 0;
}
@@ -4366,12 +4366,15 @@ core_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
/* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command) \
- ({ \
- (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
- 0; \
+# define ftrace_startup(ops, command) \
+ ({ \
+ int ___ret = __register_ftrace_function(ops); \
+ if (!___ret) \
+ (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
+ ___ret; \
})
-# define ftrace_shutdown(ops, command) do { } while (0)
+# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops)
+
# define ftrace_startup_sysctl() do { } while (0)
# define ftrace_shutdown_sysctl() do { } while (0)
@@ -4780,9 +4783,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
mutex_lock(&ftrace_lock);
- ret = __register_ftrace_function(ops);
- if (!ret)
- ret = ftrace_startup(ops, 0);
+ ret = ftrace_startup(ops, 0);
mutex_unlock(&ftrace_lock);
@@ -4801,9 +4802,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
int ret;
mutex_lock(&ftrace_lock);
- ret = __unregister_ftrace_function(ops);
- if (!ret)
- ftrace_shutdown(ops, 0);
+ ret = ftrace_shutdown(ops, 0);
mutex_unlock(&ftrace_lock);
return ret;
@@ -4997,6 +4996,13 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
return NOTIFY_DONE;
}
+/* Just a place holder for function graph */
+static struct ftrace_ops fgraph_ops __read_mostly = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
+ FTRACE_OPS_FL_RECURSION_SAFE,
+};
+
int register_ftrace_graph(trace_func_graph_ret_t retfunc,
trace_func_graph_ent_t entryfunc)
{
@@ -5023,7 +5029,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
ftrace_graph_return = retfunc;
ftrace_graph_entry = entryfunc;
- ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
+ ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
out:
mutex_unlock(&ftrace_lock);
@@ -5040,7 +5046,7 @@ void unregister_ftrace_graph(void)
ftrace_graph_active--;
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
- ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
+ ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 78e27e3b52ac..e854f420e033 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,6 +24,12 @@ static int total_ref_count;
static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
struct perf_event *p_event)
{
+ if (tp_event->perf_perm) {
+ int ret = tp_event->perf_perm(tp_event, p_event);
+ if (ret)
+ return ret;
+ }
+
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event) &&
perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
@@ -173,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
int perf_trace_init(struct perf_event *p_event)
{
struct ftrace_event_call *tp_event;
- int event_id = p_event->attr.config;
+ u64 event_id = p_event->attr.config;
int ret = -EINVAL;
mutex_lock(&event_mutex);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f919a2e21bf3..a11800ae96de 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2314,6 +2314,9 @@ int event_trace_del_tracer(struct trace_array *tr)
/* Disable any running events */
__ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
+ /* Access to events are within rcu_read_lock_sched() */
+ synchronize_sched();
+
down_write(&trace_event_sem);
__trace_remove_event_dirs(tr);
debugfs_remove_recursive(tr->event_dir);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index e4b6d11bdf78..ea90eb5f6f17 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -431,11 +431,6 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
if (!tr->sys_refcount_enter)
unregister_trace_sys_enter(ftrace_syscall_enter, tr);
mutex_unlock(&syscall_trace_lock);
- /*
- * Callers expect the event to be completely disabled on
- * return, so wait for current handlers to finish.
- */
- synchronize_sched();
}
static int reg_event_syscall_exit(struct ftrace_event_file *file,
@@ -474,11 +469,6 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
if (!tr->sys_refcount_exit)
unregister_trace_sys_exit(ftrace_syscall_exit, tr);
mutex_unlock(&syscall_trace_lock);
- /*
- * Callers expect the event to be completely disabled on
- * return, so wait for current handlers to finish.
- */
- synchronize_sched();
}
static int __init init_syscall_trace(struct ftrace_event_call *call)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 987293d03ebc..c66912be990f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
/* I: attributes used when instantiating standard unbound pools on demand */
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
+/* I: attributes used when instantiating ordered pools on demand */
+static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
+
struct workqueue_struct *system_wq __read_mostly;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -518,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif
-/* allocate ID and assign it to @pool */
+/**
+ * worker_pool_assign_id - allocate ID and assing it to @pool
+ * @pool: the pool pointer of interest
+ *
+ * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
+ * successfully, -errno on failure.
+ */
static int worker_pool_assign_id(struct worker_pool *pool)
{
int ret;
lockdep_assert_held(&wq_pool_mutex);
- ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
+ ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
+ GFP_KERNEL);
if (ret >= 0) {
pool->id = ret;
return 0;
@@ -1320,7 +1330,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
debug_work_activate(work);
- /* if dying, only works from the same workqueue are allowed */
+ /* if draining, only works from the same workqueue are allowed */
if (unlikely(wq->flags & __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
@@ -1736,16 +1746,17 @@ static struct worker *create_worker(struct worker_pool *pool)
if (IS_ERR(worker->task))
goto fail;
+ set_user_nice(worker->task, pool->attrs->nice);
+
+ /* prevent userland from meddling with cpumask of workqueue workers */
+ worker->task->flags |= PF_NO_SETAFFINITY;
+
/*
* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
* online CPUs. It'll be re-applied when any of the CPUs come up.
*/
- set_user_nice(worker->task, pool->attrs->nice);
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
- /* prevent userland from meddling with cpumask of workqueue workers */
- worker->task->flags |= PF_NO_SETAFFINITY;
-
/*
* The caller is responsible for ensuring %POOL_DISASSOCIATED
* remains stable across this function. See the comments above the
@@ -4106,7 +4117,7 @@ out_unlock:
static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
bool highpri = wq->flags & WQ_HIGHPRI;
- int cpu;
+ int cpu, ret;
if (!(wq->flags & WQ_UNBOUND)) {
wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
@@ -4126,6 +4137,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
mutex_unlock(&wq->mutex);
}
return 0;
+ } else if (wq->flags & __WQ_ORDERED) {
+ ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+ /* there should only be single pwq for ordering guarantee */
+ WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
+ wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+ "ordering guarantee broken for workqueue %s\n", wq->name);
+ return ret;
} else {
return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
}
@@ -5009,10 +5027,6 @@ static int __init init_workqueues(void)
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
int i, cpu;
- /* make sure we have enough bits for OFFQ pool ID */
- BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
- WORK_CPU_END * NR_STD_WORKER_POOLS);
-
WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
@@ -5051,13 +5065,23 @@ static int __init init_workqueues(void)
}
}
- /* create default unbound wq attrs */
+ /* create default unbound and ordered wq attrs */
for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
struct workqueue_attrs *attrs;
BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
attrs->nice = std_nice[i];
unbound_std_wq_attrs[i] = attrs;
+
+ /*
+ * An ordered wq should have only one pwq as ordering is
+ * guaranteed by max_active which is enforced by pwqs.
+ * Turn off NUMA so that dfl_pwq is used for all nodes.
+ */
+ BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+ attrs->nice = std_nice[i];
+ attrs->no_numa = true;
+ ordered_wq_attrs[i] = attrs;
}
system_wq = alloc_workqueue("events", 0, 0);