summaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-07-25 13:59:34 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-25 13:59:34 -0700
commitcca08cd66ce6cc37812b6b36986ba7eaabd33e0b (patch)
treef68966cff4e888f51bd18497c358662c14cab6a2 /kernel/sched/core.c
parent7e4dc77b2869a683fc43c0394fca5441816390ba (diff)
parent748c7201e622d1c24abb4f85072d2e74d12f295f (diff)
downloadlinux-cca08cd66ce6cc37812b6b36986ba7eaabd33e0b.tar.bz2
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - introduce and use task_rcu_dereference()/try_get_task_struct() to fix and generalize task_struct handling (Oleg Nesterov) - do various per entity load tracking (PELT) fixes and optimizations (Peter Zijlstra) - cputime virt-steal time accounting enhancements/fixes (Wanpeng Li) - introduce consolidated cputime output file cpuacct.usage_all and related refactorings (Zhao Lei) - ... plus misc fixes and enhancements * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/core: Panic on scheduling while atomic bugs if kernel.panic_on_warn is set sched/cpuacct: Introduce cpuacct.usage_all to show all CPU stats together sched/cpuacct: Use loop to consolidate code in cpuacct_stats_show() sched/cpuacct: Merge cpuacct_usage_index and cpuacct_stat_index enums sched/fair: Rework throttle_count sync sched/core: Fix sched_getaffinity() return value kerneldoc comment sched/fair: Reorder cgroup creation code sched/fair: Apply more PELT fixes sched/fair: Fix PELT integrity for new tasks sched/cgroup: Fix cpu_cgroup_fork() handling sched/fair: Fix PELT integrity for new groups sched/fair: Fix and optimize the fork() path sched/cputime: Add steal time support to full dynticks CPU time accounting sched/cputime: Fix prev steal time accouting during CPU hotplug KVM: Fix steal clock warp during guest CPU hotplug sched/debug: Always show 'nr_migrations' sched/fair: Use task_rcu_dereference() sched/api: Introduce task_rcu_dereference() and try_get_task_struct() sched/idle: Optimize the generic idle loop sched/fair: Fix the wrong throttled clock time for cfs_rq_clock_task()
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c114
1 files changed, 81 insertions, 33 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index af0ef74df23c..5c883fe8e440 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
__sched_fork(clone_flags, p);
/*
- * We mark the process as running here. This guarantees that
+ * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_RUNNING;
+ p->state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -2383,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_class = &fair_sched_class;
}
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
+ init_entity_runnable_average(&p->se);
/*
* The child is not yet in the pid-hash so no cgroup attach races,
@@ -2394,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
- set_task_cpu(p, cpu);
+ /*
+ * We're setting the cpu for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, cpu);
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO
@@ -2526,16 +2531,18 @@ void wake_up_new_task(struct task_struct *p)
struct rq_flags rf;
struct rq *rq;
- /* Initialize new task's runnable average */
- init_entity_runnable_average(&p->se);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
+ *
+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+ * as we're not fully set-up yet.
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
post_init_entity_util_avg(&p->se);
@@ -3161,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_cont("\n");
}
#endif
+ if (panic_on_warn)
+ panic("scheduling while atomic\n");
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -4752,7 +4762,8 @@ out_unlock:
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
*
- * Return: 0 on success. An error code otherwise.
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -7233,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
rq->calc_load_update = calc_load_update;
- account_reset_rq(rq);
update_max_interval();
}
@@ -7713,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
+
+ online_fair_sched_group(tg);
}
/* rcu callback to free various structures associated with a task group */
@@ -7741,27 +7753,9 @@ void sched_offline_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-/* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk, int type)
{
struct task_group *tg;
- int queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &rf);
-
- running = task_current(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
- put_prev_task(rq, tsk);
/*
* All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7774,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk)
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk);
+ if (tsk->sched_class->task_change_group)
+ tsk->sched_class->task_change_group(tsk, type);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ int queued, running;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &rf);
+
+ running = task_current(rq, tsk);
+ queued = task_on_rq_queued(tsk);
+
+ if (queued)
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+ if (unlikely(running))
+ put_prev_task(rq, tsk);
+
+ sched_change_group(tsk, TASK_MOVE_GROUP);
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
@@ -8206,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_free_group(tg);
}
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
static void cpu_cgroup_fork(struct task_struct *task)
{
- sched_move_task(task);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &rf);
+
+ sched_change_group(task, TASK_SET_GROUP);
+
+ task_rq_unlock(rq, task, &rf);
}
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
+ int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
@@ -8225,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (task->sched_class != &fair_sched_class)
return -EINVAL;
#endif
+ /*
+ * Serialize against wake_up_new_task() such that if its
+ * running, we're sure to observe its full state.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ /*
+ * Avoid calling sched_move_task() before wake_up_new_task()
+ * has happened. This would lead to problems with PELT, due to
+ * move wanting to detach+attach while we're not attached yet.
+ */
+ if (task->state == TASK_NEW)
+ ret = -EINVAL;
+ raw_spin_unlock_irq(&task->pi_lock);
+
+ if (ret)
+ break;
}
- return 0;
+ return ret;
}
static void cpu_cgroup_attach(struct cgroup_taskset *tset)