From e3fca9e7cbfb72694a21c886fcdf9f059cfded9c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:37 +0200
Subject: sched: Replace post_schedule with a balance callback list

Generalize the post_schedule() stuff into a balance callback list.
This allows us to more easily use it outside of schedule() and cross
sched_class.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124742.424032725@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c     | 36 ++++++++++++++++++++++++------------
 kernel/sched/deadline.c | 21 +++++++++++----------
 kernel/sched/rt.c       | 25 +++++++++++--------------
 kernel/sched/sched.h    | 19 +++++++++++++++++--
 4 files changed, 63 insertions(+), 38 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41942a5f3315..fa32bc09dadf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2277,23 +2277,35 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 #ifdef CONFIG_SMP
 
 /* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
 {
-	if (rq->post_schedule) {
-		unsigned long flags;
+	struct callback_head *head, *next;
+	void (*func)(struct rq *rq);
+	unsigned long flags;
 
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (rq->curr->sched_class->post_schedule)
-			rq->curr->sched_class->post_schedule(rq);
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	head = rq->balance_callback;
+	rq->balance_callback = NULL;
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
 
-		rq->post_schedule = 0;
+		func(rq);
 	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+	if (unlikely(rq->balance_callback))
+		__balance_callback(rq);
 }
 
 #else
 
-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
 {
 }
 
@@ -2311,7 +2323,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	/* finish_task_switch() drops rq->lock and enables preemtion */
 	preempt_disable();
 	rq = finish_task_switch(prev);
-	post_schedule(rq);
+	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -2823,7 +2835,7 @@ static void __sched __schedule(void)
 	} else
 		raw_spin_unlock_irq(&rq->lock);
 
-	post_schedule(rq);
+	balance_callback(rq);
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -7219,7 +7231,7 @@ void __init sched_init(void)
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
-		rq->post_schedule = 0;
+		rq->balance_callback = NULL;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7a08d590990e..d80523fb1de5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,9 +213,16 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
 	return dl_task(prev);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+
+static void push_dl_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
 {
-	rq->post_schedule = has_pushable_dl_tasks(rq);
+	if (!has_pushable_dl_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
 }
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -296,7 +303,7 @@ static inline int pull_dl_task(struct rq *rq)
 	return 0;
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1126,7 +1133,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 	if (hrtick_enabled(rq))
 		start_hrtick_dl(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -1544,11 +1551,6 @@ skip:
 	return ret;
 }
 
-static void post_schedule_dl(struct rq *rq)
-{
-	push_dl_tasks(rq);
-}
-
 /*
  * Since the task is not running and a reschedule is not going to happen
  * anytime soon on its runqueue, we try pushing it away now.
@@ -1784,7 +1786,6 @@ const struct sched_class dl_sched_class = {
 	.set_cpus_allowed       = set_cpus_allowed_dl,
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
-	.post_schedule		= post_schedule_dl,
 	.task_woken		= task_woken_dl,
 #endif
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7d7093c51f8d..4f3726fe1246 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -354,13 +354,16 @@ static inline int has_pushable_tasks(struct rq *rq)
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
+
+static void push_rt_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
 {
-	/*
-	 * We detect this state here so that we can avoid taking the RQ
-	 * lock again later if there is no need to push
-	 */
-	rq->post_schedule = has_pushable_tasks(rq);
+	if (!has_pushable_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
 }
 
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -417,7 +420,7 @@ static inline int pull_rt_task(struct rq *this_rq)
 	return 0;
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1497,7 +1500,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	/* The running task is never eligible for pushing */
 	dequeue_pushable_task(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -2042,11 +2045,6 @@ skip:
 	return ret;
 }
 
-static void post_schedule_rt(struct rq *rq)
-{
-	push_rt_tasks(rq);
-}
-
 /*
  * If we are not running and we are not going to reschedule soon, we should
  * try to push tasks away now
@@ -2318,7 +2316,6 @@ const struct sched_class rt_sched_class = {
 	.set_cpus_allowed       = set_cpus_allowed_rt,
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
-	.post_schedule		= post_schedule_rt,
 	.task_woken		= task_woken_rt,
 	.switched_from		= switched_from_rt,
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f10a445910c9..62949ab06bc2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -624,9 +624,10 @@ struct rq {
 	unsigned long cpu_capacity;
 	unsigned long cpu_capacity_orig;
 
+	struct callback_head *balance_callback;
+
 	unsigned char idle_balance;
 	/* For active balancing */
-	int post_schedule;
 	int active_balance;
 	int push_cpu;
 	struct cpu_stop_work active_balance_work;
@@ -767,6 +768,21 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
 
 #ifdef CONFIG_SMP
 
+static inline void
+queue_balance_callback(struct rq *rq,
+		       struct callback_head *head,
+		       void (*func)(struct rq *rq))
+{
+	lockdep_assert_held(&rq->lock);
+
+	if (unlikely(head->next))
+		return;
+
+	head->func = (void (*)(struct callback_head *))func;
+	head->next = rq->balance_callback;
+	rq->balance_callback = head;
+}
+
 extern void sched_ttwu_pending(void);
 
 #define rcu_dereference_check_sched_domain(p) \
@@ -1192,7 +1208,6 @@ struct sched_class {
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
 	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
 
-	void (*post_schedule) (struct rq *this_rq);
 	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
 
-- 
cgit v1.2.3


From dbc7f069b93a249340e974d6e8f55656280d8701 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:38 +0200
Subject: sched: Use replace normalize_task() with __sched_setscheduler()

Reduce duplicate logic; normalize_task() is a simplified version of
__sched_setscheduler(). Parametrize the difference and collapse.

This reduces the amount of check_class_changed() sites.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124742.532642391@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 65 +++++++++++++++++++----------------------------------
 1 file changed, 23 insertions(+), 42 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa32bc09dadf..b610ef9e522f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3438,7 +3438,7 @@ static bool dl_param_changed(struct task_struct *p,
 
 static int __sched_setscheduler(struct task_struct *p,
 				const struct sched_attr *attr,
-				bool user)
+				bool user, bool pi)
 {
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3624,18 +3624,20 @@ change:
 	p->sched_reset_on_fork = reset_on_fork;
 	oldprio = p->prio;
 
-	/*
-	 * Take priority boosted tasks into account. If the new
-	 * effective priority is unchanged, we just store the new
-	 * normal parameters and do not touch the scheduler class and
-	 * the runqueue. This will be done when the task deboost
-	 * itself.
-	 */
-	new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-	if (new_effective_prio == oldprio) {
-		__setscheduler_params(p, attr);
-		task_rq_unlock(rq, p, &flags);
-		return 0;
+	if (pi) {
+		/*
+		 * Take priority boosted tasks into account. If the new
+		 * effective priority is unchanged, we just store the new
+		 * normal parameters and do not touch the scheduler class and
+		 * the runqueue. This will be done when the task deboost
+		 * itself.
+		 */
+		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+		if (new_effective_prio == oldprio) {
+			__setscheduler_params(p, attr);
+			task_rq_unlock(rq, p, &flags);
+			return 0;
+		}
 	}
 
 	queued = task_on_rq_queued(p);
@@ -3646,7 +3648,7 @@ change:
 		put_prev_task(rq, p);
 
 	prev_class = p->sched_class;
-	__setscheduler(rq, p, attr, true);
+	__setscheduler(rq, p, attr, pi);
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
@@ -3661,7 +3663,8 @@ change:
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
 
-	rt_mutex_adjust_pi(p);
+	if (pi)
+		rt_mutex_adjust_pi(p);
 
 	return 0;
 }
@@ -3682,7 +3685,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
 		attr.sched_policy = policy;
 	}
 
-	return __sched_setscheduler(p, &attr, check);
+	return __sched_setscheduler(p, &attr, check, true);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3703,7 +3706,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 
 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
-	return __sched_setscheduler(p, attr, true);
+	return __sched_setscheduler(p, attr, true, true);
 }
 EXPORT_SYMBOL_GPL(sched_setattr);
 
@@ -7361,32 +7364,12 @@ EXPORT_SYMBOL(___might_sleep);
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
 {
-	const struct sched_class *prev_class = p->sched_class;
+	struct task_struct *g, *p;
 	struct sched_attr attr = {
 		.sched_policy = SCHED_NORMAL,
 	};
-	int old_prio = p->prio;
-	int queued;
-
-	queued = task_on_rq_queued(p);
-	if (queued)
-		dequeue_task(rq, p, 0);
-	__setscheduler(rq, p, &attr, false);
-	if (queued) {
-		enqueue_task(rq, p, 0);
-		resched_curr(rq);
-	}
-
-	check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
-	struct task_struct *g, *p;
-	unsigned long flags;
-	struct rq *rq;
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
@@ -7413,9 +7396,7 @@ void normalize_rt_tasks(void)
 			continue;
 		}
 
-		rq = task_rq_lock(p, &flags);
-		normalize_task(rq, p);
-		task_rq_unlock(rq, p, &flags);
+		__sched_setscheduler(p, &attr, false, false);
 	}
 	read_unlock(&tasklist_lock);
 }
-- 
cgit v1.2.3


From 4c9a4bc89a9cca8128bce67d6bc8870d6b7ee0b2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:39 +0200
Subject: sched: Allow balance callbacks for check_class_changed()

In order to remove dropping rq->lock from the
switched_{to,from}()/prio_changed() sched_class methods, run the
balance callbacks after it.

We need to remove dropping rq->lock because its buggy,
suppose using sched_setattr()/sched_setscheduler() to change a running
task from FIFO to OTHER.

By the time we get to switched_from_rt() the task is already enqueued
on the cfs runqueues. If switched_from_rt() does pull_rt_task() and
drops rq->lock, load-balancing can come in and move our task @p to
another rq.

The subsequent switched_to_fair() still assumes @p is on @rq and bad
things will happen.

By using balance callbacks we delay the load-balancing operations
{rt,dl}x{push,pull} until we've done all the important work and the
task is fully set up.

Furthermore, the balance callbacks do not know about @p, therefore
they cannot get confused like this.

Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Link: http://lkml.kernel.org/r/20150611124742.615343911@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b610ef9e522f..ef546e349e75 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1001,7 +1001,11 @@ inline int task_curr(const struct task_struct *p)
 }
 
 /*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
+ * use the balance_callback list if you want balancing.
+ *
+ * this means any call to check_class_changed() must be followed by a call to
+ * balance_callback().
  */
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
@@ -1010,7 +1014,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
 			prev_class->switched_from(rq, p);
-		/* Possble rq->lock 'hole'.  */
+
 		p->sched_class->switched_to(rq, p);
 	} else if (oldprio != p->prio || dl_task(p))
 		p->sched_class->prio_changed(rq, p, oldprio);
@@ -1491,8 +1495,12 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
-	if (p->sched_class->task_woken)
+	if (p->sched_class->task_woken) {
+		/*
+		 * XXX can drop rq->lock; most likely ok.
+		 */
 		p->sched_class->task_woken(rq, p);
+	}
 
 	if (rq->idle_stamp) {
 		u64 delta = rq_clock(rq) - rq->idle_stamp;
@@ -3100,7 +3108,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
+	preempt_disable(); /* avoid rq from going away on us */
 	__task_rq_unlock(rq);
+
+	balance_callback(rq);
+	preempt_enable();
 }
 #endif
 
@@ -3661,11 +3673,18 @@ change:
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
+	preempt_disable(); /* avoid rq from going away on us */
 	task_rq_unlock(rq, p, &flags);
 
 	if (pi)
 		rt_mutex_adjust_pi(p);
 
+	/*
+	 * Run balance callbacks after we've adjusted the PI chain.
+	 */
+	balance_callback(rq);
+	preempt_enable();
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 8046d6806247088de5725eaf8a2580b29e50ac5a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:40 +0200
Subject: sched,rt: Remove return value from pull_rt_task()

In order to be able to use pull_rt_task() from a callback, we need to
do away with the return value.

Since the return value indicates if we should reschedule, do this
inside the function. Since not all callers currently do this, this can
increase the number of reschedules due rt balancing.

Too many reschedules is not a correctness issues, too few are.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124742.679002000@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/rt.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4f3726fe1246..c702b48de9f0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -260,7 +260,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 
 #ifdef CONFIG_SMP
 
-static int pull_rt_task(struct rq *this_rq);
+static void pull_rt_task(struct rq *this_rq);
 
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
@@ -415,9 +415,8 @@ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 	return false;
 }
 
-static inline int pull_rt_task(struct rq *this_rq)
+static inline void pull_rt_task(struct rq *this_rq)
 {
-	return 0;
 }
 
 static inline void queue_push_tasks(struct rq *rq)
@@ -1955,14 +1954,15 @@ static void push_irq_work_func(struct irq_work *work)
 }
 #endif /* HAVE_RT_PUSH_IPI */
 
-static int pull_rt_task(struct rq *this_rq)
+static void pull_rt_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
+	bool resched = false;
 	struct task_struct *p;
 	struct rq *src_rq;
 
 	if (likely(!rt_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from rt_set_overloaded; this guarantees that if we
@@ -1973,7 +1973,7 @@ static int pull_rt_task(struct rq *this_rq)
 #ifdef HAVE_RT_PUSH_IPI
 	if (sched_feat(RT_PUSH_IPI)) {
 		tell_cpu_to_push(this_rq);
-		return 0;
+		return;
 	}
 #endif
 
@@ -2026,7 +2026,7 @@ static int pull_rt_task(struct rq *this_rq)
 			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -2042,7 +2042,8 @@ skip:
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -2138,8 +2139,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	if (pull_rt_task(rq))
-		resched_curr(rq);
+	pull_rt_task(rq);
 }
 
 void __init init_sched_rt_class(void)
-- 
cgit v1.2.3


From fd7a4bed183523275279c9addbf42fce550c2e90 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:41 +0200
Subject: sched, rt: Convert switched_{from, to}_rt() / prio_changed_rt() to
 balance callbacks

Remove the direct {push,pull} balancing operations from
switched_{from,to}_rt() / prio_changed_rt() and use the balance
callback queue.

Again, err on the side of too many reschedules; since too few is a
hard bug while too many is just annoying.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124742.766832367@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/rt.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c702b48de9f0..460f85888b74 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -354,16 +354,23 @@ static inline int has_pushable_tasks(struct rq *rq)
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
-static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
+static DEFINE_PER_CPU(struct callback_head, rt_push_head);
+static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 
 static void push_rt_tasks(struct rq *);
+static void pull_rt_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_tasks(rq))
 		return;
 
-	queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
+	queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+	queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 }
 
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -2139,7 +2146,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	pull_rt_task(rq);
+	queue_pull_task(rq);
 }
 
 void __init init_sched_rt_class(void)
@@ -2160,8 +2167,6 @@ void __init init_sched_rt_class(void)
  */
 static void switched_to_rt(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	/*
 	 * If we are already running, then there's nothing
 	 * that needs to be done. But if we are not running
@@ -2171,13 +2176,12 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 	 */
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
-		    /* Don't resched if we changed runqueues */
-		    push_rt_task(rq) && rq != task_rq(p))
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched && p->prio < rq->curr->prio)
+		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (p->prio < rq->curr->prio)
 			resched_curr(rq);
+#endif /* CONFIG_SMP */
 	}
 }
 
@@ -2198,14 +2202,13 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 		 * may need to pull tasks to this runqueue.
 		 */
 		if (oldprio < p->prio)
-			pull_rt_task(rq);
+			queue_pull_task(rq);
+
 		/*
 		 * If there's a higher priority task waiting to run
-		 * then reschedule. Note, the above pull_rt_task
-		 * can release the rq lock and p could migrate.
-		 * Only reschedule if p is still on the same runqueue.
+		 * then reschedule.
 		 */
-		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
+		if (p->prio > rq->rt.highest_prio.curr)
 			resched_curr(rq);
 #else
 		/* For UP simply resched on drop of prio */
-- 
cgit v1.2.3


From 0ea60c2054fc3b0c3eb68ac4f6884f3ee78d9925 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:42 +0200
Subject: sched,dl: Remove return value from pull_dl_task()

In order to be able to use pull_dl_task() from a callback, we need to
do away with the return value.

Since the return value indicates if we should reschedule, do this
inside the function. Since not all callers currently do this, this can
increase the number of reschedules due rt balancing.

Too many reschedules is not a correctness issues, too few are.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124742.859398977@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/deadline.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d80523fb1de5..079c09251e90 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -298,9 +298,8 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
 	return false;
 }
 
-static inline int pull_dl_task(struct rq *rq)
+static inline void pull_dl_task(struct rq *rq)
 {
-	return 0;
 }
 
 static inline void queue_push_tasks(struct rq *rq)
@@ -1041,7 +1040,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	resched_curr(rq);
 }
 
-static int pull_dl_task(struct rq *this_rq);
+static void pull_dl_task(struct rq *this_rq);
 
 #endif /* CONFIG_SMP */
 
@@ -1472,15 +1471,16 @@ static void push_dl_tasks(struct rq *rq)
 		;
 }
 
-static int pull_dl_task(struct rq *this_rq)
+static void pull_dl_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
 	struct task_struct *p;
+	bool resched = false;
 	struct rq *src_rq;
 	u64 dmin = LONG_MAX;
 
 	if (likely(!dl_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from dl_set_overloaded; this guarantees that if we
@@ -1535,7 +1535,7 @@ static int pull_dl_task(struct rq *this_rq)
 					   src_rq->curr->dl.deadline))
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -1548,7 +1548,8 @@ skip:
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -1704,8 +1705,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	if (pull_dl_task(rq))
-		resched_curr(rq);
+	pull_dl_task(rq);
 }
 
 /*
-- 
cgit v1.2.3


From 9916e214998a4a363b152b637245e5c958067350 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:43 +0200
Subject: sched, dl: Convert switched_{from, to}_dl() / prio_changed_dl() to
 balance callbacks

Remove the direct {push,pull} balancing operations from
switched_{from,to}_dl() / prio_changed_dl() and use the balance
callback queue.

Again, err on the side of too many reschedules; since too few is a
hard bug while too many is just annoying.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124742.968262663@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/deadline.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 079c09251e90..69d9f509a582 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,16 +213,23 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
 	return dl_task(prev);
 }
 
-static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+static DEFINE_PER_CPU(struct callback_head, dl_push_head);
+static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
 
 static void push_dl_tasks(struct rq *);
+static void pull_dl_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_dl_tasks(rq))
 		return;
 
-	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
+	queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+	queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
 }
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -305,6 +312,10 @@ static inline void pull_dl_task(struct rq *rq)
 static inline void queue_push_tasks(struct rq *rq)
 {
 }
+
+static inline void queue_pull_task(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -1040,8 +1051,6 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	resched_curr(rq);
 }
 
-static void pull_dl_task(struct rq *this_rq);
-
 #endif /* CONFIG_SMP */
 
 /*
@@ -1705,7 +1714,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	pull_dl_task(rq);
+	queue_pull_task(rq);
 }
 
 /*
@@ -1714,21 +1723,16 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
-			push_dl_task(rq) && rq != task_rq(p))
-			/* Only reschedule if pushing failed */
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched) {
-			if (dl_task(rq->curr))
-				check_preempt_curr_dl(rq, p, 0);
-			else
-				resched_curr(rq);
-		}
+		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (dl_task(rq->curr))
+			check_preempt_curr_dl(rq, p, 0);
+		else
+			resched_curr(rq);
+#endif
 	}
 }
 
@@ -1748,15 +1752,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 		 * or lowering its prio, so...
 		 */
 		if (!rq->dl.overloaded)
-			pull_dl_task(rq);
+			queue_pull_task(rq);
 
 		/*
 		 * If we now have a earlier deadline task than p,
 		 * then reschedule, provided p is still on this
 		 * runqueue.
 		 */
-		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
-		    rq->curr == p)
+		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
 			resched_curr(rq);
 #else
 		/*
-- 
cgit v1.2.3


From a649f237db18450de767d70f40a41d5dbd0291de Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:49 +0200
Subject: sched,dl: Fix sched class hopping CBS hole

We still have a few pending issues with the deadline code, one of which
is that switching between scheduling classes can 'leak' CBS state.

Close the hole by retaining the current CBS state when leaving
SCHED_DEADLINE and unconditionally programming the deadline timer.
The timer will then reset the CBS state if the task is still
!SCHED_DEADLINE by the time it hits.

If the task left SCHED_DEADLINE it will not call task_dead_dl() and
we'll not cancel the hrtimer, leaving us a pending timer in free
space. Avoid this by giving the timer a task reference, this avoids
littering the task exit path for this rather uncommon case.

In order to do this, I had to move dl_task_offline_migration() below
the replenishment, such that the task_rq()->lock fully covers that.
While doing this, I noticed that it (was) buggy in assuming a task is
enqueued and or we need to enqueue the task now. Fixing this means
select_task_rq_dl() might encounter an offline rq -- look into that.

As a result this kills cancel_dl_timer() which included a rq->lock
break.

Fixes: 40767b0dc768 ("sched/deadline: Fix deadline parameter modification handling")
Cc: Wanpeng Li <wanpeng.li@linux.intel.com>
Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.574192138@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/deadline.c | 152 +++++++++++++++++++++++++++---------------------
 1 file changed, 86 insertions(+), 66 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 69d9f509a582..6318f43971c9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -234,7 +234,7 @@ static inline void queue_pull_task(struct rq *rq)
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
 
-static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 {
 	struct rq *later_rq = NULL;
 	bool fallback = false;
@@ -268,14 +268,19 @@ static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 		double_lock_balance(rq, later_rq);
 	}
 
+	/*
+	 * By now the task is replenished and enqueued; migrate it.
+	 */
 	deactivate_task(rq, p, 0);
 	set_task_cpu(p, later_rq->cpu);
-	activate_task(later_rq, p, ENQUEUE_REPLENISH);
+	activate_task(later_rq, p, 0);
 
 	if (!fallback)
 		resched_curr(later_rq);
 
-	double_unlock_balance(rq, later_rq);
+	double_unlock_balance(later_rq, rq);
+
+	return later_rq;
 }
 
 #else
@@ -515,22 +520,23 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
  * actually started or not (i.e., the replenishment instant is in
  * the future or in the past).
  */
-static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+static int start_dl_timer(struct task_struct *p)
 {
-	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq);
+	struct sched_dl_entity *dl_se = &p->dl;
+	struct hrtimer *timer = &dl_se->dl_timer;
+	struct rq *rq = task_rq(p);
 	ktime_t now, act;
 	s64 delta;
 
-	if (boosted)
-		return 0;
+	lockdep_assert_held(&rq->lock);
+
 	/*
 	 * We want the timer to fire at the deadline, but considering
 	 * that it is actually coming from rq->clock and not from
 	 * hrtimer's time base reading.
 	 */
 	act = ns_to_ktime(dl_se->deadline);
-	now = hrtimer_cb_get_time(&dl_se->dl_timer);
+	now = hrtimer_cb_get_time(timer);
 	delta = ktime_to_ns(now) - rq_clock(rq);
 	act = ktime_add_ns(act, delta);
 
@@ -542,7 +548,19 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
 	if (ktime_us_delta(act, now) < 0)
 		return 0;
 
-	hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS);
+	/*
+	 * !enqueued will guarantee another callback; even if one is already in
+	 * progress. This ensures a balanced {get,put}_task_struct().
+	 *
+	 * The race against __run_timer() clearing the enqueued state is
+	 * harmless because we're holding task_rq()->lock, therefore the timer
+	 * expiring after we've done the check will wait on its task_rq_lock()
+	 * and observe our state.
+	 */
+	if (!hrtimer_is_queued(timer)) {
+		get_task_struct(p);
+		hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+	}
 
 	return 1;
 }
@@ -572,35 +590,40 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	rq = task_rq_lock(p, &flags);
 
 	/*
-	 * We need to take care of several possible races here:
-	 *
-	 *   - the task might have changed its scheduling policy
-	 *     to something different than SCHED_DEADLINE
-	 *   - the task might have changed its reservation parameters
-	 *     (through sched_setattr())
-	 *   - the task might have been boosted by someone else and
-	 *     might be in the boosting/deboosting path
+	 * The task might have changed its scheduling policy to something
+	 * different than SCHED_DEADLINE (through switched_fromd_dl()).
+	 */
+	if (!dl_task(p)) {
+		__dl_clear_params(p);
+		goto unlock;
+	}
+
+	/*
+	 * This is possible if switched_from_dl() raced against a running
+	 * callback that took the above !dl_task() path and we've since then
+	 * switched back into SCHED_DEADLINE.
 	 *
-	 * In all this cases we bail out, as the task is already
-	 * in the runqueue or is going to be enqueued back anyway.
+	 * There's nothing to do except drop our task reference.
 	 */
-	if (!dl_task(p) || dl_se->dl_new ||
-	    dl_se->dl_boosted || !dl_se->dl_throttled)
+	if (dl_se->dl_new)
 		goto unlock;
 
-	sched_clock_tick();
-	update_rq_clock(rq);
+	/*
+	 * The task might have been boosted by someone else and might be in the
+	 * boosting/deboosting path, its not throttled.
+	 */
+	if (dl_se->dl_boosted)
+		goto unlock;
 
-#ifdef CONFIG_SMP
 	/*
-	 * If we find that the rq the task was on is no longer
-	 * available, we need to select a new rq.
+	 * Spurious timer due to start_dl_timer() race; or we already received
+	 * a replenishment from rt_mutex_setprio().
 	 */
-	if (unlikely(!rq->online)) {
-		dl_task_offline_migration(rq, p);
+	if (!dl_se->dl_throttled)
 		goto unlock;
-	}
-#endif
+
+	sched_clock_tick();
+	update_rq_clock(rq);
 
 	/*
 	 * If the throttle happened during sched-out; like:
@@ -626,17 +649,38 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 		check_preempt_curr_dl(rq, p, 0);
 	else
 		resched_curr(rq);
+
 #ifdef CONFIG_SMP
 	/*
-	 * Queueing this task back might have overloaded rq,
-	 * check if we need to kick someone away.
+	 * Perform balancing operations here; after the replenishments.  We
+	 * cannot drop rq->lock before this, otherwise the assertion in
+	 * start_dl_timer() about not missing updates is not true.
+	 *
+	 * If we find that the rq the task was on is no longer available, we
+	 * need to select a new rq.
+	 *
+	 * XXX figure out if select_task_rq_dl() deals with offline cpus.
+	 */
+	if (unlikely(!rq->online))
+		rq = dl_task_offline_migration(rq, p);
+
+	/*
+	 * Queueing this task back might have overloaded rq, check if we need
+	 * to kick someone away.
 	 */
 	if (has_pushable_dl_tasks(rq))
 		push_dl_task(rq);
 #endif
+
 unlock:
 	task_rq_unlock(rq, p, &flags);
 
+	/*
+	 * This can free the task_struct, including this hrtimer, do not touch
+	 * anything related to that after this.
+	 */
+	put_task_struct(p);
+
 	return HRTIMER_NORESTART;
 }
 
@@ -696,7 +740,7 @@ static void update_curr_dl(struct rq *rq)
 	if (dl_runtime_exceeded(rq, dl_se)) {
 		dl_se->dl_throttled = 1;
 		__dequeue_task_dl(rq, curr, 0);
-		if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
+		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
 			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
 
 		if (!is_leftmost(curr, &rq->dl))
@@ -1178,7 +1222,6 @@ static void task_fork_dl(struct task_struct *p)
 
 static void task_dead_dl(struct task_struct *p)
 {
-	struct hrtimer *timer = &p->dl.dl_timer;
 	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
 	/*
@@ -1188,8 +1231,6 @@ static void task_dead_dl(struct task_struct *p)
 	/* XXX we should retain the bw until 0-lag */
 	dl_b->total_bw -= p->dl.dl_bw;
 	raw_spin_unlock_irq(&dl_b->lock);
-
-	hrtimer_cancel(timer);
 }
 
 static void set_curr_task_dl(struct rq *rq)
@@ -1674,37 +1715,16 @@ void init_sched_dl_class(void)
 
 #endif /* CONFIG_SMP */
 
-/*
- *  Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
- */
-static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
-{
-	struct hrtimer *dl_timer = &p->dl.dl_timer;
-
-	/* Nobody will change task's class if pi_lock is held */
-	lockdep_assert_held(&p->pi_lock);
-
-	if (hrtimer_active(dl_timer)) {
-		int ret = hrtimer_try_to_cancel(dl_timer);
-
-		if (unlikely(ret == -1)) {
-			/*
-			 * Note, p may migrate OR new deadline tasks
-			 * may appear in rq when we are unlocking it.
-			 * A caller of us must be fine with that.
-			 */
-			raw_spin_unlock(&rq->lock);
-			hrtimer_cancel(dl_timer);
-			raw_spin_lock(&rq->lock);
-		}
-	}
-}
-
 static void switched_from_dl(struct rq *rq, struct task_struct *p)
 {
-	/* XXX we should retain the bw until 0-lag */
-	cancel_dl_timer(rq, p);
-	__dl_clear_params(p);
+	/*
+	 * Start the deadline timer; if we switch back to dl before this we'll
+	 * continue consuming our current CBS slice. If we stay outside of
+	 * SCHED_DEADLINE until the deadline passes, the timer will reset the
+	 * task.
+	 */
+	if (!start_dl_timer(p))
+		__dl_clear_params(p);
 
 	/*
 	 * Since this might be the only -deadline task on the rq,
-- 
cgit v1.2.3


From 5cc389bcee088b72c8c34a01d596412cab4f3f78 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:50 +0200
Subject: sched: Move code around

In preparation to reworking set_cpus_allowed_ptr() move some code
around. This also removes some superfluous #ifdefs and adds comments
to some #endifs.

   text    data     bss     dec     hex filename
12211532        1738144 1081344 15031020         e55aec defconfig-build/vmlinux.pre
12211532        1738144 1081344 15031020         e55aec defconfig-build/vmlinux.post

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.662086684@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 364 +++++++++++++++++++++++++---------------------------
 1 file changed, 178 insertions(+), 186 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ef546e349e75..26637c9daef6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1046,6 +1046,180 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ *    stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ *    off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ *    it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ *    is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+{
+	struct rq *rq = task_rq(p);
+
+	lockdep_assert_held(&rq->lock);
+
+	dequeue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_MIGRATING;
+	set_task_cpu(p, new_cpu);
+	raw_spin_unlock(&rq->lock);
+
+	rq = cpu_rq(new_cpu);
+
+	raw_spin_lock(&rq->lock);
+	BUG_ON(task_cpu(p) != new_cpu);
+	p->on_rq = TASK_ON_RQ_QUEUED;
+	enqueue_task(rq, p, 0);
+	check_preempt_curr(rq, p, 0);
+
+	return rq;
+}
+
+struct migration_arg {
+	struct task_struct *task;
+	int dest_cpu;
+};
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
+ */
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+	struct rq *rq;
+	int ret = 0;
+
+	if (unlikely(!cpu_active(dest_cpu)))
+		return ret;
+
+	rq = cpu_rq(src_cpu);
+
+	raw_spin_lock(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+	/* Already moved. */
+	if (task_cpu(p) != src_cpu)
+		goto done;
+
+	/* Affinity changed (again). */
+	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+		goto fail;
+
+	/*
+	 * If we're not on a rq, the next wake-up will ensure we're
+	 * placed properly.
+	 */
+	if (task_on_rq_queued(p))
+		rq = move_queued_task(p, dest_cpu);
+done:
+	ret = 1;
+fail:
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock(&p->pi_lock);
+	return ret;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+	struct migration_arg *arg = data;
+
+	/*
+	 * The original target cpu might have gone down and we might
+	 * be on another cpu but it doesn't matter.
+	 */
+	local_irq_disable();
+	/*
+	 * We need to explicitly wake pending tasks before running
+	 * __migrate_task() such that we will not miss enforcing cpus_allowed
+	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+	 */
+	sched_ttwu_pending();
+	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+	local_irq_enable();
+	return 0;
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	if (p->sched_class->set_cpus_allowed)
+		p->sched_class->set_cpus_allowed(p, new_mask);
+
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	unsigned long flags;
+	struct rq *rq;
+	unsigned int dest_cpu;
+	int ret = 0;
+
+	rq = task_rq_lock(p, &flags);
+
+	if (cpumask_equal(&p->cpus_allowed, new_mask))
+		goto out;
+
+	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	do_set_cpus_allowed(p, new_mask);
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
+		goto out;
+
+	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+	if (task_running(rq, p) || p->state == TASK_WAKING) {
+		struct migration_arg arg = { p, dest_cpu };
+		/* Need help from migration thread: drop lock and wait. */
+		task_rq_unlock(rq, p, &flags);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+		tlb_migrate_finish(p->mm);
+		return 0;
+	} else if (task_on_rq_queued(p))
+		rq = move_queued_task(p, dest_cpu);
+out:
+	task_rq_unlock(rq, p, &flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -1186,13 +1360,6 @@ out:
 	return ret;
 }
 
-struct migration_arg {
-	struct task_struct *task;
-	int dest_cpu;
-};
-
-static int migration_cpu_stop(void *data);
-
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -1325,9 +1492,7 @@ void kick_process(struct task_struct *p)
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
 
-#ifdef CONFIG_SMP
 /*
  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
  */
@@ -1432,7 +1597,7 @@ static void update_avg(u64 *avg, u64 sample)
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -4773,149 +4938,6 @@ out:
 }
 
 #ifdef CONFIG_SMP
-/*
- * move_queued_task - move a queued task to new rq.
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
-{
-	struct rq *rq = task_rq(p);
-
-	lockdep_assert_held(&rq->lock);
-
-	dequeue_task(rq, p, 0);
-	p->on_rq = TASK_ON_RQ_MIGRATING;
-	set_task_cpu(p, new_cpu);
-	raw_spin_unlock(&rq->lock);
-
-	rq = cpu_rq(new_cpu);
-
-	raw_spin_lock(&rq->lock);
-	BUG_ON(task_cpu(p) != new_cpu);
-	p->on_rq = TASK_ON_RQ_QUEUED;
-	enqueue_task(rq, p, 0);
-	check_preempt_curr(rq, p, 0);
-
-	return rq;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-	if (p->sched_class->set_cpus_allowed)
-		p->sched_class->set_cpus_allowed(p, new_mask);
-
-	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- *    stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- *    off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- *    it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- *    is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
-	unsigned long flags;
-	struct rq *rq;
-	unsigned int dest_cpu;
-	int ret = 0;
-
-	rq = task_rq_lock(p, &flags);
-
-	if (cpumask_equal(&p->cpus_allowed, new_mask))
-		goto out;
-
-	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	do_set_cpus_allowed(p, new_mask);
-
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
-		goto out;
-
-	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-	if (task_running(rq, p) || p->state == TASK_WAKING) {
-		struct migration_arg arg = { p, dest_cpu };
-		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, p, &flags);
-		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-		tlb_migrate_finish(p->mm);
-		return 0;
-	} else if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-out:
-	task_rq_unlock(rq, p, &flags);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-	struct rq *rq;
-	int ret = 0;
-
-	if (unlikely(!cpu_active(dest_cpu)))
-		return ret;
-
-	rq = cpu_rq(src_cpu);
-
-	raw_spin_lock(&p->pi_lock);
-	raw_spin_lock(&rq->lock);
-	/* Already moved. */
-	if (task_cpu(p) != src_cpu)
-		goto done;
-
-	/* Affinity changed (again). */
-	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-		goto fail;
-
-	/*
-	 * If we're not on a rq, the next wake-up will ensure we're
-	 * placed properly.
-	 */
-	if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-done:
-	ret = 1;
-fail:
-	raw_spin_unlock(&rq->lock);
-	raw_spin_unlock(&p->pi_lock);
-	return ret;
-}
 
 #ifdef CONFIG_NUMA_BALANCING
 /* Migrate current task p to target_cpu */
@@ -4963,35 +4985,9 @@ void sched_setnuma(struct task_struct *p, int nid)
 		enqueue_task(rq, p, 0);
 	task_rq_unlock(rq, p, &flags);
 }
-#endif
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
-	struct migration_arg *arg = data;
-
-	/*
-	 * The original target cpu might have gone down and we might
-	 * be on another cpu but it doesn't matter.
-	 */
-	local_irq_disable();
-	/*
-	 * We need to explicitly wake pending tasks before running
-	 * __migrate_task() such that we will not miss enforcing cpus_allowed
-	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
-	 */
-	sched_ttwu_pending();
-	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
-	local_irq_enable();
-	return 0;
-}
+#endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_HOTPLUG_CPU
-
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
@@ -5094,7 +5090,6 @@ static void migrate_tasks(unsigned int dead_cpu)
 
 	rq->stop = stop;
 }
-
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5273,7 +5268,7 @@ static void register_sched_domain_sysctl(void)
 static void unregister_sched_domain_sysctl(void)
 {
 }
-#endif
+#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
 
 static void set_rq_online(struct rq *rq)
 {
@@ -5420,9 +5415,6 @@ static int __init migration_init(void)
 	return 0;
 }
 early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
 
 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 
@@ -6648,7 +6640,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 			struct sched_group *sg;
 			struct sched_group_capacity *sgc;
 
-		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sd)
 				return -ENOMEM;
-- 
cgit v1.2.3


From 5e16bbc2fb4053755705da5dd3557bbc0e5ccef6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:51 +0200
Subject: sched: Streamline the task migration locking a little

The whole migrate_task{,s}() locking seems a little shaky, there's a
lot of dropping an require happening. Pull the locking up into the
callers as far as possible to streamline the lot.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.755256708@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 76 ++++++++++++++++++++++++-----------------------------
 1 file changed, 34 insertions(+), 42 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26637c9daef6..1ddc129c5f66 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1065,10 +1065,8 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
  *
  * Returns (locked) new rq. Old rq's lock is released.
  */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
 {
-	struct rq *rq = task_rq(p);
-
 	lockdep_assert_held(&rq->lock);
 
 	dequeue_task(rq, p, 0);
@@ -1100,41 +1098,19 @@ struct migration_arg {
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
  */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
 {
-	struct rq *rq;
-	int ret = 0;
-
 	if (unlikely(!cpu_active(dest_cpu)))
-		return ret;
-
-	rq = cpu_rq(src_cpu);
-
-	raw_spin_lock(&p->pi_lock);
-	raw_spin_lock(&rq->lock);
-	/* Already moved. */
-	if (task_cpu(p) != src_cpu)
-		goto done;
+		return rq;
 
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-		goto fail;
+		return rq;
 
-	/*
-	 * If we're not on a rq, the next wake-up will ensure we're
-	 * placed properly.
-	 */
-	if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-done:
-	ret = 1;
-fail:
-	raw_spin_unlock(&rq->lock);
-	raw_spin_unlock(&p->pi_lock);
-	return ret;
+	rq = move_queued_task(rq, p, dest_cpu);
+
+	return rq;
 }
 
 /*
@@ -1145,6 +1121,8 @@ fail:
 static int migration_cpu_stop(void *data)
 {
 	struct migration_arg *arg = data;
+	struct task_struct *p = arg->task;
+	struct rq *rq = this_rq();
 
 	/*
 	 * The original target cpu might have gone down and we might
@@ -1157,7 +1135,19 @@ static int migration_cpu_stop(void *data)
 	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
 	 */
 	sched_ttwu_pending();
-	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+
+	raw_spin_lock(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+	/*
+	 * If task_rq(p) != rq, it cannot be migrated here, because we're
+	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+	 * we're holding p->pi_lock.
+	 */
+	if (task_rq(p) == rq && task_on_rq_queued(p))
+		rq = __migrate_task(rq, p, arg->dest_cpu);
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock(&p->pi_lock);
+
 	local_irq_enable();
 	return 0;
 }
@@ -1212,7 +1202,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 		tlb_migrate_finish(p->mm);
 		return 0;
 	} else if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
+		rq = move_queued_task(rq, p, dest_cpu);
 out:
 	task_rq_unlock(rq, p, &flags);
 
@@ -5043,9 +5033,9 @@ static struct task_struct fake_task = {
  * there's no concurrency possible, we hold the required locks anyway
  * because of lock validation efforts.
  */
-static void migrate_tasks(unsigned int dead_cpu)
+static void migrate_tasks(struct rq *dead_rq)
 {
-	struct rq *rq = cpu_rq(dead_cpu);
+	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
 	int dest_cpu;
 
@@ -5067,7 +5057,7 @@ static void migrate_tasks(unsigned int dead_cpu)
 	 */
 	update_rq_clock(rq);
 
-	for ( ; ; ) {
+	for (;;) {
 		/*
 		 * There's this thread running, bail when that's the only
 		 * remaining thread.
@@ -5080,12 +5070,14 @@ static void migrate_tasks(unsigned int dead_cpu)
 		next->sched_class->put_prev_task(rq, next);
 
 		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_cpu, next);
-		raw_spin_unlock(&rq->lock);
-
-		__migrate_task(next, dead_cpu, dest_cpu);
+		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
-		raw_spin_lock(&rq->lock);
+		rq = __migrate_task(rq, next, dest_cpu);
+		if (rq != dead_rq) {
+			raw_spin_unlock(&rq->lock);
+			rq = dead_rq;
+			raw_spin_lock(&rq->lock);
+		}
 	}
 
 	rq->stop = stop;
@@ -5337,7 +5329,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
-		migrate_tasks(cpu);
+		migrate_tasks(rq);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;
-- 
cgit v1.2.3


From cbce1a686700595de65ee363b9b3283ae85d8fc5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:54 +0200
Subject: sched,lockdep: Employ lock pinning

Employ the new lockdep lock pinning annotation to ensure no
'accidental' lock-breaks happen with rq->lock.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124744.003233193@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c     | 42 +++++++++++++++++++++++++++++++++++++++---
 kernel/sched/deadline.c |  8 ++++++++
 kernel/sched/fair.c     | 11 ++++++++---
 kernel/sched/rt.c       |  8 ++++++++
 kernel/sched/sched.h    | 10 ++++++++--
 5 files changed, 71 insertions(+), 8 deletions(-)

(limited to 'kernel/sched')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1ddc129c5f66..c74191aa4e6a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1201,8 +1201,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		tlb_migrate_finish(p->mm);
 		return 0;
-	} else if (task_on_rq_queued(p))
+	} else if (task_on_rq_queued(p)) {
+		/*
+		 * OK, since we're going to drop the lock immediately
+		 * afterwards anyway.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		rq = move_queued_task(rq, p, dest_cpu);
+		lockdep_pin_lock(&rq->lock);
+	}
 out:
 	task_rq_unlock(rq, p, &flags);
 
@@ -1562,6 +1569,8 @@ out:
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
+	lockdep_assert_held(&p->pi_lock);
+
 	if (p->nr_cpus_allowed > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 
@@ -1652,9 +1661,12 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken) {
 		/*
-		 * XXX can drop rq->lock; most likely ok.
+		 * Our task @p is fully woken up and running; so its safe to
+		 * drop the rq->lock, hereafter rq is only used for statistics.
 		 */
+		lockdep_unpin_lock(&rq->lock);
 		p->sched_class->task_woken(rq, p);
+		lockdep_pin_lock(&rq->lock);
 	}
 
 	if (rq->idle_stamp) {
@@ -1674,6 +1686,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 {
+	lockdep_assert_held(&rq->lock);
+
 #ifdef CONFIG_SMP
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
@@ -1718,6 +1732,7 @@ void sched_ttwu_pending(void)
 		return;
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
+	lockdep_pin_lock(&rq->lock);
 
 	while (llist) {
 		p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1725,6 +1740,7 @@ void sched_ttwu_pending(void)
 		ttwu_do_activate(rq, p, 0);
 	}
 
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -1821,7 +1837,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 #endif
 
 	raw_spin_lock(&rq->lock);
+	lockdep_pin_lock(&rq->lock);
 	ttwu_do_activate(rq, p, 0);
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1916,9 +1934,17 @@ static void try_to_wake_up_local(struct task_struct *p)
 	lockdep_assert_held(&rq->lock);
 
 	if (!raw_spin_trylock(&p->pi_lock)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we've
+		 * not yet picked a replacement task.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&p->pi_lock);
 		raw_spin_lock(&rq->lock);
+		lockdep_pin_lock(&rq->lock);
 	}
 
 	if (!(p->state & TASK_NORMAL))
@@ -2530,6 +2556,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
+	lockdep_unpin_lock(&rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
 	context_tracking_task_switch(prev, next);
@@ -2953,6 +2980,7 @@ static void __sched __schedule(void)
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock_irq(&rq->lock);
+	lockdep_pin_lock(&rq->lock);
 
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
@@ -2995,8 +3023,10 @@ static void __sched __schedule(void)
 
 		rq = context_switch(rq, prev, next); /* unlocks the rq */
 		cpu = cpu_of(rq);
-	} else
+	} else {
+		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock_irq(&rq->lock);
+	}
 
 	balance_callback(rq);
 }
@@ -5065,6 +5095,11 @@ static void migrate_tasks(struct rq *dead_rq)
 		if (rq->nr_running == 1)
 			break;
 
+		/*
+		 * Ensure rq->lock covers the entire task selection
+		 * until the migration.
+		 */
+		lockdep_pin_lock(&rq->lock);
 		next = pick_next_task(rq, &fake_task);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
@@ -5072,6 +5107,7 @@ static void migrate_tasks(struct rq *dead_rq)
 		/* Find suitable destination for @next, with force if needed. */
 		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
+		lockdep_unpin_lock(&rq->lock);
 		rq = __migrate_task(rq, next, dest_cpu);
 		if (rq != dead_rq) {
 			raw_spin_unlock(&rq->lock);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6318f43971c9..e8146415a688 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1151,7 +1151,15 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 	dl_rq = &rq->dl;
 
 	if (need_pull_dl_task(rq, prev)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we're
+		 * being very careful to re-start the picking loop.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		pull_dl_task(rq);
+		lockdep_pin_lock(&rq->lock);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a stop task can slip in, in which case we need to
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7210ae848909..509ef63d0d6f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5392,7 +5392,15 @@ simple:
 	return p;
 
 idle:
+	/*
+	 * This is OK, because current is on_cpu, which avoids it being picked
+	 * for load-balance and preemption/IRQs are still disabled avoiding
+	 * further scheduler activity on it and we're being very careful to
+	 * re-start the picking loop.
+	 */
+	lockdep_unpin_lock(&rq->lock);
 	new_tasks = idle_balance(rq);
+	lockdep_pin_lock(&rq->lock);
 	/*
 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 	 * possible for any higher priority task to appear. In that case we
@@ -7426,9 +7434,6 @@ static int idle_balance(struct rq *this_rq)
 		goto out;
 	}
 
-	/*
-	 * Drop the rq->lock, but keep IRQ/preempt disabled.
-	 */
 	raw_spin_unlock(&this_rq->lock);
 
 	update_blocked_averages(this_cpu);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 460f85888b74..0d193a243e96 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1478,7 +1478,15 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	struct rt_rq *rt_rq = &rq->rt;
 
 	if (need_pull_rt_task(rq, prev)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we're
+		 * being very careful to re-start the picking loop.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		pull_rt_task(rq);
+		lockdep_pin_lock(&rq->lock);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a dl or stop task can slip in, in which case we need
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 62949ab06bc2..ef02d11654cd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1439,8 +1439,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			lockdep_pin_lock(&rq->lock);
 			return rq;
+		}
 		raw_spin_unlock(&rq->lock);
 
 		while (unlikely(task_on_rq_migrating(p)))
@@ -1477,8 +1479,10 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag
 		 * If we observe the new cpu in task_rq_lock, the acquire will
 		 * pair with the WMB to ensure we must then also see migrating.
 		 */
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			lockdep_pin_lock(&rq->lock);
 			return rq;
+		}
 		raw_spin_unlock(&rq->lock);
 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 
@@ -1490,6 +1494,7 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag
 static inline void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1498,6 +1503,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
-- 
cgit v1.2.3