From 7c80cfc99b7bfdc92cee26f8008859f326f4a37f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 6 May 2017 16:03:17 +0200
Subject: sched/fair: Clean up calc_cfs_shares()

For consistencies sake, we should have only a single reading of tg->shares.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..048fbfb75523 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2694,9 +2694,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq)
 {
-	long tg_weight, load, shares;
+	long tg_weight, tg_shares, load, shares;
+	struct task_group *tg = cfs_rq->tg;
+
+	tg_shares = READ_ONCE(tg->shares);
 
 	/*
 	 * This really should be: cfs_rq->avg.load_avg, but instead we use
@@ -2711,7 +2714,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 	tg_weight -= cfs_rq->tg_load_avg_contrib;
 	tg_weight += load;
 
-	shares = (tg->shares * load);
+	shares = (tg_shares * load);
 	if (tg_weight)
 		shares /= tg_weight;
 
@@ -2727,17 +2730,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 	 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
 	 * instead of 0.
 	 */
-	if (shares < MIN_SHARES)
-		shares = MIN_SHARES;
-	if (shares > tg->shares)
-		shares = tg->shares;
-
-	return shares;
-}
-# else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
-	return tg->shares;
+	return clamp_t(long, shares, MIN_SHARES, tg_shares);
 }
 # endif /* CONFIG_SMP */
 
@@ -2762,7 +2755,6 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 static void update_cfs_shares(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
-	struct task_group *tg;
 	long shares;
 
 	if (!cfs_rq)
@@ -2771,13 +2763,14 @@ static void update_cfs_shares(struct sched_entity *se)
 	if (throttled_hierarchy(cfs_rq))
 		return;
 
-	tg = cfs_rq->tg;
-
 #ifndef CONFIG_SMP
-	if (likely(se->load.weight == tg->shares))
+	shares = READ_ONCE(cfs_rq->tg->shares);
+
+	if (likely(se->load.weight == shares))
 		return;
+#else
+	shares = calc_cfs_shares(cfs_rq);
 #endif
-	shares = calc_cfs_shares(cfs_rq, tg);
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
-- 
cgit v1.2.3


From cef27403cbe98ebda0a32d43128dd60c309eb966 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 May 2017 11:04:07 +0200
Subject: sched/fair: Add comment to calc_cfs_shares()

Explain the magic equation in calc_cfs_shares() a bit better.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 048fbfb75523..dd565aeafc5a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2694,6 +2694,67 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
+/*
+ * All this does is approximate the hierarchical proportion which includes that
+ * global sum we all love to hate.
+ *
+ * That is, the weight of a group entity, is the proportional share of the
+ * group weight based on the group runqueue weights. That is:
+ *
+ *                     tg->weight * grq->load.weight
+ *   ge->load.weight = -----------------------------               (1)
+ *			  \Sum grq->load.weight
+ *
+ * Now, because computing that sum is prohibitively expensive to compute (been
+ * there, done that) we approximate it with this average stuff. The average
+ * moves slower and therefore the approximation is cheaper and more stable.
+ *
+ * So instead of the above, we substitute:
+ *
+ *   grq->load.weight -> grq->avg.load_avg                         (2)
+ *
+ * which yields the following:
+ *
+ *                     tg->weight * grq->avg.load_avg
+ *   ge->load.weight = ------------------------------              (3)
+ *				tg->load_avg
+ *
+ * Where: tg->load_avg ~= \Sum grq->avg.load_avg
+ *
+ * That is shares_avg, and it is right (given the approximation (2)).
+ *
+ * The problem with it is that because the average is slow -- it was designed
+ * to be exactly that of course -- this leads to transients in boundary
+ * conditions. In specific, the case where the group was idle and we start the
+ * one task. It takes time for our CPU's grq->avg.load_avg to build up,
+ * yielding bad latency etc..
+ *
+ * Now, in that special case (1) reduces to:
+ *
+ *                     tg->weight * grq->load.weight
+ *   ge->load.weight = ----------------------------- = tg>weight   (4)
+ *			    grp->load.weight
+ *
+ * That is, the sum collapses because all other CPUs are idle; the UP scenario.
+ *
+ * So what we do is modify our approximation (3) to approach (4) in the (near)
+ * UP case, like:
+ *
+ *   ge->load.weight =
+ *
+ *              tg->weight * grq->load.weight
+ *     ---------------------------------------------------         (5)
+ *     tg->load_avg - grq->avg.load_avg + grq->load.weight
+ *
+ *
+ * And that is shares_weight and is icky. In the (near) UP case it approaches
+ * (4) while in the normal case it approaches (3). It consistently
+ * overestimates the ge->load.weight and therefore:
+ *
+ *   \Sum ge->load.weight >= tg->weight
+ *
+ * hence icky!
+ */
 static long calc_cfs_shares(struct cfs_rq *cfs_rq)
 {
 	long tg_weight, tg_shares, load, shares;
-- 
cgit v1.2.3


From 3d4b60d3e3dde6ea24e439000eb3b71078da81f1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 May 2017 18:16:06 +0200
Subject: sched/fair: Cure calc_cfs_shares() vs. reweight_entity()

Vincent reported that when running in a cgroup, his root
cfs_rq->avg.load_avg dropped to 0 on task idle.

This is because reweight_entity() will now immediately propagate the
weight change of the group entity to its cfs_rq, and as it happens,
our approxmation (5) for calc_cfs_shares() results in 0 when the group
is idle.

Avoid this by using the correct (3) as a lower bound on (5). This way
the empty cgroup will slowly decay instead of instantly drop to 0.

Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd565aeafc5a..63166a0ed854 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2763,11 +2763,10 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq)
 	tg_shares = READ_ONCE(tg->shares);
 
 	/*
-	 * This really should be: cfs_rq->avg.load_avg, but instead we use
-	 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
-	 * the shares for small weight interactive tasks.
+	 * Because (5) drops to 0 when the cfs_rq is idle, we need to use (3)
+	 * as a lower bound.
 	 */
-	load = scale_load_down(cfs_rq->load.weight);
+	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
 
 	tg_weight = atomic_long_read(&tg->load_avg);
 
-- 
cgit v1.2.3


From c7b50216818ef3dca14a52e3499750fbad2d9691 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 6 May 2017 16:42:08 +0200
Subject: sched/fair: Remove se->load.weight from se->avg.load_sum

Remove the load from the load_sum for sched_entities, basically
turning load_sum into runnable_sum.  This prepares for better
reweighting of group entities.

Since we now have different rules for computing load_avg, split
___update_load_avg() into two parts, ___update_load_sum() and
___update_load_avg().

So for se:

  ___update_load_sum(.weight = 1)
  ___upate_load_avg(.weight = se->load.weight)

and for cfs_rq:

  ___update_load_sum(.weight = cfs_rq->load.weight)
  ___upate_load_avg(.weight = 1)

Since the primary consumable is load_avg, most things will not be
affected. Only those few sites that initialize/modify load_sum need
attention.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 91 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 64 insertions(+), 27 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 63166a0ed854..3b5b82345774 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -731,7 +731,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 	 */
 	if (entity_is_task(se))
 		sa->load_avg = scale_load_down(se->load.weight);
-	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+	sa->load_sum = LOAD_AVG_MAX;
 	/*
 	 * At this point, util_avg won't be used in select_task_rq_fair anyway
 	 */
@@ -2025,7 +2025,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
 		delta = runtime - p->last_sum_exec_runtime;
 		*period = now - p->last_task_numa_placement;
 	} else {
-		delta = p->se.avg.load_sum / p->se.load.weight;
+		delta = p->se.avg.load_sum;
 		*period = LOAD_AVG_MAX;
 	}
 
@@ -3018,8 +3018,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
  */
 static __always_inline int
-___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
-		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
+___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+		   unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
 	u64 delta;
 
@@ -3065,39 +3065,80 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 	if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
 		return 0;
 
+	return 1;
+}
+
+static __always_inline void
+___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq)
+{
+	u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
 	/*
 	 * Step 2: update *_avg.
 	 */
-	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
+	sa->load_avg = div_u64(weight * sa->load_sum, divider);
 	if (cfs_rq) {
 		cfs_rq->runnable_load_avg =
-			div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
+			div_u64(cfs_rq->runnable_load_sum, divider);
 	}
-	sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
+	sa->util_avg = sa->util_sum / divider;
+}
 
-	return 1;
+/*
+ * XXX we want to get rid of this helper and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+	return scale_load_down(se->load.weight);
 }
 
+/*
+ * sched_entity:
+ *
+ *   load_sum := runnable_sum
+ *   load_avg = se_weight(se) * runnable_avg
+ *
+ * cfq_rs:
+ *
+ *   load_sum = \Sum se_weight(se) * se->avg.load_sum
+ *   load_avg = \Sum se->avg.load_avg
+ */
+
 static int
 __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 {
-	return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+	if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) {
+		___update_load_avg(&se->avg, se_weight(se), NULL);
+		return 1;
+	}
+
+	return 0;
 }
 
 static int
 __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	return ___update_load_avg(now, cpu, &se->avg,
-				  se->on_rq * scale_load_down(se->load.weight),
-				  cfs_rq->curr == se, NULL);
+	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq,
+				cfs_rq->curr == se, NULL)) {
+
+		___update_load_avg(&se->avg, se_weight(se), NULL);
+		return 1;
+	}
+
+	return 0;
 }
 
 static int
 __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
 {
-	return ___update_load_avg(now, cpu, &cfs_rq->avg,
-			scale_load_down(cfs_rq->load.weight),
-			cfs_rq->curr != NULL, cfs_rq);
+	if (___update_load_sum(now, cpu, &cfs_rq->avg,
+				scale_load_down(cfs_rq->load.weight),
+				cfs_rq->curr != NULL, cfs_rq)) {
+		___update_load_avg(&cfs_rq->avg, 1, cfs_rq);
+		return 1;
+	}
+
+	return 0;
 }
 
 /*
@@ -3267,7 +3308,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	/* Set new sched_entity's load */
 	se->avg.load_avg = load;
-	se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+	se->avg.load_sum = LOAD_AVG_MAX;
 
 	/* Update parent cfs_rq load */
 	add_positive(&cfs_rq->avg.load_avg, delta);
@@ -3473,7 +3514,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 {
 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
 	cfs_rq->avg.load_avg += se->avg.load_avg;
-	cfs_rq->avg.load_sum += se->avg.load_sum;
+	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
 	set_tg_cfs_propagate(cfs_rq);
@@ -3493,7 +3534,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 {
 
 	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
-	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
 	set_tg_cfs_propagate(cfs_rq);
@@ -3505,12 +3546,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 static inline void
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	struct sched_avg *sa = &se->avg;
-
-	cfs_rq->runnable_load_avg += sa->load_avg;
-	cfs_rq->runnable_load_sum += sa->load_sum;
+	cfs_rq->runnable_load_avg += se->avg.load_avg;
+	cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
 
-	if (!sa->last_update_time) {
+	if (!se->avg.last_update_time) {
 		attach_entity_load_avg(cfs_rq, se);
 		update_tg_load_avg(cfs_rq, 0);
 	}
@@ -3520,10 +3559,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static inline void
 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	cfs_rq->runnable_load_avg =
-		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
-	cfs_rq->runnable_load_sum =
-		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
+	sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
+	sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
 }
 
 #ifndef CONFIG_64BIT
-- 
cgit v1.2.3


From 88c0616ee729067ecb412bed76ef4a8734ea5100 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 6 May 2017 17:32:43 +0200
Subject: sched/fair: Change update_load_avg() arguments

Most call sites of update_load_avg() already have cfs_rq_of(se)
available, pass it down instead of recomputing it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3b5b82345774..a6c53580fea0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3480,9 +3480,8 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 #define SKIP_AGE_LOAD	0x2
 
 /* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int flags)
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
 	struct rq *rq = rq_of(cfs_rq);
 	int cpu = cpu_of(rq);
@@ -3643,9 +3642,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 #define UPDATE_TG	0x0
 #define SKIP_AGE_LOAD	0x0
 
-static inline void update_load_avg(struct sched_entity *se, int not_used1)
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
 {
-	cfs_rq_util_change(cfs_rq_of(se));
+	cfs_rq_util_change(cfs_rq);
 }
 
 static inline void
@@ -3796,7 +3795,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *     its group cfs_rq
 	 *   - Add its new weight to cfs_rq->load.weight
 	 */
-	update_load_avg(se, UPDATE_TG);
+	update_load_avg(cfs_rq, se, UPDATE_TG);
 	enqueue_entity_load_avg(cfs_rq, se);
 	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
@@ -3880,7 +3879,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *   - For group entity, update its weight to reflect the new share
 	 *     of its group cfs_rq.
 	 */
-	update_load_avg(se, UPDATE_TG);
+	update_load_avg(cfs_rq, se, UPDATE_TG);
 	dequeue_entity_load_avg(cfs_rq, se);
 
 	update_stats_dequeue(cfs_rq, se, flags);
@@ -3968,7 +3967,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 */
 		update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
-		update_load_avg(se, UPDATE_TG);
+		update_load_avg(cfs_rq, se, UPDATE_TG);
 	}
 
 	update_stats_curr_start(cfs_rq, se);
@@ -4070,7 +4069,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
-		update_load_avg(prev, 0);
+		update_load_avg(cfs_rq, prev, 0);
 	}
 	cfs_rq->curr = NULL;
 }
@@ -4086,7 +4085,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	/*
 	 * Ensure that runnable average is periodically updated.
 	 */
-	update_load_avg(curr, UPDATE_TG);
+	update_load_avg(cfs_rq, curr, UPDATE_TG);
 	update_cfs_shares(curr);
 
 #ifdef CONFIG_SCHED_HRTICK
@@ -5004,7 +5003,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_load_avg(se, UPDATE_TG);
+		update_load_avg(cfs_rq, se, UPDATE_TG);
 		update_cfs_shares(se);
 	}
 
@@ -5063,7 +5062,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_load_avg(se, UPDATE_TG);
+		update_load_avg(cfs_rq, se, UPDATE_TG);
 		update_cfs_shares(se);
 	}
 
@@ -7121,7 +7120,7 @@ static void update_blocked_averages(int cpu)
 		/* Propagate pending load changes to the parent, if any: */
 		se = cfs_rq->tg->se[cpu];
 		if (se && !skip_blocked_update(se))
-			update_load_avg(se, 0);
+			update_load_avg(cfs_rq_of(se), se, 0);
 
 		/*
 		 * There can be a lot of idle CPU cgroups.  Don't let fully
@@ -9295,7 +9294,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_load_avg(se, UPDATE_TG);
+		update_load_avg(cfs_rq, se, UPDATE_TG);
 	}
 }
 #else
@@ -9307,7 +9306,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 	/* Catch up with the cfs_rq and remove our load when we leave */
-	update_load_avg(se, 0);
+	update_load_avg(cfs_rq, se, 0);
 	detach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
 	propagate_entity_cfs_rq(se);
@@ -9326,7 +9325,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
 #endif
 
 	/* Synchronize entity with its cfs_rq */
-	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
 	attach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
 	propagate_entity_cfs_rq(se);
@@ -9610,7 +9609,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		rq_lock_irqsave(rq, &rf);
 		update_rq_clock(rq);
 		for_each_sched_entity(se) {
-			update_load_avg(se, UPDATE_TG);
+			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
 			update_cfs_shares(se);
 		}
 		rq_unlock_irqrestore(rq, &rf);
-- 
cgit v1.2.3


From b382a531b9fece229c8358a9fb79431cddcf93c2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 6 May 2017 17:37:03 +0200
Subject: sched/fair: Move enqueue migrate handling

Move the entity migrate handling from enqueue_entity_load_avg() to
update_load_avg(). This has two benefits:

 - {en,de}queue_entity_load_avg() will become purely about managing
   runnable_load

 - we can avoid a double update_tg_load_avg() and reduce pressure on
   the global tg->shares cacheline

The reason we do this is so that we can change update_cfs_shares() to
change both weight and (future) runnable_weight. For this to work we
need to have the cfs_rq averages up-to-date (which means having done
the attach), but we need the cfs_rq->avg.runnable_avg to not yet
include the se's contribution (since se->on_rq == 0).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 70 +++++++++++++++++++++++++++--------------------------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a6c53580fea0..605b7c3e1ab8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3473,34 +3473,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 	return decayed || removed_load;
 }
 
-/*
- * Optional action to be done while updating the load average
- */
-#define UPDATE_TG	0x1
-#define SKIP_AGE_LOAD	0x2
-
-/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
-	u64 now = cfs_rq_clock_task(cfs_rq);
-	struct rq *rq = rq_of(cfs_rq);
-	int cpu = cpu_of(rq);
-	int decayed;
-
-	/*
-	 * Track task load average for carrying it to new CPU after migrated, and
-	 * track group sched_entity load average for task_h_load calc in migration
-	 */
-	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
-		__update_load_avg_se(now, cpu, cfs_rq, se);
-
-	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
-	decayed |= propagate_entity_load_avg(se);
-
-	if (decayed && (flags & UPDATE_TG))
-		update_tg_load_avg(cfs_rq, 0);
-}
-
 /**
  * attach_entity_load_avg - attach this entity to its cfs_rq load avg
  * @cfs_rq: cfs_rq to attach to
@@ -3541,17 +3513,46 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	cfs_rq_util_change(cfs_rq);
 }
 
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG	0x1
+#define SKIP_AGE_LOAD	0x2
+#define DO_ATTACH	0x4
+
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+	u64 now = cfs_rq_clock_task(cfs_rq);
+	struct rq *rq = rq_of(cfs_rq);
+	int cpu = cpu_of(rq);
+	int decayed;
+
+	/*
+	 * Track task load average for carrying it to new CPU after migrated, and
+	 * track group sched_entity load average for task_h_load calc in migration
+	 */
+	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+		__update_load_avg_se(now, cpu, cfs_rq, se);
+
+	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
+	decayed |= propagate_entity_load_avg(se);
+
+	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
+
+		attach_entity_load_avg(cfs_rq, se);
+		update_tg_load_avg(cfs_rq, 0);
+
+	} else if (decayed && (flags & UPDATE_TG))
+		update_tg_load_avg(cfs_rq, 0);
+}
+
 /* Add the load generated by se into cfs_rq's load average */
 static inline void
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	cfs_rq->runnable_load_avg += se->avg.load_avg;
 	cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
-
-	if (!se->avg.last_update_time) {
-		attach_entity_load_avg(cfs_rq, se);
-		update_tg_load_avg(cfs_rq, 0);
-	}
 }
 
 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
@@ -3641,6 +3642,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 
 #define UPDATE_TG	0x0
 #define SKIP_AGE_LOAD	0x0
+#define DO_ATTACH	0x0
 
 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
 {
@@ -3795,7 +3797,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *     its group cfs_rq
 	 *   - Add its new weight to cfs_rq->load.weight
 	 */
-	update_load_avg(cfs_rq, se, UPDATE_TG);
+	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
 	enqueue_entity_load_avg(cfs_rq, se);
 	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
-- 
cgit v1.2.3


From b5b3e35f4149df72aaba612bba195fb2ec37b1b1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 24 Aug 2017 17:38:30 +0200
Subject: sched/fair: Rename {en,de}queue_entity_load_avg()

Since they're now purely about runnable_load, rename them.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 605b7c3e1ab8..fad77c8454e4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3549,7 +3549,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
 /* Add the load generated by se into cfs_rq's load average */
 static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	cfs_rq->runnable_load_avg += se->avg.load_avg;
 	cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
@@ -3557,7 +3557,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
 static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
 	sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
@@ -3650,9 +3650,9 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 }
 
 static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void remove_entity_load_avg(struct sched_entity *se) {}
 
 static inline void
@@ -3798,7 +3798,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *   - Add its new weight to cfs_rq->load.weight
 	 */
 	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
-	enqueue_entity_load_avg(cfs_rq, se);
+	enqueue_runnable_load_avg(cfs_rq, se);
 	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
 
@@ -3882,7 +3882,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *     of its group cfs_rq.
 	 */
 	update_load_avg(cfs_rq, se, UPDATE_TG);
-	dequeue_entity_load_avg(cfs_rq, se);
+	dequeue_runnable_load_avg(cfs_rq, se);
 
 	update_stats_dequeue(cfs_rq, se, flags);
 
-- 
cgit v1.2.3


From 8d5b9025f9b4500f828260dc62e8ffa823ce0d59 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 24 Aug 2017 17:45:35 +0200
Subject: sched/fair: Introduce {en,de}queue_load_avg()

Analogous to the existing {en,de}queue_runnable_load_avg() add helpers
for {en,de}queue_load_avg(). More users will follow.

Includes some code movement to avoid fwd declarations.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 156 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 86 insertions(+), 70 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fad77c8454e4..654d8e3d6047 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2692,6 +2692,90 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	cfs_rq->nr_running--;
 }
 
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do {                           \
+	typeof(_ptr) ptr = (_ptr);                              \
+	typeof(_val) val = (_val);                              \
+	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+								\
+	res = var + val;                                        \
+								\
+	if (val < 0 && res > var)                               \
+		res = 0;                                        \
+								\
+	WRITE_ONCE(*ptr, res);                                  \
+} while (0)
+
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do {				\
+	typeof(_ptr) ptr = (_ptr);				\
+	typeof(*ptr) val = (_val);				\
+	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
+	res = var - val;					\
+	if (res > var)						\
+		res = 0;					\
+	WRITE_ONCE(*ptr, res);					\
+} while (0)
+
+#ifdef CONFIG_SMP
+/*
+ * XXX we want to get rid of this helper and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+	return scale_load_down(se->load.weight);
+}
+
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	cfs_rq->runnable_load_avg += se->avg.load_avg;
+	cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
+}
+
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
+	sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
+}
+
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	cfs_rq->avg.load_avg += se->avg.load_avg;
+	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+}
+
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+}
+#else
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
 /*
@@ -3084,14 +3168,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cf
 	sa->util_avg = sa->util_sum / divider;
 }
 
-/*
- * XXX we want to get rid of this helper and use the full load resolution.
- */
-static inline long se_weight(struct sched_entity *se)
-{
-	return scale_load_down(se->load.weight);
-}
-
 /*
  * sched_entity:
  *
@@ -3141,26 +3217,6 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
 	return 0;
 }
 
-/*
- * Signed add and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define add_positive(_ptr, _val) do {                           \
-	typeof(_ptr) ptr = (_ptr);                              \
-	typeof(_val) val = (_val);                              \
-	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
-								\
-	res = var + val;                                        \
-								\
-	if (val < 0 && res > var)                               \
-		res = 0;                                        \
-								\
-	WRITE_ONCE(*ptr, res);                                  \
-} while (0)
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /**
  * update_tg_load_avg - update the tg's load avg
@@ -3405,23 +3461,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do {				\
-	typeof(_ptr) ptr = (_ptr);				\
-	typeof(*ptr) val = (_val);				\
-	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
-	res = var - val;					\
-	if (res > var)						\
-		res = 0;					\
-	WRITE_ONCE(*ptr, res);					\
-} while (0)
-
 /**
  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
  * @now: current time, as per cfs_rq_clock_task()
@@ -3484,8 +3523,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
-	cfs_rq->avg.load_avg += se->avg.load_avg;
-	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+	enqueue_load_avg(cfs_rq, se);
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
 	set_tg_cfs_propagate(cfs_rq);
@@ -3503,9 +3541,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
  */
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-
-	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
-	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+	dequeue_load_avg(cfs_rq, se);
 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
 	set_tg_cfs_propagate(cfs_rq);
@@ -3547,22 +3583,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 		update_tg_load_avg(cfs_rq, 0);
 }
 
-/* Add the load generated by se into cfs_rq's load average */
-static inline void
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	cfs_rq->runnable_load_avg += se->avg.load_avg;
-	cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
-}
-
-/* Remove the runnable load generated by se from cfs_rq's runnable load average */
-static inline void
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
-	sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
-}
-
 #ifndef CONFIG_64BIT
 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 {
@@ -3649,10 +3669,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	cfs_rq_util_change(cfs_rq);
 }
 
-static inline void
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline void
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void remove_entity_load_avg(struct sched_entity *se) {}
 
 static inline void
-- 
cgit v1.2.3


From 840c5abca499a858619954dbcffc82110bb6e076 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 6 May 2017 16:11:34 +0200
Subject: sched/fair: More accurate reweight_entity()

When a (group) entity changes it's weight we should instantly change
its load_avg and propagate that change into the sums it is part of.
Because we use these values to predict future behaviour and are not
interested in its historical value.

Without this change, the change in load would need to propagate
through the average, by which time it could again have changed etc..
always chasing itself.

With this change, the cfs_rq load_avg sum will more accurately reflect
the current runnable and expected return of blocked load.

Reported-by: Paul Turner <pjt@google.com>
[josef: compile fix !SMP || !FAIR_GROUP]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 654d8e3d6047..750ae4dbf812 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2886,12 +2886,22 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		if (cfs_rq->curr == se)
 			update_curr(cfs_rq);
 		account_entity_dequeue(cfs_rq, se);
+		dequeue_runnable_load_avg(cfs_rq, se);
 	}
+	dequeue_load_avg(cfs_rq, se);
 
 	update_load_set(&se->load, weight);
 
-	if (se->on_rq)
+#ifdef CONFIG_SMP
+	se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum,
+				   LOAD_AVG_MAX - 1024 + se->avg.period_contrib);
+#endif
+
+	enqueue_load_avg(cfs_rq, se);
+	if (se->on_rq) {
 		account_entity_enqueue(cfs_rq, se);
+		enqueue_runnable_load_avg(cfs_rq, se);
+	}
 }
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-- 
cgit v1.2.3


From 9059393e4ec1c8c6623a120b405ef2c90b968d80 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 17 May 2017 11:50:45 +0200
Subject: sched/fair: Use reweight_entity() for set_user_nice()

Now that we directly change load_avg and propagate that change into
the sums, sys_nice() and co should do the same, otherwise its possible
to confuse load accounting when we migrate near the weight change.

Fixes-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
[ Added changelog, fixed the call condition. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20170517095045.GA8420@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 22 ++++++++++++------
 kernel/sched/fair.c  | 63 ++++++++++++++++++++++++++++++----------------------
 kernel/sched/sched.h |  2 ++
 3 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d17c5da523a0..2288a145bf01 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -733,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data)
 }
 #endif
 
-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
 {
 	int prio = p->static_prio - MAX_RT_PRIO;
 	struct load_weight *load = &p->se.load;
@@ -747,8 +747,16 @@ static void set_load_weight(struct task_struct *p)
 		return;
 	}
 
-	load->weight = scale_load(sched_prio_to_weight[prio]);
-	load->inv_weight = sched_prio_to_wmult[prio];
+	/*
+	 * SCHED_OTHER tasks have to update their load when changing their
+	 * weight
+	 */
+	if (update_load && p->sched_class == &fair_sched_class) {
+		reweight_task(p, prio);
+	} else {
+		load->weight = scale_load(sched_prio_to_weight[prio]);
+		load->inv_weight = sched_prio_to_wmult[prio];
+	}
 }
 
 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -2358,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 			p->static_prio = NICE_TO_PRIO(0);
 
 		p->prio = p->normal_prio = __normal_prio(p);
-		set_load_weight(p);
+		set_load_weight(p, false);
 
 		/*
 		 * We don't need the reset flag anymore after the fork. It has
@@ -3805,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice)
 		put_prev_task(rq, p);
 
 	p->static_prio = NICE_TO_PRIO(nice);
-	set_load_weight(p);
+	set_load_weight(p, true);
 	old_prio = p->prio;
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
@@ -3962,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p,
 	 */
 	p->rt_priority = attr->sched_priority;
 	p->normal_prio = normal_prio(p);
-	set_load_weight(p);
+	set_load_weight(p, true);
 }
 
 /* Actually do priority change: must hold pi & rq lock. */
@@ -5933,7 +5941,7 @@ void __init sched_init(void)
 		atomic_set(&rq->nr_iowait, 0);
 	}
 
-	set_load_weight(&init_task);
+	set_load_weight(&init_task, false);
 
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 750ae4dbf812..d8c02b31498d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2776,6 +2776,43 @@ static inline void
 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif
 
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+			    unsigned long weight)
+{
+	if (se->on_rq) {
+		/* commit outstanding execution time */
+		if (cfs_rq->curr == se)
+			update_curr(cfs_rq);
+		account_entity_dequeue(cfs_rq, se);
+		dequeue_runnable_load_avg(cfs_rq, se);
+	}
+	dequeue_load_avg(cfs_rq, se);
+
+	update_load_set(&se->load, weight);
+
+#ifdef CONFIG_SMP
+	se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum,
+				   LOAD_AVG_MAX - 1024 + se->avg.period_contrib);
+#endif
+
+	enqueue_load_avg(cfs_rq, se);
+	if (se->on_rq) {
+		account_entity_enqueue(cfs_rq, se);
+		enqueue_runnable_load_avg(cfs_rq, se);
+	}
+}
+
+void reweight_task(struct task_struct *p, int prio)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	struct load_weight *load = &se->load;
+	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
+
+	reweight_entity(cfs_rq, se, weight);
+	load->inv_weight = sched_prio_to_wmult[prio];
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
 /*
@@ -2878,32 +2915,6 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq)
 }
 # endif /* CONFIG_SMP */
 
-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-			    unsigned long weight)
-{
-	if (se->on_rq) {
-		/* commit outstanding execution time */
-		if (cfs_rq->curr == se)
-			update_curr(cfs_rq);
-		account_entity_dequeue(cfs_rq, se);
-		dequeue_runnable_load_avg(cfs_rq, se);
-	}
-	dequeue_load_avg(cfs_rq, se);
-
-	update_load_set(&se->load, weight);
-
-#ifdef CONFIG_SMP
-	se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum,
-				   LOAD_AVG_MAX - 1024 + se->avg.period_contrib);
-#endif
-
-	enqueue_load_avg(cfs_rq, se);
-	if (se->on_rq) {
-		account_entity_enqueue(cfs_rq, se);
-		enqueue_runnable_load_avg(cfs_rq, se);
-	}
-}
-
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
 static void update_cfs_shares(struct sched_entity *se)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14db76cd496f..a5d97460ee4e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1529,6 +1529,8 @@ extern void init_sched_dl_class(void);
 extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
 
+extern void reweight_task(struct task_struct *p, int prio);
+
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);
 
-- 
cgit v1.2.3


From 2a2f5d4e44ed160a5ed822c94e04f918f9fbb487 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 8 May 2017 16:51:41 +0200
Subject: sched/fair: Rewrite cfs_rq->removed_*avg

Since on wakeup migration we don't hold the rq->lock for the old CPU
we cannot update its state. Instead we add the removed 'load' to an
atomic variable and have the next update on that CPU collect and
process it.

Currently we have 2 atomic variables; which already have the issue
that they can be read out-of-sync. Also, two atomic ops on a single
cacheline is already more expensive than an uncontended lock.

Since we want to add more, convert the thing over to an explicit
cacheline with a lock in.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/debug.c |  8 ++++----
 kernel/sched/fair.c  | 51 +++++++++++++++++++++++++++++++++++----------------
 kernel/sched/sched.h | 13 +++++++++----
 3 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2f93e4a2d9f6..2f22342c48ff 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -564,10 +564,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->runnable_load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
 			cfs_rq->avg.util_avg);
-	SEQ_printf(m, "  .%-30s: %ld\n", "removed_load_avg",
-			atomic_long_read(&cfs_rq->removed_load_avg));
-	SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
-			atomic_long_read(&cfs_rq->removed_util_avg));
+	SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
+			cfs_rq->removed.load_avg);
+	SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
+			cfs_rq->removed.util_avg);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
 			cfs_rq->tg_load_avg_contrib);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d8c02b31498d..fe4a66b10480 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3501,36 +3501,47 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
+	unsigned long removed_load = 0, removed_util = 0;
 	struct sched_avg *sa = &cfs_rq->avg;
-	int decayed, removed_load = 0, removed_util = 0;
+	int decayed = 0;
 
-	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
-		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+	if (cfs_rq->removed.nr) {
+		unsigned long r;
+
+		raw_spin_lock(&cfs_rq->removed.lock);
+		swap(cfs_rq->removed.util_avg, removed_util);
+		swap(cfs_rq->removed.load_avg, removed_load);
+		cfs_rq->removed.nr = 0;
+		raw_spin_unlock(&cfs_rq->removed.lock);
+
+		/*
+		 * The LOAD_AVG_MAX for _sum is a slight over-estimate,
+		 * which is safe due to sub_positive() clipping at 0.
+		 */
+		r = removed_load;
 		sub_positive(&sa->load_avg, r);
 		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
-		removed_load = 1;
-		set_tg_cfs_propagate(cfs_rq);
-	}
 
-	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
-		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+		r = removed_util;
 		sub_positive(&sa->util_avg, r);
 		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
-		removed_util = 1;
+
 		set_tg_cfs_propagate(cfs_rq);
+
+		decayed = 1;
 	}
 
-	decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+	decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
 
 #ifndef CONFIG_64BIT
 	smp_wmb();
 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
-	if (decayed || removed_util)
+	if (decayed)
 		cfs_rq_util_change(cfs_rq);
 
-	return decayed || removed_load;
+	return decayed;
 }
 
 /**
@@ -3645,6 +3656,7 @@ void sync_entity_load_avg(struct sched_entity *se)
 void remove_entity_load_avg(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	unsigned long flags;
 
 	/*
 	 * tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3654,11 +3666,19 @@ void remove_entity_load_avg(struct sched_entity *se)
 	 * Similarly for groups, they will have passed through
 	 * post_init_entity_util_avg() before unregister_sched_fair_group()
 	 * calls this.
+	 *
+	 * XXX in case entity_is_task(se) && task_of(se)->on_rq == MIGRATING
+	 * we could actually get the right time, since we're called with
+	 * rq->lock held, see detach_task().
 	 */
 
 	sync_entity_load_avg(se);
-	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
-	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+
+	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
+	++cfs_rq->removed.nr;
+	cfs_rq->removed.util_avg	+= se->avg.util_avg;
+	cfs_rq->removed.load_avg	+= se->avg.load_avg;
+	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
 }
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
@@ -9449,8 +9469,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->propagate_avg = 0;
 #endif
-	atomic_long_set(&cfs_rq->removed_load_avg, 0);
-	atomic_long_set(&cfs_rq->removed_util_avg, 0);
+	raw_spin_lock_init(&cfs_rq->removed.lock);
 #endif
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a5d97460ee4e..2fd350a12bb7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -445,14 +445,19 @@ struct cfs_rq {
 	struct sched_avg avg;
 	u64 runnable_load_sum;
 	unsigned long runnable_load_avg;
+#ifndef CONFIG_64BIT
+	u64 load_last_update_time_copy;
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	unsigned long tg_load_avg_contrib;
 	unsigned long propagate_avg;
 #endif
-	atomic_long_t removed_load_avg, removed_util_avg;
-#ifndef CONFIG_64BIT
-	u64 load_last_update_time_copy;
-#endif
+	struct {
+		raw_spinlock_t	lock ____cacheline_aligned;
+		int		nr;
+		unsigned long	load_avg;
+		unsigned long	util_avg;
+	} removed;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
-- 
cgit v1.2.3


From 0e2d2aaaae52c247c047d14999b93486bdbd3431 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 8 May 2017 17:30:46 +0200
Subject: sched/fair: Rewrite PELT migration propagation

When an entity migrates in (or out) of a runqueue, we need to add (or
remove) its contribution from the entire PELT hierarchy, because even
non-runnable entities are included in the load average sums.

In order to do this we have some propagation logic that updates the
PELT tree, however the way it 'propagates' the runnable (or load)
change is (more or less):

                     tg->weight * grq->avg.load_avg
  ge->avg.load_avg = ------------------------------
                               tg->load_avg

But that is the expression for ge->weight, and per the definition of
load_avg:

  ge->avg.load_avg := ge->weight * ge->avg.runnable_avg

That destroys the runnable_avg (by setting it to 1) we wanted to
propagate.

Instead directly propagate runnable_sum.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/debug.c |   2 +
 kernel/sched/fair.c  | 186 +++++++++++++++++++++++++++++----------------------
 kernel/sched/sched.h |   9 +--
 3 files changed, 112 insertions(+), 85 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2f22342c48ff..2e039a81864c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -568,6 +568,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->removed.load_avg);
 	SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
 			cfs_rq->removed.util_avg);
+	SEQ_printf(m, "  .%-30s: %ld\n", "removed.runnable_sum",
+			cfs_rq->removed.runnable_sum);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
 			cfs_rq->tg_load_avg_contrib);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe4a66b10480..086a5d979720 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3319,11 +3319,77 @@ void set_task_rq_fair(struct sched_entity *se,
 	se->avg.last_update_time = n_last_update_time;
 }
 
-/* Take into account change of utilization of a child task group */
+
+/*
+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
+ * propagate its contribution. The key to this propagation is the invariant
+ * that for each group:
+ *
+ *   ge->avg == grq->avg						(1)
+ *
+ * _IFF_ we look at the pure running and runnable sums. Because they
+ * represent the very same entity, just at different points in the hierarchy.
+ *
+ *
+ * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
+ * simply copies the running sum over.
+ *
+ * However, update_tg_cfs_runnable() is more complex. So we have:
+ *
+ *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg		(2)
+ *
+ * And since, like util, the runnable part should be directly transferable,
+ * the following would _appear_ to be the straight forward approach:
+ *
+ *   grq->avg.load_avg = grq->load.weight * grq->avg.running_avg	(3)
+ *
+ * And per (1) we have:
+ *
+ *   ge->avg.running_avg == grq->avg.running_avg
+ *
+ * Which gives:
+ *
+ *                      ge->load.weight * grq->avg.load_avg
+ *   ge->avg.load_avg = -----------------------------------		(4)
+ *                               grq->load.weight
+ *
+ * Except that is wrong!
+ *
+ * Because while for entities historical weight is not important and we
+ * really only care about our future and therefore can consider a pure
+ * runnable sum, runqueues can NOT do this.
+ *
+ * We specifically want runqueues to have a load_avg that includes
+ * historical weights. Those represent the blocked load, the load we expect
+ * to (shortly) return to us. This only works by keeping the weights as
+ * integral part of the sum. We therefore cannot decompose as per (3).
+ *
+ * OK, so what then?
+ *
+ *
+ * Another way to look at things is:
+ *
+ *   grq->avg.load_avg = \Sum se->avg.load_avg
+ *
+ * Therefore, per (2):
+ *
+ *   grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ *
+ * And the very thing we're propagating is a change in that sum (someone
+ * joined/left). So we can easily know the runnable change, which would be, per
+ * (2) the already tracked se->load_avg divided by the corresponding
+ * se->weight.
+ *
+ * Basically (4) but in differential form:
+ *
+ *   d(runnable_avg) += se->avg.load_avg / se->load.weight
+ *								   (5)
+ *   ge->avg.load_avg += ge->load.weight * d(runnable_avg)
+ */
+
 static inline void
-update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
-	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
 	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
 
 	/* Nothing to update */
@@ -3339,102 +3405,59 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
 }
 
-/* Take into account change of load of a child task group */
 static inline void
-update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
-	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
-	long delta, load = gcfs_rq->avg.load_avg;
+	long runnable_sum = gcfs_rq->prop_runnable_sum;
+	long load_avg;
+	s64 load_sum;
 
-	/*
-	 * If the load of group cfs_rq is null, the load of the
-	 * sched_entity will also be null so we can skip the formula
-	 */
-	if (load) {
-		long tg_load;
-
-		/* Get tg's load and ensure tg_load > 0 */
-		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
-
-		/* Ensure tg_load >= load and updated with current load*/
-		tg_load -= gcfs_rq->tg_load_avg_contrib;
-		tg_load += load;
-
-		/*
-		 * We need to compute a correction term in the case that the
-		 * task group is consuming more CPU than a task of equal
-		 * weight. A task with a weight equals to tg->shares will have
-		 * a load less or equal to scale_load_down(tg->shares).
-		 * Similarly, the sched_entities that represent the task group
-		 * at parent level, can't have a load higher than
-		 * scale_load_down(tg->shares). And the Sum of sched_entities'
-		 * load must be <= scale_load_down(tg->shares).
-		 */
-		if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
-			/* scale gcfs_rq's load into tg's shares*/
-			load *= scale_load_down(gcfs_rq->tg->shares);
-			load /= tg_load;
-		}
-	}
+	if (!runnable_sum)
+		return;
 
-	delta = load - se->avg.load_avg;
+	gcfs_rq->prop_runnable_sum = 0;
 
-	/* Nothing to update */
-	if (!delta)
-		return;
+	load_sum = (s64)se_weight(se) * runnable_sum;
+	load_avg = div_s64(load_sum, LOAD_AVG_MAX);
 
-	/* Set new sched_entity's load */
-	se->avg.load_avg = load;
-	se->avg.load_sum = LOAD_AVG_MAX;
+	add_positive(&se->avg.load_sum, runnable_sum);
+	add_positive(&se->avg.load_avg, load_avg);
 
-	/* Update parent cfs_rq load */
-	add_positive(&cfs_rq->avg.load_avg, delta);
-	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+	add_positive(&cfs_rq->avg.load_avg, load_avg);
+	add_positive(&cfs_rq->avg.load_sum, load_sum);
 
-	/*
-	 * If the sched_entity is already enqueued, we also have to update the
-	 * runnable load avg.
-	 */
 	if (se->on_rq) {
-		/* Update parent cfs_rq runnable_load_avg */
-		add_positive(&cfs_rq->runnable_load_avg, delta);
-		cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+		add_positive(&cfs_rq->runnable_load_avg, load_avg);
+		add_positive(&cfs_rq->runnable_load_sum, load_sum);
 	}
 }
 
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
 {
-	cfs_rq->propagate_avg = 1;
-}
-
-static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
-{
-	struct cfs_rq *cfs_rq = group_cfs_rq(se);
-
-	if (!cfs_rq->propagate_avg)
-		return 0;
-
-	cfs_rq->propagate_avg = 0;
-	return 1;
+	cfs_rq->propagate = 1;
+	cfs_rq->prop_runnable_sum += runnable_sum;
 }
 
 /* Update task and its cfs_rq load average */
 static inline int propagate_entity_load_avg(struct sched_entity *se)
 {
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *gcfs_rq;
 
 	if (entity_is_task(se))
 		return 0;
 
-	if (!test_and_clear_tg_cfs_propagate(se))
+	gcfs_rq = group_cfs_rq(se);
+	if (!gcfs_rq->propagate)
 		return 0;
 
+	gcfs_rq->propagate = 0;
+
 	cfs_rq = cfs_rq_of(se);
 
-	set_tg_cfs_propagate(cfs_rq);
+	add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
 
-	update_tg_cfs_util(cfs_rq, se);
-	update_tg_cfs_load(cfs_rq, se);
+	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
+	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
 
 	return 1;
 }
@@ -3458,7 +3481,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
 	 * If there is a pending propagation, we have to update the load and
 	 * the utilization of the sched_entity:
 	 */
-	if (gcfs_rq->propagate_avg)
+	if (gcfs_rq->propagate)
 		return false;
 
 	/*
@@ -3478,7 +3501,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
 	return 0;
 }
 
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
@@ -3501,7 +3524,7 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
-	unsigned long removed_load = 0, removed_util = 0;
+	unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
 	struct sched_avg *sa = &cfs_rq->avg;
 	int decayed = 0;
 
@@ -3511,6 +3534,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 		raw_spin_lock(&cfs_rq->removed.lock);
 		swap(cfs_rq->removed.util_avg, removed_util);
 		swap(cfs_rq->removed.load_avg, removed_load);
+		swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
 		cfs_rq->removed.nr = 0;
 		raw_spin_unlock(&cfs_rq->removed.lock);
 
@@ -3526,7 +3550,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 		sub_positive(&sa->util_avg, r);
 		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
 
-		set_tg_cfs_propagate(cfs_rq);
+		add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
 
 		decayed = 1;
 	}
@@ -3558,7 +3582,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	enqueue_load_avg(cfs_rq, se);
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
-	set_tg_cfs_propagate(cfs_rq);
+
+	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -3576,7 +3601,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	dequeue_load_avg(cfs_rq, se);
 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
-	set_tg_cfs_propagate(cfs_rq);
+
+	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -3678,6 +3704,7 @@ void remove_entity_load_avg(struct sched_entity *se)
 	++cfs_rq->removed.nr;
 	cfs_rq->removed.util_avg	+= se->avg.util_avg;
 	cfs_rq->removed.load_avg	+= se->avg.load_avg;
+	cfs_rq->removed.runnable_sum	+= se->avg.load_sum; /* == runnable_sum */
 	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
 }
 
@@ -9466,9 +9493,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
 #ifdef CONFIG_SMP
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	cfs_rq->propagate_avg = 0;
-#endif
 	raw_spin_lock_init(&cfs_rq->removed.lock);
 #endif
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2fd350a12bb7..5bcb86eb026b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -447,19 +447,20 @@ struct cfs_rq {
 	unsigned long runnable_load_avg;
 #ifndef CONFIG_64BIT
 	u64 load_last_update_time_copy;
-#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	unsigned long tg_load_avg_contrib;
-	unsigned long propagate_avg;
 #endif
 	struct {
 		raw_spinlock_t	lock ____cacheline_aligned;
 		int		nr;
 		unsigned long	load_avg;
 		unsigned long	util_avg;
+		unsigned long	runnable_sum;
 	} removed;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	unsigned long tg_load_avg_contrib;
+	long propagate;
+	long prop_runnable_sum;
+
 	/*
 	 *   h_load = weight * f(tg)
 	 *
-- 
cgit v1.2.3


From 1ea6c46a23f1213d1972bfae220db5c165e27bba Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 6 May 2017 15:59:54 +0200
Subject: sched/fair: Propagate an effective runnable_load_avg

The load balancer uses runnable_load_avg as load indicator. For
!cgroup this is:

  runnable_load_avg = \Sum se->avg.load_avg ; where se->on_rq

That is, a direct sum of all runnable tasks on that runqueue. As
opposed to load_avg, which is a sum of all tasks on the runqueue,
which includes a blocked component.

However, in the cgroup case, this comes apart since the group entities
are always runnable, even if most of their constituent entities are
blocked.

Therefore introduce a runnable_weight which for task entities is the
same as the regular weight, but for group entities is a fraction of
the entity weight and represents the runnable part of the group
runqueue.

Then propagate this load through the PELT hierarchy to arrive at an
effective runnable load avgerage -- which we should not confuse with
the canonical runnable load average.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |   3 +
 kernel/sched/debug.c  |   8 ++-
 kernel/sched/fair.c   | 172 +++++++++++++++++++++++++++++++++-----------------
 kernel/sched/sched.h  |   3 +-
 4 files changed, 124 insertions(+), 62 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a7df4e558c..bdd6ad6fcce1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -331,9 +331,11 @@ struct load_weight {
 struct sched_avg {
 	u64				last_update_time;
 	u64				load_sum;
+	u64				runnable_load_sum;
 	u32				util_sum;
 	u32				period_contrib;
 	unsigned long			load_avg;
+	unsigned long			runnable_load_avg;
 	unsigned long			util_avg;
 };
 
@@ -376,6 +378,7 @@ struct sched_statistics {
 struct sched_entity {
 	/* For load-balancing: */
 	struct load_weight		load;
+	unsigned long			runnable_weight;
 	struct rb_node			run_node;
 	struct list_head		group_node;
 	unsigned int			on_rq;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2e039a81864c..1ca0130ed4f9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 		P_SCHEDSTAT(se->statistics.wait_count);
 	}
 	P(se->load.weight);
+	P(se->runnable_weight);
 #ifdef CONFIG_SMP
 	P(se->avg.load_avg);
 	P(se->avg.util_avg);
+	P(se->avg.runnable_load_avg);
 #endif
 
 #undef PN_SCHEDSTAT
@@ -558,10 +560,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_SMP
+	SEQ_printf(m, "  .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
 	SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",
 			cfs_rq->avg.load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "runnable_load_avg",
-			cfs_rq->runnable_load_avg);
+			cfs_rq->avg.runnable_load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
 			cfs_rq->avg.util_avg);
 	SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
@@ -1006,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 		   "nr_involuntary_switches", (long long)p->nivcsw);
 
 	P(se.load.weight);
+	P(se.runnable_weight);
 #ifdef CONFIG_SMP
 	P(se.avg.load_sum);
+	P(se.avg.runnable_load_sum);
 	P(se.avg.util_sum);
 	P(se.avg.load_avg);
+	P(se.avg.runnable_load_avg);
 	P(se.avg.util_avg);
 	P(se.avg.last_update_time);
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 086a5d979720..10d2000fca2d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -730,8 +730,9 @@ void init_entity_runnable_average(struct sched_entity *se)
 	 * nothing has been attached to the task group yet.
 	 */
 	if (entity_is_task(se))
-		sa->load_avg = scale_load_down(se->load.weight);
-	sa->load_sum = LOAD_AVG_MAX;
+		sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
+	sa->runnable_load_sum = sa->load_sum = LOAD_AVG_MAX;
+
 	/*
 	 * At this point, util_avg won't be used in select_task_rq_fair anyway
 	 */
@@ -2731,25 +2732,35 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 #ifdef CONFIG_SMP
 /*
- * XXX we want to get rid of this helper and use the full load resolution.
+ * XXX we want to get rid of these helpers and use the full load resolution.
  */
 static inline long se_weight(struct sched_entity *se)
 {
 	return scale_load_down(se->load.weight);
 }
 
+static inline long se_runnable(struct sched_entity *se)
+{
+	return scale_load_down(se->runnable_weight);
+}
+
 static inline void
 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	cfs_rq->runnable_load_avg += se->avg.load_avg;
-	cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
+	cfs_rq->runnable_weight += se->runnable_weight;
+
+	cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
+	cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
 }
 
 static inline void
 dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
-	sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
+	cfs_rq->runnable_weight -= se->runnable_weight;
+
+	sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
+	sub_positive(&cfs_rq->avg.runnable_load_sum,
+		     se_runnable(se) * se->avg.runnable_load_sum);
 }
 
 static inline void
@@ -2777,7 +2788,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif
 
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-			    unsigned long weight)
+			    unsigned long weight, unsigned long runnable)
 {
 	if (se->on_rq) {
 		/* commit outstanding execution time */
@@ -2788,11 +2799,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	}
 	dequeue_load_avg(cfs_rq, se);
 
+	se->runnable_weight = runnable;
 	update_load_set(&se->load, weight);
 
 #ifdef CONFIG_SMP
-	se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum,
-				   LOAD_AVG_MAX - 1024 + se->avg.period_contrib);
+	do {
+		u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+
+		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
+		se->avg.runnable_load_avg =
+			div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
+	} while (0);
 #endif
 
 	enqueue_load_avg(cfs_rq, se);
@@ -2809,7 +2826,7 @@ void reweight_task(struct task_struct *p, int prio)
 	struct load_weight *load = &se->load;
 	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
 
-	reweight_entity(cfs_rq, se, weight);
+	reweight_entity(cfs_rq, se, weight, weight);
 	load->inv_weight = sched_prio_to_wmult[prio];
 }
 
@@ -2917,31 +2934,45 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq)
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
-static void update_cfs_shares(struct sched_entity *se)
+/*
+ * Recomputes the group entity based on the current state of its group
+ * runqueue.
+ */
+static void update_cfs_group(struct sched_entity *se)
 {
-	struct cfs_rq *cfs_rq = group_cfs_rq(se);
-	long shares;
+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+	long shares, runnable;
 
-	if (!cfs_rq)
+	if (!gcfs_rq)
 		return;
 
-	if (throttled_hierarchy(cfs_rq))
+	if (throttled_hierarchy(gcfs_rq))
 		return;
 
 #ifndef CONFIG_SMP
-	shares = READ_ONCE(cfs_rq->tg->shares);
+	runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
 
 	if (likely(se->load.weight == shares))
 		return;
 #else
-	shares = calc_cfs_shares(cfs_rq);
+	shares = calc_cfs_shares(gcfs_rq);
+	/*
+	 * The hierarchical runnable load metric is the proportional part
+	 * of this group's runnable_load_avg / load_avg.
+	 *
+	 * Note: we need to deal with very sporadic 'runnable > load' cases
+	 * due to numerical instability.
+	 */
+	runnable = shares * gcfs_rq->avg.runnable_load_avg;
+	if (runnable)
+		runnable /= max(gcfs_rq->avg.load_avg, gcfs_rq->avg.runnable_load_avg);
 #endif
 
-	reweight_entity(cfs_rq_of(se), se, shares);
+	reweight_entity(cfs_rq_of(se), se, shares, runnable);
 }
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct sched_entity *se)
+static inline void update_cfs_group(struct sched_entity *se)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3050,7 +3081,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
  */
 static __always_inline u32
 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
-	       unsigned long weight, int running, struct cfs_rq *cfs_rq)
+	       unsigned long load, unsigned long runnable, int running)
 {
 	unsigned long scale_freq, scale_cpu;
 	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
@@ -3067,10 +3098,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
 	 */
 	if (periods) {
 		sa->load_sum = decay_load(sa->load_sum, periods);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_sum =
-				decay_load(cfs_rq->runnable_load_sum, periods);
-		}
+		sa->runnable_load_sum =
+			decay_load(sa->runnable_load_sum, periods);
 		sa->util_sum = decay_load((u64)(sa->util_sum), periods);
 
 		/*
@@ -3083,11 +3112,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
 	sa->period_contrib = delta;
 
 	contrib = cap_scale(contrib, scale_freq);
-	if (weight) {
-		sa->load_sum += weight * contrib;
-		if (cfs_rq)
-			cfs_rq->runnable_load_sum += weight * contrib;
-	}
+	if (load)
+		sa->load_sum += load * contrib;
+	if (runnable)
+		sa->runnable_load_sum += runnable * contrib;
 	if (running)
 		sa->util_sum += contrib * scale_cpu;
 
@@ -3124,7 +3152,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
  */
 static __always_inline int
 ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
-		   unsigned long weight, int running, struct cfs_rq *cfs_rq)
+		  unsigned long load, unsigned long runnable, int running)
 {
 	u64 delta;
 
@@ -3157,8 +3185,8 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
 	 * this happens during idle_balance() which calls
 	 * update_blocked_averages()
 	 */
-	if (!weight)
-		running = 0;
+	if (!load)
+		runnable = running = 0;
 
 	/*
 	 * Now we know we crossed measurement unit boundaries. The *_avg
@@ -3167,45 +3195,60 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
 	 * Step 1: accumulate *_sum since last_update_time. If we haven't
 	 * crossed period boundaries, finish.
 	 */
-	if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+	if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
 		return 0;
 
 	return 1;
 }
 
 static __always_inline void
-___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq)
+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
 {
 	u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
 
 	/*
 	 * Step 2: update *_avg.
 	 */
-	sa->load_avg = div_u64(weight * sa->load_sum, divider);
-	if (cfs_rq) {
-		cfs_rq->runnable_load_avg =
-			div_u64(cfs_rq->runnable_load_sum, divider);
-	}
+	sa->load_avg = div_u64(load * sa->load_sum, divider);
+	sa->runnable_load_avg =	div_u64(runnable * sa->runnable_load_sum, divider);
 	sa->util_avg = sa->util_sum / divider;
 }
 
 /*
  * sched_entity:
  *
+ *   task:
+ *     se_runnable() == se_weight()
+ *
+ *   group: [ see update_cfs_group() ]
+ *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
+ *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ *
  *   load_sum := runnable_sum
  *   load_avg = se_weight(se) * runnable_avg
  *
+ *   runnable_load_sum := runnable_sum
+ *   runnable_load_avg = se_runnable(se) * runnable_avg
+ *
+ * XXX collapse load_sum and runnable_load_sum
+ *
  * cfq_rs:
  *
  *   load_sum = \Sum se_weight(se) * se->avg.load_sum
  *   load_avg = \Sum se->avg.load_avg
+ *
+ *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
+ *   runnable_load_avg = \Sum se->avg.runable_load_avg
  */
 
 static int
 __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 {
-	if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) {
-		___update_load_avg(&se->avg, se_weight(se), NULL);
+	if (entity_is_task(se))
+		se->runnable_weight = se->load.weight;
+
+	if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
 		return 1;
 	}
 
@@ -3215,10 +3258,13 @@ __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 static int
 __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq,
-				cfs_rq->curr == se, NULL)) {
+	if (entity_is_task(se))
+		se->runnable_weight = se->load.weight;
+
+	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+				cfs_rq->curr == se)) {
 
-		___update_load_avg(&se->avg, se_weight(se), NULL);
+		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
 		return 1;
 	}
 
@@ -3230,8 +3276,10 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
 {
 	if (___update_load_sum(now, cpu, &cfs_rq->avg,
 				scale_load_down(cfs_rq->load.weight),
-				cfs_rq->curr != NULL, cfs_rq)) {
-		___update_load_avg(&cfs_rq->avg, 1, cfs_rq);
+				scale_load_down(cfs_rq->runnable_weight),
+				cfs_rq->curr != NULL)) {
+
+		___update_load_avg(&cfs_rq->avg, 1, 1);
 		return 1;
 	}
 
@@ -3409,8 +3457,8 @@ static inline void
 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
 	long runnable_sum = gcfs_rq->prop_runnable_sum;
-	long load_avg;
-	s64 load_sum;
+	long runnable_load_avg, load_avg;
+	s64 runnable_load_sum, load_sum;
 
 	if (!runnable_sum)
 		return;
@@ -3426,9 +3474,15 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
 	add_positive(&cfs_rq->avg.load_avg, load_avg);
 	add_positive(&cfs_rq->avg.load_sum, load_sum);
 
+	runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
+	runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
+
+	add_positive(&se->avg.runnable_load_sum, runnable_sum);
+	add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
+
 	if (se->on_rq) {
-		add_positive(&cfs_rq->runnable_load_avg, load_avg);
-		add_positive(&cfs_rq->runnable_load_sum, load_sum);
+		add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
+		add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
 	}
 }
 
@@ -3710,7 +3764,7 @@ void remove_entity_load_avg(struct sched_entity *se)
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
 {
-	return cfs_rq->runnable_load_avg;
+	return cfs_rq->avg.runnable_load_avg;
 }
 
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
@@ -3882,8 +3936,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *   - Add its new weight to cfs_rq->load.weight
 	 */
 	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+	update_cfs_group(se);
 	enqueue_runnable_load_avg(cfs_rq, se);
-	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
 
 	if (flags & ENQUEUE_WAKEUP)
@@ -3989,7 +4043,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	/* return excess runtime on last dequeue */
 	return_cfs_rq_runtime(cfs_rq);
 
-	update_cfs_shares(se);
+	update_cfs_group(se);
 
 	/*
 	 * Now advance min_vruntime if @se was the entity holding it back,
@@ -4172,7 +4226,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * Ensure that runnable average is periodically updated.
 	 */
 	update_load_avg(cfs_rq, curr, UPDATE_TG);
-	update_cfs_shares(curr);
+	update_cfs_group(curr);
 
 #ifdef CONFIG_SCHED_HRTICK
 	/*
@@ -5090,7 +5144,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(cfs_rq, se, UPDATE_TG);
-		update_cfs_shares(se);
+		update_cfs_group(se);
 	}
 
 	if (!se)
@@ -5149,7 +5203,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(cfs_rq, se, UPDATE_TG);
-		update_cfs_shares(se);
+		update_cfs_group(se);
 	}
 
 	if (!se)
@@ -7174,7 +7228,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 	if (cfs_rq->avg.util_sum)
 		return false;
 
-	if (cfs_rq->runnable_load_sum)
+	if (cfs_rq->avg.runnable_load_sum)
 		return false;
 
 	return true;
@@ -9692,7 +9746,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		update_rq_clock(rq);
 		for_each_sched_entity(se) {
 			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
-			update_cfs_shares(se);
+			update_cfs_group(se);
 		}
 		rq_unlock_irqrestore(rq, &rf);
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5bcb86eb026b..e83d1b8be611 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -418,6 +418,7 @@ struct cfs_bandwidth { };
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
+	unsigned long runnable_weight;
 	unsigned int nr_running, h_nr_running;
 
 	u64 exec_clock;
@@ -443,8 +444,6 @@ struct cfs_rq {
 	 * CFS load tracking
 	 */
 	struct sched_avg avg;
-	u64 runnable_load_sum;
-	unsigned long runnable_load_avg;
 #ifndef CONFIG_64BIT
 	u64 load_last_update_time_copy;
 #endif
-- 
cgit v1.2.3


From 144d8487bc6e9b741895709cb46d4e19b748a725 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 May 2017 17:57:24 +0200
Subject: sched/fair: Implement synchonous PELT detach on load-balance migrate

Vincent wondered why his self migrating task had a roughly 50% dip in
load_avg when landing on the new CPU. This is because we uncondionally
take the asynchronous detatch_entity route, which can lead to the
attach on the new CPU still seeing the old CPU's contribution to
tg->load_avg, effectively halving the new CPU's shares.

While in general this is something we have to live with, there is the
special case of runnable migration where we can do better.

Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 10d2000fca2d..92dbcc0fea46 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3746,10 +3746,6 @@ void remove_entity_load_avg(struct sched_entity *se)
 	 * Similarly for groups, they will have passed through
 	 * post_init_entity_util_avg() before unregister_sched_fair_group()
 	 * calls this.
-	 *
-	 * XXX in case entity_is_task(se) && task_of(se)->on_rq == MIGRATING
-	 * we could actually get the right time, since we're called with
-	 * rq->lock held, see detach_task().
 	 */
 
 	sync_entity_load_avg(se);
@@ -6292,6 +6288,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	return new_cpu;
 }
 
+static void detach_entity_cfs_rq(struct sched_entity *se);
+
 /*
  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
@@ -6325,14 +6323,25 @@ static void migrate_task_rq_fair(struct task_struct *p)
 		se->vruntime -= min_vruntime;
 	}
 
-	/*
-	 * We are supposed to update the task to "current" time, then its up to date
-	 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
-	 * what current time is, so simply throw away the out-of-date time. This
-	 * will result in the wakee task is less decayed, but giving the wakee more
-	 * load sounds not bad.
-	 */
-	remove_entity_load_avg(&p->se);
+	if (p->on_rq == TASK_ON_RQ_MIGRATING) {
+		/*
+		 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
+		 * rq->lock and can modify state directly.
+		 */
+		lockdep_assert_held(&task_rq(p)->lock);
+		detach_entity_cfs_rq(&p->se);
+
+	} else {
+		/*
+		 * We are supposed to update the task to "current" time, then
+		 * its up to date and ready to go to new CPU/cfs_rq. But we
+		 * have difficulty in getting what current time is, so simply
+		 * throw away the out-of-date time. This will result in the
+		 * wakee task is less decayed, but giving the wakee more load
+		 * sounds not bad.
+		 */
+		remove_entity_load_avg(&p->se);
+	}
 
 	/* Tell new CPU we are migrated */
 	p->se.avg.last_update_time = 0;
-- 
cgit v1.2.3


From f207934fb79d1af1de1a62b09d56a3a1914172c4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 12 May 2017 14:16:30 +0200
Subject: sched/fair: Align PELT windows between cfs_rq and its se

The PELT _sum values are a saw-tooth function, dropping on the decay
edge and then growing back up again during the window.

When these window-edges are not aligned between cfs_rq and se, we can
have the situation where, for example, on dequeue, the se decays
first.

Its _sum values will be small(er), while the cfs_rq _sum values will
still be on their way up. Because of this, the subtraction:
cfs_rq->avg._sum -= se->avg._sum will result in a positive value. This
will then, once the cfs_rq reaches an edge, translate into its _avg
value jumping up.

This is especially visible with the runnable_load bits, since they get
added/subtracted a lot.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 92dbcc0fea46..954b332cd899 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -716,13 +716,8 @@ void init_entity_runnable_average(struct sched_entity *se)
 {
 	struct sched_avg *sa = &se->avg;
 
-	sa->last_update_time = 0;
-	/*
-	 * sched_avg's period_contrib should be strictly less then 1024, so
-	 * we give it 1023 to make sure it is almost a period (1024us), and
-	 * will definitely be update (after enqueue).
-	 */
-	sa->period_contrib = 1023;
+	memset(sa, 0, sizeof(*sa));
+
 	/*
 	 * Tasks are intialized with full load to be seen as heavy tasks until
 	 * they get a chance to stabilize to their real load level.
@@ -731,13 +726,9 @@ void init_entity_runnable_average(struct sched_entity *se)
 	 */
 	if (entity_is_task(se))
 		sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
-	sa->runnable_load_sum = sa->load_sum = LOAD_AVG_MAX;
 
-	/*
-	 * At this point, util_avg won't be used in select_task_rq_fair anyway
-	 */
-	sa->util_avg = 0;
-	sa->util_sum = 0;
+	se->runnable_weight = se->load.weight;
+
 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
 
@@ -785,7 +776,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
 		} else {
 			sa->util_avg = cap;
 		}
-		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 	}
 
 	if (entity_is_task(se)) {
@@ -3632,7 +3622,34 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  */
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+
+	/*
+	 * When we attach the @se to the @cfs_rq, we must align the decay
+	 * window because without that, really weird and wonderful things can
+	 * happen.
+	 *
+	 * XXX illustrate
+	 */
 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
+	se->avg.period_contrib = cfs_rq->avg.period_contrib;
+
+	/*
+	 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
+	 * period_contrib. This isn't strictly correct, but since we're
+	 * entirely outside of the PELT hierarchy, nobody cares if we truncate
+	 * _sum a little.
+	 */
+	se->avg.util_sum = se->avg.util_avg * divider;
+
+	se->avg.load_sum = divider;
+	if (se_weight(se)) {
+		se->avg.load_sum =
+			div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
+	}
+
+	se->avg.runnable_load_sum = se->avg.load_sum;
+
 	enqueue_load_avg(cfs_rq, se);
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
-- 
cgit v1.2.3


From 9a2dd585b2c431ec1e5d46a9d9568291c7a534cc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 12 May 2017 14:18:10 +0200
Subject: sched/fair: Implement more accurate async detach

The problem with the overestimate is that it will subtract too big a
value from the load_sum, thereby pushing it down further than it ought
to go. Since runnable_load_avg is not subject to a similar 'force',
this results in the occasional 'runnable_load > load' situation.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 954b332cd899..67c39642a512 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3574,6 +3574,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 
 	if (cfs_rq->removed.nr) {
 		unsigned long r;
+		u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
 
 		raw_spin_lock(&cfs_rq->removed.lock);
 		swap(cfs_rq->removed.util_avg, removed_util);
@@ -3582,17 +3583,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 		cfs_rq->removed.nr = 0;
 		raw_spin_unlock(&cfs_rq->removed.lock);
 
-		/*
-		 * The LOAD_AVG_MAX for _sum is a slight over-estimate,
-		 * which is safe due to sub_positive() clipping at 0.
-		 */
 		r = removed_load;
 		sub_positive(&sa->load_avg, r);
-		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
+		sub_positive(&sa->load_sum, r * divider);
 
 		r = removed_util;
 		sub_positive(&sa->util_avg, r);
-		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+		sub_positive(&sa->util_sum, r * divider);
 
 		add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
 
-- 
cgit v1.2.3


From 2c8e4dce7963d2bae02db95fce2691365630685c Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 3 Aug 2017 11:13:39 -0400
Subject: sched/fair: Calculate runnable_weight slightly differently

Our runnable_weight currently looks like this

runnable_weight = shares * runnable_load_avg / load_avg

The goal is to scale the runnable weight for the group based on its runnable to
load_avg ratio.  The problem with this is it biases us towards tasks that never
go to sleep.  Tasks that go to sleep are going to have their runnable_load_avg
decayed pretty hard, which will drastically reduce the runnable weight of groups
with interactive tasks.  To solve this imbalance we tweak this slightly, so in
the ideal case it is still the above, but in the interactive case it is

runnable_weight = shares * runnable_weight / load_weight

which will make the weight distribution fairer between interactive and
non-interactive groups.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@fb.com
Cc: linux-kernel@vger.kernel.org
Cc: riel@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/1501773219-18774-2-git-send-email-jbacik@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 67c39642a512..a62098ec9deb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2883,7 +2883,7 @@ void reweight_task(struct task_struct *p, int prio)
  *
  * hence icky!
  */
-static long calc_cfs_shares(struct cfs_rq *cfs_rq)
+static long calc_group_shares(struct cfs_rq *cfs_rq)
 {
 	long tg_weight, tg_shares, load, shares;
 	struct task_group *tg = cfs_rq->tg;
@@ -2920,6 +2920,36 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq)
 	 */
 	return clamp_t(long, shares, MIN_SHARES, tg_shares);
 }
+
+/*
+ * The runnable shares of this group are calculated as such
+ *
+ *          max(cfs_rq->avg.runnable_load_avg, cfs_rq->runnable_weight)
+ * shares * ------------------------------------------------------------
+ *               max(cfs_rq->avg.load_avg, cfs_rq->load.weight)
+ *
+ * We do this to keep the shares in line with expected load on the cfs_rq.
+ * Consider a cfs_rq that has several tasks wake up on this cfs_rq for the first
+ * time, it's runnable_load_avg is not going to be representative of the actual
+ * load this cfs_rq will now experience, which will bias us agaisnt this cfs_rq.
+ * The weight on the cfs_rq is the immediate effect of having new tasks
+ * enqueue'd onto it which should be used to calculate the new runnable shares.
+ * At the same time we need the actual load_avg to be the lower bounds for the
+ * calculation, to handle when our weight drops quickly from having entities
+ * dequeued.
+ */
+static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
+{
+	long load_avg = max(cfs_rq->avg.load_avg,
+			    scale_load_down(cfs_rq->load.weight));
+	long runnable = max(cfs_rq->avg.runnable_load_avg,
+			    scale_load_down(cfs_rq->runnable_weight));
+
+	runnable *= shares;
+	if (load_avg)
+		runnable /= load_avg;
+	return clamp_t(long, runnable, MIN_SHARES, shares);
+}
 # endif /* CONFIG_SMP */
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
@@ -2945,17 +2975,8 @@ static void update_cfs_group(struct sched_entity *se)
 	if (likely(se->load.weight == shares))
 		return;
 #else
-	shares = calc_cfs_shares(gcfs_rq);
-	/*
-	 * The hierarchical runnable load metric is the proportional part
-	 * of this group's runnable_load_avg / load_avg.
-	 *
-	 * Note: we need to deal with very sporadic 'runnable > load' cases
-	 * due to numerical instability.
-	 */
-	runnable = shares * gcfs_rq->avg.runnable_load_avg;
-	if (runnable)
-		runnable /= max(gcfs_rq->avg.load_avg, gcfs_rq->avg.runnable_load_avg);
+	shares   = calc_group_shares(gcfs_rq);
+	runnable = calc_group_runnable(gcfs_rq, shares);
 #endif
 
 	reweight_entity(cfs_rq_of(se), se, shares, runnable);
-- 
cgit v1.2.3


From 17de4ee04ca925590df036b112c1db8a778e14bf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 24 Aug 2017 13:06:35 +0200
Subject: sched/fair: Update calc_group_*() comments

I had a wee bit of trouble recalling how the calc_group_runnable()
stuff worked.. add hopefully better comments.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 66 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 44 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a62098ec9deb..350dbec01523 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2860,7 +2860,7 @@ void reweight_task(struct task_struct *p, int prio)
  * Now, in that special case (1) reduces to:
  *
  *                     tg->weight * grq->load.weight
- *   ge->load.weight = ----------------------------- = tg>weight   (4)
+ *   ge->load.weight = ----------------------------- = tg->weight   (4)
  *			    grp->load.weight
  *
  * That is, the sum collapses because all other CPUs are idle; the UP scenario.
@@ -2874,6 +2874,18 @@ void reweight_task(struct task_struct *p, int prio)
  *     ---------------------------------------------------         (5)
  *     tg->load_avg - grq->avg.load_avg + grq->load.weight
  *
+ * But because grq->load.weight can drop to 0, resulting in a divide by zero,
+ * we need to use grq->avg.load_avg as its lower bound, which then gives:
+ *
+ *
+ *                     tg->weight * grq->load.weight
+ *   ge->load.weight = -----------------------------		   (6)
+ *				tg_load_avg'
+ *
+ * Where:
+ *
+ *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
+ *                  max(grq->load.weight, grq->avg.load_avg)
  *
  * And that is shares_weight and is icky. In the (near) UP case it approaches
  * (4) while in the normal case it approaches (3). It consistently
@@ -2890,10 +2902,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
 
 	tg_shares = READ_ONCE(tg->shares);
 
-	/*
-	 * Because (5) drops to 0 when the cfs_rq is idle, we need to use (3)
-	 * as a lower bound.
-	 */
 	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
 
 	tg_weight = atomic_long_read(&tg->load_avg);
@@ -2922,32 +2930,46 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
 }
 
 /*
- * The runnable shares of this group are calculated as such
+ * This calculates the effective runnable weight for a group entity based on
+ * the group entity weight calculated above.
+ *
+ * Because of the above approximation (2), our group entity weight is
+ * an load_avg based ratio (3). This means that it includes blocked load and
+ * does not represent the runnable weight.
  *
- *          max(cfs_rq->avg.runnable_load_avg, cfs_rq->runnable_weight)
- * shares * ------------------------------------------------------------
- *               max(cfs_rq->avg.load_avg, cfs_rq->load.weight)
+ * Approximate the group entity's runnable weight per ratio from the group
+ * runqueue:
  *
- * We do this to keep the shares in line with expected load on the cfs_rq.
- * Consider a cfs_rq that has several tasks wake up on this cfs_rq for the first
- * time, it's runnable_load_avg is not going to be representative of the actual
- * load this cfs_rq will now experience, which will bias us agaisnt this cfs_rq.
- * The weight on the cfs_rq is the immediate effect of having new tasks
- * enqueue'd onto it which should be used to calculate the new runnable shares.
- * At the same time we need the actual load_avg to be the lower bounds for the
- * calculation, to handle when our weight drops quickly from having entities
- * dequeued.
+ *					     grq->avg.runnable_load_avg
+ *   ge->runnable_weight = ge->load.weight * -------------------------- (7)
+ *						 grq->avg.load_avg
+ *
+ * However, analogous to above, since the avg numbers are slow, this leads to
+ * transients in the from-idle case. Instead we use:
+ *
+ *   ge->runnable_weight = ge->load.weight *
+ *
+ *		max(grq->avg.runnable_load_avg, grq->runnable_weight)
+ *		-----------------------------------------------------	(8)
+ *		      max(grq->avg.load_avg, grq->load.weight)
+ *
+ * Where these max() serve both to use the 'instant' values to fix the slow
+ * from-idle and avoid the /0 on to-idle, similar to (6).
  */
 static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
 {
-	long load_avg = max(cfs_rq->avg.load_avg,
-			    scale_load_down(cfs_rq->load.weight));
-	long runnable = max(cfs_rq->avg.runnable_load_avg,
-			    scale_load_down(cfs_rq->runnable_weight));
+	long runnable, load_avg;
+
+	load_avg = max(cfs_rq->avg.load_avg,
+		       scale_load_down(cfs_rq->load.weight));
+
+	runnable = max(cfs_rq->avg.runnable_load_avg,
+		       scale_load_down(cfs_rq->runnable_weight));
 
 	runnable *= shares;
 	if (load_avg)
 		runnable /= load_avg;
+
 	return clamp_t(long, runnable, MIN_SHARES, shares);
 }
 # endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From 62cb1188ed86a9cf082fd2f757d4dd9b54741f24 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 29 Aug 2017 15:07:54 +0200
Subject: sched/idle: Move quiet_vmstate() into the NOHZ code

quiet_vmstat() is an expensive function that only makes sense when we
go into NOHZ.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: aubrey.li@linux.intel.com
Cc: cl@linux.com
Cc: fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/idle.c      | 1 -
 kernel/time/tick-sched.c | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 257f4f0b4532..b2e8f0afbb42 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -219,7 +219,6 @@ static void do_idle(void)
 	 */
 
 	__current_set_polling();
-	quiet_vmstat();
 	tick_nohz_idle_enter();
 
 	while (!need_resched()) {
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c7a899c5ce64..7b258c59d78a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -27,6 +27,7 @@
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
 #include <linux/context_tracking.h>
+#include <linux/mm.h>
 
 #include <asm/irq_regs.h>
 
@@ -787,6 +788,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	if (!ts->tick_stopped) {
 		calc_load_nohz_start();
 		cpu_load_update_nohz_start();
+		quiet_vmstat();
 
 		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
 		ts->tick_stopped = 1;
-- 
cgit v1.2.3


From 1d48b080bcce0a5e7d7aa2dbcdb35deefc188c3f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 29 Sep 2017 13:50:16 +0200
Subject: sched/debug: Rename task-state printing helpers

Steve requested better names for the new task-state helper functions.

So introduce the concept of task-state index for the printing and
rename __get_task_state() to task_state_index() and
__task_state_to_char() to task_index_to_char().

Requested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170929115016.pzlqc7ss3ccystyg@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/proc/array.c                   |  2 +-
 include/linux/sched.h             |  6 +++---
 include/trace/events/sched.h      |  2 +-
 kernel/trace/trace_output.c       | 12 ++++++------
 kernel/trace/trace_sched_wakeup.c |  8 ++++----
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 77a8eacbe032..1b5406ca8e4c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -137,7 +137,7 @@ static const char * const task_state_array[] = {
 static inline const char *get_task_state(struct task_struct *tsk)
 {
 	BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
-	return task_state_array[__get_task_state(tsk)];
+	return task_state_array[task_state_index(tsk)];
 }
 
 static inline int get_task_umask(struct task_struct *tsk)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bdd6ad6fcce1..33a01f4deb00 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1248,7 +1248,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 #define TASK_REPORT_IDLE	(TASK_REPORT + 1)
 #define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1)
 
-static inline unsigned int __get_task_state(struct task_struct *tsk)
+static inline unsigned int task_state_index(struct task_struct *tsk)
 {
 	unsigned int tsk_state = READ_ONCE(tsk->state);
 	unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
@@ -1261,7 +1261,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk)
 	return fls(state);
 }
 
-static inline char __task_state_to_char(unsigned int state)
+static inline char task_index_to_char(unsigned int state)
 {
 	static const char state_char[] = "RSDTtXZPI";
 
@@ -1272,7 +1272,7 @@ static inline char __task_state_to_char(unsigned int state)
 
 static inline char task_state_to_char(struct task_struct *tsk)
 {
-	return __task_state_to_char(__get_task_state(tsk));
+	return task_index_to_char(task_state_index(tsk));
 }
 
 /**
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 3c8b7f625670..fab74a12ca0f 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -117,7 +117,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
 	if (preempt)
 		return TASK_STATE_MAX;
 
-	return __get_task_state(p);
+	return task_state_index(p);
 }
 #endif /* CREATE_TRACE_POINTS */
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c738e764e2a5..90db994ac900 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	T = __task_state_to_char(field->next_state);
-	S = __task_state_to_char(field->prev_state);
+	T = task_index_to_char(field->next_state);
+	S = task_index_to_char(field->prev_state);
 	trace_find_cmdline(field->next_pid, comm);
 	trace_seq_printf(&iter->seq,
 			 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 	trace_assign_type(field, iter->ent);
 
 	if (!S)
-		S = __task_state_to_char(field->prev_state);
-	T = __task_state_to_char(field->next_state);
+		S = task_index_to_char(field->prev_state);
+	T = task_index_to_char(field->next_state);
 	trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
 			 field->prev_pid,
 			 field->prev_prio,
@@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 	trace_assign_type(field, iter->ent);
 
 	if (!S)
-		S = __task_state_to_char(field->prev_state);
-	T = __task_state_to_char(field->next_state);
+		S = task_index_to_char(field->prev_state);
+	T = task_index_to_char(field->next_state);
 
 	SEQ_PUT_HEX_FIELD(s, field->prev_pid);
 	SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0c331978b1a6..500f370d3bb1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->prev_pid			= prev->pid;
 	entry->prev_prio		= prev->prio;
-	entry->prev_state		= __get_task_state(prev);
+	entry->prev_state		= task_state_index(prev);
 	entry->next_pid			= next->pid;
 	entry->next_prio		= next->prio;
-	entry->next_state		= __get_task_state(next);
+	entry->next_state		= task_state_index(next);
 	entry->next_cpu	= task_cpu(next);
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
@@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->prev_pid			= curr->pid;
 	entry->prev_prio		= curr->prio;
-	entry->prev_state		= __get_task_state(curr);
+	entry->prev_state		= task_state_index(curr);
 	entry->next_pid			= wakee->pid;
 	entry->next_prio		= wakee->prio;
-	entry->next_state		= __get_task_state(wakee);
+	entry->next_state		= task_state_index(wakee);
 	entry->next_cpu			= task_cpu(wakee);
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-- 
cgit v1.2.3


From e964d3501b64d6930aaa4dd18955a8cd086ccb92 Mon Sep 17 00:00:00 2001
From: luca abeni <luca.abeni@santannapisa.it>
Date: Thu, 7 Sep 2017 12:09:28 +0200
Subject: sched/headers: Remove duplicate prototype of __dl_clear_params()

Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1504778971-13573-2-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e83d1b8be611..9d5aa18fc9bf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -255,7 +255,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy,
 extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
 extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
 extern bool __checkparam_dl(const struct sched_attr *attr);
-extern void __dl_clear_params(struct task_struct *p);
 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
 extern int dl_task_can_attach(struct task_struct *p,
 			      const struct cpumask *cs_cpus_allowed);
-- 
cgit v1.2.3


From 295d6d5e373607729bcc8182c25afe964655714f Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Thu, 7 Sep 2017 12:09:29 +0200
Subject: sched/deadline: Fix switching to -deadline

Fix a bug introduced in:

  72f9f3fdc928 ("sched/deadline: Remove dl_new from struct sched_dl_entity")

After that commit, when switching to -deadline if the scheduling
deadline of a task is in the past then switched_to_dl() calls
setup_new_entity() to properly initialize the scheduling deadline
and runtime.

The problem is that the task is enqueued _before_ having its parameters
initialized by setup_new_entity(), and this can cause problems.
For example, a task with its out-of-date deadline in the past will
potentially be enqueued as the highest priority one; however, its
adjusted deadline may not be the earliest one.

This patch fixes the problem by initializing the task's parameters before
enqueuing it.

Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1504778971-13573-3-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0191ec7667c3..9d45354e4296 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1364,6 +1364,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
 		update_dl_entity(dl_se, pi_se);
 	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se, pi_se);
+	} else if ((flags & ENQUEUE_RESTORE) &&
+		  dl_time_before(dl_se->deadline,
+				 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+		setup_new_dl_entity(dl_se);
 	}
 
 	__enqueue_dl_entity(dl_se);
@@ -2255,13 +2259,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 
 		return;
 	}
-	/*
-	 * If p is boosted we already updated its params in
-	 * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
-	 * p's deadline being now already after rq_clock(rq).
-	 */
-	if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-		setup_new_dl_entity(&p->dl);
 
 	if (rq->curr != p) {
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 8c0944cee7af55291df0b28e6e2eeac0930e93c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 7 Sep 2017 12:09:30 +0200
Subject: sched/deadline: Rename __dl_clear() to __dl_sub()

__dl_sub() is more meaningful as a name, and is more consistent
with the naming of the dual function (__dl_add()).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1504778971-13573-4-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 10 +++++-----
 kernel/sched/sched.h    |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9d45354e4296..8d1b946fa684 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -242,7 +242,7 @@ static void task_non_contending(struct task_struct *p)
 			if (p->state == TASK_DEAD)
 				sub_rq_bw(p->dl.dl_bw, &rq->dl);
 			raw_spin_lock(&dl_b->lock);
-			__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+			__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 			__dl_clear_params(p);
 			raw_spin_unlock(&dl_b->lock);
 		}
@@ -1209,7 +1209,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 		}
 
 		raw_spin_lock(&dl_b->lock);
-		__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+		__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 		raw_spin_unlock(&dl_b->lock);
 		__dl_clear_params(p);
 
@@ -2170,7 +2170,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 		 * until we complete the update.
 		 */
 		raw_spin_lock(&src_dl_b->lock);
-		__dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+		__dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 		raw_spin_unlock(&src_dl_b->lock);
 	}
 
@@ -2448,7 +2448,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
 	if (dl_policy(policy) && !task_has_dl_policy(p) &&
 	    !__dl_overflow(dl_b, cpus, 0, new_bw)) {
 		if (hrtimer_active(&p->dl.inactive_timer))
-			__dl_clear(dl_b, p->dl.dl_bw, cpus);
+			__dl_sub(dl_b, p->dl.dl_bw, cpus);
 		__dl_add(dl_b, new_bw, cpus);
 		err = 0;
 	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
@@ -2460,7 +2460,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
 		 * But this would require to set the task's "inactive
 		 * timer" when the task is not inactive.
 		 */
-		__dl_clear(dl_b, p->dl.dl_bw, cpus);
+		__dl_sub(dl_b, p->dl.dl_bw, cpus);
 		__dl_add(dl_b, new_bw, cpus);
 		dl_change_utilization(p, new_bw);
 		err = 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9d5aa18fc9bf..a81c9782e98c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -226,7 +226,7 @@ struct dl_bw {
 static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
 
 static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
 {
 	dl_b->total_bw -= tsk_bw;
 	__dl_update(dl_b, (s32)tsk_bw / cpus);
-- 
cgit v1.2.3


From 799ba82de01e7543f6b2042e1a739f3a20255f23 Mon Sep 17 00:00:00 2001
From: luca abeni <luca.abeni@santannapisa.it>
Date: Thu, 7 Sep 2017 12:09:31 +0200
Subject: sched/deadline: Use C bitfields for the state flags

Ask the compiler to use a single bit for storing true / false values,
instead of wasting the size of a whole int value.
Tested with gcc 5.4.0 on x86_64, and the compiler produces the expected
Assembly (similar to the Assembly code generated when explicitly accessing
the bits with bitmasks, "&" and "|").

Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1504778971-13573-5-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33a01f4deb00..0f897dfc195e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -474,10 +474,10 @@ struct sched_dl_entity {
 	 * conditions between the inactive timer handler and the wakeup
 	 * code.
 	 */
-	int				dl_throttled;
-	int				dl_boosted;
-	int				dl_yielded;
-	int				dl_non_contending;
+	int				dl_throttled      : 1;
+	int				dl_boosted        : 1;
+	int				dl_yielded        : 1;
+	int				dl_non_contending : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
-- 
cgit v1.2.3


From ed4ad1ca08a53cf1a805478678d1e7ff0d2cf251 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 2 Oct 2017 14:50:33 +0200
Subject: sched/topology: Restore SD_PREFER_SIBLING on MC domains

The normal x86_topology on NHM+ machines degenerates because the MC
and CPU domains are of the same size, therefore MC inherits
SD_PREFER_SIBLING from CPU (which then gets taken out). The result is
that we'll spread tasks across the first NUMA level in order to
maximize cache utilization.

However, for the x86_numa_in_package_topology we loose the CPU domain,
and we'll not have SD_PREFER_SIBLING set anywhere, giving a distinct
difference in behaviour.

Commit:

  8e7fbcbc22c1 ("sched: Remove stale power aware scheduling remnants and dysfunctional knobs")

made a fail by not preserving the SD_PREFER_SIBLING for the !power_saving
case on both CPU and MC.

Then commit:

  6956dc568f34 ("sched/numa: Add SD_PERFER_SIBLING to CPU domain")

adds it back to the CPU but not MC.

Restore that now, such that we get consistent spreading behaviour wrt
L3 and NUMA.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f1cf4f306a82..86e81f06d36b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1157,6 +1157,7 @@ sd_init(struct sched_domain_topology_level *tl,
 		sd->smt_gain = 1178; /* ~15% */
 
 	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+		sd->flags |= SD_PREFER_SIBLING;
 		sd->imbalance_pct = 117;
 		sd->cache_nice_tries = 1;
 		sd->busy_idx = 2;
-- 
cgit v1.2.3


From 051f3ca02e46432c0965e8948f00c07d8a2f09c0 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Thu, 7 Sep 2017 02:20:05 -0500
Subject: sched/topology: Introduce NUMA identity node sched domain

On AMD Family17h-based (EPYC) system, a logical NUMA node can contain
upto 8 cores (16 threads) with the following topology.

             ----------------------------
         C0  | T0 T1 |    ||    | T0 T1 | C4
             --------|    ||    |--------
         C1  | T0 T1 | L3 || L3 | T0 T1 | C5
             --------|    ||    |--------
         C2  | T0 T1 | #0 || #1 | T0 T1 | C6
             --------|    ||    |--------
         C3  | T0 T1 |    ||    | T0 T1 | C7
             ----------------------------

Here, there are 2 last-level (L3) caches per logical NUMA node.
A socket can contain upto 4 NUMA nodes, and a system can support
upto 2 sockets. With full system configuration, current scheduler
creates 4 sched domains:

  domain0 SMT       (span a core)
  domain1 MC        (span a last-level-cache)
  domain2 NUMA      (span a socket: 4 nodes)
  domain3 NUMA      (span a system: 8 nodes)

Note that there is no domain to represent cpus spaning a logical
NUMA node.  With this hierarchy of sched domains, the scheduler does
not balance properly in the following cases:

Case1:

 When running 8 tasks, a properly balanced system should
 schedule a task per logical NUMA node. This is not the case for
 the current scheduler.

Case2:

 In some cases, threads are scheduled on the same cpu, while other
 cpus are idle. This results in run-to-run inconsistency. For example:

  taskset -c 0-7 sysbench --num-threads=8 --test=cpu \
                          --cpu-max-prime=100000 run

Total execution time ranges from 25.1s to 33.5s depending on threads
placement, where 25.1s is when all 8 threads are balanced properly
on 8 cpus.

Introducing NUMA identity node sched domain, which is based on how
SRAT/SLIT table define a logical NUMA node. This results in the following
hierarchy of sched domains on the same system described above.

  domain0 SMT       (span a core)
  domain1 MC        (span a last-level-cache)
  domain2 NODE      (span a logical NUMA node)
  domain3 NUMA      (span a socket: 4 nodes)
  domain4 NUMA      (span a system: 8 nodes)

This fixes the improper load balancing cases mentioned above.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@suse.de
Link: http://lkml.kernel.org/r/1504768805-46716-1-git-send-email-suravee.suthikulpanit@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 86e81f06d36b..f51d123f9fe1 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1332,6 +1332,10 @@ void sched_init_numa(void)
 	if (!sched_domains_numa_distance)
 		return;
 
+	/* Includes NUMA identity node at level 0. */
+	sched_domains_numa_distance[level++] = curr_distance;
+	sched_domains_numa_levels = level;
+
 	/*
 	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
 	 * unique distances in the node_distance() table.
@@ -1379,8 +1383,7 @@ void sched_init_numa(void)
 		return;
 
 	/*
-	 * 'level' contains the number of unique distances, excluding the
-	 * identity distance node_distance(i,i).
+	 * 'level' contains the number of unique distances
 	 *
 	 * The sched_domains_numa_distance[] array includes the actual distance
 	 * numbers.
@@ -1441,10 +1444,19 @@ void sched_init_numa(void)
 	for (i = 0; sched_domain_topology[i].mask; i++)
 		tl[i] = sched_domain_topology[i];
 
+	/*
+	 * Add the NUMA identity distance, aka single NODE.
+	 */
+	tl[i++] = (struct sched_domain_topology_level){
+		.mask = sd_numa_mask,
+		.numa_level = 0,
+		SD_INIT_NAME(NODE)
+	};
+
 	/*
 	 * .. and append 'j' levels of NUMA goodness.
 	 */
-	for (j = 0; j < level; i++, j++) {
+	for (j = 1; j < level; i++, j++) {
 		tl[i] = (struct sched_domain_topology_level){
 			.mask = sd_numa_mask,
 			.sd_flags = cpu_numa_flags,
-- 
cgit v1.2.3


From 93824900a2e242766f5fe6ae7697e3d7171aa234 Mon Sep 17 00:00:00 2001
From: Uladzislau Rezki <urezki@gmail.com>
Date: Wed, 13 Sep 2017 12:24:30 +0200
Subject: sched/fair: Search a task from the tail of the queue

As a first step this patch makes cfs_tasks list as MRU one.
It means, that when a next task is picked to run on physical
CPU it is moved to the front of the list.

Therefore, the cfs_tasks list is more or less sorted (except
woken tasks) starting from recently given CPU time tasks toward
tasks with max wait time in a run-queue, i.e. MRU list.

Second, as part of the load balance operation, this approach
starts detach_tasks()/detach_one_task() from the tail of the
queue instead of the head, giving some advantages:

 - tends to pick a task with highest wait time;

 - tasks located in the tail are less likely cache-hot,
   therefore the can_migrate_task() decision is higher.

hackbench illustrates slightly better performance. For example
doing 1000 samples and 40 groups on i5-3320M CPU, it shows below
figures:

 default: 0.657 avg
 patched: 0.646 avg

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Kirill Tkhai <tkhai@yandex.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: http://lkml.kernel.org/r/20170913102430.8985-2-urezki@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ac6602c5f36f..cc0bfb035df9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6628,10 +6628,7 @@ again:
 		set_next_entity(cfs_rq, se);
 	}
 
-	if (hrtick_enabled(rq))
-		hrtick_start_fair(rq, p);
-
-	return p;
+	goto done;
 simple:
 #endif
 
@@ -6645,6 +6642,16 @@ simple:
 
 	p = task_of(se);
 
+done: __maybe_unused
+#ifdef CONFIG_SMP
+	/*
+	 * Move the next running task to the front of
+	 * the list, so our cfs_tasks list becomes MRU
+	 * one.
+	 */
+	list_move(&p->se.group_node, &rq->cfs_tasks);
+#endif
+
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
@@ -7080,11 +7087,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
  */
 static struct task_struct *detach_one_task(struct lb_env *env)
 {
-	struct task_struct *p, *n;
+	struct task_struct *p;
 
 	lockdep_assert_held(&env->src_rq->lock);
 
-	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+	list_for_each_entry_reverse(p,
+			&env->src_rq->cfs_tasks, se.group_node) {
 		if (!can_migrate_task(p, env))
 			continue;
 
@@ -7130,7 +7138,7 @@ static int detach_tasks(struct lb_env *env)
 		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
 			break;
 
-		p = list_first_entry(tasks, struct task_struct, se.group_node);
+		p = list_last_entry(tasks, struct task_struct, se.group_node);
 
 		env->loop++;
 		/* We've more or less seen every task there is, call it quits */
@@ -7180,7 +7188,7 @@ static int detach_tasks(struct lb_env *env)
 
 		continue;
 next:
-		list_move_tail(&p->se.group_node, tasks);
+		list_move(&p->se.group_node, tasks);
 	}
 
 	/*
-- 
cgit v1.2.3


From ea16f0ea6c3dc9e1aa083bd3d1792ba02860526e Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 11:55:51 +0100
Subject: sched/fair: Sync task util before slow-path wakeup

We use task_util() in find_idlest_group() via capacity_spare_wake().
This task_util() updated in wake_cap(). However wake_cap() is not the
only reason for ending up in find_idlest_group() - we could have been sent
there by wake_wide(). So explicitly sync the task util with prev_cpu
when we are about to head to find_idlest_group().

We could simply do this at the beginning of
select_task_rq_fair() (i.e. irrespective of whether we're heading to
select_idle_sibling() or find_idlest_group() & co), but I didn't want to
slow down the select_idle_sibling() path more than necessary.

Don't do this during fork balancing, we won't need the task_util and
we'd just clobber the last_update_time, which is supposed to be 0.

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andres Oportus <andresoportus@google.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/20170808095519.10077-1-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cc0bfb035df9..c04a42556a59 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6263,8 +6263,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 			new_cpu = cpu;
 	}
 
+	if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+		/*
+		 * We're going to need the task's util for capacity_spare_wake
+		 * in find_idlest_group. Sync it up to prev_cpu's
+		 * last_update_time.
+		 */
+		sync_entity_load_avg(&p->se);
+	}
+
 	if (!sd) {
- pick_cpu:
+pick_cpu:
 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
 			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
-- 
cgit v1.2.3


From 583ffd99d7657755736d831bbc182612d1d2697d Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 11:58:54 +0100
Subject: sched/fair: Force balancing on NOHZ balance if local group has
 capacity

The "goto force_balance" here is intended to mitigate the fact that
avg_load calculations can result in bad placement decisions when
priority is asymmetrical.

The original commit that adds it:

  fab476228ba3 ("sched: Force balancing on newidle balance if local group has capacity")

explains:

    Under certain situations, such as a niced down task (i.e. nice =
    -15) in the presence of nr_cpus NICE0 tasks, the niced task lands
    on a sched group and kicks away other tasks because of its large
    weight. This leads to sub-optimal utilization of the
    machine. Even though the sched group has capacity, it does not
    pull tasks because sds.this_load >> sds.max_load, and f_b_g()
    returns NULL.

A similar but inverted issue also affects ARM big.LITTLE (asymmetrical CPU
capacity) systems - consider 8 always-running, same-priority tasks on a
system with 4 "big" and 4 "little" CPUs. Suppose that 5 of them end up on
the "big" CPUs (which will be represented by one sched_group in the DIE
sched_domain) and 3 on the "little" (the other sched_group in DIE), leaving
one CPU unused. Because the "big" group has a higher group_capacity its
avg_load may not present an imbalance that would cause migrating a
task to the idle "little".

The force_balance case here solves the problem but currently only for
CPU_NEWLY_IDLE balances, which in theory might never happen on the
unused CPU. Including CPU_IDLE in the force_balance case means
there's an upper bound on the time before we can attempt to solve the
underutilization: after DIE's sd->balance_interval has passed the
next nohz balance kick will help us out.

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170807163900.25180-1-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c04a42556a59..cf3e816db80f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8186,8 +8186,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (busiest->group_type == group_imbalanced)
 		goto force_balance;
 
-	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-	if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+	/*
+	 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+	 * capacities from resulting in underutilization due to avg_load.
+	 */
+	if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
 	    busiest->group_no_capacity)
 		goto force_balance;
 
-- 
cgit v1.2.3


From 18bd1b4bd53aba81d76d55e91a68310a227dc187 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 12:45:12 +0100
Subject: sched/fair: Move select_task_rq_fair() slow-path into its own
 function

In preparation for changes that would otherwise require adding a new
level of indentation to the while(sd) loop, create a new function
find_idlest_cpu() which contains this loop, and rename the existing
find_idlest_cpu() to find_idlest_group_cpu().

Code inside the while(sd) loop is unchanged. @new_cpu is added as a
variable in the new function, with the same initial value as the
@new_cpu in select_task_rq_fair().

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-2-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 83 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 48 insertions(+), 35 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cf3e816db80f..dd4f25305604 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5859,10 +5859,10 @@ skip_spare:
 }
 
 /*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
 	unsigned long load, min_load = ULONG_MAX;
 	unsigned int min_exit_latency = UINT_MAX;
@@ -5911,6 +5911,50 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 }
 
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+				  int cpu, int prev_cpu, int sd_flag)
+{
+	int new_cpu = prev_cpu;
+
+	while (sd) {
+		struct sched_group *group;
+		struct sched_domain *tmp;
+		int weight;
+
+		if (!(sd->flags & sd_flag)) {
+			sd = sd->child;
+			continue;
+		}
+
+		group = find_idlest_group(sd, p, cpu, sd_flag);
+		if (!group) {
+			sd = sd->child;
+			continue;
+		}
+
+		new_cpu = find_idlest_group_cpu(group, p, cpu);
+		if (new_cpu == -1 || new_cpu == cpu) {
+			/* Now try balancing at a lower domain level of cpu */
+			sd = sd->child;
+			continue;
+		}
+
+		/* Now try balancing at a lower domain level of new_cpu */
+		cpu = new_cpu;
+		weight = sd->span_weight;
+		sd = NULL;
+		for_each_domain(cpu, tmp) {
+			if (weight <= tmp->span_weight)
+				break;
+			if (tmp->flags & sd_flag)
+				sd = tmp;
+		}
+		/* while loop will break here if sd == NULL */
+	}
+
+	return new_cpu;
+}
+
 #ifdef CONFIG_SCHED_SMT
 
 static inline void set_idle_cores(int cpu, int val)
@@ -6277,39 +6321,8 @@ pick_cpu:
 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
 			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
-	} else while (sd) {
-		struct sched_group *group;
-		int weight;
-
-		if (!(sd->flags & sd_flag)) {
-			sd = sd->child;
-			continue;
-		}
-
-		group = find_idlest_group(sd, p, cpu, sd_flag);
-		if (!group) {
-			sd = sd->child;
-			continue;
-		}
-
-		new_cpu = find_idlest_cpu(group, p, cpu);
-		if (new_cpu == -1 || new_cpu == cpu) {
-			/* Now try balancing at a lower domain level of cpu */
-			sd = sd->child;
-			continue;
-		}
-
-		/* Now try balancing at a lower domain level of new_cpu */
-		cpu = new_cpu;
-		weight = sd->span_weight;
-		sd = NULL;
-		for_each_domain(cpu, tmp) {
-			if (weight <= tmp->span_weight)
-				break;
-			if (tmp->flags & sd_flag)
-				sd = tmp;
-		}
-		/* while loop will break here if sd == NULL */
+	} else {
+		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
 	}
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From e90381eaecf6d59c60fe396838e0e99789531419 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 12:45:13 +0100
Subject: sched/fair: Remove unnecessary comparison with -1

Since commit:

  83a0a96a5f26 ("sched/fair: Leverage the idle state info when choosing the "idlest" cpu")

find_idlest_group_cpu() (formerly find_idlest_cpu) no longer returns -1,
so we can simplify the checking of the return value in find_idlest_cpu().

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-3-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd4f25305604..bdd28fbad7a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5933,7 +5933,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 		}
 
 		new_cpu = find_idlest_group_cpu(group, p, cpu);
-		if (new_cpu == -1 || new_cpu == cpu) {
+		if (new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
 			continue;
-- 
cgit v1.2.3


From 0d10ab952e99f3e9f374898e93f45452b81e5711 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 12:45:14 +0100
Subject: sched/fair: Fix find_idlest_group() when local group is not allowed

When the local group is not allowed we do not modify this_*_load from
their initial value of 0. That means that the load checks at the end
of find_idlest_group cause us to incorrectly return NULL. Fixing the
initial values to ULONG_MAX means we will instead return the idlest
remote group in that case.

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-4-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bdd28fbad7a3..cca1835efa7b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5737,8 +5737,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
 	struct sched_group *most_spare_sg = NULL;
-	unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
-	unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
+	unsigned long min_runnable_load = ULONG_MAX;
+	unsigned long this_runnable_load = ULONG_MAX;
+	unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
 	unsigned long most_spare = 0, this_spare = 0;
 	int load_idx = sd->forkexec_idx;
 	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
-- 
cgit v1.2.3


From 6fee85ccbc76e8aeba43dc120c5fa3c5409a4e2c Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 12:45:15 +0100
Subject: sched/fair: Fix usage of find_idlest_group() when no groups are
 allowed

When 'p' is not allowed on any of the CPUs in the sched_domain, we
currently return NULL from find_idlest_group(), and pointlessly
continue the search on lower sched_domain levels (where 'p' is also not
allowed) before returning prev_cpu regardless (as we have not updated
new_cpu).

Add an explicit check for this case, and add a comment to
find_idlest_group(). Now when find_idlest_group() returns NULL, it always
means that the local group is allowed and idlest.

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-5-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cca1835efa7b..ed80d6bd76c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5730,6 +5730,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
@@ -5917,6 +5919,9 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 {
 	int new_cpu = prev_cpu;
 
+	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+		return prev_cpu;
+
 	while (sd) {
 		struct sched_group *group;
 		struct sched_domain *tmp;
-- 
cgit v1.2.3


From 93f50f90247e3e926bbe9830df089c64a5cec236 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 12:45:16 +0100
Subject: sched/fair: Fix usage of find_idlest_group() when the local group is
 idlest

find_idlest_group() returns NULL when the local group is idlest. The
caller then continues the find_idlest_group() search at a lower level
of the current CPU's sched_domain hierarchy. find_idlest_group_cpu() is
not consulted and, crucially, @new_cpu is not updated. This means the
search is pointless and we return @prev_cpu from select_task_rq_fair().

This is fixed by initialising @new_cpu to @cpu instead of @prev_cpu.

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-6-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ed80d6bd76c8..56f343b8e749 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5917,7 +5917,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
 				  int cpu, int prev_cpu, int sd_flag)
 {
-	int new_cpu = prev_cpu;
+	int new_cpu = cpu;
 
 	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
 		return prev_cpu;
-- 
cgit v1.2.3


From ff0d4a9dc16b1f4c954f6407c233ab848bdfe8b0 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 4 Oct 2017 17:49:00 +0200
Subject: sched/rt: Add a helper to test for a RT task

This helper returns true if a task has elevated priority which is true
for RT tasks (SCHED_RR and SCHED_FIFO) and also for SCHED_DEADLINE.
A task which runs at RT priority due to PI-boosting is not considered as
one with elevated priority.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Jens Axboe <axboe@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171004154901.26904-1-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/rt.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index f93329aba31a..133001627ba1 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -17,6 +17,17 @@ static inline int rt_task(struct task_struct *p)
 	return rt_prio(p->prio);
 }
 
+static inline bool task_is_realtime(struct task_struct *tsk)
+{
+	int policy = tsk->policy;
+
+	if (policy == SCHED_FIFO || policy == SCHED_RR)
+		return true;
+	if (policy == SCHED_DEADLINE)
+		return true;
+	return false;
+}
+
 #ifdef CONFIG_RT_MUTEXES
 /*
  * Must hold either p->pi_lock or task_rq(p)->lock.
-- 
cgit v1.2.3


From 36436440cd19f59f5be12a1b181d299af2725140 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 4 Oct 2017 17:49:01 +0200
Subject: block/ioprio: Use a helper to check for RT prio

A side-effect to the old code is that now SCHED_DEADLINE is also
recognized.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Jens Axboe <axboe@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171004154901.26904-2-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ioprio.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 8c1239020d79..2f19aab84a4a 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -2,6 +2,7 @@
 #define IOPRIO_H
 
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/iocontext.h>
 
 /*
@@ -62,7 +63,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
 {
 	if (task->policy == SCHED_IDLE)
 		return IOPRIO_CLASS_IDLE;
-	else if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
+	else if (task_is_realtime(task))
 		return IOPRIO_CLASS_RT;
 	else
 		return IOPRIO_CLASS_BE;
-- 
cgit v1.2.3


From 4bdced5c9a2922521e325896a7bbbf0132c94e56 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 6 Oct 2017 14:05:04 -0400
Subject: sched/rt: Simplify the IPI based RT balancing logic

When a CPU lowers its priority (schedules out a high priority task for a
lower priority one), a check is made to see if any other CPU has overloaded
RT tasks (more than one). It checks the rto_mask to determine this and if so
it will request to pull one of those tasks to itself if the non running RT
task is of higher priority than the new priority of the next task to run on
the current CPU.

When we deal with large number of CPUs, the original pull logic suffered
from large lock contention on a single CPU run queue, which caused a huge
latency across all CPUs. This was caused by only having one CPU having
overloaded RT tasks and a bunch of other CPUs lowering their priority. To
solve this issue, commit:

  b6366f048e0c ("sched/rt: Use IPI to trigger RT task push migration instead of pulling")

changed the way to request a pull. Instead of grabbing the lock of the
overloaded CPU's runqueue, it simply sent an IPI to that CPU to do the work.

Although the IPI logic worked very well in removing the large latency build
up, it still could suffer from a large number of IPIs being sent to a single
CPU. On a 80 CPU box, I measured over 200us of processing IPIs. Worse yet,
when I tested this on a 120 CPU box, with a stress test that had lots of
RT tasks scheduling on all CPUs, it actually triggered the hard lockup
detector! One CPU had so many IPIs sent to it, and due to the restart
mechanism that is triggered when the source run queue has a priority status
change, the CPU spent minutes! processing the IPIs.

Thinking about this further, I realized there's no reason for each run queue
to send its own IPI. As all CPUs with overloaded tasks must be scanned
regardless if there's one or many CPUs lowering their priority, because
there's no current way to find the CPU with the highest priority task that
can schedule to one of these CPUs, there really only needs to be one IPI
being sent around at a time.

This greatly simplifies the code!

The new approach is to have each root domain have its own irq work, as the
rto_mask is per root domain. The root domain has the following fields
attached to it:

  rto_push_work	 - the irq work to process each CPU set in rto_mask
  rto_lock	 - the lock to protect some of the other rto fields
  rto_loop_start - an atomic that keeps contention down on rto_lock
		    the first CPU scheduling in a lower priority task
		    is the one to kick off the process.
  rto_loop_next	 - an atomic that gets incremented for each CPU that
		    schedules in a lower priority task.
  rto_loop	 - a variable protected by rto_lock that is used to
		    compare against rto_loop_next
  rto_cpu	 - The cpu to send the next IPI to, also protected by
		    the rto_lock.

When a CPU schedules in a lower priority task and wants to make sure
overloaded CPUs know about it. It increments the rto_loop_next. Then it
atomically sets rto_loop_start with a cmpxchg. If the old value is not "0",
then it is done, as another CPU is kicking off the IPI loop. If the old
value is "0", then it will take the rto_lock to synchronize with a possible
IPI being sent around to the overloaded CPUs.

If rto_cpu is greater than or equal to nr_cpu_ids, then there's either no
IPI being sent around, or one is about to finish. Then rto_cpu is set to the
first CPU in rto_mask and an IPI is sent to that CPU. If there's no CPUs set
in rto_mask, then there's nothing to be done.

When the CPU receives the IPI, it will first try to push any RT tasks that is
queued on the CPU but can't run because a higher priority RT task is
currently running on that CPU.

Then it takes the rto_lock and looks for the next CPU in the rto_mask. If it
finds one, it simply sends an IPI to that CPU and the process continues.

If there's no more CPUs in the rto_mask, then rto_loop is compared with
rto_loop_next. If they match, everything is done and the process is over. If
they do not match, then a CPU scheduled in a lower priority task as the IPI
was being passed around, and the process needs to start again. The first CPU
in rto_mask is sent the IPI.

This change removes this duplication of work in the IPI logic, and greatly
lowers the latency caused by the IPIs. This removed the lockup happening on
the 120 CPU machine. It also simplifies the code tremendously. What else
could anyone ask for?

Thanks to Peter Zijlstra for simplifying the rto_loop_start atomic logic and
supplying me with the rto_start_trylock() and rto_start_unlock() helper
functions.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Clark Williams <williams@redhat.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Wood <swood@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170424114732.1aac6dc4@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/rt.c       | 316 ++++++++++++++++++------------------------------
 kernel/sched/sched.h    |  24 ++--
 kernel/sched/topology.c |   6 +
 3 files changed, 138 insertions(+), 208 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0af5ca9e3e3f..fda27991699a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -73,10 +73,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
 
-#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
-static void push_irq_work_func(struct irq_work *work);
-#endif
-
 void init_rt_rq(struct rt_rq *rt_rq)
 {
 	struct rt_prio_array *array;
@@ -96,13 +92,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
 	plist_head_init(&rt_rq->pushable_tasks);
-
-#ifdef HAVE_RT_PUSH_IPI
-	rt_rq->push_flags = 0;
-	rt_rq->push_cpu = nr_cpu_ids;
-	raw_spin_lock_init(&rt_rq->push_lock);
-	init_irq_work(&rt_rq->push_work, push_irq_work_func);
-#endif
 #endif /* CONFIG_SMP */
 	/* We start is dequeued state, because no RT tasks are queued */
 	rt_rq->rt_queued = 0;
@@ -1875,241 +1864,166 @@ static void push_rt_tasks(struct rq *rq)
 }
 
 #ifdef HAVE_RT_PUSH_IPI
+
 /*
- * The search for the next cpu always starts at rq->cpu and ends
- * when we reach rq->cpu again. It will never return rq->cpu.
- * This returns the next cpu to check, or nr_cpu_ids if the loop
- * is complete.
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * All CPUs with overloaded RT tasks need to be notified as there is currently
+ * no way to know which of these CPUs have the highest priority task waiting
+ * to run. Instead of trying to take a spinlock on each of these CPUs,
+ * which has shown to cause large latency when done on machines with many
+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+ * RT tasks waiting to run.
+ *
+ * Just sending an IPI to each of the CPUs is also an issue, as on large
+ * count CPU machines, this can cause an IPI storm on a CPU, especially
+ * if its the only CPU with multiple RT tasks queued, and a large number
+ * of CPUs scheduling a lower priority task at the same time.
+ *
+ * Each root domain has its own irq work function that can iterate over
+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+ * tassk must be checked if there's one or many CPUs that are lowering
+ * their priority, there's a single irq work iterator that will try to
+ * push off RT tasks that are waiting to run.
+ *
+ * When a CPU schedules a lower priority task, it will kick off the
+ * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * As it only takes the first CPU that schedules a lower priority task
+ * to start the process, the rto_start variable is incremented and if
+ * the atomic result is one, then that CPU will try to take the rto_lock.
+ * This prevents high contention on the lock as the process handles all
+ * CPUs scheduling lower priority tasks.
+ *
+ * All CPUs that are scheduling a lower priority task will increment the
+ * rt_loop_next variable. This will make sure that the irq work iterator
+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+ * priority task, even if the iterator is in the middle of a scan. Incrementing
+ * the rt_loop_next will cause the iterator to perform another scan.
  *
- * rq->rt.push_cpu holds the last cpu returned by this function,
- * or if this is the first instance, it must hold rq->cpu.
  */
 static int rto_next_cpu(struct rq *rq)
 {
-	int prev_cpu = rq->rt.push_cpu;
+	struct root_domain *rd = rq->rd;
+	int next;
 	int cpu;
 
-	cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
-
 	/*
-	 * If the previous cpu is less than the rq's CPU, then it already
-	 * passed the end of the mask, and has started from the beginning.
-	 * We end if the next CPU is greater or equal to rq's CPU.
+	 * When starting the IPI RT pushing, the rto_cpu is set to -1,
+	 * rt_next_cpu() will simply return the first CPU found in
+	 * the rto_mask.
+	 *
+	 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+	 * will return the next CPU found in the rto_mask.
+	 *
+	 * If there are no more CPUs left in the rto_mask, then a check is made
+	 * against rto_loop and rto_loop_next. rto_loop is only updated with
+	 * the rto_lock held, but any CPU may increment the rto_loop_next
+	 * without any locking.
 	 */
-	if (prev_cpu < rq->cpu) {
-		if (cpu >= rq->cpu)
-			return nr_cpu_ids;
+	for (;;) {
 
-	} else if (cpu >= nr_cpu_ids) {
-		/*
-		 * We passed the end of the mask, start at the beginning.
-		 * If the result is greater or equal to the rq's CPU, then
-		 * the loop is finished.
-		 */
-		cpu = cpumask_first(rq->rd->rto_mask);
-		if (cpu >= rq->cpu)
-			return nr_cpu_ids;
-	}
-	rq->rt.push_cpu = cpu;
+		/* When rto_cpu is -1 this acts like cpumask_first() */
+		cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
 
-	/* Return cpu to let the caller know if the loop is finished or not */
-	return cpu;
-}
+		rd->rto_cpu = cpu;
 
-static int find_next_push_cpu(struct rq *rq)
-{
-	struct rq *next_rq;
-	int cpu;
+		if (cpu < nr_cpu_ids)
+			return cpu;
 
-	while (1) {
-		cpu = rto_next_cpu(rq);
-		if (cpu >= nr_cpu_ids)
-			break;
-		next_rq = cpu_rq(cpu);
+		rd->rto_cpu = -1;
+
+		/*
+		 * ACQUIRE ensures we see the @rto_mask changes
+		 * made prior to the @next value observed.
+		 *
+		 * Matches WMB in rt_set_overload().
+		 */
+		next = atomic_read_acquire(&rd->rto_loop_next);
 
-		/* Make sure the next rq can push to this rq */
-		if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+		if (rd->rto_loop == next)
 			break;
+
+		rd->rto_loop = next;
 	}
 
-	return cpu;
+	return -1;
 }
 
-#define RT_PUSH_IPI_EXECUTING		1
-#define RT_PUSH_IPI_RESTART		2
+static inline bool rto_start_trylock(atomic_t *v)
+{
+	return !atomic_cmpxchg_acquire(v, 0, 1);
+}
 
-/*
- * When a high priority task schedules out from a CPU and a lower priority
- * task is scheduled in, a check is made to see if there's any RT tasks
- * on other CPUs that are waiting to run because a higher priority RT task
- * is currently running on its CPU. In this case, the CPU with multiple RT
- * tasks queued on it (overloaded) needs to be notified that a CPU has opened
- * up that may be able to run one of its non-running queued RT tasks.
- *
- * On large CPU boxes, there's the case that several CPUs could schedule
- * a lower priority task at the same time, in which case it will look for
- * any overloaded CPUs that it could pull a task from. To do this, the runqueue
- * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
- * for a single overloaded CPU's runqueue lock can produce a large latency.
- * (This has actually been observed on large boxes running cyclictest).
- * Instead of taking the runqueue lock of the overloaded CPU, each of the
- * CPUs that scheduled a lower priority task simply sends an IPI to the
- * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
- * lots of contention. The overloaded CPU will look to push its non-running
- * RT task off, and if it does, it can then ignore the other IPIs coming
- * in, and just pass those IPIs off to any other overloaded CPU.
- *
- * When a CPU schedules a lower priority task, it only sends an IPI to
- * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
- * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
- * RT overloaded tasks, would cause 100 IPIs to go out at once.
- *
- * The overloaded RT CPU, when receiving an IPI, will try to push off its
- * overloaded RT tasks and then send an IPI to the next CPU that has
- * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
- * have completed. Just because a CPU may have pushed off its own overloaded
- * RT task does not mean it should stop sending the IPI around to other
- * overloaded CPUs. There may be another RT task waiting to run on one of
- * those CPUs that are of higher priority than the one that was just
- * pushed.
- *
- * An optimization that could possibly be made is to make a CPU array similar
- * to the cpupri array mask of all running RT tasks, but for the overloaded
- * case, then the IPI could be sent to only the CPU with the highest priority
- * RT task waiting, and that CPU could send off further IPIs to the CPU with
- * the next highest waiting task. Since the overloaded case is much less likely
- * to happen, the complexity of this implementation may not be worth it.
- * Instead, just send an IPI around to all overloaded CPUs.
- *
- * The rq->rt.push_flags holds the status of the IPI that is going around.
- * A run queue can only send out a single IPI at a time. The possible flags
- * for rq->rt.push_flags are:
- *
- *    (None or zero):		No IPI is going around for the current rq
- *    RT_PUSH_IPI_EXECUTING:	An IPI for the rq is being passed around
- *    RT_PUSH_IPI_RESTART:	The priority of the running task for the rq
- *				has changed, and the IPI should restart
- *				circulating the overloaded CPUs again.
- *
- * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
- * before sending to the next CPU.
- *
- * Instead of having all CPUs that schedule a lower priority task send
- * an IPI to the same "first" CPU in the RT overload mask, they send it
- * to the next overloaded CPU after their own CPU. This helps distribute
- * the work when there's more than one overloaded CPU and multiple CPUs
- * scheduling in lower priority tasks.
- *
- * When a rq schedules a lower priority task than what was currently
- * running, the next CPU with overloaded RT tasks is examined first.
- * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
- * priority task, it will send an IPI first to CPU 5, then CPU 5 will
- * send to CPU 1 if it is still overloaded. CPU 1 will clear the
- * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
- *
- * The first CPU to notice IPI_RESTART is set, will clear that flag and then
- * send an IPI to the next overloaded CPU after the rq->cpu and not the next
- * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
- * schedules a lower priority task, and the IPI_RESTART gets set while the
- * handling is being done on CPU 5, it will clear the flag and send it back to
- * CPU 4 instead of CPU 1.
- *
- * Note, the above logic can be disabled by turning off the sched_feature
- * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
- * taken by the CPU requesting a pull and the waiting RT task will be pulled
- * by that CPU. This may be fine for machines with few CPUs.
- */
-static void tell_cpu_to_push(struct rq *rq)
+static inline void rto_start_unlock(atomic_t *v)
 {
-	int cpu;
+	atomic_set_release(v, 0);
+}
 
-	if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
-		raw_spin_lock(&rq->rt.push_lock);
-		/* Make sure it's still executing */
-		if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
-			/*
-			 * Tell the IPI to restart the loop as things have
-			 * changed since it started.
-			 */
-			rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
-			raw_spin_unlock(&rq->rt.push_lock);
-			return;
-		}
-		raw_spin_unlock(&rq->rt.push_lock);
-	}
+static void tell_cpu_to_push(struct rq *rq)
+{
+	int cpu = -1;
 
-	/* When here, there's no IPI going around */
+	/* Keep the loop going if the IPI is currently active */
+	atomic_inc(&rq->rd->rto_loop_next);
 
-	rq->rt.push_cpu = rq->cpu;
-	cpu = find_next_push_cpu(rq);
-	if (cpu >= nr_cpu_ids)
+	/* Only one CPU can initiate a loop at a time */
+	if (!rto_start_trylock(&rq->rd->rto_loop_start))
 		return;
 
-	rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+	raw_spin_lock(&rq->rd->rto_lock);
+
+	/*
+	 * The rto_cpu is updated under the lock, if it has a valid cpu
+	 * then the IPI is still running and will continue due to the
+	 * update to loop_next, and nothing needs to be done here.
+	 * Otherwise it is finishing up and an ipi needs to be sent.
+	 */
+	if (rq->rd->rto_cpu < 0)
+		cpu = rto_next_cpu(rq);
 
-	irq_work_queue_on(&rq->rt.push_work, cpu);
+	raw_spin_unlock(&rq->rd->rto_lock);
+
+	rto_start_unlock(&rq->rd->rto_loop_start);
+
+	if (cpu >= 0)
+		irq_work_queue_on(&rq->rd->rto_push_work, cpu);
 }
 
 /* Called from hardirq context */
-static void try_to_push_tasks(void *arg)
+void rto_push_irq_work_func(struct irq_work *work)
 {
-	struct rt_rq *rt_rq = arg;
-	struct rq *rq, *src_rq;
-	int this_cpu;
+	struct rq *rq;
 	int cpu;
 
-	this_cpu = rt_rq->push_cpu;
+	rq = this_rq();
 
-	/* Paranoid check */
-	BUG_ON(this_cpu != smp_processor_id());
-
-	rq = cpu_rq(this_cpu);
-	src_rq = rq_of_rt_rq(rt_rq);
-
-again:
+	/*
+	 * We do not need to grab the lock to check for has_pushable_tasks.
+	 * When it gets updated, a check is made if a push is possible.
+	 */
 	if (has_pushable_tasks(rq)) {
 		raw_spin_lock(&rq->lock);
-		push_rt_task(rq);
+		push_rt_tasks(rq);
 		raw_spin_unlock(&rq->lock);
 	}
 
-	/* Pass the IPI to the next rt overloaded queue */
-	raw_spin_lock(&rt_rq->push_lock);
-	/*
-	 * If the source queue changed since the IPI went out,
-	 * we need to restart the search from that CPU again.
-	 */
-	if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
-		rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
-		rt_rq->push_cpu = src_rq->cpu;
-	}
+	raw_spin_lock(&rq->rd->rto_lock);
 
-	cpu = find_next_push_cpu(src_rq);
+	/* Pass the IPI to the next rt overloaded queue */
+	cpu = rto_next_cpu(rq);
 
-	if (cpu >= nr_cpu_ids)
-		rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
-	raw_spin_unlock(&rt_rq->push_lock);
+	raw_spin_unlock(&rq->rd->rto_lock);
 
-	if (cpu >= nr_cpu_ids)
+	if (cpu < 0)
 		return;
 
-	/*
-	 * It is possible that a restart caused this CPU to be
-	 * chosen again. Don't bother with an IPI, just see if we
-	 * have more to push.
-	 */
-	if (unlikely(cpu == rq->cpu))
-		goto again;
-
 	/* Try the next RT overloaded CPU */
-	irq_work_queue_on(&rt_rq->push_work, cpu);
-}
-
-static void push_irq_work_func(struct irq_work *work)
-{
-	struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
-
-	try_to_push_tasks(rt_rq);
+	irq_work_queue_on(&rq->rd->rto_push_work, cpu);
 }
 #endif /* HAVE_RT_PUSH_IPI */
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a81c9782e98c..8aa24b41f652 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -505,7 +505,7 @@ static inline int rt_bandwidth_enabled(void)
 }
 
 /* RT IPI pull logic requires IRQ_WORK */
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
 # define HAVE_RT_PUSH_IPI
 #endif
 
@@ -527,12 +527,6 @@ struct rt_rq {
 	unsigned long rt_nr_total;
 	int overloaded;
 	struct plist_head pushable_tasks;
-#ifdef HAVE_RT_PUSH_IPI
-	int push_flags;
-	int push_cpu;
-	struct irq_work push_work;
-	raw_spinlock_t push_lock;
-#endif
 #endif /* CONFIG_SMP */
 	int rt_queued;
 
@@ -641,6 +635,19 @@ struct root_domain {
 	struct dl_bw dl_bw;
 	struct cpudl cpudl;
 
+#ifdef HAVE_RT_PUSH_IPI
+	/*
+	 * For IPI pull requests, loop across the rto_mask.
+	 */
+	struct irq_work rto_push_work;
+	raw_spinlock_t rto_lock;
+	/* These are only updated and read within rto_lock */
+	int rto_loop;
+	int rto_cpu;
+	/* These atomics are updated outside of a lock */
+	atomic_t rto_loop_next;
+	atomic_t rto_loop_start;
+#endif
 	/*
 	 * The "RT overload" flag: it gets set if a CPU has more than
 	 * one runnable RT task.
@@ -658,6 +665,9 @@ extern void init_defrootdomain(void);
 extern int sched_init_domains(const struct cpumask *cpu_map);
 extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
 
+#ifdef HAVE_RT_PUSH_IPI
+extern void rto_push_irq_work_func(struct irq_work *work);
+#endif
 #endif /* CONFIG_SMP */
 
 /*
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f51d123f9fe1..e50450c2fed8 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -268,6 +268,12 @@ static int init_rootdomain(struct root_domain *rd)
 	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
 		goto free_dlo_mask;
 
+#ifdef HAVE_RT_PUSH_IPI
+	rd->rto_cpu = -1;
+	raw_spin_lock_init(&rd->rto_lock);
+	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+#endif
+
 	init_dl_bw(&rd->dl_bw);
 	if (cpudl_init(&rd->cpudl) != 0)
 		goto free_rto_mask;
-- 
cgit v1.2.3


From 6be53520ad8f87dc748f381e4e8e6a846a4b466b Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Mon, 9 Oct 2017 17:03:33 +0800
Subject: x86/tsc: Append the 'tsc=' description for the 'tsc=unstable' boot
 parameter

Commit:

  8309f86cd41e ("x86/tsc: Provide 'tsc=unstable' boot parameter")

added a new 'tsc=unstable' parameter, but didn't document it.

Document it.

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <corbet@lwn.net>
Cc: <pasha.tatashin@oracle.com>
Cc: <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1507539813-11420-1-git-send-email-douly.fnst@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 05496622b4ef..6b99c8b77c59 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4194,6 +4194,9 @@
 			Used to run time disable IRQ_TIME_ACCOUNTING on any
 			platforms where RDTSC is slow and this accounting
 			can add overhead.
+			[x86] unstable: mark the TSC clocksource as unstable, this
+			marks the TSC unconditionally unstable at bootup and
+			avoids any further wobbles once the TSC watchdog notices.
 
 	turbografx.map[2|3]=	[HW,JOY]
 			TurboGraFX parallel port interface
-- 
cgit v1.2.3


From e22cdc3fc5991956146b9856d36b4971fe54dcd6 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Mon, 23 Oct 2017 19:01:54 +0600
Subject: sched/isolcpus: Fix "isolcpus=" boot parameter handling when
 !CONFIG_CPUMASK_OFFSTACK

cpulist_parse() uses nr_cpumask_bits as a limit to parse the
passed buffer from kernel commandline. What nr_cpumask_bits
represents varies depending upon the CONFIG_CPUMASK_OFFSTACK option:

 - If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is the same as
   NR_CPUS, which might not represent the # of CPUs that really exist
   (default 64). So, there's a chance of a gap between nr_cpu_ids
   and NR_CPUS, which ultimately lead towards invalid cpulist_parse()
   operation. For example, if isolcpus=9 is passed on an 8 cpu
   system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error
   that it's supposed to.

This patch fixes this bug by finding the last CPU of the passed
isolcpus= list and checking it against nr_cpu_ids.

It also fixes the error message where the nr_cpu_ids should be
nr_cpu_ids-1, since CPU numbering starts from 0.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adobriyan@gmail.com
Cc: akpm@linux-foundation.org
Cc: longman@redhat.com
Cc: mka@chromium.org
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/20171023130154.9050-1-rakib.mullick@gmail.com
[ Enhanced the changelog and the kernel message. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

 include/linux/cpumask.h |   16 ++++++++++++++++
 kernel/sched/topology.c |    4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)
---
 include/linux/cpumask.h | 16 ++++++++++++++++
 kernel/sched/topology.c |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cd415b733c2a..63661de67ad4 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -130,6 +130,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 	return 0;
 }
 
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+	return 0;
+}
+
 /* Valid inputs for n are -1 and 0. */
 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
@@ -178,6 +183,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 	return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
+/**
+ * cpumask_last - get the last CPU in a cpumask
+ * @srcp:	- the cpumask pointer
+ *
+ * Returns	>= nr_cpumask_bits if no CPUs set.
+ */
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+	return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
 unsigned int cpumask_next(int n, const struct cpumask *srcp);
 
 /**
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e50450c2fed8..e3d31b0880dc 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -476,8 +476,8 @@ static int __init isolated_cpu_setup(char *str)
 
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 	ret = cpulist_parse(str, cpu_isolated_map);
-	if (ret) {
-		pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
+	if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) {
+		pr_err("sched: Error, all isolcpus= values must be between 0 and %u - ignoring them.\n", nr_cpu_ids-1);
 		return 0;
 	}
 	return 1;
-- 
cgit v1.2.3


From 54b933c6c954a8b7b0c2b40a1c4d3f7279d11e22 Mon Sep 17 00:00:00 2001
From: Cheng Jian <cj.chengjian@huawei.com>
Date: Wed, 25 Oct 2017 19:28:27 +0800
Subject: sched/idle: Micro-optimize the idle loop

Move the loop-invariant calculation of 'cpu' in do_idle() out of the loop body,
because the current CPU is always constant.

This improves the generated code both on x86-64 and ARM64:

x86-64:

Before patch (execution in loop):
	864:       0f ae e8                lfence
	867:       65 8b 05 c2 38 f1 7e    mov %gs:0x7ef138c2(%rip),%eax
	86e:       89 c0                   mov %eax,%eax
	870:       48 0f a3 05 68 19 08    bt  %rax,0x1081968(%rip)
	877:	   01

After patch (execution in loop):
	872:       0f ae e8                lfence
	875:       4c 0f a3 25 63 19 08    bt  %r12,0x1081963(%rip)
	87c:       01

ARM64:

Before patch (execution in loop):
	c58:       d5033d9f        dsb     ld
	c5c:       d538d080        mrs     x0, tpidr_el1
	c60:       b8606a61        ldr     w1, [x19,x0]
	c64:       1100fc20        add     w0, w1, #0x3f
	c68:       7100003f        cmp     w1, #0x0
	c6c:       1a81b000        csel    w0, w0, w1, lt
	c70:       13067c00        asr     w0, w0, #6
	c74:       93407c00        sxtw    x0, w0
	c78:       f8607a80        ldr     x0, [x20,x0,lsl #3]
	c7c:       9ac12401        lsr     x1, x0, x1
	c80:       36000581        tbz     w1, #0, d30 <do_idle+0x128>

After patch (execution in loop):
	c84:       d5033d9f        dsb     ld
	c88:       f9400260        ldr     x0, [x19]
	c8c:       ea14001f        tst     x0, x20
	c90:       54000580        b.eq    d40 <do_idle+0x138>

Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
[ Rewrote the title and the changelog. ]
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: huawei.libin@huawei.com
Cc: xiexiuqi@huawei.com
Link: http://lkml.kernel.org/r/1508930907-107755-1-git-send-email-cj.chengjian@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/idle.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index b2e8f0afbb42..7dae9eb8c042 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -209,6 +209,7 @@ exit_idle:
  */
 static void do_idle(void)
 {
+	int cpu = smp_processor_id();
 	/*
 	 * If the arch has a polling bit, we maintain an invariant:
 	 *
@@ -225,7 +226,7 @@ static void do_idle(void)
 		check_pgt_cache();
 		rmb();
 
-		if (cpu_is_offline(smp_processor_id())) {
+		if (cpu_is_offline(cpu)) {
 			cpuhp_report_idle_dead();
 			arch_cpu_idle_dead();
 		}
-- 
cgit v1.2.3


From 7863406143d8bbbbda07a61285c5f4c217908dfd Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:28 +0200
Subject: sched/isolation: Move housekeeping related code to its own file

The housekeeping code is currently tied to the NOHZ code. As we are
planning to make housekeeping independent from it, start with moving
the relevant code to its own file.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-2-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/net/ethernet/tile/tilegx.c |  2 +-
 include/linux/sched/isolation.h    | 56 ++++++++++++++++++++++++++++++++++++++
 include/linux/tick.h               | 37 -------------------------
 init/main.c                        |  2 ++
 kernel/rcu/tree_plugin.h           |  1 +
 kernel/rcu/update.c                |  1 +
 kernel/sched/Makefile              |  1 +
 kernel/sched/core.c                |  1 +
 kernel/sched/fair.c                |  1 +
 kernel/sched/isolation.c           | 33 ++++++++++++++++++++++
 kernel/time/tick-sched.c           | 18 ------------
 kernel/watchdog.c                  |  1 +
 12 files changed, 98 insertions(+), 56 deletions(-)
 create mode 100644 include/linux/sched/isolation.h
 create mode 100644 kernel/sched/isolation.c

diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index c00102b8145a..27a3272dcd48 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -40,7 +40,7 @@
 #include <linux/tcp.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
-#include <linux/tick.h>
+#include <linux/sched/isolation.h>
 
 #include <asm/checksum.h>
 #include <asm/homecache.h>
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
new file mode 100644
index 000000000000..b7cfbc46286c
--- /dev/null
+++ b/include/linux/sched/isolation.h
@@ -0,0 +1,56 @@
+#ifndef _LINUX_SCHED_ISOLATION_H
+#define _LINUX_SCHED_ISOLATION_H
+
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/tick.h>
+
+#ifdef CONFIG_NO_HZ_FULL
+extern cpumask_var_t housekeeping_mask;
+
+static inline int housekeeping_any_cpu(void)
+{
+	return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+}
+
+extern void __init housekeeping_init(void);
+
+#else
+
+static inline int housekeeping_any_cpu(void)
+{
+	return smp_processor_id();
+}
+
+static inline void housekeeping_init(void) { }
+#endif /* CONFIG_NO_HZ_FULL */
+
+
+static inline const struct cpumask *housekeeping_cpumask(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled())
+		return housekeeping_mask;
+#endif
+	return cpu_possible_mask;
+}
+
+static inline bool is_housekeeping_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled())
+		return cpumask_test_cpu(cpu, housekeeping_mask);
+#endif
+	return true;
+}
+
+static inline void housekeeping_affine(struct task_struct *t)
+{
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled())
+		set_cpus_allowed_ptr(t, housekeeping_mask);
+
+#endif
+}
+
+#endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index fe01e68bf520..68afc09aa8ac 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -137,7 +137,6 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 #ifdef CONFIG_NO_HZ_FULL
 extern bool tick_nohz_full_running;
 extern cpumask_var_t tick_nohz_full_mask;
-extern cpumask_var_t housekeeping_mask;
 
 static inline bool tick_nohz_full_enabled(void)
 {
@@ -161,11 +160,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
 		cpumask_or(mask, mask, tick_nohz_full_mask);
 }
 
-static inline int housekeeping_any_cpu(void)
-{
-	return cpumask_any_and(housekeeping_mask, cpu_online_mask);
-}
-
 extern void tick_nohz_dep_set(enum tick_dep_bits bit);
 extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
 extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
@@ -235,10 +229,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void __tick_nohz_task_switch(void);
 #else
-static inline int housekeeping_any_cpu(void)
-{
-	return smp_processor_id();
-}
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
 static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
@@ -260,33 +250,6 @@ static inline void tick_nohz_full_kick_cpu(int cpu) { }
 static inline void __tick_nohz_task_switch(void) { }
 #endif
 
-static inline const struct cpumask *housekeeping_cpumask(void)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return housekeeping_mask;
-#endif
-	return cpu_possible_mask;
-}
-
-static inline bool is_housekeeping_cpu(int cpu)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return cpumask_test_cpu(cpu, housekeeping_mask);
-#endif
-	return true;
-}
-
-static inline void housekeeping_affine(struct task_struct *t)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		set_cpus_allowed_ptr(t, housekeeping_mask);
-
-#endif
-}
-
 static inline void tick_nohz_task_switch(void)
 {
 	if (tick_nohz_full_enabled())
diff --git a/init/main.c b/init/main.c
index 0ee9c6866ada..4610c99ae306 100644
--- a/init/main.c
+++ b/init/main.c
@@ -46,6 +46,7 @@
 #include <linux/cgroup.h>
 #include <linux/efi.h>
 #include <linux/tick.h>
+#include <linux/sched/isolation.h>
 #include <linux/interrupt.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
@@ -606,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void)
 	early_irq_init();
 	init_IRQ();
 	tick_init();
+	housekeeping_init();
 	rcu_init_nohz();
 	init_timers();
 	hrtimers_init();
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e012b9be777e..c7632f51c5a0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -29,6 +29,7 @@
 #include <linux/oom.h>
 #include <linux/sched/debug.h>
 #include <linux/smpboot.h>
+#include <linux/sched/isolation.h>
 #include <uapi/linux/sched/types.h>
 #include "../time/tick-internal.h"
 
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5033b66d2753..79abeb0ca329 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -51,6 +51,7 @@
 #include <linux/kthread.h>
 #include <linux/tick.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/sched/isolation.h>
 
 #define CREATE_TRACE_POINTS
 
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 78f54932ea1d..871d43d73375 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -26,3 +26,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
+obj-$(CONFIG_NO_HZ_FULL) += isolation.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2288a145bf01..ad188acb7636 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
 #include <linux/profile.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/sched/isolation.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56f343b8e749..591481db8c6a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,6 +32,7 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/sched/isolation.h>
 
 #include <trace/events/sched.h>
 
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
new file mode 100644
index 000000000000..3589252ed476
--- /dev/null
+++ b/kernel/sched/isolation.c
@@ -0,0 +1,33 @@
+/*
+ *  Housekeeping management. Manage the targets for routine code that can run on
+ *  any CPU: unbound workqueues, timers, kthreads and any offloadable work.
+ *
+ * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
+ *
+ */
+
+#include <linux/sched/isolation.h>
+#include <linux/tick.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+cpumask_var_t housekeeping_mask;
+
+void __init housekeeping_init(void)
+{
+	if (!tick_nohz_full_enabled())
+		return;
+
+	if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
+		WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
+		cpumask_clear(tick_nohz_full_mask);
+		tick_nohz_full_running = false;
+		return;
+	}
+
+	cpumask_andnot(housekeeping_mask,
+		       cpu_possible_mask, tick_nohz_full_mask);
+
+	/* We need at least one CPU to handle housekeeping work */
+	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7b258c59d78a..27d7d522ac4e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -166,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 
 #ifdef CONFIG_NO_HZ_FULL
 cpumask_var_t tick_nohz_full_mask;
-cpumask_var_t housekeeping_mask;
 bool tick_nohz_full_running;
 static atomic_t tick_dep_mask;
 
@@ -438,13 +437,6 @@ void __init tick_nohz_init(void)
 			return;
 	}
 
-	if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
-		WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
-		cpumask_clear(tick_nohz_full_mask);
-		tick_nohz_full_running = false;
-		return;
-	}
-
 	/*
 	 * Full dynticks uses irq work to drive the tick rescheduling on safe
 	 * locking contexts. But then we need irq work to raise its own
@@ -453,7 +445,6 @@ void __init tick_nohz_init(void)
 	if (!arch_irq_work_has_interrupt()) {
 		pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
 		cpumask_clear(tick_nohz_full_mask);
-		cpumask_copy(housekeeping_mask, cpu_possible_mask);
 		tick_nohz_full_running = false;
 		return;
 	}
@@ -466,9 +457,6 @@ void __init tick_nohz_init(void)
 		cpumask_clear_cpu(cpu, tick_nohz_full_mask);
 	}
 
-	cpumask_andnot(housekeeping_mask,
-		       cpu_possible_mask, tick_nohz_full_mask);
-
 	for_each_cpu(cpu, tick_nohz_full_mask)
 		context_tracking_cpu_set(cpu);
 
@@ -478,12 +466,6 @@ void __init tick_nohz_init(void)
 	WARN_ON(ret < 0);
 	pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
 		cpumask_pr_args(tick_nohz_full_mask));
-
-	/*
-	 * We need at least one CPU to handle housekeeping work such
-	 * as timekeeping, unbound timers, workqueues, ...
-	 */
-	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
 #endif
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6bcb854909c0..3c44dbaa4b9f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
 #include <linux/workqueue.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
-- 
cgit v1.2.3


From 13316b31fdaaa45f06793eb7992588359ba6ab9f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:29 +0200
Subject: sched/isolation, watchdog: Use housekeeping_cpumask() instead of
 ad-hoc version

While trying to disable the watchog on nohz_full CPUs, the watchdog
implements an ad-hoc version of housekeeping_cpumask(). Lets replace
those re-invented lines.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-3-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3c44dbaa4b9f..562652c9c815 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -774,15 +774,10 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 
 void __init lockup_detector_init(void)
 {
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled()) {
+	if (tick_nohz_full_enabled())
 		pr_info("Disabling watchdog on nohz_full cores by default\n");
-		cpumask_copy(&watchdog_cpumask, housekeeping_mask);
-	} else
-		cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
-#else
-	cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
-#endif
+
+	cpumask_copy(&watchdog_cpumask, housekeeping_cpumask());
 
 	if (!watchdog_nmi_probe())
 		nmi_watchdog_available = true;
-- 
cgit v1.2.3


From 9f0ca2d97ef0b5e966be2cfef26c7c094ec14e41 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:30 +0200
Subject: sched/isolation: Provide a dynamic off-case to housekeeping_any_cpu()

housekeeping_any_cpu() doesn't handle correctly the case where
CONFIG_NO_HZ_FULL=y and no CPU is in nohz_full mode. So far no caller
needs this but let's prepare to avoid any future surprise.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-4-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index b7cfbc46286c..040df04fa78a 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -7,25 +7,20 @@
 
 #ifdef CONFIG_NO_HZ_FULL
 extern cpumask_var_t housekeeping_mask;
-
-static inline int housekeeping_any_cpu(void)
-{
-	return cpumask_any_and(housekeeping_mask, cpu_online_mask);
-}
-
 extern void __init housekeeping_init(void);
-
 #else
+static inline void housekeeping_init(void) { }
+#endif /* CONFIG_NO_HZ_FULL */
 
 static inline int housekeeping_any_cpu(void)
 {
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled())
+		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+#endif
 	return smp_processor_id();
 }
 
-static inline void housekeeping_init(void) { }
-#endif /* CONFIG_NO_HZ_FULL */
-
-
 static inline const struct cpumask *housekeeping_cpumask(void)
 {
 #ifdef CONFIG_NO_HZ_FULL
-- 
cgit v1.2.3


From 7e56a1cf4b28f5739526877b8dbad623fae2e4e7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:31 +0200
Subject: sched/isolation: Make the housekeeping cpumask private

Nobody needs to access this detail. housekeeping_cpumask() already
takes care of it.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-5-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h | 31 ++++++++++---------------------
 kernel/sched/isolation.c        | 36 +++++++++++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 040df04fa78a..ed935ffc6ffa 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -6,46 +6,35 @@
 #include <linux/tick.h>
 
 #ifdef CONFIG_NO_HZ_FULL
-extern cpumask_var_t housekeeping_mask;
+extern int housekeeping_any_cpu(void);
+extern const struct cpumask *housekeeping_cpumask(void);
+extern void housekeeping_affine(struct task_struct *t);
+extern bool housekeeping_test_cpu(int cpu);
 extern void __init housekeeping_init(void);
+
 #else
-static inline void housekeeping_init(void) { }
-#endif /* CONFIG_NO_HZ_FULL */
 
 static inline int housekeeping_any_cpu(void)
 {
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
-#endif
 	return smp_processor_id();
 }
 
 static inline const struct cpumask *housekeeping_cpumask(void)
 {
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return housekeeping_mask;
-#endif
 	return cpu_possible_mask;
 }
 
+static inline void housekeeping_affine(struct task_struct *t) { }
+static inline void housekeeping_init(void) { }
+#endif /* CONFIG_NO_HZ_FULL */
+
 static inline bool is_housekeeping_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
 	if (tick_nohz_full_enabled())
-		return cpumask_test_cpu(cpu, housekeeping_mask);
+		return housekeeping_test_cpu(cpu);
 #endif
 	return true;
 }
 
-static inline void housekeeping_affine(struct task_struct *t)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		set_cpus_allowed_ptr(t, housekeeping_mask);
-
-#endif
-}
-
 #endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 3589252ed476..16445097eb25 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -11,7 +11,41 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 
-cpumask_var_t housekeeping_mask;
+static cpumask_var_t housekeeping_mask;
+
+int housekeeping_any_cpu(void)
+{
+	if (tick_nohz_full_enabled())
+		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+
+	return smp_processor_id();
+}
+EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
+
+const struct cpumask *housekeeping_cpumask(void)
+{
+	if (tick_nohz_full_enabled())
+		return housekeeping_mask;
+
+	return cpu_possible_mask;
+}
+EXPORT_SYMBOL_GPL(housekeeping_cpumask);
+
+void housekeeping_affine(struct task_struct *t)
+{
+	if (tick_nohz_full_enabled())
+		set_cpus_allowed_ptr(t, housekeeping_mask);
+}
+EXPORT_SYMBOL_GPL(housekeeping_affine);
+
+bool housekeeping_test_cpu(int cpu)
+{
+	if (tick_nohz_full_enabled())
+		return cpumask_test_cpu(cpu, housekeeping_mask);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
 
 void __init housekeeping_init(void)
 {
-- 
cgit v1.2.3


From e179f5a04ba46ee5c5439480c2bfd68c358168b7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:32 +0200
Subject: sched/isolation: Use its own static key

Housekeeping code still depends on the nohz_full static key. Since we want
to decouple housekeeping from NOHZ, let's create a housekeeping specific
static key.

It's mostly relevant for calls to is_housekeeping_cpu() from the scheduler.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-6-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h |  3 ++-
 kernel/sched/isolation.c        | 13 +++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index ed935ffc6ffa..194c586fbb12 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -6,6 +6,7 @@
 #include <linux/tick.h>
 
 #ifdef CONFIG_NO_HZ_FULL
+DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
 extern int housekeeping_any_cpu(void);
 extern const struct cpumask *housekeeping_cpumask(void);
 extern void housekeeping_affine(struct task_struct *t);
@@ -31,7 +32,7 @@ static inline void housekeeping_init(void) { }
 static inline bool is_housekeeping_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		return housekeeping_test_cpu(cpu);
 #endif
 	return true;
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 16445097eb25..bb8ba19a0235 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -10,12 +10,15 @@
 #include <linux/tick.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/static_key.h>
 
+DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
+EXPORT_SYMBOL_GPL(housekeeping_overriden);
 static cpumask_var_t housekeeping_mask;
 
 int housekeeping_any_cpu(void)
 {
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
 
 	return smp_processor_id();
@@ -24,7 +27,7 @@ EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
 
 const struct cpumask *housekeeping_cpumask(void)
 {
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		return housekeeping_mask;
 
 	return cpu_possible_mask;
@@ -33,14 +36,14 @@ EXPORT_SYMBOL_GPL(housekeeping_cpumask);
 
 void housekeeping_affine(struct task_struct *t)
 {
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		set_cpus_allowed_ptr(t, housekeeping_mask);
 }
 EXPORT_SYMBOL_GPL(housekeeping_affine);
 
 bool housekeeping_test_cpu(int cpu)
 {
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		return cpumask_test_cpu(cpu, housekeeping_mask);
 
 	return true;
@@ -62,6 +65,8 @@ void __init housekeeping_init(void)
 	cpumask_andnot(housekeeping_mask,
 		       cpu_possible_mask, tick_nohz_full_mask);
 
+	static_branch_enable(&housekeeping_overriden);
+
 	/* We need at least one CPU to handle housekeeping work */
 	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
-- 
cgit v1.2.3


From 204c083a009378dfa751175b5fcddc75988bab6c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:33 +0200
Subject: sched/isolation: Rename is_housekeeping_cpu() to housekeeping_cpu()

Fit it into the housekeeping_*() namespace.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-7-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h | 2 +-
 kernel/sched/core.c             | 6 +++---
 kernel/sched/fair.c             | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 194c586fbb12..ad0f5d986a2e 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -29,7 +29,7 @@ static inline void housekeeping_affine(struct task_struct *t) { }
 static inline void housekeeping_init(void) { }
 #endif /* CONFIG_NO_HZ_FULL */
 
-static inline bool is_housekeeping_cpu(int cpu)
+static inline bool housekeeping_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
 	if (static_branch_unlikely(&housekeeping_overriden))
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad188acb7636..d0fb448dd43a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -527,7 +527,7 @@ int get_nohz_timer_target(void)
 	int i, cpu = smp_processor_id();
 	struct sched_domain *sd;
 
-	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+	if (!idle_cpu(cpu) && housekeeping_cpu(cpu))
 		return cpu;
 
 	rcu_read_lock();
@@ -536,14 +536,14 @@ int get_nohz_timer_target(void)
 			if (cpu == i)
 				continue;
 
-			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+			if (!idle_cpu(i) && housekeeping_cpu(i)) {
 				cpu = i;
 				goto unlock;
 			}
 		}
 	}
 
-	if (!is_housekeeping_cpu(cpu))
+	if (!housekeeping_cpu(cpu))
 		cpu = housekeeping_any_cpu();
 unlock:
 	rcu_read_unlock();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 591481db8c6a..cdece8f967f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9027,7 +9027,7 @@ void nohz_balance_enter_idle(int cpu)
 		return;
 
 	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
-	if (!is_housekeeping_cpu(cpu))
+	if (!housekeeping_cpu(cpu))
 		return;
 
 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
-- 
cgit v1.2.3


From 5c4991e24c69737bd41fc2737b1e3980abbf73f9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:34 +0200
Subject: sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from
 CONFIG_NO_HZ_FULL

Split the housekeeping config from CONFIG_NO_HZ_FULL. This way we finally
separate the isolation code from NOHZ.

Although a dependency to CONFIG_NO_HZ_FULL remains for now, while the
housekeeping code still deals with NOHZ internals.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-8-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h | 6 +++---
 init/Kconfig                    | 8 ++++++++
 kernel/sched/Makefile           | 2 +-
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index ad0f5d986a2e..93ac2367a520 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -5,7 +5,7 @@
 #include <linux/init.h>
 #include <linux/tick.h>
 
-#ifdef CONFIG_NO_HZ_FULL
+#ifdef CONFIG_CPU_ISOLATION
 DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
 extern int housekeeping_any_cpu(void);
 extern const struct cpumask *housekeeping_cpumask(void);
@@ -27,11 +27,11 @@ static inline const struct cpumask *housekeeping_cpumask(void)
 
 static inline void housekeeping_affine(struct task_struct *t) { }
 static inline void housekeeping_init(void) { }
-#endif /* CONFIG_NO_HZ_FULL */
+#endif /* CONFIG_CPU_ISOLATION */
 
 static inline bool housekeeping_cpu(int cpu)
 {
-#ifdef CONFIG_NO_HZ_FULL
+#ifdef CONFIG_CPU_ISOLATION
 	if (static_branch_unlikely(&housekeeping_overriden))
 		return housekeeping_test_cpu(cpu);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 78cb2461012e..6f52e6f4bd9d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -472,6 +472,14 @@ config TASK_IO_ACCOUNTING
 
 endmenu # "CPU/Task time and stats accounting"
 
+config CPU_ISOLATION
+	bool "CPU isolation"
+	depends on NO_HZ_FULL
+	help
+	  Make sure that CPUs running critical tasks are not disturbed by
+	  any source of "noise" such as unbound workqueues, timers, kthreads...
+	  Unbound jobs get offloaded to housekeeping CPUs.
+
 source "kernel/rcu/Kconfig"
 
 config BUILD_BIN2C
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 871d43d73375..dbe61302c352 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -26,4 +26,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
-obj-$(CONFIG_NO_HZ_FULL) += isolation.o
+obj-$(CONFIG_CPU_ISOLATION) += isolation.o
-- 
cgit v1.2.3


From de201559df872f83d0c08fb4effe3efd28e6cbc8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:35 +0200
Subject: sched/isolation: Introduce housekeeping flags

Before we implement isolcpus under housekeeping, we need the isolation
features to be more finegrained. For example some people want NOHZ_FULL
without the full scheduler isolation, others want full scheduler
isolation without NOHZ_FULL.

So let's cut all these isolation features piecewise, at the risk of
overcutting it right now. We can still merge some flags later if they
always make sense together.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-9-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/net/ethernet/tile/tilegx.c |  4 ++--
 include/linux/sched/isolation.h    | 26 +++++++++++++++++---------
 kernel/rcu/tree_plugin.h           |  2 +-
 kernel/rcu/update.c                |  2 +-
 kernel/sched/core.c                |  8 ++++----
 kernel/sched/fair.c                |  2 +-
 kernel/sched/isolation.c           | 26 +++++++++++++++-----------
 kernel/watchdog.c                  |  3 ++-
 8 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index 27a3272dcd48..b3e5816a4678 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -2270,8 +2270,8 @@ static int __init tile_net_init_module(void)
 		tile_net_dev_init(name, mac);
 
 	if (!network_cpus_init())
-		cpumask_and(&network_cpus_map, housekeeping_cpumask(),
-			    cpu_online_mask);
+		cpumask_and(&network_cpus_map,
+			    housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask);
 
 	return 0;
 }
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 93ac2367a520..9bb753eece3b 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -5,35 +5,43 @@
 #include <linux/init.h>
 #include <linux/tick.h>
 
+enum hk_flags {
+	HK_FLAG_TIMER		= 1,
+	HK_FLAG_RCU		= (1 << 1),
+	HK_FLAG_MISC		= (1 << 2),
+	HK_FLAG_SCHED		= (1 << 3),
+};
+
 #ifdef CONFIG_CPU_ISOLATION
 DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
-extern int housekeeping_any_cpu(void);
-extern const struct cpumask *housekeeping_cpumask(void);
-extern void housekeeping_affine(struct task_struct *t);
-extern bool housekeeping_test_cpu(int cpu);
+extern int housekeeping_any_cpu(enum hk_flags flags);
+extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
+extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
+extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
 extern void __init housekeeping_init(void);
 
 #else
 
-static inline int housekeeping_any_cpu(void)
+static inline int housekeeping_any_cpu(enum hk_flags flags)
 {
 	return smp_processor_id();
 }
 
-static inline const struct cpumask *housekeeping_cpumask(void)
+static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
 {
 	return cpu_possible_mask;
 }
 
-static inline void housekeeping_affine(struct task_struct *t) { }
+static inline void housekeeping_affine(struct task_struct *t,
+				       enum hk_flags flags) { }
 static inline void housekeeping_init(void) { }
 #endif /* CONFIG_CPU_ISOLATION */
 
-static inline bool housekeeping_cpu(int cpu)
+static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
 {
 #ifdef CONFIG_CPU_ISOLATION
 	if (static_branch_unlikely(&housekeeping_overriden))
-		return housekeeping_test_cpu(cpu);
+		return housekeeping_test_cpu(cpu, flags);
 #endif
 	return true;
 }
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c7632f51c5a0..34125d23e58a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2584,7 +2584,7 @@ static void rcu_bind_gp_kthread(void)
 
 	if (!tick_nohz_full_enabled())
 		return;
-	housekeeping_affine(current);
+	housekeeping_affine(current, HK_FLAG_RCU);
 }
 
 /* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 79abeb0ca329..e3e60efaafee 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -719,7 +719,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 	LIST_HEAD(rcu_tasks_holdouts);
 
 	/* Run on housekeeping CPUs by default.  Sysadm can move if desired. */
-	housekeeping_affine(current);
+	housekeeping_affine(current, HK_FLAG_RCU);
 
 	/*
 	 * Each pass through the following loop makes one check for
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d0fb448dd43a..2210c0203e51 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -527,7 +527,7 @@ int get_nohz_timer_target(void)
 	int i, cpu = smp_processor_id();
 	struct sched_domain *sd;
 
-	if (!idle_cpu(cpu) && housekeeping_cpu(cpu))
+	if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
 		return cpu;
 
 	rcu_read_lock();
@@ -536,15 +536,15 @@ int get_nohz_timer_target(void)
 			if (cpu == i)
 				continue;
 
-			if (!idle_cpu(i) && housekeeping_cpu(i)) {
+			if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
 				cpu = i;
 				goto unlock;
 			}
 		}
 	}
 
-	if (!housekeeping_cpu(cpu))
-		cpu = housekeeping_any_cpu();
+	if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
+		cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
 unlock:
 	rcu_read_unlock();
 	return cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cdece8f967f0..f755de86a813 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9027,7 +9027,7 @@ void nohz_balance_enter_idle(int cpu)
 		return;
 
 	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
-	if (!housekeeping_cpu(cpu))
+	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
 		return;
 
 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index bb8ba19a0235..37a138a780ff 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -15,37 +15,39 @@
 DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
 EXPORT_SYMBOL_GPL(housekeeping_overriden);
 static cpumask_var_t housekeeping_mask;
+static unsigned int housekeeping_flags;
 
-int housekeeping_any_cpu(void)
+int housekeeping_any_cpu(enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overriden))
-		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
-
+		if (housekeeping_flags & flags)
+			return cpumask_any_and(housekeeping_mask, cpu_online_mask);
 	return smp_processor_id();
 }
 EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
 
-const struct cpumask *housekeeping_cpumask(void)
+const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overriden))
-		return housekeeping_mask;
-
+		if (housekeeping_flags & flags)
+			return housekeeping_mask;
 	return cpu_possible_mask;
 }
 EXPORT_SYMBOL_GPL(housekeeping_cpumask);
 
-void housekeeping_affine(struct task_struct *t)
+void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overriden))
-		set_cpus_allowed_ptr(t, housekeeping_mask);
+		if (housekeeping_flags & flags)
+			set_cpus_allowed_ptr(t, housekeeping_mask);
 }
 EXPORT_SYMBOL_GPL(housekeeping_affine);
 
-bool housekeeping_test_cpu(int cpu)
+bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overriden))
-		return cpumask_test_cpu(cpu, housekeeping_mask);
-
+		if (housekeeping_flags & flags)
+			return cpumask_test_cpu(cpu, housekeeping_mask);
 	return true;
 }
 EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
@@ -65,6 +67,8 @@ void __init housekeeping_init(void)
 	cpumask_andnot(housekeeping_mask,
 		       cpu_possible_mask, tick_nohz_full_mask);
 
+	housekeeping_flags = HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+
 	static_branch_enable(&housekeeping_overriden);
 
 	/* We need at least one CPU to handle housekeeping work */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 562652c9c815..e9e2ebb3db29 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -777,7 +777,8 @@ void __init lockup_detector_init(void)
 	if (tick_nohz_full_enabled())
 		pr_info("Disabling watchdog on nohz_full cores by default\n");
 
-	cpumask_copy(&watchdog_cpumask, housekeeping_cpumask());
+	cpumask_copy(&watchdog_cpumask,
+		     housekeeping_cpumask(HK_FLAG_TIMER));
 
 	if (!watchdog_nmi_probe())
 		nmi_watchdog_available = true;
-- 
cgit v1.2.3


From 6f1982fedd59856bcc42a9b521be4c3ffd2f60a7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:36 +0200
Subject: sched/isolation: Handle the nohz_full= parameter

We want to centralize the isolation management, done by the housekeeping
subsystem. Therefore we need to handle the nohz_full= parameter from
there.

Since nohz_full= so far has involved unbound timers, watchdog, RCU
and tilegx NAPI isolation, we keep that default behaviour.

nohz_full= will be deprecated in the future. We want to control
the isolation features from the isolcpus= parameter.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-10-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h |  1 +
 include/linux/tick.h            |  2 ++
 init/Kconfig                    |  1 -
 kernel/sched/isolation.c        | 42 +++++++++++++++++++++++++++++------------
 kernel/time/tick-sched.c        | 13 +++----------
 5 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 9bb753eece3b..e53cfa96e91e 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -10,6 +10,7 @@ enum hk_flags {
 	HK_FLAG_RCU		= (1 << 1),
 	HK_FLAG_MISC		= (1 << 2),
 	HK_FLAG_SCHED		= (1 << 3),
+	HK_FLAG_TICK		= (1 << 4),
 };
 
 #ifdef CONFIG_CPU_ISOLATION
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 68afc09aa8ac..e2a163a9f96c 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -228,6 +228,7 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 
 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void __tick_nohz_task_switch(void);
+extern void __init tick_nohz_full_setup(cpumask_var_t cpumask);
 #else
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
@@ -248,6 +249,7 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 
 static inline void tick_nohz_full_kick_cpu(int cpu) { }
 static inline void __tick_nohz_task_switch(void) { }
+static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { }
 #endif
 
 static inline void tick_nohz_task_switch(void)
diff --git a/init/Kconfig b/init/Kconfig
index 6f52e6f4bd9d..f8564df58d0a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -474,7 +474,6 @@ endmenu # "CPU/Task time and stats accounting"
 
 config CPU_ISOLATION
 	bool "CPU isolation"
-	depends on NO_HZ_FULL
 	help
 	  Make sure that CPUs running critical tasks are not disturbed by
 	  any source of "noise" such as unbound workqueues, timers, kthreads...
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 37a138a780ff..1f61e440358d 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -54,23 +54,41 @@ EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
 
 void __init housekeeping_init(void)
 {
-	if (!tick_nohz_full_enabled())
+	if (!housekeeping_flags)
 		return;
 
-	if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
-		WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
-		cpumask_clear(tick_nohz_full_mask);
-		tick_nohz_full_running = false;
-		return;
+	static_branch_enable(&housekeeping_overriden);
+
+	/* We need at least one CPU to handle housekeeping work */
+	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+static int __init housekeeping_nohz_full_setup(char *str)
+{
+	cpumask_var_t non_housekeeping_mask;
+
+	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
+	if (cpulist_parse(str, non_housekeeping_mask) < 0) {
+		pr_warn("Housekeeping: Incorrect nohz_full cpumask\n");
+		free_bootmem_cpumask_var(non_housekeeping_mask);
+		return 0;
 	}
 
-	cpumask_andnot(housekeeping_mask,
-		       cpu_possible_mask, tick_nohz_full_mask);
+	alloc_bootmem_cpumask_var(&housekeeping_mask);
+	cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask);
 
-	housekeeping_flags = HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+	if (cpumask_empty(housekeeping_mask))
+		cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
 
-	static_branch_enable(&housekeeping_overriden);
+	housekeeping_flags = HK_FLAG_TICK | HK_FLAG_TIMER |
+				HK_FLAG_RCU | HK_FLAG_MISC;
 
-	/* We need at least one CPU to handle housekeeping work */
-	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+	tick_nohz_full_setup(non_housekeeping_mask);
+
+	free_bootmem_cpumask_var(non_housekeeping_mask);
+
+	return 1;
 }
+__setup("nohz_full=", housekeeping_nohz_full_setup);
+#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 27d7d522ac4e..69f3dbe38984 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -385,20 +385,13 @@ out:
 	local_irq_restore(flags);
 }
 
-/* Parse the boot-time nohz CPU list from the kernel parameters. */
-static int __init tick_nohz_full_setup(char *str)
+/* Get the boot-time nohz CPU list from the kernel parameters. */
+void __init tick_nohz_full_setup(cpumask_var_t cpumask)
 {
 	alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
-	if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
-		pr_warn("NO_HZ: Incorrect nohz_full cpumask\n");
-		free_bootmem_cpumask_var(tick_nohz_full_mask);
-		return 1;
-	}
+	cpumask_copy(tick_nohz_full_mask, cpumask);
 	tick_nohz_full_running = true;
-
-	return 1;
 }
-__setup("nohz_full=", tick_nohz_full_setup);
 
 static int tick_nohz_cpu_down(unsigned int cpu)
 {
-- 
cgit v1.2.3


From edb9382175c3ebdced8ffdb3e0f20052ad9fdbe9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:37 +0200
Subject: sched/isolation: Move isolcpus= handling to the housekeeping code

We want to centralize the isolation features, to be done by the housekeeping
subsystem and scheduler domain isolation is a significant part of it.

No intended behaviour change, we just reuse the housekeeping cpumask
and core code.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-11-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/base/cpu.c              | 11 ++++++-
 include/linux/sched.h           |  2 --
 include/linux/sched/isolation.h |  1 +
 kernel/cgroup/cpuset.c          | 15 ++++------
 kernel/sched/core.c             | 16 +----------
 kernel/sched/isolation.c        | 63 ++++++++++++++++++++++++++++++++---------
 kernel/sched/topology.c         | 24 ++++------------
 7 files changed, 73 insertions(+), 59 deletions(-)

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 321cd7b4d817..a73ab95558f5 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -18,6 +18,7 @@
 #include <linux/cpufeature.h>
 #include <linux/tick.h>
 #include <linux/pm_qos.h>
+#include <linux/sched/isolation.h>
 
 #include "base.h"
 
@@ -271,8 +272,16 @@ static ssize_t print_cpus_isolated(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
 	int n = 0, len = PAGE_SIZE-2;
+	cpumask_var_t isolated;
 
-	n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map));
+	if (!alloc_cpumask_var(&isolated, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpumask_andnot(isolated, cpu_possible_mask,
+		       housekeeping_cpumask(HK_FLAG_DOMAIN));
+	n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated));
+
+	free_cpumask_var(isolated);
 
 	return n;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0f897dfc195e..1b0cc0d6df8d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -165,8 +165,6 @@ struct task_group;
 /* Task command name length: */
 #define TASK_COMM_LEN			16
 
-extern cpumask_var_t			cpu_isolated_map;
-
 extern void scheduler_tick(void);
 
 #define	MAX_SCHEDULE_TIMEOUT		LONG_MAX
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index e53cfa96e91e..d849431c8060 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -11,6 +11,7 @@ enum hk_flags {
 	HK_FLAG_MISC		= (1 << 2),
 	HK_FLAG_SCHED		= (1 << 3),
 	HK_FLAG_TICK		= (1 << 4),
+	HK_FLAG_DOMAIN		= (1 << 5),
 };
 
 #ifdef CONFIG_CPU_ISOLATION
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4657e2924ecb..f7efa7b4d825 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -57,7 +57,7 @@
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
 #include <linux/oom.h>
-
+#include <linux/sched/isolation.h>
 #include <linux/uaccess.h>
 #include <linux/atomic.h>
 #include <linux/mutex.h>
@@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	int csn;		/* how many cpuset ptrs in csa so far */
 	int i, j, k;		/* indices for partition finding loops */
 	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
-	cpumask_var_t non_isolated_cpus;  /* load balanced CPUs */
 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
 	int ndoms = 0;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] struct cpumask slot */
@@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	dattr = NULL;
 	csa = NULL;
 
-	if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
-		goto done;
-	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-
 	/* Special case for the 99% of systems with one, full, sched domain */
 	if (is_sched_load_balance(&top_cpuset)) {
 		ndoms = 1;
@@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 			update_domain_attr_tree(dattr, &top_cpuset);
 		}
 		cpumask_and(doms[0], top_cpuset.effective_cpus,
-				     non_isolated_cpus);
+			    housekeeping_cpumask(HK_FLAG_DOMAIN));
 
 		goto done;
 	}
@@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
 		 */
 		if (!cpumask_empty(cp->cpus_allowed) &&
 		    !(is_sched_load_balance(cp) &&
-		      cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
+		      cpumask_intersects(cp->cpus_allowed,
+					 housekeeping_cpumask(HK_FLAG_DOMAIN))))
 			continue;
 
 		if (is_sched_load_balance(cp))
@@ -789,7 +785,7 @@ restart:
 
 			if (apn == b->pn) {
 				cpumask_or(dp, dp, b->effective_cpus);
-				cpumask_and(dp, dp, non_isolated_cpus);
+				cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
 				if (dattr)
 					update_domain_attr_tree(dattr + nslot, b);
 
@@ -802,7 +798,6 @@ restart:
 	BUG_ON(nslot != ndoms);
 
 done:
-	free_cpumask_var(non_isolated_cpus);
 	kfree(csa);
 
 	/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2210c0203e51..1a55c842bfbc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -84,9 +84,6 @@ __read_mostly int scheduler_running;
  */
 int sysctl_sched_rt_runtime = 950000;
 
-/* CPUs with isolated domains */
-cpumask_var_t cpu_isolated_map;
-
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
@@ -5735,10 +5732,6 @@ static inline void sched_init_smt(void) { }
 
 void __init sched_init_smp(void)
 {
-	cpumask_var_t non_isolated_cpus;
-
-	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-
 	sched_init_numa();
 
 	/*
@@ -5748,16 +5741,12 @@ void __init sched_init_smp(void)
 	 */
 	mutex_lock(&sched_domains_mutex);
 	sched_init_domains(cpu_active_mask);
-	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-	if (cpumask_empty(non_isolated_cpus))
-		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 
 	/* Move init over to a non-isolated CPU */
-	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
 		BUG();
 	sched_init_granularity();
-	free_cpumask_var(non_isolated_cpus);
 
 	init_sched_rt_class();
 	init_sched_dl_class();
@@ -5961,9 +5950,6 @@ void __init sched_init(void)
 	calc_load_update = jiffies + LOAD_FREQ;
 
 #ifdef CONFIG_SMP
-	/* May be allocated at isolcpus cmdline parse time */
-	if (cpu_isolated_map == NULL)
-		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 	idle_thread_set_boot_cpu();
 	set_cpu_rq_start_time(smp_processor_id());
 #endif
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 1f61e440358d..8f666bc5abe8 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -63,32 +63,69 @@ void __init housekeeping_init(void)
 	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
 
-#ifdef CONFIG_NO_HZ_FULL
-static int __init housekeeping_nohz_full_setup(char *str)
+static int __init housekeeping_setup(char *str, enum hk_flags flags)
 {
 	cpumask_var_t non_housekeeping_mask;
+	int err;
 
 	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
-	if (cpulist_parse(str, non_housekeeping_mask) < 0) {
-		pr_warn("Housekeeping: Incorrect nohz_full cpumask\n");
+	err = cpulist_parse(str, non_housekeeping_mask);
+	if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
+		pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
 		free_bootmem_cpumask_var(non_housekeeping_mask);
 		return 0;
 	}
 
-	alloc_bootmem_cpumask_var(&housekeeping_mask);
-	cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask);
-
-	if (cpumask_empty(housekeeping_mask))
-		cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+	if (!housekeeping_flags) {
+		alloc_bootmem_cpumask_var(&housekeeping_mask);
+		cpumask_andnot(housekeeping_mask,
+			       cpu_possible_mask, non_housekeeping_mask);
+		if (cpumask_empty(housekeeping_mask))
+			cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+	} else {
+		cpumask_var_t tmp;
+
+		alloc_bootmem_cpumask_var(&tmp);
+		cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
+		if (!cpumask_equal(tmp, housekeeping_mask)) {
+			pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+			free_bootmem_cpumask_var(tmp);
+			free_bootmem_cpumask_var(non_housekeeping_mask);
+			return 0;
+		}
+		free_bootmem_cpumask_var(tmp);
+	}
 
-	housekeeping_flags = HK_FLAG_TICK | HK_FLAG_TIMER |
-				HK_FLAG_RCU | HK_FLAG_MISC;
+	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
+		if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+			tick_nohz_full_setup(non_housekeeping_mask);
+		} else {
+			pr_warn("Housekeeping: nohz unsupported."
+				" Build with CONFIG_NO_HZ_FULL\n");
+			free_bootmem_cpumask_var(non_housekeeping_mask);
+			return 0;
+		}
+	}
 
-	tick_nohz_full_setup(non_housekeeping_mask);
+	housekeeping_flags |= flags;
 
 	free_bootmem_cpumask_var(non_housekeeping_mask);
 
 	return 1;
 }
+
+static int __init housekeeping_nohz_full_setup(char *str)
+{
+	unsigned int flags;
+
+	flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+
+	return housekeeping_setup(str, flags);
+}
 __setup("nohz_full=", housekeeping_nohz_full_setup);
-#endif
+
+static int __init housekeeping_isolcpus_setup(char *str)
+{
+	return housekeeping_setup(str, HK_FLAG_DOMAIN);
+}
+__setup("isolcpus=", housekeeping_isolcpus_setup);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e3d31b0880dc..2e6b9126ffdc 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,7 @@
  */
 #include <linux/sched.h>
 #include <linux/mutex.h>
+#include <linux/sched/isolation.h>
 
 #include "sched.h"
 
@@ -469,21 +470,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 	update_top_cache_domain(cpu);
 }
 
-/* Setup the mask of CPUs configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
-	int ret;
-
-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
-	ret = cpulist_parse(str, cpu_isolated_map);
-	if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) {
-		pr_err("sched: Error, all isolcpus= values must be between 0 and %u - ignoring them.\n", nr_cpu_ids-1);
-		return 0;
-	}
-	return 1;
-}
-__setup("isolcpus=", isolated_cpu_setup);
-
 struct s_data {
 	struct sched_domain ** __percpu sd;
 	struct root_domain	*rd;
@@ -1792,7 +1778,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
 	doms_cur = alloc_sched_domains(ndoms_cur);
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
-	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
 	err = build_sched_domains(doms_cur[0], NULL);
 	register_sched_domain_sysctl();
 
@@ -1875,7 +1861,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 		doms_new = alloc_sched_domains(1);
 		if (doms_new) {
 			n = 1;
-			cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+			cpumask_and(doms_new[0], cpu_active_mask,
+				    housekeeping_cpumask(HK_FLAG_DOMAIN));
 		}
 	} else {
 		n = ndoms_new;
@@ -1898,7 +1885,8 @@ match1:
 	if (!doms_new) {
 		n = 0;
 		doms_new = &fallback_doms;
-		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+		cpumask_and(doms_new[0], cpu_active_mask,
+			    housekeeping_cpumask(HK_FLAG_DOMAIN));
 	}
 
 	/* Build new domains: */
-- 
cgit v1.2.3


From 150dfee95feb913876ced5cc559a342384e8ea97 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:38 +0200
Subject: sched/isolation: Add basic isolcpus flags

Add flags to control NOHZ and domain isolation from "isolcpus=", in
order to centralize the isolation features to a common interface. Domain
isolation remains the default so not to break the existing isolcpus
boot paramater behaviour.

Further flags in the future may include 0hz (1hz tick offload) and timers,
workqueue, RCU, kthread, watchdog, likely all merged together in a
common flag ("async"?). In any case, this will have to be modifiable by
cpusets.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-12-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/isolation.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 8f666bc5abe8..b71b436f59f2 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/static_key.h>
+#include <linux/ctype.h>
 
 DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
 EXPORT_SYMBOL_GPL(housekeeping_overriden);
@@ -126,6 +127,29 @@ __setup("nohz_full=", housekeeping_nohz_full_setup);
 
 static int __init housekeeping_isolcpus_setup(char *str)
 {
-	return housekeeping_setup(str, HK_FLAG_DOMAIN);
+	unsigned int flags = 0;
+
+	while (isalpha(*str)) {
+		if (!strncmp(str, "nohz,", 5)) {
+			str += 5;
+			flags |= HK_FLAG_TICK;
+			continue;
+		}
+
+		if (!strncmp(str, "domain,", 7)) {
+			str += 7;
+			flags |= HK_FLAG_DOMAIN;
+			continue;
+		}
+
+		pr_warn("isolcpus: Error, unknown flag\n");
+		return 0;
+	}
+
+	/* Default behaviour for isolcpus without flags */
+	if (!flags)
+		flags |= HK_FLAG_DOMAIN;
+
+	return housekeeping_setup(str, flags);
 }
 __setup("isolcpus=", housekeeping_isolcpus_setup);
-- 
cgit v1.2.3


From b0d40d2b22fe48cfcbbfb137fd198be0a1cd8a85 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 31 Oct 2017 04:18:34 +0100
Subject: sched/isolation: Document isolcpus= boot parameter flags, mark it
 deprecated

Document the latest updates on the isolcpus= boot option. While at it,
let's also fix the details about the preferred way to isolate a set of
CPUs from the scheduler general domains. Cpusets offer a much better
interface to achieve that.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509419914-16179-1-git-send-email-frederic@kernel.org
[ Clarified the text some more, marked the boot option deprecated. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 39 ++++++++++++++++---------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6b99c8b77c59..17eb0234d5be 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1727,20 +1727,33 @@
 	isapnp=		[ISAPNP]
 			Format: <RDP>,<reset>,<pci_scan>,<verbosity>
 
-	isolcpus=	[KNL,SMP] Isolate CPUs from the general scheduler.
-			The argument is a cpu list, as described above.
+	isolcpus=	[KNL,SMP] Isolate a given set of CPUs from disturbance.
+			[Deprecated - use cpusets instead]
+			Format: [flag-list,]<cpu-list>
+
+			Specify one or more CPUs to isolate from disturbances
+			specified in the flag list (default: domain):
+
+			nohz
+			  Disable the tick when a single task runs.
+			domain
+			  Isolate from the general SMP balancing and scheduling
+			  algorithms. Note that performing domain isolation this way
+			  is irreversible: it's not possible to bring back a CPU to
+			  the domains once isolated through isolcpus. It's strongly
+			  advised to use cpusets instead to disable scheduler load
+			  balancing through the "cpuset.sched_load_balance" file.
+			  It offers a much more flexible interface where CPUs can
+			  move in and out of an isolated set anytime.
+
+			  You can move a process onto or off an "isolated" CPU via
+			  the CPU affinity syscalls or cpuset.
+			  <cpu number> begins at 0 and the maximum value is
+			  "number of CPUs in system - 1".
+
+			The format of <cpu-list> is described above.
+
 
-			This option can be used to specify one or more CPUs
-			to isolate from the general SMP balancing and scheduling
-			algorithms. You can move a process onto or off an
-			"isolated" CPU via the CPU affinity syscalls or cpuset.
-			<cpu number> begins at 0 and the maximum value is
-			"number of CPUs in system - 1".
-
-			This option is the preferred way to isolate CPUs. The
-			alternative -- manually setting the CPU mask of all
-			tasks in the system -- can cause problems and
-			suboptimal load balancer performance.
 
 	iucv=		[HW,NET]
 
-- 
cgit v1.2.3


From a9903f04e0a4ea522d959c2f287cdf0ab029e324 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Mon, 30 Oct 2017 11:08:16 -0700
Subject: sched/sysctl: Fix attributes of some extern declarations

The definition of sysctl_sched_migration_cost, sysctl_sched_nr_migrate
and sysctl_sched_time_avg includes the attribute const_debug. This
attribute is not part of the extern declaration of these variables in
include/linux/sched/sysctl.h, while it is in kernel/sched/sched.h,
and as a result Clang generates warnings like this:

  kernel/sched/sched.h:1618:33: warning: section attribute is specified on redeclared variable [-Wsection]
  extern const_debug unsigned int sysctl_sched_time_avg;
                                ^
  ./include/linux/sched/sysctl.h:42:21: note: previous declaration is here
  extern unsigned int sysctl_sched_time_avg;

The header only declares the variables when CONFIG_SCHED_DEBUG is defined,
therefore it is not necessary to duplicate the definition of const_debug.
Instead we can use the attribute __read_mostly, which is the expansion of
const_debug when CONFIG_SCHED_DEBUG=y is set.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Nick Desaulniers <nick.desaulniers@gmail.com>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Guenter Roeck <groeck@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shile Zhang <shile.zhang@nokia.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171030180816.170850-1-mka@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/sysctl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 0f5ecd4d298e..d34c823f3d36 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -37,9 +37,9 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
 extern unsigned int sysctl_numa_balancing_scan_size;
 
 #ifdef CONFIG_SCHED_DEBUG
-extern unsigned int sysctl_sched_migration_cost;
-extern unsigned int sysctl_sched_nr_migrate;
-extern unsigned int sysctl_sched_time_avg;
+extern __read_mostly unsigned int sysctl_sched_migration_cost;
+extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+extern __read_mostly unsigned int sysctl_sched_time_avg;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
-- 
cgit v1.2.3


From 765cc3a4b224e22bf524fabe40284a524f37cdd0 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Wed, 8 Nov 2017 18:41:01 +0000
Subject: sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds

When the kernel is compiled with !CONFIG_SCHED_DEBUG support, we expect that
all SCHED_FEAT are turned into compile time constants being propagated
to support compiler optimizations.

Specifically, we expect that code blocks like this:

   if (sched_feat(FEATURE_NAME) [&& <other_conditions>]) {
	/* FEATURE CODE */
   }

are turned into dead-code in case FEATURE_NAME defaults to FALSE, and thus
being removed by the compiler from the finale image.

For this mechanism to properly work it's required for the compiler to
have full access, from each translation unit, to whatever is the value
defined by the sched_feat macro. This macro is defined as:

   #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

and thus, the compiler can optimize that code only if the value of
sysctl_sched_features is visible within each translation unit.

Since:

   029632fbb ("sched: Make separate sched*.c translation units")

the scheduler code has been split into separate translation units
however the definition of sysctl_sched_features is part of
kernel/sched/core.c while, for all the other scheduler modules, it is
visible only via kernel/sched/sched.h as an:

   extern const_debug unsigned int sysctl_sched_features

Unfortunately, an extern reference does not allow the compiler to apply
constants propagation. Thus, on !CONFIG_SCHED_DEBUG kernel we still end up
with code to load a memory reference and (eventually) doing an unconditional
jump of a chunk of code.

This mechanism is unavoidable when sched_features can be turned on and off at
run-time. However, this is not the case for "production" kernels compiled with
!CONFIG_SCHED_DEBUG. In this case, sysctl_sched_features is just a constant value
which cannot be changed at run-time and thus memory loads and jumps can be
avoided altogether.

This patch fixes the case of !CONFIG_SCHED_DEBUG kernel by declaring a local version
of the sysctl_sched_features constant for each translation unit. This will
ultimately allow the compiler to perform constants propagation and dead-code
pruning.

Tests have been done, with !CONFIG_SCHED_DEBUG on a v4.14-rc8 with and without
the patch, by running 30 iterations of:

   perf bench sched messaging --pipe --thread --group 4 --loop 50000

on a 40 cores Intel(R) Xeon(R) CPU E5-2690 v2 @ 3.00GHz using the
powersave governor to rule out variations due to frequency scaling.

Statistics on the reported completion time:

                   count     mean       std     min       99%     max
  v4.14-rc8         30.0  15.7831  0.176032  15.442  16.01226  16.014
  v4.14-rc8+patch   30.0  15.5033  0.189681  15.232  15.93938  15.962

... show a 1.8% speedup on average completion time and 0.5% speedup in the
99 percentile.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reviewed-by: Brendan Jackman <brendan.jackman@arm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/20171108184101.16006-1-patrick.bellasi@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  9 ++++++---
 kernel/sched/sched.h | 25 ++++++++++++++++++++++---
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1a55c842bfbc..a39a08113003 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -43,18 +43,21 @@
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
 /*
  * Debugging: various feature bits
+ *
+ * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
+ * sysctl_sched_features, defined in sched.h, to allow constants propagation
+ * at compile time and compiler optimization based on features default.
  */
-
 #define SCHED_FEAT(name, enabled)	\
 	(1UL << __SCHED_FEAT_##name) * enabled |
-
 const_debug unsigned int sysctl_sched_features =
 #include "features.h"
 	0;
-
 #undef SCHED_FEAT
+#endif
 
 /*
  * Number of tasks to iterate in a single balance run.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 58787e3631c7..45ab0bf564e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1233,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 # define const_debug const
 #endif
 
-extern const_debug unsigned int sysctl_sched_features;
-
 #define SCHED_FEAT(name, enabled)	\
 	__SCHED_FEAT_##name ,
 
@@ -1246,6 +1244,13 @@ enum {
 #undef SCHED_FEAT
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+
+/*
+ * To support run-time toggling of sched features, all the translation units
+ * (but core.c) reference the sysctl_sched_features defined in core.c.
+ */
+extern const_debug unsigned int sysctl_sched_features;
+
 #define SCHED_FEAT(name, enabled)					\
 static __always_inline bool static_branch_##name(struct static_key *key) \
 {									\
@@ -1253,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
 }
 
 #include "features.h"
-
 #undef SCHED_FEAT
 
 extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
+
 #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+
+/*
+ * Each translation unit has its own copy of sysctl_sched_features to allow
+ * constants propagation at compile time and compiler optimization based on
+ * features default.
+ */
+#define SCHED_FEAT(name, enabled)	\
+	(1UL << __SCHED_FEAT_##name) * enabled |
+static const_debug __maybe_unused unsigned int sysctl_sched_features =
+#include "features.h"
+	0;
+#undef SCHED_FEAT
+
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 
 extern struct static_key_false sched_numa_balancing;
-- 
cgit v1.2.3