From 6d299f1b53b84e2665f402d9bcc494800aba6386 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: sched: fix SCHED_OTHER balance iterator to include all tasks

The currently logic inadvertently skips the last task on the run-queue,
resulting in missed balance opportunities.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: David Bahi <dbahi@novell.com>
CC: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 08ae848b71d4..1fe4c65a8170 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1275,23 +1275,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
 	struct task_struct *p = NULL;
 	struct sched_entity *se;
 
-	if (next == &cfs_rq->tasks)
-		return NULL;
-
-	/* Skip over entities that are not tasks */
-	do {
+	while (next != &cfs_rq->tasks) {
 		se = list_entry(next, struct sched_entity, group_node);
 		next = next->next;
-	} while (next != &cfs_rq->tasks && !entity_is_task(se));
 
-	if (next == &cfs_rq->tasks)
-		return NULL;
+		/* Skip over entities that are not tasks */
+		if (entity_is_task(se)) {
+			p = task_of(se);
+			break;
+		}
+	}
 
 	cfs_rq->balance_iterator = next;
-
-	if (entity_is_task(se))
-		p = task_of(se);
-
 	return p;
 }
 
-- 
cgit v1.2.3


From a7be37ac8e1565e00880531f4e2aff421a21c803 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:11 +0200
Subject: sched: revert the revert of: weight calculations

Try again..

initial commit: 8f1bc385cfbab474db6c27b5af1e439614f3025c
revert: f9305d4a0968201b2818dbed0dc8cb0d4ee7aeb3

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          |   9 ++---
 kernel/sched_fair.c     | 105 +++++++++++++++++++++++++++++++++---------------
 kernel/sched_features.h |   1 +
 3 files changed, 76 insertions(+), 39 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index c51d9fae8cd8..f653af684fb3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1342,6 +1342,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
  */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 
+/*
+ * delta *= weight / lw
+ */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
@@ -1369,12 +1372,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 
-static inline unsigned long
-calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
-{
-	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
-}
-
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1fe4c65a8170..496500988ce5 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -333,6 +333,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 }
 #endif
 
+/*
+ * delta *= w / rw
+ */
+static inline unsigned long
+calc_delta_weight(unsigned long delta, struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		delta = calc_delta_mine(delta,
+				se->load.weight, &cfs_rq_of(se)->load);
+	}
+
+	return delta;
+}
+
+/*
+ * delta *= rw / w
+ */
+static inline unsigned long
+calc_delta_fair(unsigned long delta, struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		delta = calc_delta_mine(delta,
+				cfs_rq_of(se)->load.weight, &se->load);
+	}
+
+	return delta;
+}
+
 /*
  * The idea is to set a period in which each task runs once.
  *
@@ -362,47 +390,54 @@ static u64 __sched_period(unsigned long nr_running)
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	u64 slice = __sched_period(cfs_rq->nr_running);
-
-	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
-
-		slice *= se->load.weight;
-		do_div(slice, cfs_rq->load.weight);
-	}
-
-
-	return slice;
+	return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
 }
 
 /*
  * We calculate the vruntime slice of a to be inserted task
  *
- * vs = s/w = p/rw
+ * vs = s*rw/w = p
  */
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long nr_running = cfs_rq->nr_running;
-	unsigned long weight;
-	u64 vslice;
 
 	if (!se->on_rq)
 		nr_running++;
 
-	vslice = __sched_period(nr_running);
+	return __sched_period(nr_running);
+}
+
+/*
+ * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
+ * that it favours >=0 over <0.
+ *
+ *   -20         |
+ *               |
+ *     0 --------+-------
+ *             .'
+ *    19     .'
+ *
+ */
+static unsigned long
+calc_delta_asym(unsigned long delta, struct sched_entity *se)
+{
+	struct load_weight lw = {
+		.weight = NICE_0_LOAD,
+		.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
+	};
 
 	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
+		struct load_weight *se_lw = &se->load;
 
-		weight = cfs_rq->load.weight;
-		if (!se->on_rq)
-			weight += se->load.weight;
+		if (se->load.weight < NICE_0_LOAD)
+			se_lw = &lw;
 
-		vslice *= NICE_0_LOAD;
-		do_div(vslice, weight);
+		delta = calc_delta_mine(delta,
+				cfs_rq_of(se)->load.weight, se_lw);
 	}
 
-	return vslice;
+	return delta;
 }
 
 /*
@@ -419,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 	curr->sum_exec_runtime += delta_exec;
 	schedstat_add(cfs_rq, exec_clock, delta_exec);
-	delta_exec_weighted = delta_exec;
-	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
-		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
-							&curr->load);
-	}
+	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
 	curr->vruntime += delta_exec_weighted;
 }
 
@@ -609,8 +640,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
-		if (sched_feat(NEW_FAIR_SLEEPERS))
-			vruntime -= sysctl_sched_latency;
+		if (sched_feat(NEW_FAIR_SLEEPERS)) {
+			unsigned long thresh = sysctl_sched_latency;
+
+			/*
+			 * convert the sleeper threshold into virtual time
+			 */
+			if (sched_feat(NORMALIZED_SLEEPER))
+				thresh = calc_delta_fair(thresh, se);
+
+			vruntime -= thresh;
+		}
 
 		/* ensure we never gain time by being placed backwards. */
 		vruntime = max_vruntime(se->vruntime, vruntime);
@@ -1111,11 +1151,10 @@ static unsigned long wakeup_gran(struct sched_entity *se)
 	unsigned long gran = sysctl_sched_wakeup_granularity;
 
 	/*
-	 * More easily preempt - nice tasks, while not making
-	 * it harder for + nice tasks.
+	 * More easily preempt - nice tasks, while not making it harder for
+	 * + nice tasks.
 	 */
-	if (unlikely(se->load.weight > NICE_0_LOAD))
-		gran = calc_delta_fair(gran, &se->load);
+	gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
 
 	return gran;
 }
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 62b39ca92ebd..afa549166d8d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,5 @@
 SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
+SCHED_FEAT(NORMALIZED_SLEEPER, 1)
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 SCHED_FEAT(START_DEBIT, 1)
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
-- 
cgit v1.2.3


From c9c294a630e28eec5f2865f028ecfc58d45c0a5a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:12 +0200
Subject: sched: fix calc_delta_asym()

calc_delta_asym() is supposed to do the same as calc_delta_fair() except
linearly shrink the result for negative nice processes - this causes them
to have a smaller preemption threshold so that they are more easily preempted.

The problem is that for task groups se->load.weight is the per cpu share of
the actual task group weight; take that into account.

Also provide a debug switch to disable the asymmetry (which I still don't
like - but it does greatly benefit some workloads)

This would explain the interactivity issues reported against group scheduling.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 28 +++++++++++++++++++++++++++-
 kernel/sched_features.h |  1 +
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 496500988ce5..2268e634812b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -430,6 +430,29 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se)
 	for_each_sched_entity(se) {
 		struct load_weight *se_lw = &se->load;
 
+#ifdef CONFIG_FAIR_SCHED_GROUP
+		struct cfs_rq *cfs_rq = se->my_q;
+		struct task_group *tg = NULL
+
+		if (cfs_rq)
+			tg = cfs_rq->tg;
+
+		if (tg && tg->shares < NICE_0_LOAD) {
+			/*
+			 * scale shares to what it would have been had
+			 * tg->weight been NICE_0_LOAD:
+			 *
+			 *   weight = 1024 * shares / tg->weight
+			 */
+			lw.weight *= se->load.weight;
+			lw.weight /= tg->shares;
+
+			lw.inv_weight = 0;
+
+			se_lw = &lw;
+		} else
+#endif
+
 		if (se->load.weight < NICE_0_LOAD)
 			se_lw = &lw;
 
@@ -1154,7 +1177,10 @@ static unsigned long wakeup_gran(struct sched_entity *se)
 	 * More easily preempt - nice tasks, while not making it harder for
 	 * + nice tasks.
 	 */
-	gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+	if (sched_feat(ASYM_GRAN))
+		gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+	else
+		gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
 
 	return gran;
 }
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index afa549166d8d..04123af2e678 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -7,3 +7,4 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
 SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 1)
 SCHED_FEAT(DOUBLE_TICK, 0)
+SCHED_FEAT(ASYM_GRAN, 1)
-- 
cgit v1.2.3


From ced8aa16e1db55c33c507174c1b1f9e107445865 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:13 +0200
Subject: sched: fix calc_delta_asym, #2

Ok, so why are we in this mess, it was:

  1/w

but now we mixed that rw in the mix like:

 rw/w

rw being \Sum w suggests: fiddling w, we should also fiddle rw, humm?

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2268e634812b..2e197b8e43f1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -429,6 +429,7 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se)
 
 	for_each_sched_entity(se) {
 		struct load_weight *se_lw = &se->load;
+		unsigned long rw = cfs_rq_of(se)->load.weight;
 
 #ifdef CONFIG_FAIR_SCHED_GROUP
 		struct cfs_rq *cfs_rq = se->my_q;
@@ -450,14 +451,16 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se)
 			lw.inv_weight = 0;
 
 			se_lw = &lw;
+			rw += lw.weight - se->load.weight;
 		} else
 #endif
 
-		if (se->load.weight < NICE_0_LOAD)
+		if (se->load.weight < NICE_0_LOAD) {
 			se_lw = &lw;
+			rw += NICE_0_LOAD - se->load.weight;
+		}
 
-		delta = calc_delta_mine(delta,
-				cfs_rq_of(se)->load.weight, se_lw);
+		delta = calc_delta_mine(delta, rw, se_lw);
 	}
 
 	return delta;
-- 
cgit v1.2.3


From c09595f63bb1909c5dc4dca288f4fe818561b5f3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:14 +0200
Subject: sched: revert revert of: fair-group: SMP-nice for group scheduling

Try again..

Initial commit: 18d95a2832c1392a2d63227a7a6d433cb9f2037e
Revert: 6363ca57c76b7b83639ca8c83fc285fa26a7880e

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |   1 +
 kernel/sched.c        | 430 ++++++++++++++++++++++++++++++++++++++++++++++----
 kernel/sched_debug.c  |   5 +
 kernel/sched_fair.c   | 124 +++++++++------
 kernel/sched_rt.c     |   4 +
 5 files changed, 489 insertions(+), 75 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eaf821072dbd..97a58b622ee1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -765,6 +765,7 @@ struct sched_domain {
 	struct sched_domain *child;	/* bottom domain must be null terminated */
 	struct sched_group *groups;	/* the balancing groups of the domain */
 	cpumask_t span;			/* span of all CPUs in this domain */
+	int first_cpu;			/* cache of the first cpu in this domain */
 	unsigned long min_interval;	/* Minimum balance interval ms */
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c
index f653af684fb3..874b6da15430 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -403,6 +403,43 @@ struct cfs_rq {
 	 */
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
+
+#ifdef CONFIG_SMP
+	unsigned long task_weight;
+	unsigned long shares;
+	/*
+	 * We need space to build a sched_domain wide view of the full task
+	 * group tree, in order to avoid depending on dynamic memory allocation
+	 * during the load balancing we place this in the per cpu task group
+	 * hierarchy. This limits the load balancing to one instance per cpu,
+	 * but more should not be needed anyway.
+	 */
+	struct aggregate_struct {
+		/*
+		 *   load = weight(cpus) * f(tg)
+		 *
+		 * Where f(tg) is the recursive weight fraction assigned to
+		 * this group.
+		 */
+		unsigned long load;
+
+		/*
+		 * part of the group weight distributed to this span.
+		 */
+		unsigned long shares;
+
+		/*
+		 * The sum of all runqueue weights within this span.
+		 */
+		unsigned long rq_weight;
+
+		/*
+		 * Weight contributed by tasks; this is the part we can
+		 * influence by moving tasks around.
+		 */
+		unsigned long task_weight;
+	} aggregate;
+#endif
 #endif
 };
 
@@ -1484,6 +1521,326 @@ static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/*
+ * Group load balancing.
+ *
+ * We calculate a few balance domain wide aggregate numbers; load and weight.
+ * Given the pictures below, and assuming each item has equal weight:
+ *
+ *         root          1 - thread
+ *         / | \         A - group
+ *        A  1  B
+ *       /|\   / \
+ *      C 2 D 3   4
+ *      |   |
+ *      5   6
+ *
+ * load:
+ *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
+ *    which equals 1/9-th of the total load.
+ *
+ * shares:
+ *    The weight of this group on the selected cpus.
+ *
+ * rq_weight:
+ *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
+ *    B would get 2.
+ *
+ * task_weight:
+ *    Part of the rq_weight contributed by tasks; all groups except B would
+ *    get 1, B gets 2.
+ */
+
+static inline struct aggregate_struct *
+aggregate(struct task_group *tg, struct sched_domain *sd)
+{
+	return &tg->cfs_rq[sd->first_cpu]->aggregate;
+}
+
+typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ */
+static
+void aggregate_walk_tree(aggregate_func down, aggregate_func up,
+			 struct sched_domain *sd)
+{
+	struct task_group *parent, *child;
+
+	rcu_read_lock();
+	parent = &root_task_group;
+down:
+	(*down)(parent, sd);
+	list_for_each_entry_rcu(child, &parent->children, siblings) {
+		parent = child;
+		goto down;
+
+up:
+		continue;
+	}
+	(*up)(parent, sd);
+
+	child = parent;
+	parent = parent->parent;
+	if (parent)
+		goto up;
+	rcu_read_unlock();
+}
+
+/*
+ * Calculate the aggregate runqueue weight.
+ */
+static
+void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long rq_weight = 0;
+	unsigned long task_weight = 0;
+	int i;
+
+	for_each_cpu_mask(i, sd->span) {
+		rq_weight += tg->cfs_rq[i]->load.weight;
+		task_weight += tg->cfs_rq[i]->task_weight;
+	}
+
+	aggregate(tg, sd)->rq_weight = rq_weight;
+	aggregate(tg, sd)->task_weight = task_weight;
+}
+
+/*
+ * Compute the weight of this group on the given cpus.
+ */
+static
+void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long shares = 0;
+	int i;
+
+	for_each_cpu_mask(i, sd->span)
+		shares += tg->cfs_rq[i]->shares;
+
+	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+		shares = tg->shares;
+
+	aggregate(tg, sd)->shares = shares;
+}
+
+/*
+ * Compute the load fraction assigned to this group, relies on the aggregate
+ * weight and this group's parent's load, i.e. top-down.
+ */
+static
+void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long load;
+
+	if (!tg->parent) {
+		int i;
+
+		load = 0;
+		for_each_cpu_mask(i, sd->span)
+			load += cpu_rq(i)->load.weight;
+
+	} else {
+		load = aggregate(tg->parent, sd)->load;
+
+		/*
+		 * shares is our weight in the parent's rq so
+		 * shares/parent->rq_weight gives our fraction of the load
+		 */
+		load *= aggregate(tg, sd)->shares;
+		load /= aggregate(tg->parent, sd)->rq_weight + 1;
+	}
+
+	aggregate(tg, sd)->load = load;
+}
+
+static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+
+/*
+ * Calculate and set the cpu's group shares.
+ */
+static void
+__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
+			  int tcpu)
+{
+	int boost = 0;
+	unsigned long shares;
+	unsigned long rq_weight;
+
+	if (!tg->se[tcpu])
+		return;
+
+	rq_weight = tg->cfs_rq[tcpu]->load.weight;
+
+	/*
+	 * If there are currently no tasks on the cpu pretend there is one of
+	 * average load so that when a new task gets to run here it will not
+	 * get delayed by group starvation.
+	 */
+	if (!rq_weight) {
+		boost = 1;
+		rq_weight = NICE_0_LOAD;
+	}
+
+	/*
+	 *           \Sum shares * rq_weight
+	 * shares =  -----------------------
+	 *               \Sum rq_weight
+	 *
+	 */
+	shares = aggregate(tg, sd)->shares * rq_weight;
+	shares /= aggregate(tg, sd)->rq_weight + 1;
+
+	/*
+	 * record the actual number of shares, not the boosted amount.
+	 */
+	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+
+	if (shares < MIN_SHARES)
+		shares = MIN_SHARES;
+	else if (shares > MAX_SHARES)
+		shares = MAX_SHARES;
+
+	__set_se_shares(tg->se[tcpu], shares);
+}
+
+/*
+ * Re-adjust the weights on the cpu the task came from and on the cpu the
+ * task went to.
+ */
+static void
+__move_group_shares(struct task_group *tg, struct sched_domain *sd,
+		    int scpu, int dcpu)
+{
+	unsigned long shares;
+
+	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+
+	__update_group_shares_cpu(tg, sd, scpu);
+	__update_group_shares_cpu(tg, sd, dcpu);
+
+	/*
+	 * ensure we never loose shares due to rounding errors in the
+	 * above redistribution.
+	 */
+	shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+	if (shares)
+		tg->cfs_rq[dcpu]->shares += shares;
+}
+
+/*
+ * Because changing a group's shares changes the weight of the super-group
+ * we need to walk up the tree and change all shares until we hit the root.
+ */
+static void
+move_group_shares(struct task_group *tg, struct sched_domain *sd,
+		  int scpu, int dcpu)
+{
+	while (tg) {
+		__move_group_shares(tg, sd, scpu, dcpu);
+		tg = tg->parent;
+	}
+}
+
+static
+void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long shares = aggregate(tg, sd)->shares;
+	int i;
+
+	for_each_cpu_mask(i, sd->span) {
+		struct rq *rq = cpu_rq(i);
+		unsigned long flags;
+
+		spin_lock_irqsave(&rq->lock, flags);
+		__update_group_shares_cpu(tg, sd, i);
+		spin_unlock_irqrestore(&rq->lock, flags);
+	}
+
+	aggregate_group_shares(tg, sd);
+
+	/*
+	 * ensure we never loose shares due to rounding errors in the
+	 * above redistribution.
+	 */
+	shares -= aggregate(tg, sd)->shares;
+	if (shares) {
+		tg->cfs_rq[sd->first_cpu]->shares += shares;
+		aggregate(tg, sd)->shares += shares;
+	}
+}
+
+/*
+ * Calculate the accumulative weight and recursive load of each task group
+ * while walking down the tree.
+ */
+static
+void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+{
+	aggregate_group_weight(tg, sd);
+	aggregate_group_shares(tg, sd);
+	aggregate_group_load(tg, sd);
+}
+
+/*
+ * Rebalance the cpu shares while walking back up the tree.
+ */
+static
+void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+{
+	aggregate_group_set_shares(tg, sd);
+}
+
+static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
+
+static void __init init_aggregate(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		spin_lock_init(&per_cpu(aggregate_lock, i));
+}
+
+static int get_aggregate(struct sched_domain *sd)
+{
+	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
+		return 0;
+
+	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
+	return 1;
+}
+
+static void put_aggregate(struct sched_domain *sd)
+{
+	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+}
+
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+	cfs_rq->shares = shares;
+}
+
+#else
+
+static inline void init_aggregate(void)
+{
+}
+
+static inline int get_aggregate(struct sched_domain *sd)
+{
+	return 0;
+}
+
+static inline void put_aggregate(struct sched_domain *sd)
+{
+}
+#endif
+
 #endif
 
 #include "sched_stats.h"
@@ -1498,26 +1855,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
 
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_add(&rq->load, p->se.load.weight);
-}
-
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_sub(&rq->load, p->se.load.weight);
-}
-
-static void inc_nr_running(struct task_struct *p, struct rq *rq)
+static void inc_nr_running(struct rq *rq)
 {
 	rq->nr_running++;
-	inc_load(rq, p);
 }
 
-static void dec_nr_running(struct task_struct *p, struct rq *rq)
+static void dec_nr_running(struct rq *rq)
 {
 	rq->nr_running--;
-	dec_load(rq, p);
 }
 
 static void set_load_weight(struct task_struct *p)
@@ -1609,7 +1954,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, wakeup);
-	inc_nr_running(p, rq);
+	inc_nr_running(rq);
 }
 
 /*
@@ -1621,7 +1966,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 		rq->nr_uninterruptible++;
 
 	dequeue_task(rq, p, sleep);
-	dec_nr_running(p, rq);
+	dec_nr_running(rq);
 }
 
 /**
@@ -2274,7 +2619,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p);
-		inc_nr_running(p, rq);
+		inc_nr_running(rq);
 	}
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
@@ -3265,9 +3610,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
+	int unlock_aggregate;
 
 	cpus_setall(*cpus);
 
+	unlock_aggregate = get_aggregate(sd);
+
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3383,8 +3731,9 @@ redo:
 
 	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		return -1;
-	return ld_moved;
+		ld_moved = -1;
+
+	goto out;
 
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
@@ -3399,8 +3748,13 @@ out_one_pinned:
 
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		return -1;
-	return 0;
+		ld_moved = -1;
+	else
+		ld_moved = 0;
+out:
+	if (unlock_aggregate)
+		put_aggregate(sd);
+	return ld_moved;
 }
 
 /*
@@ -4588,10 +4942,8 @@ void set_user_nice(struct task_struct *p, long nice)
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
-	if (on_rq) {
+	if (on_rq)
 		dequeue_task(rq, p, 0);
-		dec_load(rq, p);
-	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -4601,7 +4953,6 @@ void set_user_nice(struct task_struct *p, long nice)
 
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
-		inc_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -7016,6 +7367,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
+			sd->first_cpu = first_cpu(sd->span);
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
@@ -7026,6 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), &sd->span);
+		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7037,6 +7390,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		sd->span = *nodemask;
+		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7048,6 +7402,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		sd->span = cpu_coregroup_map(i);
+		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7060,6 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		sd->span = per_cpu(cpu_sibling_map, i);
+		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7757,6 +8113,7 @@ void __init sched_init(void)
 	}
 
 #ifdef CONFIG_SMP
+	init_aggregate();
 	init_defrootdomain();
 #endif
 
@@ -8322,14 +8679,11 @@ void sched_move_task(struct task_struct *tsk)
 #endif /* CONFIG_GROUP_SCHED */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
+static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
-	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 
-	spin_lock_irq(&rq->lock);
-
 	on_rq = se->on_rq;
 	if (on_rq)
 		dequeue_entity(cfs_rq, se, 0);
@@ -8339,8 +8693,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
 
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
+}
 
-	spin_unlock_irq(&rq->lock);
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+	struct cfs_rq *cfs_rq = se->cfs_rq;
+	struct rq *rq = cfs_rq->rq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	__set_se_shares(se, shares);
+	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
 static DEFINE_MUTEX(shares_mutex);
@@ -8379,8 +8742,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * w/o tripping rebalance_share or load_balance_fair.
 	 */
 	tg->shares = shares;
-	for_each_possible_cpu(i)
+	for_each_possible_cpu(i) {
+		/*
+		 * force a rebalance
+		 */
+		cfs_rq_set_shares(tg->cfs_rq[i], 0);
 		set_se_shares(tg->se[i], shares);
+	}
 
 	/*
 	 * Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8e077b9c91cb..04394ccac88d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -167,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #endif
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
 			cfs_rq->nr_spread_over);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_SMP
+	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+#endif
+#endif
 }
 
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2e197b8e43f1..183388c4dead 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -567,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+	cfs_rq->task_weight += weight;
+}
+#else
+static inline void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+}
+#endif
+
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
+	if (!parent_entity(se))
+		inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+	if (entity_is_task(se))
+		add_cfs_task_weight(cfs_rq, se->load.weight);
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
 	list_add(&se->group_node, &cfs_rq->tasks);
@@ -580,6 +597,10 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_sub(&cfs_rq->load, se->load.weight);
+	if (!parent_entity(se))
+		dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+	if (entity_is_task(se))
+		add_cfs_task_weight(cfs_rq, -se->load.weight);
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
 	list_del_init(&se->group_node);
@@ -1372,75 +1393,90 @@ static struct task_struct *load_balance_next_fair(void *arg)
 	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+static unsigned long
+__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		unsigned long max_load_move, struct sched_domain *sd,
+		enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+		struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *curr;
-	struct task_struct *p;
-
-	if (!cfs_rq->nr_running || !first_fair(cfs_rq))
-		return MAX_PRIO;
-
-	curr = cfs_rq->curr;
-	if (!curr)
-		curr = __pick_next_entity(cfs_rq);
+	struct rq_iterator cfs_rq_iterator;
 
-	p = task_of(curr);
+	cfs_rq_iterator.start = load_balance_start_fair;
+	cfs_rq_iterator.next = load_balance_next_fair;
+	cfs_rq_iterator.arg = cfs_rq;
 
-	return p->prio;
+	return balance_tasks(this_rq, this_cpu, busiest,
+			max_load_move, sd, idle, all_pinned,
+			this_best_prio, &cfs_rq_iterator);
 }
-#endif
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
 		  int *all_pinned, int *this_best_prio)
 {
-	struct cfs_rq *busy_cfs_rq;
 	long rem_load_move = max_load_move;
-	struct rq_iterator cfs_rq_iterator;
-
-	cfs_rq_iterator.start = load_balance_start_fair;
-	cfs_rq_iterator.next = load_balance_next_fair;
+	int busiest_cpu = cpu_of(busiest);
+	struct task_group *tg;
 
-	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-		struct cfs_rq *this_cfs_rq;
+	rcu_read_lock();
+	list_for_each_entry(tg, &task_groups, list) {
 		long imbalance;
-		unsigned long maxload;
+		unsigned long this_weight, busiest_weight;
+		long rem_load, max_load, moved_load;
+
+		/*
+		 * empty group
+		 */
+		if (!aggregate(tg, sd)->task_weight)
+			continue;
+
+		rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
+		rem_load /= aggregate(tg, sd)->load + 1;
+
+		this_weight = tg->cfs_rq[this_cpu]->task_weight;
+		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
 
-		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+		imbalance = (busiest_weight - this_weight) / 2;
 
-		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
-		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-		if (imbalance <= 0)
+		if (imbalance < 0)
+			imbalance = busiest_weight;
+
+		max_load = max(rem_load, imbalance);
+		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+				max_load, sd, idle, all_pinned, this_best_prio,
+				tg->cfs_rq[busiest_cpu]);
+
+		if (!moved_load)
 			continue;
 
-		/* Don't pull more than imbalance/2 */
-		imbalance /= 2;
-		maxload = min(rem_load_move, imbalance);
+		move_group_shares(tg, sd, busiest_cpu, this_cpu);
 
-		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
-#else
-# define maxload rem_load_move
-#endif
-		/*
-		 * pass busy_cfs_rq argument into
-		 * load_balance_[start|next]_fair iterators
-		 */
-		cfs_rq_iterator.arg = busy_cfs_rq;
-		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
-					       maxload, sd, idle, all_pinned,
-					       this_best_prio,
-					       &cfs_rq_iterator);
+		moved_load *= aggregate(tg, sd)->load;
+		moved_load /= aggregate(tg, sd)->rq_weight + 1;
 
-		if (rem_load_move <= 0)
+		rem_load_move -= moved_load;
+		if (rem_load_move < 0)
 			break;
 	}
+	rcu_read_unlock();
 
 	return max_load_move - rem_load_move;
 }
+#else
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		  unsigned long max_load_move,
+		  struct sched_domain *sd, enum cpu_idle_type idle,
+		  int *all_pinned, int *this_best_prio)
+{
+	return __load_balance_fair(this_rq, this_cpu, busiest,
+			max_load_move, sd, idle, all_pinned,
+			this_best_prio, &busiest->cfs);
+}
+#endif
 
 static int
 move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 6b4a6b5a4167..765932d0399d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -670,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 		rt_se->timeout = 0;
 
 	enqueue_rt_entity(rt_se);
+
+	inc_cpu_load(rq, p->se.load.weight);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -678,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se);
+
+	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
-- 
cgit v1.2.3


From 103638d95ba5b0c53c8d9c0cb581156ccc8513ee Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:16 +0200
Subject: sched: fix wakeup granularity and buddy granularity

Uncouple buddy selection from wakeup granularity.

The initial idea was that buddies could run ahead as far as a normal task
can - do this by measuring a pair 'slice' just as we do for a normal task.

This means we can drop the wakeup_granularity back to 5ms.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  1 +
 kernel/sched_fair.c | 15 +++++++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index eb3454c410fa..7d282c52bd42 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -375,6 +375,7 @@ struct cfs_rq {
 
 	u64 exec_clock;
 	u64 min_vruntime;
+	u64 pair_start;
 
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 183388c4dead..509092af0330 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 
 /*
  * SCHED_OTHER wake-up granularity.
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 
@@ -813,17 +813,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-
 static struct sched_entity *
 pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	if (!cfs_rq->next)
-		return se;
+	struct rq *rq = rq_of(cfs_rq);
+	u64 pair_slice = rq->clock - cfs_rq->pair_start;
 
-	if (wakeup_preempt_entity(cfs_rq->next, se) != 0)
+	if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
+		cfs_rq->pair_start = rq->clock;
 		return se;
+	}
 
 	return cfs_rq->next;
 }
-- 
cgit v1.2.3


From b6a86c746f5b708012809958462234d19e9c8177 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:18 +0200
Subject: sched: fix sched_domain aggregation

Keeping the aggregate on the first cpu of the sched domain has two problems:
 - it could collide between different sched domains on different cpus
 - it could slow things down because of the remote accesses

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |   1 -
 kernel/sched.c        | 113 ++++++++++++++++++++++++--------------------------
 kernel/sched_fair.c   |  12 +++---
 3 files changed, 60 insertions(+), 66 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 97a58b622ee1..eaf821072dbd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -765,7 +765,6 @@ struct sched_domain {
 	struct sched_domain *child;	/* bottom domain must be null terminated */
 	struct sched_group *groups;	/* the balancing groups of the domain */
 	cpumask_t span;			/* span of all CPUs in this domain */
-	int first_cpu;			/* cache of the first cpu in this domain */
 	unsigned long min_interval;	/* Minimum balance interval ms */
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c
index 7d282c52bd42..160d3c209b8f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1480,12 +1480,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
  */
 
 static inline struct aggregate_struct *
-aggregate(struct task_group *tg, struct sched_domain *sd)
+aggregate(struct task_group *tg, int cpu)
 {
-	return &tg->cfs_rq[sd->first_cpu]->aggregate;
+	return &tg->cfs_rq[cpu]->aggregate;
 }
 
-typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
 
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
@@ -1493,14 +1493,14 @@ typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
  */
 static
 void aggregate_walk_tree(aggregate_func down, aggregate_func up,
-			 struct sched_domain *sd)
+			 int cpu, struct sched_domain *sd)
 {
 	struct task_group *parent, *child;
 
 	rcu_read_lock();
 	parent = &root_task_group;
 down:
-	(*down)(parent, sd);
+	(*down)(parent, cpu, sd);
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
@@ -1508,7 +1508,7 @@ down:
 up:
 		continue;
 	}
-	(*up)(parent, sd);
+	(*up)(parent, cpu, sd);
 
 	child = parent;
 	parent = parent->parent;
@@ -1520,8 +1520,8 @@ up:
 /*
  * Calculate the aggregate runqueue weight.
  */
-static
-void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long rq_weight = 0;
 	unsigned long task_weight = 0;
@@ -1532,15 +1532,15 @@ void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
 		task_weight += tg->cfs_rq[i]->task_weight;
 	}
 
-	aggregate(tg, sd)->rq_weight = rq_weight;
-	aggregate(tg, sd)->task_weight = task_weight;
+	aggregate(tg, cpu)->rq_weight = rq_weight;
+	aggregate(tg, cpu)->task_weight = task_weight;
 }
 
 /*
  * Compute the weight of this group on the given cpus.
  */
-static
-void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long shares = 0;
 	int i;
@@ -1548,18 +1548,18 @@ void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
 	for_each_cpu_mask(i, sd->span)
 		shares += tg->cfs_rq[i]->shares;
 
-	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+	if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
 		shares = tg->shares;
 
-	aggregate(tg, sd)->shares = shares;
+	aggregate(tg, cpu)->shares = shares;
 }
 
 /*
  * Compute the load fraction assigned to this group, relies on the aggregate
  * weight and this group's parent's load, i.e. top-down.
  */
-static
-void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long load;
 
@@ -1571,17 +1571,17 @@ void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
 			load += cpu_rq(i)->load.weight;
 
 	} else {
-		load = aggregate(tg->parent, sd)->load;
+		load = aggregate(tg->parent, cpu)->load;
 
 		/*
 		 * shares is our weight in the parent's rq so
 		 * shares/parent->rq_weight gives our fraction of the load
 		 */
-		load *= aggregate(tg, sd)->shares;
-		load /= aggregate(tg->parent, sd)->rq_weight + 1;
+		load *= aggregate(tg, cpu)->shares;
+		load /= aggregate(tg->parent, cpu)->rq_weight + 1;
 	}
 
-	aggregate(tg, sd)->load = load;
+	aggregate(tg, cpu)->load = load;
 }
 
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1590,8 +1590,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  * Calculate and set the cpu's group shares.
  */
 static void
-__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
-			  int tcpu)
+__update_group_shares_cpu(struct task_group *tg, int cpu,
+			  struct sched_domain *sd, int tcpu)
 {
 	int boost = 0;
 	unsigned long shares;
@@ -1618,8 +1618,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
 	 *               \Sum rq_weight
 	 *
 	 */
-	shares = aggregate(tg, sd)->shares * rq_weight;
-	shares /= aggregate(tg, sd)->rq_weight + 1;
+	shares = aggregate(tg, cpu)->shares * rq_weight;
+	shares /= aggregate(tg, cpu)->rq_weight + 1;
 
 	/*
 	 * record the actual number of shares, not the boosted amount.
@@ -1639,15 +1639,15 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
  * task went to.
  */
 static void
-__move_group_shares(struct task_group *tg, struct sched_domain *sd,
+__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
 		    int scpu, int dcpu)
 {
 	unsigned long shares;
 
 	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
 
-	__update_group_shares_cpu(tg, sd, scpu);
-	__update_group_shares_cpu(tg, sd, dcpu);
+	__update_group_shares_cpu(tg, cpu, sd, scpu);
+	__update_group_shares_cpu(tg, cpu, sd, dcpu);
 
 	/*
 	 * ensure we never loose shares due to rounding errors in the
@@ -1663,19 +1663,19 @@ __move_group_shares(struct task_group *tg, struct sched_domain *sd,
  * we need to walk up the tree and change all shares until we hit the root.
  */
 static void
-move_group_shares(struct task_group *tg, struct sched_domain *sd,
+move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
 		  int scpu, int dcpu)
 {
 	while (tg) {
-		__move_group_shares(tg, sd, scpu, dcpu);
+		__move_group_shares(tg, cpu, sd, scpu, dcpu);
 		tg = tg->parent;
 	}
 }
 
-static
-void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	unsigned long shares = aggregate(tg, sd)->shares;
+	unsigned long shares = aggregate(tg, cpu)->shares;
 	int i;
 
 	for_each_cpu_mask(i, sd->span) {
@@ -1683,20 +1683,20 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		__update_group_shares_cpu(tg, sd, i);
+		__update_group_shares_cpu(tg, cpu, sd, i);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
-	aggregate_group_shares(tg, sd);
+	aggregate_group_shares(tg, cpu, sd);
 
 	/*
 	 * ensure we never loose shares due to rounding errors in the
 	 * above redistribution.
 	 */
-	shares -= aggregate(tg, sd)->shares;
+	shares -= aggregate(tg, cpu)->shares;
 	if (shares) {
-		tg->cfs_rq[sd->first_cpu]->shares += shares;
-		aggregate(tg, sd)->shares += shares;
+		tg->cfs_rq[cpu]->shares += shares;
+		aggregate(tg, cpu)->shares += shares;
 	}
 }
 
@@ -1704,21 +1704,21 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
  * Calculate the accumulative weight and recursive load of each task group
  * while walking down the tree.
  */
-static
-void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	aggregate_group_weight(tg, sd);
-	aggregate_group_shares(tg, sd);
-	aggregate_group_load(tg, sd);
+	aggregate_group_weight(tg, cpu, sd);
+	aggregate_group_shares(tg, cpu, sd);
+	aggregate_group_load(tg, cpu, sd);
 }
 
 /*
  * Rebalance the cpu shares while walking back up the tree.
  */
-static
-void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	aggregate_group_set_shares(tg, sd);
+	aggregate_group_set_shares(tg, cpu, sd);
 }
 
 static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
@@ -1731,18 +1731,18 @@ static void __init init_aggregate(void)
 		spin_lock_init(&per_cpu(aggregate_lock, i));
 }
 
-static int get_aggregate(struct sched_domain *sd)
+static int get_aggregate(int cpu, struct sched_domain *sd)
 {
-	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
+	if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
 		return 0;
 
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
+	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
 	return 1;
 }
 
-static void put_aggregate(struct sched_domain *sd)
+static void put_aggregate(int cpu, struct sched_domain *sd)
 {
-	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+	spin_unlock(&per_cpu(aggregate_lock, cpu));
 }
 
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1756,12 +1756,12 @@ static inline void init_aggregate(void)
 {
 }
 
-static inline int get_aggregate(struct sched_domain *sd)
+static inline int get_aggregate(int cpu, struct sched_domain *sd)
 {
 	return 0;
 }
 
-static inline void put_aggregate(struct sched_domain *sd)
+static inline void put_aggregate(int cpu, struct sched_domain *sd)
 {
 }
 #endif
@@ -3539,7 +3539,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	cpus_setall(*cpus);
 
-	unlock_aggregate = get_aggregate(sd);
+	unlock_aggregate = get_aggregate(this_cpu, sd);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -3678,7 +3678,7 @@ out_one_pinned:
 		ld_moved = 0;
 out:
 	if (unlock_aggregate)
-		put_aggregate(sd);
+		put_aggregate(this_cpu, sd);
 	return ld_moved;
 }
 
@@ -7292,7 +7292,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
-			sd->first_cpu = first_cpu(sd->span);
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
@@ -7303,7 +7302,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), &sd->span);
-		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7315,7 +7313,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		sd->span = *nodemask;
-		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7327,7 +7324,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		sd->span = cpu_coregroup_map(i);
-		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7340,7 +7336,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		sd->span = per_cpu(cpu_sibling_map, i);
-		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 509092af0330..40cf24ab4de8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1429,11 +1429,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		/*
 		 * empty group
 		 */
-		if (!aggregate(tg, sd)->task_weight)
+		if (!aggregate(tg, this_cpu)->task_weight)
 			continue;
 
-		rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
-		rem_load /= aggregate(tg, sd)->load + 1;
+		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
+		rem_load /= aggregate(tg, this_cpu)->load + 1;
 
 		this_weight = tg->cfs_rq[this_cpu]->task_weight;
 		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
@@ -1451,10 +1451,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!moved_load)
 			continue;
 
-		move_group_shares(tg, sd, busiest_cpu, this_cpu);
+		move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu);
 
-		moved_load *= aggregate(tg, sd)->load;
-		moved_load /= aggregate(tg, sd)->rq_weight + 1;
+		moved_load *= aggregate(tg, this_cpu)->load;
+		moved_load /= aggregate(tg, this_cpu)->rq_weight + 1;
 
 		rem_load_move -= moved_load;
 		if (rem_load_move < 0)
-- 
cgit v1.2.3


From 53fecd8ae1900fb571086f54f664051004665b55 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 27 Jun 2008 13:41:20 +0200
Subject: sched: kill task_group balancing

The idea was to balance groups until we've reached the global goal, however
Vatsa rightly pointed out that we might never reach that goal this way -
hence take out this logic.

[ the initial rationale for this 'feature' was to promote max concurrency
  within a group - it does not however affect fairness ]

Reported-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 40cf24ab4de8..b10c0d61a2a9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1422,9 +1422,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 	rcu_read_lock();
 	list_for_each_entry(tg, &task_groups, list) {
-		long imbalance;
-		unsigned long this_weight, busiest_weight;
-		long rem_load, max_load, moved_load;
+		long rem_load, moved_load;
 
 		/*
 		 * empty group
@@ -1435,17 +1433,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
 		rem_load /= aggregate(tg, this_cpu)->load + 1;
 
-		this_weight = tg->cfs_rq[this_cpu]->task_weight;
-		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
-
-		imbalance = (busiest_weight - this_weight) / 2;
-
-		if (imbalance < 0)
-			imbalance = busiest_weight;
-
-		max_load = max(rem_load, imbalance);
 		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
-				max_load, sd, idle, all_pinned, this_best_prio,
+				rem_load, sd, idle, all_pinned, this_best_prio,
 				tg->cfs_rq[busiest_cpu]);
 
 		if (!moved_load)
-- 
cgit v1.2.3


From a25b5aca8740ea99d5e18dfc71235a52b685dcf7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:22 +0200
Subject: sched: no need to aggregate task_weight

We only need to know the task_weight of the busiest rq - nothing to do
if there are no tasks there.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 16 +---------------
 kernel/sched_fair.c |  2 +-
 2 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 28229c5d4983..716cfc8e099e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -433,12 +433,6 @@ struct cfs_rq {
 		 * The sum of all runqueue weights within this span.
 		 */
 		unsigned long rq_weight;
-
-		/*
-		 * Weight contributed by tasks; this is the part we can
-		 * influence by moving tasks around.
-		 */
-		unsigned long task_weight;
 	} aggregate;
 #endif
 #endif
@@ -1473,10 +1467,6 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
  * rq_weight:
  *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
  *    B would get 2.
- *
- * task_weight:
- *    Part of the rq_weight contributed by tasks; all groups except B would
- *    get 1, B gets 2.
  */
 
 static inline struct aggregate_struct *
@@ -1524,16 +1514,12 @@ static void
 aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long rq_weight = 0;
-	unsigned long task_weight = 0;
 	int i;
 
-	for_each_cpu_mask(i, sd->span) {
+	for_each_cpu_mask(i, sd->span)
 		rq_weight += tg->cfs_rq[i]->load.weight;
-		task_weight += tg->cfs_rq[i]->task_weight;
-	}
 
 	aggregate(tg, cpu)->rq_weight = rq_weight;
-	aggregate(tg, cpu)->task_weight = task_weight;
 }
 
 /*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b10c0d61a2a9..03b9fbd9d648 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1427,7 +1427,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		/*
 		 * empty group
 		 */
-		if (!aggregate(tg, this_cpu)->task_weight)
+		if (!tg->cfs_rq[busiest_cpu]->task_weight)
 			continue;
 
 		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
-- 
cgit v1.2.3


From c8cba857b4997d5b00451d01474638f6a153f713 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:23 +0200
Subject: sched: simplify the group load balancer

While thinking about the previous patch - I realized that using per domain
aggregate load values in load_balance_fair() is wrong. We should use the
load value for that CPU.

By not needing per domain hierarchical load values we don't need to store
per domain aggregate shares, which greatly simplifies all the math.

It basically falls apart in two separate computations:
 - per domain update of the shares
 - per CPU update of the hierarchical load

Also get rid of the move_group_shares() stuff - just re-compute the shares
again after a successful load balance.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 286 ++++++++++++----------------------------------------
 kernel/sched_fair.c |  15 +--
 2 files changed, 72 insertions(+), 229 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 716cfc8e099e..f864b751fd19 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -406,34 +406,23 @@ struct cfs_rq {
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
 #ifdef CONFIG_SMP
-	unsigned long task_weight;
-	unsigned long shares;
 	/*
-	 * We need space to build a sched_domain wide view of the full task
-	 * group tree, in order to avoid depending on dynamic memory allocation
-	 * during the load balancing we place this in the per cpu task group
-	 * hierarchy. This limits the load balancing to one instance per cpu,
-	 * but more should not be needed anyway.
+	 * the part of load.weight contributed by tasks
 	 */
-	struct aggregate_struct {
-		/*
-		 *   load = weight(cpus) * f(tg)
-		 *
-		 * Where f(tg) is the recursive weight fraction assigned to
-		 * this group.
-		 */
-		unsigned long load;
+	unsigned long task_weight;
 
-		/*
-		 * part of the group weight distributed to this span.
-		 */
-		unsigned long shares;
+	/*
+	 *   h_load = weight * f(tg)
+	 *
+	 * Where f(tg) is the recursive weight fraction assigned to
+	 * this group.
+	 */
+	unsigned long h_load;
 
-		/*
-		 * The sum of all runqueue weights within this span.
-		 */
-		unsigned long rq_weight;
-	} aggregate;
+	/*
+	 * this cpu's part of tg->shares
+	 */
+	unsigned long shares;
 #endif
 #endif
 };
@@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-/*
- * Group load balancing.
- *
- * We calculate a few balance domain wide aggregate numbers; load and weight.
- * Given the pictures below, and assuming each item has equal weight:
- *
- *         root          1 - thread
- *         / | \         A - group
- *        A  1  B
- *       /|\   / \
- *      C 2 D 3   4
- *      |   |
- *      5   6
- *
- * load:
- *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
- *    which equals 1/9-th of the total load.
- *
- * shares:
- *    The weight of this group on the selected cpus.
- *
- * rq_weight:
- *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
- *    B would get 2.
- */
-
-static inline struct aggregate_struct *
-aggregate(struct task_group *tg, int cpu)
-{
-	return &tg->cfs_rq[cpu]->aggregate;
-}
-
-typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
+typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
 
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
  * leaving it for the final time.
  */
-static
-void aggregate_walk_tree(aggregate_func down, aggregate_func up,
-			 int cpu, struct sched_domain *sd)
+static void
+walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
 {
 	struct task_group *parent, *child;
 
@@ -1507,72 +1463,6 @@ up:
 	rcu_read_unlock();
 }
 
-/*
- * Calculate the aggregate runqueue weight.
- */
-static void
-aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	unsigned long rq_weight = 0;
-	int i;
-
-	for_each_cpu_mask(i, sd->span)
-		rq_weight += tg->cfs_rq[i]->load.weight;
-
-	aggregate(tg, cpu)->rq_weight = rq_weight;
-}
-
-/*
- * Compute the weight of this group on the given cpus.
- */
-static void
-aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	unsigned long shares = 0;
-	int i;
-
-	for_each_cpu_mask(i, sd->span)
-		shares += tg->cfs_rq[i]->shares;
-
-	if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
-		shares = tg->shares;
-
-	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-		shares = tg->shares;
-
-	aggregate(tg, cpu)->shares = shares;
-}
-
-/*
- * Compute the load fraction assigned to this group, relies on the aggregate
- * weight and this group's parent's load, i.e. top-down.
- */
-static void
-aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	unsigned long load;
-
-	if (!tg->parent) {
-		int i;
-
-		load = 0;
-		for_each_cpu_mask(i, sd->span)
-			load += cpu_rq(i)->load.weight;
-
-	} else {
-		load = aggregate(tg->parent, cpu)->load;
-
-		/*
-		 * shares is our weight in the parent's rq so
-		 * shares/parent->rq_weight gives our fraction of the load
-		 */
-		load *= aggregate(tg, cpu)->shares;
-		load /= aggregate(tg->parent, cpu)->rq_weight + 1;
-	}
-
-	aggregate(tg, cpu)->load = load;
-}
-
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 
 /*
@@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  */
 static void
 __update_group_shares_cpu(struct task_group *tg, int cpu,
-			  struct sched_domain *sd, int tcpu)
+			  unsigned long sd_shares, unsigned long sd_rq_weight)
 {
 	int boost = 0;
 	unsigned long shares;
 	unsigned long rq_weight;
 
-	if (!tg->se[tcpu])
+	if (!tg->se[cpu])
 		return;
 
-	rq_weight = tg->cfs_rq[tcpu]->load.weight;
+	rq_weight = tg->cfs_rq[cpu]->load.weight;
 
 	/*
 	 * If there are currently no tasks on the cpu pretend there is one of
@@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
 		rq_weight = NICE_0_LOAD;
 	}
 
+	if (unlikely(rq_weight > sd_rq_weight))
+		rq_weight = sd_rq_weight;
+
 	/*
 	 *           \Sum shares * rq_weight
 	 * shares =  -----------------------
 	 *               \Sum rq_weight
 	 *
 	 */
-	shares = aggregate(tg, cpu)->shares * rq_weight;
-	shares /= aggregate(tg, cpu)->rq_weight + 1;
+	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
 
 	/*
 	 * record the actual number of shares, not the boosted amount.
 	 */
-	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+	tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
 
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
 	else if (shares > MAX_SHARES)
 		shares = MAX_SHARES;
 
-	__set_se_shares(tg->se[tcpu], shares);
+	__set_se_shares(tg->se[cpu], shares);
 }
 
 /*
- * Re-adjust the weights on the cpu the task came from and on the cpu the
- * task went to.
+ * Re-compute the task group their per cpu shares over the given domain.
+ * This needs to be done in a bottom-up fashion because the rq weight of a
+ * parent group depends on the shares of its child groups.
  */
 static void
-__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
-		    int scpu, int dcpu)
+tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	__update_group_shares_cpu(tg, cpu, sd, scpu);
-	__update_group_shares_cpu(tg, cpu, sd, dcpu);
-}
+	unsigned long rq_weight = 0;
+	unsigned long shares = 0;
+	int i;
 
-/*
- * Because changing a group's shares changes the weight of the super-group
- * we need to walk up the tree and change all shares until we hit the root.
- */
-static void
-move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
-		  int scpu, int dcpu)
-{
-	while (tg) {
-		__move_group_shares(tg, cpu, sd, scpu, dcpu);
-		tg = tg->parent;
+	for_each_cpu_mask(i, sd->span) {
+		rq_weight += tg->cfs_rq[i]->load.weight;
+		shares += tg->cfs_rq[i]->shares;
 	}
-}
 
-static void
-aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	int i;
+	if ((!shares && rq_weight) || shares > tg->shares)
+		shares = tg->shares;
+
+	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+		shares = tg->shares;
 
 	for_each_cpu_mask(i, sd->span) {
 		struct rq *rq = cpu_rq(i);
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		__update_group_shares_cpu(tg, cpu, sd, i);
+		__update_group_shares_cpu(tg, i, shares, rq_weight);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
-
-	aggregate_group_shares(tg, cpu, sd);
 }
 
 /*
- * Calculate the accumulative weight and recursive load of each task group
- * while walking down the tree.
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
  */
 static void
-aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	aggregate_group_weight(tg, cpu, sd);
-	aggregate_group_shares(tg, cpu, sd);
-	aggregate_group_load(tg, cpu, sd);
-}
-
-/*
- * Rebalance the cpu shares while walking back up the tree.
- */
-static void
-aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	aggregate_group_set_shares(tg, cpu, sd);
-}
-
-static void
-aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-}
-
-static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
+	unsigned long load;
 
-static void __init init_aggregate(void)
-{
-	int i;
+	if (!tg->parent) {
+		load = cpu_rq(cpu)->load.weight;
+	} else {
+		load = tg->parent->cfs_rq[cpu]->h_load;
+		load *= tg->cfs_rq[cpu]->shares;
+		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+	}
 
-	for_each_possible_cpu(i)
-		spin_lock_init(&per_cpu(aggregate_lock, i));
+	tg->cfs_rq[cpu]->h_load = load;
 }
 
-static int get_aggregate(int cpu, struct sched_domain *sd)
+static void
+tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
-		return 0;
-
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
-	return 1;
 }
 
-static void update_aggregate(int cpu, struct sched_domain *sd)
+static void update_shares(struct sched_domain *sd)
 {
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd);
+	walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
 }
 
-static void put_aggregate(int cpu, struct sched_domain *sd)
+static void update_h_load(int cpu)
 {
-	spin_unlock(&per_cpu(aggregate_lock, cpu));
+	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
 }
 
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 
 #else
 
-static inline void init_aggregate(void)
-{
-}
-
-static inline int get_aggregate(int cpu, struct sched_domain *sd)
-{
-	return 0;
-}
-
-static inline void update_aggregate(int cpu, struct sched_domain *sd)
+static inline void update_shares(struct sched_domain *sd)
 {
 }
 
-static inline void put_aggregate(int cpu, struct sched_domain *sd)
-{
-}
 #endif
 
 #endif
@@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
-	/*
-	 * now that we have both rqs locked the rq weight won't change
-	 * anymore - so update the stats.
-	 */
-	update_aggregate(this_cpu, sd);
-
 	do {
 		unsigned long load, avg_load;
 		int local_group;
@@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
-	int unlock_aggregate;
 
 	cpus_setall(*cpus);
 
-	unlock_aggregate = get_aggregate(this_cpu, sd);
-
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
+	update_shares(sd);
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
 				   cpus, balance);
 
@@ -3663,8 +3506,8 @@ out_one_pinned:
 	else
 		ld_moved = 0;
 out:
-	if (unlock_aggregate)
-		put_aggregate(this_cpu, sd);
+	if (ld_moved)
+		update_shares(sd);
 	return ld_moved;
 }
 
@@ -8019,7 +7862,6 @@ void __init sched_init(void)
 	}
 
 #ifdef CONFIG_SMP
-	init_aggregate();
 	init_defrootdomain();
 #endif
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 03b9fbd9d648..7b8d664d6f22 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1421,17 +1421,20 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	struct task_group *tg;
 
 	rcu_read_lock();
+	update_h_load(busiest_cpu);
+
 	list_for_each_entry(tg, &task_groups, list) {
+		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
 		long rem_load, moved_load;
 
 		/*
 		 * empty group
 		 */
-		if (!tg->cfs_rq[busiest_cpu]->task_weight)
+		if (!busiest_cfs_rq->task_weight)
 			continue;
 
-		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
-		rem_load /= aggregate(tg, this_cpu)->load + 1;
+		rem_load = rem_load_move * busiest_cfs_rq->load.weight;
+		rem_load /= busiest_cfs_rq->h_load + 1;
 
 		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
 				rem_load, sd, idle, all_pinned, this_best_prio,
@@ -1440,10 +1443,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!moved_load)
 			continue;
 
-		move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu);
-
-		moved_load *= aggregate(tg, this_cpu)->load;
-		moved_load /= aggregate(tg, this_cpu)->rq_weight + 1;
+		moved_load *= busiest_cfs_rq->h_load;
+		moved_load /= busiest_cfs_rq->load.weight + 1;
 
 		rem_load_move -= moved_load;
 		if (rem_load_move < 0)
-- 
cgit v1.2.3


From bb3469ac9b50f14ad6eba129ca0ad4fd033097a0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:27 +0200
Subject: sched: hierarchical load vs affine wakeups

With hierarchical grouping we can't just compare task weight to rq weight - we
need to scale the weight appropriately.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7b8d664d6f22..865cb53a7ccf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1073,6 +1073,25 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 
 static const struct sched_class fair_sched_class;
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static unsigned long task_h_load(struct task_struct *p)
+{
+	unsigned long h_load = p->se.load.weight;
+	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+	update_h_load(task_cpu(p));
+
+	h_load = calc_delta_mine(h_load, cfs_rq->h_load, &cfs_rq->load);
+
+	return h_load;
+}
+#else
+static unsigned long task_h_load(struct task_struct *p)
+{
+	return p->se.load.weight;
+}
+#endif
+
 static int
 wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	    struct task_struct *p, int prev_cpu, int this_cpu, int sync,
@@ -1093,9 +1112,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	 * of the current CPU:
 	 */
 	if (sync)
-		tl -= current->se.load.weight;
+		tl -= task_h_load(current);
 
-	balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
+	balanced = 100*(tl + task_h_load(p)) <= imbalance*load;
 
 	/*
 	 * If the currently running task will sleep within
-- 
cgit v1.2.3


From 42a3ac7d5cee89849448b41b86faeb86f98e92f6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:29 +0200
Subject: sched: fix load scaling in group balancing

doing the load balance will change cfs_rq->load.weight (that's the whole point)
but since that's part of the scale factor, we'll scale back with a different
amount.

Weight getting smaller would result in an inflated moved_load which causes
it to stop balancing too soon.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 865cb53a7ccf..734e4c556fcb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1444,6 +1444,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 	list_for_each_entry(tg, &task_groups, list) {
 		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
+		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
+		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
 		long rem_load, moved_load;
 
 		/*
@@ -1452,8 +1454,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!busiest_cfs_rq->task_weight)
 			continue;
 
-		rem_load = rem_load_move * busiest_cfs_rq->load.weight;
-		rem_load /= busiest_cfs_rq->h_load + 1;
+		rem_load = rem_load_move * busiest_weight;
+		rem_load /= busiest_h_load + 1;
 
 		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
 				rem_load, sd, idle, all_pinned, this_best_prio,
@@ -1462,8 +1464,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!moved_load)
 			continue;
 
-		moved_load *= busiest_cfs_rq->h_load;
-		moved_load /= busiest_cfs_rq->load.weight + 1;
+		moved_load *= busiest_h_load;
+		moved_load /= busiest_weight + 1;
 
 		rem_load_move -= moved_load;
 		if (rem_load_move < 0)
-- 
cgit v1.2.3


From 4be9daaa1b33701f011f4117f22dc1e45a3e6e34 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:30 +0200
Subject: sched: fix task_h_load()

Currently task_h_load() computes the load of a task and uses that to either
subtract it from the total, or add to it.

However, removing or adding a task need not have any effect on the total load
at all. Imagine adding a task to a group that is local to one cpu - in that
case the total load of that cpu is unaffected.

So properly compute addition/removal:

 s_i = S * rw_i / \Sum_j rw_j
 s'_i = S * (rw_i + wl) / (\Sum_j rw_j + wg)

then s'_i - s_i gives the change in load.

Where s_i is the shares for cpu i, S the group weight, rw_i the runqueue weight
for that cpu, wl the weight we add (subtract) and wg the weight contribution to
the runqueue.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 49 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 9 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 734e4c556fcb..a1694441f8b7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1074,22 +1074,53 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 static const struct sched_class fair_sched_class;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static unsigned long task_h_load(struct task_struct *p)
+static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
 {
-	unsigned long h_load = p->se.load.weight;
-	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+	struct sched_entity *se = tg->se[cpu];
+	long wg = wl;
 
-	update_h_load(task_cpu(p));
+	for_each_sched_entity(se) {
+#define D(n) (likely(n) ? (n) : 1)
+
+		long S, Srw, rw, s, sn;
+
+		S = se->my_q->tg->shares;
+		s = se->my_q->shares;
+		rw = se->my_q->load.weight;
 
-	h_load = calc_delta_mine(h_load, cfs_rq->h_load, &cfs_rq->load);
+		Srw = S * rw / D(s);
+		sn = S * (rw + wl) / D(Srw + wg);
+
+		wl = sn - s;
+		wg = 0;
+#undef D
+	}
 
-	return h_load;
+	return wl;
 }
+
+static unsigned long task_load_sub(struct task_struct *p)
+{
+	return effective_load(task_group(p), -(long)p->se.load.weight, task_cpu(p));
+}
+
+static unsigned long task_load_add(struct task_struct *p, int cpu)
+{
+	return effective_load(task_group(p), p->se.load.weight, cpu);
+}
+
 #else
-static unsigned long task_h_load(struct task_struct *p)
+
+static unsigned long task_load_sub(struct task_struct *p)
+{
+	return -p->se.load.weight;
+}
+
+static unsigned long task_load_add(struct task_struct *p, int cpu)
 {
 	return p->se.load.weight;
 }
+
 #endif
 
 static int
@@ -1112,9 +1143,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	 * of the current CPU:
 	 */
 	if (sync)
-		tl -= task_h_load(current);
+		tl += task_load_sub(current);
 
-	balanced = 100*(tl + task_h_load(p)) <= imbalance*load;
+	balanced = 100*(tl + task_load_add(p, this_cpu)) <= imbalance*load;
 
 	/*
 	 * If the currently running task will sleep within
-- 
cgit v1.2.3


From cb5ef42a03a13f95a9ea94e6cda4f7a47497871f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:32 +0200
Subject: sched: optimize effective_load()

s_i = S * rw_i / \Sum_j rw_j

 -> \Sum_j rw_j = S * rw_i / s_i

 -> s'_i = S * (rw_i + w) / (\Sum_j rw_j + w)

delta s = s' - s = S * (rw + w) / ((S * rw / s) + w)
        = s * (S * (rw + w) / (S * rw + s * w) - 1)

 a = S*(rw+w), b = S*rw + s*w

delta s = s * (a-b) / b

IOW, trade one divide for two multiplies

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a1694441f8b7..0d197be3e3e9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1082,16 +1082,16 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
 	for_each_sched_entity(se) {
 #define D(n) (likely(n) ? (n) : 1)
 
-		long S, Srw, rw, s, sn;
+		long S, rw, s, a, b;
 
 		S = se->my_q->tg->shares;
 		s = se->my_q->shares;
 		rw = se->my_q->load.weight;
 
-		Srw = S * rw / D(s);
-		sn = S * (rw + wl) / D(Srw + wg);
+		a = S*(rw + wl);
+		b = S*rw + s*wg;
 
-		wl = sn - s;
+		wl = s*(a-b)/D(b);
 		wg = 0;
 #undef D
 	}
-- 
cgit v1.2.3


From 243e0e7b7d3b54749ece2e879ecd7e2a11874443 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 27 Jun 2008 13:41:36 +0200
Subject: sched: fix mult overflow

It was observed these mults can overflow.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0d197be3e3e9..26ebe180cdea 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1477,7 +1477,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
 		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
 		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
-		long rem_load, moved_load;
+		u64 rem_load, moved_load;
 
 		/*
 		 * empty group
@@ -1485,8 +1485,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!busiest_cfs_rq->task_weight)
 			continue;
 
-		rem_load = rem_load_move * busiest_weight;
-		rem_load /= busiest_h_load + 1;
+		rem_load = (u64)rem_load_move * busiest_weight;
+		rem_load = div_u64(rem_load, busiest_h_load + 1);
 
 		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
 				rem_load, sd, idle, all_pinned, this_best_prio,
@@ -1496,7 +1496,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			continue;
 
 		moved_load *= busiest_h_load;
-		moved_load /= busiest_weight + 1;
+		moved_load = div_u64(moved_load, busiest_weight + 1);
 
 		rem_load_move -= moved_load;
 		if (rem_load_move < 0)
-- 
cgit v1.2.3


From 83378269a5fad98f562ebc0f09c349575e6cbfe1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:37 +0200
Subject: sched: correct wakeup weight calculations

rw_i = {2, 4, 1, 0}
s_i = {2/7, 4/7, 1/7, 0}

wakeup on cpu0, weight=1

rw'_i = {3, 4, 1, 0}
s'_i = {3/8, 4/8, 1/8, 0}

s_0 = S * rw_0 / \Sum rw_j ->
  \Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct)

s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) =
       1 * (2+1) / (7+1) = 3/8 (correct

so we find that adding 1 to cpu0 gains 5/56 in weight
if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  4 ++++
 kernel/sched_fair.c | 48 ++++++++++++++++++++++++++----------------------
 2 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 62db0891025a..01d3e51b7116 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -365,6 +365,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #else
 
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+	return NULL;
+}
 
 #endif	/* CONFIG_GROUP_SCHED */
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 26ebe180cdea..bed2f71e63d9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1074,10 +1074,10 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 static const struct sched_class fair_sched_class;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
+static unsigned long effective_load(struct task_group *tg, int cpu,
+		unsigned long wl, unsigned long wg)
 {
 	struct sched_entity *se = tg->se[cpu];
-	long wg = wl;
 
 	for_each_sched_entity(se) {
 #define D(n) (likely(n) ? (n) : 1)
@@ -1092,6 +1092,13 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
 		b = S*rw + s*wg;
 
 		wl = s*(a-b)/D(b);
+		/*
+		 * Assume the group is already running and will
+		 * thus already be accounted for in the weight.
+		 *
+		 * That is, moving shares between CPUs, does not
+		 * alter the group weight.
+		 */
 		wg = 0;
 #undef D
 	}
@@ -1099,26 +1106,12 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
 	return wl;
 }
 
-static unsigned long task_load_sub(struct task_struct *p)
-{
-	return effective_load(task_group(p), -(long)p->se.load.weight, task_cpu(p));
-}
-
-static unsigned long task_load_add(struct task_struct *p, int cpu)
-{
-	return effective_load(task_group(p), p->se.load.weight, cpu);
-}
-
 #else
 
-static unsigned long task_load_sub(struct task_struct *p)
+static inline unsigned long effective_load(struct task_group *tg, int cpu,
+		unsigned long wl, unsigned long wg)
 {
-	return -p->se.load.weight;
-}
-
-static unsigned long task_load_add(struct task_struct *p, int cpu)
-{
-	return p->se.load.weight;
+	return wl;
 }
 
 #endif
@@ -1130,8 +1123,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	    unsigned int imbalance)
 {
 	struct task_struct *curr = this_rq->curr;
+	struct task_group *tg;
 	unsigned long tl = this_load;
 	unsigned long tl_per_task;
+	unsigned long weight;
 	int balanced;
 
 	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
@@ -1142,10 +1137,19 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	 * effect of the currently running task from the load
 	 * of the current CPU:
 	 */
-	if (sync)
-		tl += task_load_sub(current);
+	if (sync) {
+		tg = task_group(current);
+		weight = current->se.load.weight;
+
+		tl += effective_load(tg, this_cpu, -weight, -weight);
+		load += effective_load(tg, prev_cpu, 0, -weight);
+	}
+
+	tg = task_group(p);
+	weight = p->se.load.weight;
 
-	balanced = 100*(tl + task_load_add(p, this_cpu)) <= imbalance*load;
+	balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+		imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
 
 	/*
 	 * If the currently running task will sleep within
-- 
cgit v1.2.3


From f1d239f73200a5803a89e5929fb3abc1596b7589 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:38 +0200
Subject: sched: incremental effective_load()

Increase the accuracy of the effective_load values.

Not only consider the current increment (as per the attempted wakeup), but
also consider the delta between when we last adjusted the shares and the
current situation.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  6 ++++++
 kernel/sched_fair.c | 18 +++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 01d3e51b7116..7613f69f0978 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -427,6 +427,11 @@ struct cfs_rq {
 	 * this cpu's part of tg->shares
 	 */
 	unsigned long shares;
+
+	/*
+	 * load.weight at the time we set shares
+	 */
+	unsigned long rq_weight;
 #endif
 #endif
 };
@@ -1527,6 +1532,7 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
 	 * record the actual number of shares, not the boosted amount.
 	 */
 	tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+	tg->cfs_rq[cpu]->rq_weight = rq_weight;
 
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bed2f71e63d9..e87f1a52f625 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1074,10 +1074,22 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 static const struct sched_class fair_sched_class;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static unsigned long effective_load(struct task_group *tg, int cpu,
-		unsigned long wl, unsigned long wg)
+static long effective_load(struct task_group *tg, int cpu,
+		long wl, long wg)
 {
 	struct sched_entity *se = tg->se[cpu];
+	long more_w;
+
+	if (!tg->parent)
+		return wl;
+
+	/*
+	 * Instead of using this increment, also add the difference
+	 * between when the shares were last updated and now.
+	 */
+	more_w = se->my_q->load.weight - se->my_q->rq_weight;
+	wl += more_w;
+	wg += more_w;
 
 	for_each_sched_entity(se) {
 #define D(n) (likely(n) ? (n) : 1)
@@ -1086,7 +1098,7 @@ static unsigned long effective_load(struct task_group *tg, int cpu,
 
 		S = se->my_q->tg->shares;
 		s = se->my_q->shares;
-		rw = se->my_q->load.weight;
+		rw = se->my_q->rq_weight;
 
 		a = S*(rw + wl);
 		b = S*rw + s*wg;
-- 
cgit v1.2.3


From f5bfb7d9ff73d72ee4f2f4830a6f0c9088d00f92 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:39 +0200
Subject: sched: bias effective_load() error towards failing wake_affine().

Measurement shows that the difference between cgroup:/ and cgroup:/foo
wake_affine() results is that the latter succeeds significantly more.

Therefore bias the calculations towards failing the test.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 28 ++++++++++++++++++++++++++++
 kernel/sched_features.h |  1 +
 2 files changed, 29 insertions(+)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e87f1a52f625..9bcc0030a58b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1074,6 +1074,27 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 static const struct sched_class fair_sched_class;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * effective_load() calculates the load change as seen from the root_task_group
+ *
+ * Adding load to a group doesn't make a group heavier, but can cause movement
+ * of group shares between cpus. Assuming the shares were perfectly aligned one
+ * can calculate the shift in shares.
+ *
+ * The problem is that perfectly aligning the shares is rather expensive, hence
+ * we try to avoid doing that too often - see update_shares(), which ratelimits
+ * this change.
+ *
+ * We compensate this by not only taking the current delta into account, but
+ * also considering the delta between when the shares were last adjusted and
+ * now.
+ *
+ * We still saw a performance dip, some tracing learned us that between
+ * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
+ * significantly. Therefore try to bias the error in direction of failing
+ * the affine wakeup.
+ *
+ */
 static long effective_load(struct task_group *tg, int cpu,
 		long wl, long wg)
 {
@@ -1083,6 +1104,13 @@ static long effective_load(struct task_group *tg, int cpu,
 	if (!tg->parent)
 		return wl;
 
+	/*
+	 * By not taking the decrease of shares on the other cpu into
+	 * account our error leans towards reducing the affine wakeups.
+	 */
+	if (!wl && sched_feat(ASYM_EFF_LOAD))
+		return wl;
+
 	/*
 	 * Instead of using this increment, also add the difference
 	 * between when the shares were last updated and now.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 7d616d2a2a3f..862b06bd560a 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -10,3 +10,4 @@ SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(ASYM_GRAN, 1)
 SCHED_FEAT(LB_BIAS, 0)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
+SCHED_FEAT(ASYM_EFF_LOAD, 1)
-- 
cgit v1.2.3


From 55e12e5e7b1d7e7c05a4be10cb5fd092c039aa78 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Tue, 24 Jun 2008 23:39:43 +0530
Subject: sched: make sched_{rt,fair}.c ifdefs more readable

Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c |  6 +++---
 kernel/sched_rt.c   | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9bcc0030a58b..2e43d4a748c3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -921,7 +921,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 		hrtick_start(rq, delta, requeue);
 	}
 }
-#else
+#else /* !CONFIG_SCHED_HRTICK */
 static inline void
 hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
@@ -1062,7 +1062,7 @@ static int wake_idle(int cpu, struct task_struct *p)
 	}
 	return cpu;
 }
-#else
+#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
 static inline int wake_idle(int cpu, struct task_struct *p)
 {
 	return cpu;
@@ -1586,7 +1586,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 	return 0;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 /*
  * scheduler tick hitting a task of our scheduling class:
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 765932d0399d..47ceac9e8552 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -161,7 +161,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 	return &rt_rq->tg->rt_bandwidth;
 }
 
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
 
 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
@@ -226,7 +226,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 	return &def_rt_bandwidth;
 }
 
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_SMP
 static int do_balance_runtime(struct rt_rq *rt_rq)
@@ -374,12 +374,12 @@ static int balance_runtime(struct rt_rq *rt_rq)
 
 	return more;
 }
-#else
+#else /* !CONFIG_SMP */
 static inline int balance_runtime(struct rt_rq *rt_rq)
 {
 	return 0;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
@@ -1472,4 +1472,4 @@ static void print_rt_stats(struct seq_file *m, int cpu)
 		print_rt_rq(m, cpu, rt_rq);
 	rcu_read_unlock();
 }
-#endif
+#endif /* CONFIG_SCHED_DEBUG */
-- 
cgit v1.2.3


From 2087a1ad822cd3a68b73338457047fcc54da726b Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 27 Jun 2008 14:30:00 -0600
Subject: sched: add avg-overlap support to RT tasks

We have the notion of tracking process-coupling (a.k.a. buddy-wake) via
the p->se.last_wake / p->se.avg_overlap facilities, but it is only used
for cfs to cfs interactions.  There is no reason why an rt to cfs
interaction cannot share in establishing a relationhip in a similar
manner.

Because PREEMPT_RT runs many kernel threads as FIFO priority, we often
times have heavy interaction between RT threads waking CFS applications.
This patch offers a substantial boost (50-60%+) in perfomance under those
circumstances.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Cc: npiggin@suse.de
Cc: rostedt@goodmis.org
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 14 ++++++++++++++
 kernel/sched_fair.c | 21 ++-------------------
 2 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'kernel/sched_fair.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index d99aeabeb72f..bbc40c3a0657 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1693,6 +1693,12 @@ static void set_load_weight(struct task_struct *p)
 	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
 
+static void update_avg(u64 *avg, u64 sample)
+{
+	s64 diff = sample - *avg;
+	*avg += diff >> 3;
+}
+
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	sched_info_queued(p);
@@ -1702,6 +1708,12 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
+	if (sleep && p->se.last_wakeup) {
+		update_avg(&p->se.avg_overlap,
+			   p->se.sum_exec_runtime - p->se.last_wakeup);
+		p->se.last_wakeup = 0;
+	}
+
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
 }
@@ -2313,6 +2325,8 @@ out_running:
 		p->sched_class->task_wake_up(rq, p);
 #endif
 out:
+	current->se.last_wakeup = current->se.sum_exec_runtime;
+
 	task_rq_unlock(rq, &flags);
 
 	return success;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2e43d4a748c3..f2aa987027d6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -726,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 		__enqueue_entity(cfs_rq, se);
 }
 
-static void update_avg(u64 *avg, u64 sample)
-{
-	s64 diff = sample - *avg;
-	*avg += diff >> 3;
-}
-
-static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	if (!se->last_wakeup)
-		return;
-
-	update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
-	se->last_wakeup = 0;
-}
-
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
@@ -751,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 
 	update_stats_dequeue(cfs_rq, se);
 	if (sleep) {
-		update_avg_stats(cfs_rq, se);
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
@@ -1196,9 +1180,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	 * a reasonable amount of time then attract this newly
 	 * woken task:
 	 */
-	if (sync && balanced && curr->sched_class == &fair_sched_class) {
+	if (sync && balanced) {
 		if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-				p->se.avg_overlap < sysctl_sched_migration_cost)
+		    p->se.avg_overlap < sysctl_sched_migration_cost)
 			return 1;
 	}
 
@@ -1359,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		return;
 	}
 
-	se->last_wakeup = se->sum_exec_runtime;
 	if (unlikely(se == pse))
 		return;
 
-- 
cgit v1.2.3