From a555e9d86ee384d9d3cb3310a57aed33f7e053d4 Mon Sep 17 00:00:00 2001
From: Cheng Jian <cj.chengjian@huawei.com>
Date: Thu, 7 Dec 2017 21:30:43 +0800
Subject: sched/fair: Remove unused 'curr' parameter from wakeup_gran

The first parameter of wakeup_gran(), 'curr', is unnecessary now.

Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: huawei.libin@huawei.com
Cc: xiexiuqi@huawei.com
Link: http://lkml.kernel.org/r/1512653443-179848-1-git-send-email-cj.chengjian@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2fe3aa853e4d..2915c0d95107 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6449,8 +6449,7 @@ static void task_dead_fair(struct task_struct *p)
 }
 #endif /* CONFIG_SMP */
 
-static unsigned long
-wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
+static unsigned long wakeup_gran(struct sched_entity *se)
 {
 	unsigned long gran = sysctl_sched_wakeup_granularity;
 
@@ -6492,7 +6491,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 	if (vdiff <= 0)
 		return -1;
 
-	gran = wakeup_gran(curr, se);
+	gran = wakeup_gran(se);
 	if (vdiff > gran)
 		return 1;
 
-- 
cgit v1.2.3


From f01415fdbfe83380c2dfcf90b7b26042f88963aa Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Tue, 5 Dec 2017 17:10:15 +0000
Subject: sched/fair: Use 'unsigned long' for utilization, consistently

Utilization and capacity are tracked as 'unsigned long', however some
functions using them return an 'int' which is ultimately assigned back to
'unsigned long' variables.

Since there is not scope on using a different and signed type,
consolidate the signature of functions returning utilization to always
use the native type.

This change improves code consistency, and it also benefits
code paths where utilizations should be clamped by avoiding
further type conversions or ugly type casts.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chris Redpath <chris.redpath@arm.com>
Reviewed-by: Brendan Jackman <brendan.jackman@arm.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Todd Kjos <tkjos@android.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Link: http://lkml.kernel.org/r/20171205171018.9203-2-patrick.bellasi@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2915c0d95107..de43bd80a98f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5765,8 +5765,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 	return affine;
 }
 
-static inline int task_util(struct task_struct *p);
-static int cpu_util_wake(int cpu, struct task_struct *p);
+static inline unsigned long task_util(struct task_struct *p);
+static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
 
 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
 {
@@ -6247,7 +6247,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
  * capacity_orig) as it useful for predicting the capacity required after task
  * migrations (scheduler-driven DVFS).
  */
-static int cpu_util(int cpu)
+static unsigned long cpu_util(int cpu)
 {
 	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
 	unsigned long capacity = capacity_orig_of(cpu);
@@ -6255,7 +6255,7 @@ static int cpu_util(int cpu)
 	return (util >= capacity) ? capacity : util;
 }
 
-static inline int task_util(struct task_struct *p)
+static inline unsigned long task_util(struct task_struct *p)
 {
 	return p->se.avg.util_avg;
 }
@@ -6264,7 +6264,7 @@ static inline int task_util(struct task_struct *p)
  * cpu_util_wake: Compute cpu utilization with any contributions from
  * the waking task p removed.
  */
-static int cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
 {
 	unsigned long util, capacity;
 
-- 
cgit v1.2.3


From f453ae2200b0d1b7abc0c3794ce088899ac7a2af Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 14 Dec 2017 13:21:58 -0800
Subject: sched/fair: Consider RT/IRQ pressure in capacity_spare_wake()

capacity_spare_wake() in the slow path influences choice of idlest groups,
as we search for groups with maximum spare capacity. In scenarios where
RT pressure is high, a sub optimal group can be chosen and hurt
performance of the task being woken up.

Fix this by using capacity_of() instead of capacity_orig_of() in capacity_spare_wake().

Tests results from improvements with this change are below. More tests
were also done by myself and Matt Fleming to ensure no degradation in
different benchmarks.

1) Rohit ran barrier.c test (details below) with following improvements:
------------------------------------------------------------------------
This was Rohit's original use case for a patch he posted at [1] however
from his recent tests he showed my patch can replace his slow path
changes [1] and there's no need to selectively scan/skip CPUs in
find_idlest_group_cpu in the slow path to get the improvement he sees.

barrier.c (open_mp code) as a micro-benchmark. It does a number of
iterations and barrier sync at the end of each for loop.

Here barrier,c is running in along with ping on CPU 0 and 1 as:
'ping -l 10000 -q -s 10 -f hostX'

barrier.c can be found at:
http://www.spinics.net/lists/kernel/msg2506955.html

Following are the results for the iterations per second with this
micro-benchmark (higher is better), on a 44 core, 2 socket 88 Threads
Intel x86 machine:
+--------+------------------+---------------------------+
|Threads | Without patch    | With patch                |
|        |                  |                           |
+--------+--------+---------+-----------------+---------+
|        | Mean   | Std Dev | Mean            | Std Dev |
+--------+--------+---------+-----------------+---------+
|1       | 539.36 | 60.16   | 572.54 (+6.15%) | 40.95   |
|2       | 481.01 | 19.32   | 530.64 (+10.32%)| 56.16   |
|4       | 474.78 | 22.28   | 479.46 (+0.99%) | 18.89   |
|8       | 450.06 | 24.91   | 447.82 (-0.50%) | 12.36   |
|16      | 436.99 | 22.57   | 441.88 (+1.12%) | 7.39    |
|32      | 388.28 | 55.59   | 429.4  (+10.59%)| 31.14   |
|64      | 314.62 | 6.33    | 311.81 (-0.89%) | 11.99   |
+--------+--------+---------+-----------------+---------+

2) ping+hackbench test on bare-metal sever (by Rohit)
-----------------------------------------------------
Here hackbench is running in threaded mode along
with, running ping on CPU 0 and 1 as:
'ping -l 10000 -q -s 10 -f hostX'

This test is running on 2 socket, 20 core and 40 threads Intel x86
machine:
Number of loops is 10000 and runtime is in seconds (Lower is better).

+--------------+-----------------+--------------------------+
|Task Groups   | Without patch   |  With patch              |
|              +-------+---------+----------------+---------+
|(Groups of 40)| Mean  | Std Dev |  Mean          | Std Dev |
+--------------+-------+---------+----------------+---------+
|1             | 0.851 | 0.007   |  0.828 (+2.77%)| 0.032   |
|2             | 1.083 | 0.203   |  1.087 (-0.37%)| 0.246   |
|4             | 1.601 | 0.051   |  1.611 (-0.62%)| 0.055   |
|8             | 2.837 | 0.060   |  2.827 (+0.35%)| 0.031   |
|16            | 5.139 | 0.133   |  5.107 (+0.63%)| 0.085   |
|25            | 7.569 | 0.142   |  7.503 (+0.88%)| 0.143   |
+--------------+-------+---------+----------------+---------+

[1] https://patchwork.kernel.org/patch/9991635/

Matt Fleming also ran several different hackbench tests and cyclic test
to santiy-check that the patch doesn't harm other usecases.

Tested-by: Matt Fleming <matt@codeblueprint.co.uk>
Tested-by: Rohit Jain <rohit.k.jain@oracle.com>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Atish Patra <atish.patra@oracle.com>
Cc: Brendan Jackman <brendan.jackman@arm.com>
Cc: Chris Redpath <Chris.Redpath@arm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Ramussen <morten.rasmussen@arm.com>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Saravana Kannan <skannan@quicinc.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steve Muckle <smuckle@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vikram Mulukutla <markivx@codeaurora.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Link: http://lkml.kernel.org/r/20171214212158.188190-1-joelaf@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index de43bd80a98f..6e775ac39eb4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5770,7 +5770,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
 
 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
 {
-	return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+	return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
 }
 
 /*
-- 
cgit v1.2.3


From 18cec7e0ddd5e28b7722f7049d715873373be3e9 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Fri, 15 Dec 2017 07:39:44 -0800
Subject: sched/fair: Remove impossible condition from find_idlest_group_cpu()

find_idlest_group_cpu() goes through CPUs of a group previous selected by
find_idlest_group(). find_idlest_group() returns NULL if the local group is the
selected one and doesn't execute find_idlest_group_cpu if the group to which
'cpu' belongs to is chosen. So we're always guaranteed to call
find_idlest_group_cpu() with a group to which 'cpu' is non-local.

This makes one of the conditions in find_idlest_group_cpu() an impossible one,
which we can get rid off.

Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Brendan Jackman <brendan.jackman@arm.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Android Kernel <kernel-team@android.com>
Cc: Atish Patra <atish.patra@oracle.com>
Cc: Chris Redpath <Chris.Redpath@arm.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: EAS Dev <eas-dev@lists.linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Ramussen <morten.rasmussen@arm.com>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Rohit Jain <rohit.k.jain@oracle.com>
Cc: Saravana Kannan <skannan@quicinc.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steve Muckle <smuckle@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vikram Mulukutla <markivx@codeaurora.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Link: http://lkml.kernel.org/r/20171215153944.220146-3-joelaf@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e775ac39eb4..3e7606d3ad0f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5950,7 +5950,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 			}
 		} else if (shallowest_idle_cpu == -1) {
 			load = weighted_cpuload(cpu_rq(i));
-			if (load < min_load || (load == min_load && i == this_cpu)) {
+			if (load < min_load) {
 				min_load = load;
 				least_loaded_cpu = i;
 			}
-- 
cgit v1.2.3


From 9783be2c0e90bbaceec3c471c4fb017bff7293ba Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Fri, 15 Dec 2017 07:39:43 -0800
Subject: sched/fair: Correct obsolete comment about cpufreq_update_util()

Since the remote cpufreq callback work, the cpufreq_update_util() call can happen
from remote CPUs. The comment about local CPUs is thus obsolete. Update it
accordingly.

Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Android Kernel <kernel-team@android.com>
Cc: Atish Patra <atish.patra@oracle.com>
Cc: Chris Redpath <Chris.Redpath@arm.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: EAS Dev <eas-dev@lists.linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Ramussen <morten.rasmussen@arm.com>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Rohit Jain <rohit.k.jain@oracle.com>
Cc: Saravana Kannan <skannan@quicinc.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Steve Muckle <smuckle@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vikram Mulukutla <markivx@codeaurora.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/20171215153944.220146-2-joelaf@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3e7606d3ad0f..59e66a5848d0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3020,9 +3020,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 		/*
 		 * There are a few boundary cases this might miss but it should
 		 * get called often enough that that should (hopefully) not be
-		 * a real problem -- added to that it only calls on the local
-		 * CPU, so if we enqueue remotely we'll miss an update, but
-		 * the next tick/schedule should update.
+		 * a real problem.
 		 *
 		 * It will not get called when we go idle, because the idle
 		 * thread is a different class (!fair), nor will the utilization
-- 
cgit v1.2.3


From 7332dec055f2457c386032f7e9b2991eb05c2a0a Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 19 Dec 2017 08:59:47 +0000
Subject: sched/fair: Only immediately migrate tasks due to interrupts if prev
 and target CPUs share cache

If waking from an idle CPU due to an interrupt then it's possible that
the waker task will be pulled to wake on the current CPU. Unfortunately,
depending on the type of interrupt and IRQ configuration, there may not
be a strong relationship between the CPU an interrupt was delivered on
and the CPU a task was running on. For example, the interrupts could all
be delivered to CPUs on one particular node due to the machine topology
or IRQ affinity configuration. Another example is an interrupt for an IO
completion which can be delivered to any CPU where there is no guarantee
the data is either cache hot or even local.

This patch was motivated by the observation that an IO workload was
being pulled cross-node on a frequent basis when IO completed.  From a
wakeup latency perspective, it's still useful to know that an idle CPU is
immediately available for use but lets only consider an automatic migration
if the CPUs share cache to limit damage due to NUMA migrations. Migrations
may still occur if wake_affine_weight determines it's appropriate.

These are the throughput results for dbench running on ext4 comparing
4.15-rc3 and this patch on a 2-socket machine where interrupts due to IO
completions can happen on any CPU.

                          4.15.0-rc3             4.15.0-rc3
                             vanilla            lessmigrate
Hmean     1        854.64 (   0.00%)      865.01 (   1.21%)
Hmean     2       1229.60 (   0.00%)     1274.44 (   3.65%)
Hmean     4       1591.81 (   0.00%)     1628.08 (   2.28%)
Hmean     8       1845.04 (   0.00%)     1831.80 (  -0.72%)
Hmean     16      2038.61 (   0.00%)     2091.44 (   2.59%)
Hmean     32      2327.19 (   0.00%)     2430.29 (   4.43%)
Hmean     64      2570.61 (   0.00%)     2568.54 (  -0.08%)
Hmean     128     2481.89 (   0.00%)     2499.28 (   0.70%)
Stddev    1         14.31 (   0.00%)        5.35 (  62.65%)
Stddev    2         21.29 (   0.00%)       11.09 (  47.92%)
Stddev    4          7.22 (   0.00%)        6.80 (   5.92%)
Stddev    8         26.70 (   0.00%)        9.41 (  64.76%)
Stddev    16        22.40 (   0.00%)       20.01 (  10.70%)
Stddev    32        45.13 (   0.00%)       44.74 (   0.85%)
Stddev    64        93.10 (   0.00%)       93.18 (  -0.09%)
Stddev    128      184.28 (   0.00%)      177.85 (   3.49%)

Note the small increase in throughput for low thread counts but also
note that the standard deviation for each sample during the test run is
lower. The throughput figures for dbench can be misleading so the benchmark
is actually modified to time the latency of the processing of one load
file with many samples taken. The difference in latency is

                           4.15.0-rc3             4.15.0-rc3
                              vanilla            lessmigrate
Amean      1         21.71 (   0.00%)       21.47 (   1.08%)
Amean      2         30.89 (   0.00%)       29.58 (   4.26%)
Amean      4         47.54 (   0.00%)       46.61 (   1.97%)
Amean      8         82.71 (   0.00%)       82.81 (  -0.12%)
Amean      16       149.45 (   0.00%)      145.01 (   2.97%)
Amean      32       265.49 (   0.00%)      248.43 (   6.42%)
Amean      64       463.23 (   0.00%)      463.55 (  -0.07%)
Amean      128      933.97 (   0.00%)      935.50 (  -0.16%)
Stddev     1          1.58 (   0.00%)        1.54 (   2.26%)
Stddev     2          2.84 (   0.00%)        2.95 (  -4.15%)
Stddev     4          6.78 (   0.00%)        6.85 (  -0.99%)
Stddev     8         16.85 (   0.00%)       16.37 (   2.85%)
Stddev     16        41.59 (   0.00%)       41.04 (   1.32%)
Stddev     32       111.05 (   0.00%)      105.11 (   5.35%)
Stddev     64       285.94 (   0.00%)      288.01 (  -0.72%)
Stddev     128      803.39 (   0.00%)      809.73 (  -0.79%)

It's a small improvement which is not surprising given that migrations that
migrate to a different node as not that common. However, it is noticeable
in the CPU migration statistics which are reduced by 24%.

There was a query for v1 of this patch about NAS so here are the results
for C-class using MPI for parallelisation on the same machine

nas-mpi
                      4.15.0-rc3             4.15.0-rc3
                         vanilla                  noirq
Time cg.C       24.25 (   0.00%)       23.17 (   4.45%)
Time ep.C        8.22 (   0.00%)        8.29 (  -0.85%)
Time ft.C       22.67 (   0.00%)       20.34 (  10.28%)
Time is.C        1.42 (   0.00%)        1.47 (  -3.52%)
Time lu.C       55.62 (   0.00%)       54.81 (   1.46%)
Time mg.C        7.93 (   0.00%)        7.91 (   0.25%)

          4.15.0-rc3  4.15.0-rc3
             vanilla  noirq-v1r1
User         3799.96     3748.34
System        672.10      626.15
Elapsed        91.91       79.49

lu.C sees a small gain, ft.C a large gain and ep.C and is.C see small
regressions but in terms of absolute time, the difference is small and
likely within run-to-run variance. System CPU usage is slightly reduced.

schbench from Facebook was also requested. This is a bit of a mixed bag but
it's important to note that this workload should not be heavily impacted
by wakeups from interrupt context.

                                 4.15.0-rc3             4.15.0-rc3
                                    vanilla             noirq-v1r1
Lat 50.00th-qrtle-1        41.00 (   0.00%)       41.00 (   0.00%)
Lat 75.00th-qrtle-1        42.00 (   0.00%)       42.00 (   0.00%)
Lat 90.00th-qrtle-1        43.00 (   0.00%)       44.00 (  -2.33%)
Lat 95.00th-qrtle-1        44.00 (   0.00%)       46.00 (  -4.55%)
Lat 99.00th-qrtle-1        57.00 (   0.00%)       58.00 (  -1.75%)
Lat 99.50th-qrtle-1        59.00 (   0.00%)       59.00 (   0.00%)
Lat 99.90th-qrtle-1        67.00 (   0.00%)       78.00 ( -16.42%)
Lat 50.00th-qrtle-2        40.00 (   0.00%)       51.00 ( -27.50%)
Lat 75.00th-qrtle-2        45.00 (   0.00%)       56.00 ( -24.44%)
Lat 90.00th-qrtle-2        53.00 (   0.00%)       59.00 ( -11.32%)
Lat 95.00th-qrtle-2        57.00 (   0.00%)       61.00 (  -7.02%)
Lat 99.00th-qrtle-2        67.00 (   0.00%)       71.00 (  -5.97%)
Lat 99.50th-qrtle-2        69.00 (   0.00%)       74.00 (  -7.25%)
Lat 99.90th-qrtle-2        83.00 (   0.00%)       77.00 (   7.23%)
Lat 50.00th-qrtle-4        51.00 (   0.00%)       51.00 (   0.00%)
Lat 75.00th-qrtle-4        57.00 (   0.00%)       56.00 (   1.75%)
Lat 90.00th-qrtle-4        60.00 (   0.00%)       59.00 (   1.67%)
Lat 95.00th-qrtle-4        62.00 (   0.00%)       62.00 (   0.00%)
Lat 99.00th-qrtle-4        73.00 (   0.00%)       72.00 (   1.37%)
Lat 99.50th-qrtle-4        76.00 (   0.00%)       74.00 (   2.63%)
Lat 99.90th-qrtle-4        85.00 (   0.00%)       78.00 (   8.24%)
Lat 50.00th-qrtle-8        54.00 (   0.00%)       58.00 (  -7.41%)
Lat 75.00th-qrtle-8        59.00 (   0.00%)       62.00 (  -5.08%)
Lat 90.00th-qrtle-8        65.00 (   0.00%)       66.00 (  -1.54%)
Lat 95.00th-qrtle-8        67.00 (   0.00%)       70.00 (  -4.48%)
Lat 99.00th-qrtle-8        78.00 (   0.00%)       79.00 (  -1.28%)
Lat 99.50th-qrtle-8        81.00 (   0.00%)       80.00 (   1.23%)
Lat 99.90th-qrtle-8       116.00 (   0.00%)       83.00 (  28.45%)
Lat 50.00th-qrtle-16       65.00 (   0.00%)       64.00 (   1.54%)
Lat 75.00th-qrtle-16       77.00 (   0.00%)       71.00 (   7.79%)
Lat 90.00th-qrtle-16       83.00 (   0.00%)       82.00 (   1.20%)
Lat 95.00th-qrtle-16       87.00 (   0.00%)       87.00 (   0.00%)
Lat 99.00th-qrtle-16       95.00 (   0.00%)       96.00 (  -1.05%)
Lat 99.50th-qrtle-16       99.00 (   0.00%)      103.00 (  -4.04%)
Lat 99.90th-qrtle-16      104.00 (   0.00%)      122.00 ( -17.31%)
Lat 50.00th-qrtle-32       71.00 (   0.00%)       73.00 (  -2.82%)
Lat 75.00th-qrtle-32       91.00 (   0.00%)       92.00 (  -1.10%)
Lat 90.00th-qrtle-32      108.00 (   0.00%)      107.00 (   0.93%)
Lat 95.00th-qrtle-32      118.00 (   0.00%)      115.00 (   2.54%)
Lat 99.00th-qrtle-32      134.00 (   0.00%)      129.00 (   3.73%)
Lat 99.50th-qrtle-32      138.00 (   0.00%)      133.00 (   3.62%)
Lat 99.90th-qrtle-32      149.00 (   0.00%)      146.00 (   2.01%)
Lat 50.00th-qrtle-39       83.00 (   0.00%)       81.00 (   2.41%)
Lat 75.00th-qrtle-39      105.00 (   0.00%)      102.00 (   2.86%)
Lat 90.00th-qrtle-39      120.00 (   0.00%)      119.00 (   0.83%)
Lat 95.00th-qrtle-39      129.00 (   0.00%)      128.00 (   0.78%)
Lat 99.00th-qrtle-39      153.00 (   0.00%)      149.00 (   2.61%)
Lat 99.50th-qrtle-39      166.00 (   0.00%)      156.00 (   6.02%)
Lat 99.90th-qrtle-39    12304.00 (   0.00%)    12848.00 (  -4.42%)

When heavily loaded (e.g. 99.50th-qrtle-39 indicates 39 threads), there
are small gains in many cases. Otherwise it depends on the quartile used
where it can be bad -- e.g. 75.00th-qrtle-2. However, even these results
are probably a co-incidence. For this workload, much depends on what node
the threads get placed on and their relative locality and not wakeups from
interrupt context. A larger component on how it behaves would be automatic
NUMA balancing where a fault incurred to measure locality would be a much
larger contributer to latency than the wakeup path.

This is the results from an almost identical machine that happened to run
the same test.  They only differ in terms of storage which is irrelevant
for this test.

                                 4.15.0-rc3             4.15.0-rc3
                                    vanilla             noirq-v1r1
Lat 50.00th-qrtle-1        41.00 (   0.00%)       41.00 (   0.00%)
Lat 75.00th-qrtle-1        42.00 (   0.00%)       42.00 (   0.00%)
Lat 90.00th-qrtle-1        44.00 (   0.00%)       43.00 (   2.27%)
Lat 95.00th-qrtle-1        53.00 (   0.00%)       45.00 (  15.09%)
Lat 99.00th-qrtle-1        59.00 (   0.00%)       58.00 (   1.69%)
Lat 99.50th-qrtle-1        60.00 (   0.00%)       59.00 (   1.67%)
Lat 99.90th-qrtle-1        86.00 (   0.00%)       61.00 (  29.07%)
Lat 50.00th-qrtle-2        52.00 (   0.00%)       41.00 (  21.15%)
Lat 75.00th-qrtle-2        57.00 (   0.00%)       46.00 (  19.30%)
Lat 90.00th-qrtle-2        60.00 (   0.00%)       53.00 (  11.67%)
Lat 95.00th-qrtle-2        62.00 (   0.00%)       57.00 (   8.06%)
Lat 99.00th-qrtle-2        73.00 (   0.00%)       68.00 (   6.85%)
Lat 99.50th-qrtle-2        74.00 (   0.00%)       71.00 (   4.05%)
Lat 99.90th-qrtle-2        90.00 (   0.00%)       75.00 (  16.67%)
Lat 50.00th-qrtle-4        57.00 (   0.00%)       52.00 (   8.77%)
Lat 75.00th-qrtle-4        60.00 (   0.00%)       58.00 (   3.33%)
Lat 90.00th-qrtle-4        62.00 (   0.00%)       62.00 (   0.00%)
Lat 95.00th-qrtle-4        65.00 (   0.00%)       65.00 (   0.00%)
Lat 99.00th-qrtle-4        76.00 (   0.00%)       75.00 (   1.32%)
Lat 99.50th-qrtle-4        77.00 (   0.00%)       77.00 (   0.00%)
Lat 99.90th-qrtle-4        87.00 (   0.00%)       81.00 (   6.90%)
Lat 50.00th-qrtle-8        59.00 (   0.00%)       57.00 (   3.39%)
Lat 75.00th-qrtle-8        63.00 (   0.00%)       62.00 (   1.59%)
Lat 90.00th-qrtle-8        66.00 (   0.00%)       67.00 (  -1.52%)
Lat 95.00th-qrtle-8        68.00 (   0.00%)       70.00 (  -2.94%)
Lat 99.00th-qrtle-8        79.00 (   0.00%)       80.00 (  -1.27%)
Lat 99.50th-qrtle-8        80.00 (   0.00%)       84.00 (  -5.00%)
Lat 99.90th-qrtle-8        84.00 (   0.00%)       90.00 (  -7.14%)
Lat 50.00th-qrtle-16       65.00 (   0.00%)       65.00 (   0.00%)
Lat 75.00th-qrtle-16       77.00 (   0.00%)       75.00 (   2.60%)
Lat 90.00th-qrtle-16       84.00 (   0.00%)       83.00 (   1.19%)
Lat 95.00th-qrtle-16       88.00 (   0.00%)       87.00 (   1.14%)
Lat 99.00th-qrtle-16       97.00 (   0.00%)       96.00 (   1.03%)
Lat 99.50th-qrtle-16      100.00 (   0.00%)      104.00 (  -4.00%)
Lat 99.90th-qrtle-16      110.00 (   0.00%)      126.00 ( -14.55%)
Lat 50.00th-qrtle-32       70.00 (   0.00%)       71.00 (  -1.43%)
Lat 75.00th-qrtle-32       92.00 (   0.00%)       94.00 (  -2.17%)
Lat 90.00th-qrtle-32      110.00 (   0.00%)      110.00 (   0.00%)
Lat 95.00th-qrtle-32      121.00 (   0.00%)      118.00 (   2.48%)
Lat 99.00th-qrtle-32      135.00 (   0.00%)      137.00 (  -1.48%)
Lat 99.50th-qrtle-32      140.00 (   0.00%)      146.00 (  -4.29%)
Lat 99.90th-qrtle-32      150.00 (   0.00%)      160.00 (  -6.67%)
Lat 50.00th-qrtle-39       80.00 (   0.00%)       71.00 (  11.25%)
Lat 75.00th-qrtle-39      102.00 (   0.00%)       91.00 (  10.78%)
Lat 90.00th-qrtle-39      118.00 (   0.00%)      108.00 (   8.47%)
Lat 95.00th-qrtle-39      128.00 (   0.00%)      117.00 (   8.59%)
Lat 99.00th-qrtle-39      149.00 (   0.00%)      133.00 (  10.74%)
Lat 99.50th-qrtle-39      160.00 (   0.00%)      139.00 (  13.12%)
Lat 99.90th-qrtle-39    13808.00 (   0.00%)     4920.00 (  64.37%)

Despite being nearly identical, it showed a variety of major gains so
I'm not convinced that heavy emphasis should be placed on this particular
workload in terms of evaluating this particular patch. Further evidence of
this is the fact that testing on a UMA machine showed small gains/losses
even though the patch should be a no-op on UMA.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171219085947.13136-2-mgorman@techsingularity.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 59e66a5848d0..9fec992410f7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5687,8 +5687,8 @@ static int wake_wide(struct task_struct *p)
  * soonest. For the purpose of speed we only consider the waking and previous
  * CPU.
  *
- * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
- *			will be) idle.
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is
+ *			cache-affine and is (or	will be) idle.
  *
  * wake_affine_weight() - considers the weight to reflect the average
  *			  scheduling latency of the CPUs. This seems to work
@@ -5699,7 +5699,13 @@ static bool
 wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
 		 int this_cpu, int prev_cpu, int sync)
 {
-	if (idle_cpu(this_cpu))
+	/*
+	 * If this_cpu is idle, it implies the wakeup is from interrupt
+	 * context. Only allow the move if cache is shared. Otherwise an
+	 * interrupt intensive workload could force all tasks onto one
+	 * node depending on the IO topology or IRQ affinity settings.
+	 */
+	if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
 		return true;
 
 	if (sync && cpu_rq(this_cpu)->nr_running == 1)
-- 
cgit v1.2.3


From 7673c8a4c75d1cac2cd47156b9768f462683a09d Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Mon, 4 Dec 2017 11:23:23 +0100
Subject: sched/cpufreq: Remove arch_scale_freq_capacity()'s 'sd' parameter

The 'sd' parameter is never used in arch_scale_freq_capacity() (and it's hard to
see where information coming from scheduling domains might help doing
frequency invariance scaling).

Remove it; also in anticipation of moving arch_scale_freq_capacity()
outside CONFIG_SMP.

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: alessio.balsini@arm.com
Cc: bristot@redhat.com
Cc: claudio@evidence.eu.com
Cc: dietmar.eggemann@arm.com
Cc: joelaf@google.com
Cc: juri.lelli@redhat.com
Cc: luca.abeni@santannapisa.it
Cc: mathieu.poirier@linaro.org
Cc: morten.rasmussen@arm.com
Cc: patrick.bellasi@arm.com
Cc: rjw@rjwysocki.net
Cc: rostedt@goodmis.org
Cc: tkjos@android.com
Cc: tommaso.cucinotta@santannapisa.it
Cc: vincent.guittot@linaro.org
Cc: viresh.kumar@linaro.org
Link: http://lkml.kernel.org/r/20171204102325.5110-7-juri.lelli@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/arch_topology.h | 2 +-
 kernel/sched/fair.c           | 2 +-
 kernel/sched/sched.h          | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel/sched/fair.c')

diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 304511267c82..2b709416de05 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -27,7 +27,7 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
 DECLARE_PER_CPU(unsigned long, freq_scale);
 
 static inline
-unsigned long topology_get_freq_scale(struct sched_domain *sd, int cpu)
+unsigned long topology_get_freq_scale(int cpu)
 {
 	return per_cpu(freq_scale, cpu);
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9fec992410f7..14859757bff0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3120,7 +3120,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
 	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
 	u64 periods;
 
-	scale_freq = arch_scale_freq_capacity(NULL, cpu);
+	scale_freq = arch_scale_freq_capacity(cpu);
 	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 
 	delta += sa->period_contrib;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c5197338ac47..b7100192ecd3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1675,7 +1675,7 @@ extern void sched_avg_update(struct rq *rq);
 
 #ifndef arch_scale_freq_capacity
 static __always_inline
-unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+unsigned long arch_scale_freq_capacity(int cpu)
 {
 	return SCHED_CAPACITY_SCALE;
 }
@@ -1694,7 +1694,7 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
-	rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
+	rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq));
 	sched_avg_update(rq);
 }
 #else
-- 
cgit v1.2.3


From 07881166a892fa4908ac4924660a7793f75d6544 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Mon, 4 Dec 2017 11:23:25 +0100
Subject: sched/deadline: Make bandwidth enforcement scale-invariant

Apply frequency and CPU scale-invariance correction factor to bandwidth
enforcement (similar to what we already do to fair utilization tracking).

Each delta_exec gets scaled considering current frequency and maximum
CPU capacity; which means that the reservation runtime parameter (that
need to be specified profiling the task execution at max frequency on
biggest capacity core) gets thus scaled accordingly.

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Claudio Scordino <claudio@evidence.eu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luca Abeni <luca.abeni@santannapisa.it>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: alessio.balsini@arm.com
Cc: bristot@redhat.com
Cc: dietmar.eggemann@arm.com
Cc: joelaf@google.com
Cc: juri.lelli@redhat.com
Cc: mathieu.poirier@linaro.org
Cc: morten.rasmussen@arm.com
Cc: patrick.bellasi@arm.com
Cc: rjw@rjwysocki.net
Cc: rostedt@goodmis.org
Cc: tkjos@android.com
Cc: tommaso.cucinotta@santannapisa.it
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/20171204102325.5110-9-juri.lelli@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 26 ++++++++++++++++++++++----
 kernel/sched/fair.c     |  2 --
 kernel/sched/sched.h    |  2 ++
 3 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 54a0dc1424a9..9bb0e0c412ec 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1151,7 +1151,8 @@ static void update_curr_dl(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_dl_entity *dl_se = &curr->dl;
-	u64 delta_exec;
+	u64 delta_exec, scaled_delta_exec;
+	int cpu = cpu_of(rq);
 
 	if (!dl_task(curr) || !on_dl_rq(dl_se))
 		return;
@@ -1185,9 +1186,26 @@ static void update_curr_dl(struct rq *rq)
 	if (dl_entity_is_special(dl_se))
 		return;
 
-	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
-		delta_exec = grub_reclaim(delta_exec, rq, &curr->dl);
-	dl_se->runtime -= delta_exec;
+	/*
+	 * For tasks that participate in GRUB, we implement GRUB-PA: the
+	 * spare reclaimed bandwidth is used to clock down frequency.
+	 *
+	 * For the others, we still need to scale reservation parameters
+	 * according to current frequency and CPU maximum capacity.
+	 */
+	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
+		scaled_delta_exec = grub_reclaim(delta_exec,
+						 rq,
+						 &curr->dl);
+	} else {
+		unsigned long scale_freq = arch_scale_freq_capacity(cpu);
+		unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+		scaled_delta_exec = cap_scale(delta_exec, scale_freq);
+		scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
+	}
+
+	dl_se->runtime -= scaled_delta_exec;
 
 throttle:
 	if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 14859757bff0..1070803cb423 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3089,8 +3089,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
 	return c1 + c2 + c3;
 }
 
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
-
 /*
  * Accumulate the three separate parts of the sum; d1 the remainder
  * of the last (incomplete) period, d2 the span of full periods and d3
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e122c89bdbdd..2e95505e23c6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -156,6 +156,8 @@ static inline int task_has_dl_policy(struct task_struct *p)
 	return dl_policy(p->policy);
 }
 
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
 /*
  * !! For sched_setattr_nocheck() (kernel) only !!
  *
-- 
cgit v1.2.3