summaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c74
1 files changed, 74 insertions, 0 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0af3bed3521d..7e5712a0e61b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -925,6 +925,71 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(nid, 1)];
}
+/* Handle placement on systems where not all nodes are directly connected. */
+static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
+ int maxdist, bool task)
+{
+ unsigned long score = 0;
+ int node;
+
+ /*
+ * All nodes are directly connected, and the same distance
+ * from each other. No need for fancy placement algorithms.
+ */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return 0;
+
+ /*
+ * This code is called for each node, introducing N^2 complexity,
+ * which should be ok given the number of nodes rarely exceeds 8.
+ */
+ for_each_online_node(node) {
+ unsigned long faults;
+ int dist = node_distance(nid, node);
+
+ /*
+ * The furthest away nodes in the system are not interesting
+ * for placement; nid was already counted.
+ */
+ if (dist == sched_max_numa_distance || node == nid)
+ continue;
+
+ /*
+ * On systems with a backplane NUMA topology, compare groups
+ * of nodes, and move tasks towards the group with the most
+ * memory accesses. When comparing two nodes at distance
+ * "hoplimit", only nodes closer by than "hoplimit" are part
+ * of each group. Skip other nodes.
+ */
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist > maxdist)
+ continue;
+
+ /* Add up the faults from nearby nodes. */
+ if (task)
+ faults = task_faults(p, node);
+ else
+ faults = group_faults(p, node);
+
+ /*
+ * On systems with a glueless mesh NUMA topology, there are
+ * no fixed "groups of nodes". Instead, nodes that are not
+ * directly connected bounce traffic through intermediate
+ * nodes; a numa_group can occupy any set of nodes.
+ * The further away a node is, the less the faults count.
+ * This seems to result in good task placement.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ faults *= (sched_max_numa_distance - dist);
+ faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ }
+
+ score += faults;
+ }
+
+ return score;
+}
+
/*
* These return the fraction of accesses done by a particular task, or
* task group, on a particular numa node. The group weight is given a
@@ -945,6 +1010,8 @@ static inline unsigned long task_weight(struct task_struct *p, int nid,
return 0;
faults = task_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, true);
+
return 1000 * faults / total_faults;
}
@@ -962,6 +1029,8 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
return 0;
faults = group_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, false);
+
return 1000 * faults / total_faults;
}
@@ -1374,6 +1443,11 @@ static int task_numa_migrate(struct task_struct *p)
continue;
dist = node_distance(env.src_nid, env.dst_nid);
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist != env.dist) {
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ }
/* Only consider nodes where both task and groups benefit */
taskimp = task_weight(p, nid, dist) - taskweight;