diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpuset.c | 2 | ||||
-rw-r--r-- | kernel/sched.c | 963 | ||||
-rw-r--r-- | kernel/sched_fair.c | 32 |
3 files changed, 318 insertions, 679 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 33eee16addb8..2bb8c2e98fff 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void) static int update_relax_domain_level(struct cpuset *cs, s64 val) { #ifdef CONFIG_SMP - if (val < -1 || val >= SD_LV_MAX) + if (val < -1 || val >= sched_domain_level_max) return -EINVAL; #endif diff --git a/kernel/sched.c b/kernel/sched.c index cd597c7442a3..0cfe0310ed5d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) #endif /* - * sched_domains_mutex serializes calls to arch_init_sched_domains, + * sched_domains_mutex serializes calls to init_sched_domains, * detach_destroy_domains and partition_sched_domains. */ static DEFINE_MUTEX(sched_domains_mutex); @@ -420,6 +420,7 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; + struct rcu_head rcu; cpumask_var_t span; cpumask_var_t online; @@ -578,7 +579,7 @@ static inline int cpu_of(struct rq *rq) #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ - rcu_read_lock_sched_held() || \ + rcu_read_lock_held() || \ lockdep_is_held(&sched_domains_mutex)) /* @@ -6450,6 +6451,8 @@ early_initcall(migration_init); #ifdef CONFIG_SMP +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ + #ifdef CONFIG_SCHED_DEBUG static __read_mostly int sched_domain_debug_enabled; @@ -6545,7 +6548,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, static void sched_domain_debug(struct sched_domain *sd, int cpu) { - cpumask_var_t groupmask; int level = 0; if (!sched_domain_debug_enabled) @@ -6558,20 +6560,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { - printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); - return; - } - for (;;) { - if (sched_domain_debug_one(sd, cpu, level, groupmask)) + if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) break; level++; sd = sd->parent; if (!sd) break; } - free_cpumask_var(groupmask); } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) @@ -6628,12 +6624,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } -static void free_rootdomain(struct root_domain *rd) +static void free_rootdomain(struct rcu_head *rcu) { - synchronize_sched(); + struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); - free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); @@ -6674,7 +6669,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) - free_rootdomain(old_rd); + call_rcu_sched(&old_rd->rcu, free_rootdomain); } static int init_rootdomain(struct root_domain *rd) @@ -6725,6 +6720,25 @@ static struct root_domain *alloc_rootdomain(void) return rd; } +static void free_sched_domain(struct rcu_head *rcu) +{ + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + if (atomic_dec_and_test(&sd->groups->ref)) + kfree(sd->groups); + kfree(sd); +} + +static void destroy_sched_domain(struct sched_domain *sd, int cpu) +{ + call_rcu(&sd->rcu, free_sched_domain); +} + +static void destroy_sched_domains(struct sched_domain *sd, int cpu) +{ + for (; sd; sd = sd->parent) + destroy_sched_domain(sd, cpu); +} + /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. @@ -6735,9 +6749,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; - for (tmp = sd; tmp; tmp = tmp->parent) - tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); - /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; @@ -6748,12 +6759,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; + destroy_sched_domain(parent, cpu); } else tmp = tmp->parent; } if (sd && sd_degenerate(sd)) { + tmp = sd; sd = sd->parent; + destroy_sched_domain(tmp, cpu); if (sd) sd->child = NULL; } @@ -6761,7 +6775,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); + tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); + destroy_sched_domains(tmp, cpu); } /* cpus with isolated domains */ @@ -6777,56 +6793,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -/* - * init_sched_build_groups takes the cpumask we wish to span, and a pointer - * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids - * (due to the fact that we keep track of groups covered with a struct cpumask). - * - * init_sched_build_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. - */ -static void -init_sched_build_groups(const struct cpumask *span, - const struct cpumask *cpu_map, - int (*group_fn)(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *tmpmask), - struct cpumask *covered, struct cpumask *tmpmask) -{ - struct sched_group *first = NULL, *last = NULL; - int i; - - cpumask_clear(covered); - - for_each_cpu(i, span) { - struct sched_group *sg; - int group = group_fn(i, cpu_map, &sg, tmpmask); - int j; - - if (cpumask_test_cpu(i, covered)) - continue; - - cpumask_clear(sched_group_cpus(sg)); - sg->cpu_power = 0; - - for_each_cpu(j, span) { - if (group_fn(j, cpu_map, NULL, tmpmask) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; -} - #define SD_NODES_PER_DOMAIN 16 #ifdef CONFIG_NUMA @@ -6897,311 +6863,125 @@ static void sched_domain_node_span(int node, struct cpumask *span) cpumask_or(span, span, cpumask_of_node(next_node)); } } + +static const struct cpumask *cpu_node_mask(int cpu) +{ + lockdep_assert_held(&sched_domains_mutex); + + sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); + + return sched_domains_tmpmask; +} + +static const struct cpumask *cpu_allnodes_mask(int cpu) +{ + return cpu_possible_mask; +} #endif /* CONFIG_NUMA */ -int sched_smt_power_savings = 0, sched_mc_power_savings = 0; +static const struct cpumask *cpu_cpu_mask(int cpu) +{ + return cpumask_of_node(cpu_to_node(cpu)); +} -/* - * The cpus mask in sched_group and sched_domain hangs off the end. - * - * ( See the the comments in include/linux/sched.h:struct sched_group - * and struct sched_domain. ) - */ -struct static_sched_group { - struct sched_group sg; - DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); -}; +int sched_smt_power_savings = 0, sched_mc_power_savings = 0; -struct static_sched_domain { - struct sched_domain sd; - DECLARE_BITMAP(span, CONFIG_NR_CPUS); +struct sd_data { + struct sched_domain **__percpu sd; + struct sched_group **__percpu sg; }; struct s_data { -#ifdef CONFIG_NUMA - int sd_allnodes; - cpumask_var_t domainspan; - cpumask_var_t covered; - cpumask_var_t notcovered; -#endif - cpumask_var_t nodemask; - cpumask_var_t this_sibling_map; - cpumask_var_t this_core_map; - cpumask_var_t this_book_map; - cpumask_var_t send_covered; - cpumask_var_t tmpmask; - struct sched_group **sched_group_nodes; + struct sched_domain ** __percpu sd; struct root_domain *rd; }; enum s_alloc { - sa_sched_groups = 0, sa_rootdomain, - sa_tmpmask, - sa_send_covered, - sa_this_book_map, - sa_this_core_map, - sa_this_sibling_map, - sa_nodemask, - sa_sched_group_nodes, -#ifdef CONFIG_NUMA - sa_notcovered, - sa_covered, - sa_domainspan, -#endif + sa_sd, + sa_sd_storage, sa_none, }; -/* - * SMT sched-domains: - */ -#ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_groups); +struct sched_domain_topology_level; -static int -cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *unused) -{ - if (sg) - *sg = &per_cpu(sched_groups, cpu).sg; - return cpu; -} -#endif /* CONFIG_SCHED_SMT */ +typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); -/* - * multi-core sched-domains: - */ -#ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct static_sched_domain, core_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); - -static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) -{ - int group; -#ifdef CONFIG_SCHED_SMT - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#else - group = cpu; -#endif - if (sg) - *sg = &per_cpu(sched_group_core, group).sg; - return group; -} -#endif /* CONFIG_SCHED_MC */ +struct sched_domain_topology_level { + sched_domain_init_f init; + sched_domain_mask_f mask; + struct sd_data data; +}; /* - * book sched-domains: + * Assumes the sched_domain tree is fully constructed */ -#ifdef CONFIG_SCHED_BOOK -static DEFINE_PER_CPU(struct static_sched_domain, book_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); - -static int -cpu_to_book_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) +static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { - int group = cpu; -#ifdef CONFIG_SCHED_MC - cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#endif - if (sg) - *sg = &per_cpu(sched_group_book, group).sg; - return group; -} -#endif /* CONFIG_SCHED_BOOK */ + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); + struct sched_domain *child = sd->child; -static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); + if (child) + cpu = cpumask_first(sched_domain_span(child)); -static int -cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) -{ - int group; -#ifdef CONFIG_SCHED_BOOK - cpumask_and(mask, cpu_book_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_MC) - cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#else - group = cpu; -#endif if (sg) - *sg = &per_cpu(sched_group_phys, group).sg; - return group; + *sg = *per_cpu_ptr(sdd->sg, cpu); + + return cpu; } -#ifdef CONFIG_NUMA /* - * The init_sched_build_groups can't handle what we want to do with node - * groups, so roll our own. Now each node has its own list of groups which - * gets dynamically allocated. + * build_sched_groups takes the cpumask we wish to span, and a pointer + * to a function which identifies what group(along with sched group) a CPU + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). + * + * build_sched_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. */ -static DEFINE_PER_CPU(struct static_sched_domain, node_domains); -static struct sched_group ***sched_group_nodes_bycpu; - -static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); - -static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *nodemask) -{ - int group; - - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); - group = cpumask_first(nodemask); - - if (sg) - *sg = &per_cpu(sched_group_allnodes, group).sg; - return group; -} - -static void init_numa_sched_groups_power(struct sched_group *group_head) -{ - struct sched_group *sg = group_head; - int j; - - if (!sg) - return; - do { - for_each_cpu(j, sched_group_cpus(sg)) { - struct sched_domain *sd; - - sd = &per_cpu(phys_domains, j).sd; - if (j != group_first_cpu(sd->groups)) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } - - sg->cpu_power += sd->groups->cpu_power; - } - sg = sg->next; - } while (sg != group_head); -} - -static int build_numa_sched_groups(struct s_data *d, - const struct cpumask *cpu_map, int num) +static void +build_sched_groups(struct sched_domain *sd) { - struct sched_domain *sd; - struct sched_group *sg, *prev; - int n, j; - - cpumask_clear(d->covered); - cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); - if (cpumask_empty(d->nodemask)) { - d->sched_group_nodes[num] = NULL; - goto out; - } - - sched_domain_node_span(num, d->domainspan); - cpumask_and(d->domainspan, d->domainspan, cpu_map); - - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, num); - if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for node %d\n", - num); - return -ENOMEM; - } - d->sched_group_nodes[num] = sg; - - for_each_cpu(j, d->nodemask) { - sd = &per_cpu(node_domains, j).sd; - sd->groups = sg; - } + struct sched_group *first = NULL, *last = NULL; + struct sd_data *sdd = sd->private; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered; + int i; - sg->cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d->nodemask); - sg->next = sg; - cpumask_or(d->covered, d->covered, d->nodemask); + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; - prev = sg; - for (j = 0; j < nr_node_ids; j++) { - n = (num + j) % nr_node_ids; - cpumask_complement(d->notcovered, d->covered); - cpumask_and(d->tmpmask, d->notcovered, cpu_map); - cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); - if (cpumask_empty(d->tmpmask)) - break; - cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); - if (cpumask_empty(d->tmpmask)) - continue; - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, num); - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); - return -ENOMEM; - } - sg->cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d->tmpmask); - sg->next = prev->next; - cpumask_or(d->covered, d->covered, d->tmpmask); - prev->next = sg; - prev = sg; - } -out: - return 0; -} -#endif /* CONFIG_NUMA */ - -#ifdef CONFIG_NUMA -/* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const struct cpumask *cpu_map, - struct cpumask *nodemask) -{ - int cpu, i; + cpumask_clear(covered); - for_each_cpu(cpu, cpu_map) { - struct sched_group **sched_group_nodes - = sched_group_nodes_bycpu[cpu]; + for_each_cpu(i, span) { + struct sched_group *sg; + int group = get_group(i, sdd, &sg); + int j; - if (!sched_group_nodes) + if (cpumask_test_cpu(i, covered)) continue; - for (i = 0; i < nr_node_ids; i++) { - struct sched_group *oldsg, *sg = sched_group_nodes[i]; + cpumask_clear(sched_group_cpus(sg)); + sg->cpu_power = 0; - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) + for_each_cpu(j, span) { + if (get_group(j, sdd, NULL) != group) continue; - if (sg == NULL) - continue; - sg = sg->next; -next_sg: - oldsg = sg; - sg = sg->next; - kfree(oldsg); - if (oldsg != sched_group_nodes[i]) - goto next_sg; + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); } - kfree(sched_group_nodes); - sched_group_nodes_bycpu[cpu] = NULL; + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; } + last->next = first; } -#else /* !CONFIG_NUMA */ -static void free_sched_groups(const struct cpumask *cpu_map, - struct cpumask *nodemask) -{ -} -#endif /* CONFIG_NUMA */ /* * Initialize sched groups cpu_power. @@ -7215,11 +6995,6 @@ static void free_sched_groups(const struct cpumask *cpu_map, */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { - struct sched_domain *child; - struct sched_group *group; - long power; - int weight; - WARN_ON(!sd || !sd->groups); if (cpu != group_first_cpu(sd->groups)) @@ -7227,36 +7002,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); - child = sd->child; - - sd->groups->cpu_power = 0; - - if (!child) { - power = SCHED_LOAD_SCALE; - weight = cpumask_weight(sched_domain_span(sd)); - /* - * SMT siblings share the power of a single core. - * Usually multiple threads get a better yield out of - * that one core than a single thread would have, - * reflect that in sd->smt_gain. - */ - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - power *= sd->smt_gain; - power /= weight; - power >>= SCHED_LOAD_SHIFT; - } - sd->groups->cpu_power += power; - return; - } - - /* - * Add cpu_power of each child group to this groups cpu_power. - */ - group = child->groups; - do { - sd->groups->cpu_power += group->cpu_power; - group = group->next; - } while (group != child->groups); + update_group_power(sd, cpu); } /* @@ -7270,15 +7016,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) # define SD_INIT_NAME(sd, type) do { } while (0) #endif -#define SD_INIT(sd, type) sd_init_##type(sd) - -#define SD_INIT_FUNC(type) \ -static noinline void sd_init_##type(struct sched_domain *sd) \ -{ \ - memset(sd, 0, sizeof(*sd)); \ - *sd = SD_##type##_INIT; \ - sd->level = SD_LV_##type; \ - SD_INIT_NAME(sd, type); \ +#define SD_INIT_FUNC(type) \ +static noinline struct sched_domain * \ +sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ +{ \ + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ + *sd = SD_##type##_INIT; \ + SD_INIT_NAME(sd, type); \ + sd->private = &tl->data; \ + return sd; \ } SD_INIT_FUNC(CPU) @@ -7297,13 +7043,14 @@ SD_INIT_FUNC(CPU) #endif static int default_relax_domain_level = -1; +int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { unsigned long val; val = simple_strtoul(str, NULL, 0); - if (val < SD_LV_MAX) + if (val < sched_domain_level_max) default_relax_domain_level = val; return 1; @@ -7331,37 +7078,20 @@ static void set_domain_attribute(struct sched_domain *sd, } } +static void __sdt_free(const struct cpumask *cpu_map); +static int __sdt_alloc(const struct cpumask *cpu_map); + static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map) { switch (what) { - case sa_sched_groups: - free_sched_groups(cpu_map, d->tmpmask); /* fall through */ - d->sched_group_nodes = NULL; case sa_rootdomain: - free_rootdomain(d->rd); /* fall through */ - case sa_tmpmask: - free_cpumask_var(d->tmpmask); /* fall through */ - case sa_send_covered: - free_cpumask_var(d->send_covered); /* fall through */ - case sa_this_book_map: - free_cpumask_var(d->this_book_map); /* fall through */ - case sa_this_core_map: - free_cpumask_var(d->this_core_map); /* fall through */ - case sa_this_sibling_map: - free_cpumask_var(d->this_sibling_map); /* fall through */ - case sa_nodemask: - free_cpumask_var(d->nodemask); /* fall through */ - case sa_sched_group_nodes: -#ifdef CONFIG_NUMA - kfree(d->sched_group_nodes); /* fall through */ - case sa_notcovered: - free_cpumask_var(d->notcovered); /* fall through */ - case sa_covered: - free_cpumask_var(d->covered); /* fall through */ - case sa_domainspan: - free_cpumask_var(d->domainspan); /* fall through */ -#endif + if (!atomic_read(&d->rd->refcount)) + free_rootdomain(&d->rd->rcu); /* fall through */ + case sa_sd: + free_percpu(d->sd); /* fall through */ + case sa_sd_storage: + __sdt_free(cpu_map); /* fall through */ case sa_none: break; } @@ -7370,308 +7100,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) { -#ifdef CONFIG_NUMA - if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) - return sa_none; - if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) - return sa_domainspan; - if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) - return sa_covered; - /* Allocate the per-node list of sched groups */ - d->sched_group_nodes = kcalloc(nr_node_ids, - sizeof(struct sched_group *), GFP_KERNEL); - if (!d->sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); - return sa_notcovered; - } - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; -#endif - if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) - return sa_sched_group_nodes; - if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) - return sa_nodemask; - if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) - return sa_this_sibling_map; - if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) - return sa_this_core_map; - if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) - return sa_this_book_map; - if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) - return sa_send_covered; + memset(d, 0, sizeof(*d)); + + if (__sdt_alloc(cpu_map)) + return sa_sd_storage; + d->sd = alloc_percpu(struct sched_domain *); + if (!d->sd) + return sa_sd_storage; d->rd = alloc_rootdomain(); - if (!d->rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); - return sa_tmpmask; - } + if (!d->rd) + return sa_sd; return sa_rootdomain; } -static struct sched_domain *__build_numa_sched_domains(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) +/* + * NULL the sd_data elements we've used to build the sched_domain and + * sched_group structure so that the subsequent __free_domain_allocs() + * will not free the data we're using. + */ +static void claim_allocations(int cpu, struct sched_domain *sd) { - struct sched_domain *sd = NULL; -#ifdef CONFIG_NUMA - struct sched_domain *parent; - - d->sd_allnodes = 0; - if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { - sd = &per_cpu(allnodes_domains, i).sd; - SD_INIT(sd, ALLNODES); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); - d->sd_allnodes = 1; - } - parent = sd; - - sd = &per_cpu(node_domains, i).sd; - SD_INIT(sd, NODE); - set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); - sd->parent = parent; - if (parent) - parent->child = sd; - cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); -#endif - return sd; -} + struct sd_data *sdd = sd->private; + struct sched_group *sg = sd->groups; -static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd; - sd = &per_cpu(phys_domains, i).sd; - SD_INIT(sd, CPU); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), d->nodemask); - sd->parent = parent; - if (parent) - parent->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); - return sd; -} + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); + *per_cpu_ptr(sdd->sd, cpu) = NULL; -static struct sched_domain *__build_book_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = parent; -#ifdef CONFIG_SCHED_BOOK - sd = &per_cpu(book_domains, i).sd; - SD_INIT(sd, BOOK); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); - sd->parent = parent; - parent->child = sd; - cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); -#endif - return sd; + if (cpu == cpumask_first(sched_group_cpus(sg))) { + WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); + *per_cpu_ptr(sdd->sg, cpu) = NULL; + } } -static struct sched_domain *__build_mc_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *cpu_smt_mask(int cpu) { - struct sched_domain *sd = parent; -#ifdef CONFIG_SCHED_MC - sd = &per_cpu(core_domains, i).sd; - SD_INIT(sd, MC); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); - sd->parent = parent; - parent->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); -#endif - return sd; + return topology_thread_cpumask(cpu); } - -static struct sched_domain *__build_smt_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = parent; -#ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i).sd; - SD_INIT(sd, SIBLING); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); - sd->parent = parent; - parent->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); #endif - return sd; -} -static void build_sched_groups(struct s_data *d, enum sched_domain_level l, - const struct cpumask *cpu_map, int cpu) -{ - switch (l) { +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - case SD_LV_SIBLING: /* set up CPU (sibling) groups */ - cpumask_and(d->this_sibling_map, cpu_map, - topology_thread_cpumask(cpu)); - if (cpu == cpumask_first(d->this_sibling_map)) - init_sched_build_groups(d->this_sibling_map, cpu_map, - &cpu_to_cpu_group, - d->send_covered, d->tmpmask); - break; + { sd_init_SIBLING, cpu_smt_mask, }, #endif #ifdef CONFIG_SCHED_MC - case SD_LV_MC: /* set up multi-core groups */ - cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); - if (cpu == cpumask_first(d->this_core_map)) - init_sched_build_groups(d->this_core_map, cpu_map, - &cpu_to_core_group, - d->send_covered, d->tmpmask); - break; + { sd_init_MC, cpu_coregroup_mask, }, #endif #ifdef CONFIG_SCHED_BOOK - case SD_LV_BOOK: /* set up book groups */ - cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); - if (cpu == cpumask_first(d->this_book_map)) - init_sched_build_groups(d->this_book_map, cpu_map, - &cpu_to_book_group, - d->send_covered, d->tmpmask); - break; + { sd_init_BOOK, cpu_book_mask, }, #endif - case SD_LV_CPU: /* set up physical groups */ - cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); - if (!cpumask_empty(d->nodemask)) - init_sched_build_groups(d->nodemask, cpu_map, - &cpu_to_phys_group, - d->send_covered, d->tmpmask); - break; + { sd_init_CPU, cpu_cpu_mask, }, #ifdef CONFIG_NUMA - case SD_LV_ALLNODES: - init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, - d->send_covered, d->tmpmask); - break; + { sd_init_NODE, cpu_node_mask, }, + { sd_init_ALLNODES, cpu_allnodes_mask, }, #endif - default: - break; + { NULL, }, +}; + +static struct sched_domain_topology_level *sched_domain_topology = default_topology; + +static int __sdt_alloc(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + sdd->sd = alloc_percpu(struct sched_domain *); + if (!sdd->sd) + return -ENOMEM; + + sdd->sg = alloc_percpu(struct sched_group *); + if (!sdd->sg) + return -ENOMEM; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + struct sched_group *sg; + + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sd) + return -ENOMEM; + + *per_cpu_ptr(sdd->sd, j) = sd; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sg) + return -ENOMEM; + + *per_cpu_ptr(sdd->sg, j) = sg; + } + } + + return 0; +} + +static void __sdt_free(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + for_each_cpu(j, cpu_map) { + kfree(*per_cpu_ptr(sdd->sd, j)); + kfree(*per_cpu_ptr(sdd->sg, j)); + } + free_percpu(sdd->sd); + free_percpu(sdd->sg); + } +} + +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, + struct s_data *d, const struct cpumask *cpu_map, + struct sched_domain_attr *attr, struct sched_domain *child, + int cpu) +{ + struct sched_domain *sd = tl->init(tl, cpu); + if (!sd) + return child; + + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); + if (child) { + sd->level = child->level + 1; + sched_domain_level_max = max(sched_domain_level_max, sd->level); + child->parent = sd; } + sd->child = child; + + return sd; } /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int __build_sched_domains(const struct cpumask *cpu_map, - struct sched_domain_attr *attr) +static int build_sched_domains(const struct cpumask *cpu_map, + struct sched_domain_attr *attr) { enum s_alloc alloc_state = sa_none; - struct s_data d; struct sched_domain *sd; - int i; -#ifdef CONFIG_NUMA - d.sd_allnodes = 0; -#endif + struct s_data d; + int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; - alloc_state = sa_sched_groups; - - /* - * Set up domains for cpus specified by the cpu_map. - */ - for_each_cpu(i, cpu_map) { - cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), - cpu_map); - - sd = __build_numa_sched_domains(&d, cpu_map, attr, i); - sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); - } + /* Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { - build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); - build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); - build_sched_groups(&d, SD_LV_MC, cpu_map, i); - } + struct sched_domain_topology_level *tl; - /* Set up physical groups */ - for (i = 0; i < nr_node_ids; i++) - build_sched_groups(&d, SD_LV_CPU, cpu_map, i); - -#ifdef CONFIG_NUMA - /* Set up node groups */ - if (d.sd_allnodes) - build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); + sd = NULL; + for (tl = sched_domain_topology; tl->init; tl++) + sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); - for (i = 0; i < nr_node_ids; i++) - if (build_numa_sched_groups(&d, cpu_map, i)) - goto error; -#endif + while (sd->child) + sd = sd->child; - /* Calculate CPU power for physical packages and nodes */ -#ifdef CONFIG_SCHED_SMT - for_each_cpu(i, cpu_map) { - sd = &per_cpu(cpu_domains, i).sd; - init_sched_groups_power(i, sd); + *per_cpu_ptr(d.sd, i) = sd; } -#endif -#ifdef CONFIG_SCHED_MC - for_each_cpu(i, cpu_map) { - sd = &per_cpu(core_domains, i).sd; - init_sched_groups_power(i, sd); - } -#endif -#ifdef CONFIG_SCHED_BOOK - for_each_cpu(i, cpu_map) { - sd = &per_cpu(book_domains, i).sd; - init_sched_groups_power(i, sd); - } -#endif + /* Build the groups for the domains */ for_each_cpu(i, cpu_map) { - sd = &per_cpu(phys_domains, i).sd; - init_sched_groups_power(i, sd); - } + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + sd->span_weight = cpumask_weight(sched_domain_span(sd)); + get_group(i, sd->private, &sd->groups); + atomic_inc(&sd->groups->ref); -#ifdef CONFIG_NUMA - for (i = 0; i < nr_node_ids; i++) - init_numa_sched_groups_power(d.sched_group_nodes[i]); + if (i != cpumask_first(sched_domain_span(sd))) + continue; - if (d.sd_allnodes) { - struct sched_group *sg; + build_sched_groups(sd); + } + } + + /* Calculate CPU power for physical packages and nodes */ + for (i = nr_cpumask_bits-1; i >= 0; i--) { + if (!cpumask_test_cpu(i, cpu_map)) + continue; - cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, - d.tmpmask); - init_numa_sched_groups_power(sg); + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + claim_allocations(i, sd); + init_sched_groups_power(i, sd); + } } -#endif /* Attach the domains */ + rcu_read_lock(); for_each_cpu(i, cpu_map) { -#ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i).sd; -#elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i).sd; -#elif defined(CONFIG_SCHED_BOOK) - sd = &per_cpu(book_domains, i).sd; -#else - sd = &per_cpu(phys_domains, i).sd; -#endif + sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); } + rcu_read_unlock(); - d.sched_group_nodes = NULL; /* don't free this we still need it */ - __free_domain_allocs(&d, sa_tmpmask, cpu_map); - return 0; - + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); - return -ENOMEM; -} - -static int build_sched_domains(const struct cpumask *cpu_map) -{ - return __build_sched_domains(cpu_map, NULL); + return ret; } static cpumask_var_t *doms_cur; /* current sched domains */ @@ -7726,7 +7360,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ -static int arch_init_sched_domains(const struct cpumask *cpu_map) +static int init_sched_domains(const struct cpumask *cpu_map) { int err; @@ -7737,32 +7371,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) doms_cur = &fallback_doms; cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); dattr_cur = NULL; - err = build_sched_domains(doms_cur[0]); + err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); return err; } -static void arch_destroy_sched_domains(const struct cpumask *cpu_map, - struct cpumask *tmpmask) -{ - free_sched_groups(cpu_map, tmpmask); -} - /* * Detach sched domains from a group of cpus specified in cpu_map * These cpus will now be attached to the NULL domain */ static void detach_destroy_domains(const struct cpumask *cpu_map) { - /* Save because hotplug lock held. */ - static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); int i; + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); - synchronize_sched(); - arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); + rcu_read_unlock(); } /* handle null as "default" */ @@ -7851,8 +7477,7 @@ match1: goto match2; } /* no match - add a new doms_new */ - __build_sched_domains(doms_new[i], - dattr_new ? dattr_new + i : NULL); + build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); match2: ; } @@ -7871,7 +7496,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static void arch_reinit_sched_domains(void) +static void reinit_sched_domains(void) { get_online_cpus(); @@ -7904,7 +7529,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) else sched_mc_power_savings = level; - arch_reinit_sched_domains(); + reinit_sched_domains(); return count; } @@ -8023,14 +7648,9 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); -#if defined(CONFIG_NUMA) - sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), - GFP_KERNEL); - BUG_ON(sched_group_nodes_bycpu == NULL); -#endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(cpu_active_mask); + init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); @@ -8337,6 +7957,7 @@ void __init sched_init(void) /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); #ifdef CONFIG_NO_HZ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 054cebb81f7b..87445931a179 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1639,6 +1639,7 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * Otherwise, iterate the domains and find an elegible idle cpu. */ + rcu_read_lock(); for_each_domain(target, sd) { if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) break; @@ -1658,6 +1659,7 @@ static int select_idle_sibling(struct task_struct *p, int target) cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) break; } + rcu_read_unlock(); return target; } @@ -1690,6 +1692,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) new_cpu = prev_cpu; } + rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) continue; @@ -1740,9 +1743,10 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) if (affine_sd) { if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) - return select_idle_sibling(p, cpu); - else - return select_idle_sibling(p, prev_cpu); + prev_cpu = cpu; + + new_cpu = select_idle_sibling(p, prev_cpu); + goto unlock; } while (sd) { @@ -1783,6 +1787,8 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) } /* while loop will break here if sd == NULL */ } +unlock: + rcu_read_unlock(); return new_cpu; } @@ -2662,7 +2668,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /* * Only siblings can have significantly less than SCHED_LOAD_SCALE */ - if (sd->level != SD_LV_SIBLING) + if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; /* @@ -3479,6 +3485,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) raw_spin_unlock(&this_rq->lock); update_shares(this_cpu); + rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; int balance = 1; @@ -3500,6 +3507,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) break; } } + rcu_read_unlock(); raw_spin_lock(&this_rq->lock); @@ -3548,6 +3556,7 @@ static int active_load_balance_cpu_stop(void *data) double_lock_balance(busiest_rq, target_rq); /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) @@ -3563,6 +3572,7 @@ static int active_load_balance_cpu_stop(void *data) else schedstat_inc(sd, alb_failed); } + rcu_read_unlock(); double_unlock_balance(busiest_rq, target_rq); out_unlock: busiest_rq->active_balance = 0; @@ -3689,6 +3699,7 @@ static int find_new_ilb(int cpu) { struct sched_domain *sd; struct sched_group *ilb_group; + int ilb = nr_cpu_ids; /* * Have idle load balancer selection from semi-idle packages only @@ -3704,20 +3715,25 @@ static int find_new_ilb(int cpu) if (cpumask_weight(nohz.idle_cpus_mask) < 2) goto out_done; + rcu_read_lock(); for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { ilb_group = sd->groups; do { - if (is_semi_idle_group(ilb_group)) - return cpumask_first(nohz.grp_idle_mask); + if (is_semi_idle_group(ilb_group)) { + ilb = cpumask_first(nohz.grp_idle_mask); + goto unlock; + } ilb_group = ilb_group->next; } while (ilb_group != sd->groups); } +unlock: + rcu_read_unlock(); out_done: - return nr_cpu_ids; + return ilb; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) @@ -3862,6 +3878,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) update_shares(cpu); + rcu_read_lock(); for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -3907,6 +3924,7 @@ out: if (!balance) break; } + rcu_read_unlock(); /* * next_balance will be updated only when there is a need. |