From 76b67ed9dce69a6a329cdd66f94af1787f417b62 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 27 Jun 2006 02:53:41 -0700 Subject: [PATCH] node hotplug: register cpu: remove node struct With Goto-san's patch, we can add new pgdat/node at runtime. I'm now considering node-hot-add with cpu + memory on ACPI. I found acpi container, which describes node, could evaluate cpu before memory. This means cpu-hot-add occurs before memory hot add. In most part, cpu-hot-add doesn't depend on node hot add. But register_cpu(), which creates symbolic link from node to cpu, requires that node should be onlined before register_cpu(). When a node is onlined, its pgdat should be there. This patch-set holds off creating symbolic link from node to cpu until node is onlined. This removes node arguments from register_cpu(). Now, register_cpu() requires 'struct node' as its argument. But the array of struct node is now unified in driver/base/node.c now (By Goto's node hotplug patch). We can get struct node in generic way. So, this argument is not necessary now. This patch also guarantees add cpu under node only when node is onlined. It is necessary for node-hot-add vs. cpu-hot-add patch following this. Moreover, register_cpu calculates cpu->node_id by cpu_to_node() without regard to its 'struct node *root' argument. This patch removes it. Also modify callers of register_cpu()/unregister_cpu, whose args are changed by register-cpu-remove-node-struct patch. [Brice.Goglin@ens-lyon.org: fix it] Signed-off-by: KAMEZAWA Hiroyuki Cc: Yasunori Goto Cc: Ashok Raj Cc: Dave Hansen Signed-off-by: Brice Goglin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/cpu.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'drivers/base/cpu.c') diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index dd712b24ec91..3972d8ac9786 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "base.h" @@ -57,13 +58,12 @@ static void __devinit register_cpu_control(struct cpu *cpu) { sysdev_create_file(&cpu->sysdev, &attr_online); } -void unregister_cpu(struct cpu *cpu, struct node *root) +void unregister_cpu(struct cpu *cpu) { int logical_cpu = cpu->sysdev.id; - if (root) - sysfs_remove_link(&root->sysdev.kobj, - kobject_name(&cpu->sysdev.kobj)); + unregister_cpu_under_node(logical_cpu, cpu_to_node(logical_cpu)); + sysdev_remove_file(&cpu->sysdev, &attr_online); sysdev_unregister(&cpu->sysdev); @@ -109,23 +109,21 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL); * * Initialize and register the CPU device. */ -int __devinit register_cpu(struct cpu *cpu, int num, struct node *root) +int __devinit register_cpu(struct cpu *cpu, int num) { int error; - cpu->node_id = cpu_to_node(num); cpu->sysdev.id = num; cpu->sysdev.cls = &cpu_sysdev_class; error = sysdev_register(&cpu->sysdev); - if (!error && root) - error = sysfs_create_link(&root->sysdev.kobj, - &cpu->sysdev.kobj, - kobject_name(&cpu->sysdev.kobj)); + if (!error && !cpu->no_control) register_cpu_control(cpu); if (!error) cpu_sys_devices[num] = &cpu->sysdev; + if (!error) + register_cpu_under_node(num, cpu_to_node(num)); #ifdef CONFIG_KEXEC if (!error) -- cgit v1.2.3 From 5c45bf279d378d436ce45825c0f136696c7b6109 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Tue, 27 Jun 2006 02:54:42 -0700 Subject: [PATCH] sched: mc/smt power savings sched policy sysfs entries 'sched_mc_power_savings' and 'sched_smt_power_savings' in /sys/devices/system/cpu/ control the MC/SMT power savings policy for the scheduler. Based on the values (1-enable, 0-disable) for these controls, sched groups cpu power will be determined for different domains. When power savings policy is enabled and under light load conditions, scheduler will minimize the physical packages/cpu cores carrying the load and thus conserving power(with a perf impact based on the workload characteristics... see OLS 2005 CMP kernel scheduler paper for more details..) Signed-off-by: Suresh Siddha Cc: Ingo Molnar Cc: Nick Piggin Cc: Con Kolivas Cc: "Chen, Kenneth W" Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/smpboot.c | 8 +- arch/x86_64/kernel/smpboot.c | 8 +- drivers/base/cpu.c | 10 +- include/asm-i386/topology.h | 5 + include/asm-ia64/topology.h | 1 + include/asm-powerpc/topology.h | 5 + include/asm-sparc64/topology.h | 3 + include/asm-x86_64/topology.h | 2 + include/linux/sched.h | 10 ++ include/linux/topology.h | 3 +- kernel/sched.c | 240 ++++++++++++++++++++++++++++++++++++----- 11 files changed, 262 insertions(+), 33 deletions(-) (limited to 'drivers/base/cpu.c') diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index ab5275beddf7..89e7315e539c 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -448,10 +448,12 @@ cpumask_t cpu_coregroup_map(int cpu) struct cpuinfo_x86 *c = cpu_data + cpu; /* * For perf, we return last level cache shared map. - * TBD: when power saving sched policy is added, we will return - * cpu_core_map when power saving policy is enabled + * And for power savings, we return cpu_core_map */ - return c->llc_shared_map; + if (sched_mc_power_savings || sched_smt_power_savings) + return cpu_core_map[cpu]; + else + return c->llc_shared_map; } /* representing cpus for which sibling maps can be computed */ diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 4e9755179ecf..540c0ccbcccc 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -455,10 +455,12 @@ cpumask_t cpu_coregroup_map(int cpu) struct cpuinfo_x86 *c = cpu_data + cpu; /* * For perf, we return last level cache shared map. - * TBD: when power saving sched policy is added, we will return - * cpu_core_map when power saving policy is enabled + * And for power savings, we return cpu_core_map */ - return c->llc_shared_map; + if (sched_mc_power_savings || sched_smt_power_savings) + return cpu_core_map[cpu]; + else + return c->llc_shared_map; } /* representing cpus for which sibling maps can be computed */ diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 3972d8ac9786..4bef76a2f3f2 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -143,5 +143,13 @@ EXPORT_SYMBOL_GPL(get_cpu_sysdev); int __init cpu_dev_init(void) { - return sysdev_class_register(&cpu_sysdev_class); + int err; + + err = sysdev_class_register(&cpu_sysdev_class); +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + if (!err) + err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class); +#endif + + return err; } diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h index aa4185ee81fb..6adbd9b1ae88 100644 --- a/include/asm-i386/topology.h +++ b/include/asm-i386/topology.h @@ -112,4 +112,9 @@ extern unsigned long node_remap_size[]; extern cpumask_t cpu_coregroup_map(int cpu); +#ifdef CONFIG_SMP +#define mc_capable() (boot_cpu_data.x86_max_cores > 1) +#define smt_capable() (smp_num_siblings > 1) +#endif + #endif /* _ASM_I386_TOPOLOGY_H */ diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h index 616b5ed2aa72..937c21257523 100644 --- a/include/asm-ia64/topology.h +++ b/include/asm-ia64/topology.h @@ -112,6 +112,7 @@ void build_cpu_to_node_map(void); #define topology_core_id(cpu) (cpu_data(cpu)->core_id) #define topology_core_siblings(cpu) (cpu_core_map[cpu]) #define topology_thread_siblings(cpu) (cpu_sibling_map[cpu]) +#define smt_capable() (smp_num_siblings > 1) #endif #include diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h index 92f3e5507d22..bbc3844b086f 100644 --- a/include/asm-powerpc/topology.h +++ b/include/asm-powerpc/topology.h @@ -93,5 +93,10 @@ static inline void sysfs_remove_device_from_node(struct sys_device *dev, #endif /* CONFIG_NUMA */ +#ifdef CONFIG_SMP +#include +#define smt_capable() (cpu_has_feature(CPU_FTR_SMT)) +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_TOPOLOGY_H */ diff --git a/include/asm-sparc64/topology.h b/include/asm-sparc64/topology.h index 0e234e201bd6..98a6c613589d 100644 --- a/include/asm-sparc64/topology.h +++ b/include/asm-sparc64/topology.h @@ -1,6 +1,9 @@ #ifndef _ASM_SPARC64_TOPOLOGY_H #define _ASM_SPARC64_TOPOLOGY_H +#include +#define smt_capable() (tlb_type == hypervisor) + #include #endif /* _ASM_SPARC64_TOPOLOGY_H */ diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h index c4e46e7fa7ba..6e7a2e976b04 100644 --- a/include/asm-x86_64/topology.h +++ b/include/asm-x86_64/topology.h @@ -59,6 +59,8 @@ extern int __node_distance(int, int); #define topology_core_id(cpu) (cpu_data[cpu].cpu_core_id) #define topology_core_siblings(cpu) (cpu_core_map[cpu]) #define topology_thread_siblings(cpu) (cpu_sibling_map[cpu]) +#define mc_capable() (boot_cpu_data.x86_max_cores > 1) +#define smt_capable() (smp_num_siblings > 1) #endif #include diff --git a/include/linux/sched.h b/include/linux/sched.h index ab8ffc54423a..0bc81a151e50 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -570,6 +570,11 @@ enum idle_type #define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ #define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ #define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ +#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ + +#define BALANCE_FOR_POWER ((sched_mc_power_savings || sched_smt_power_savings) \ + ? SD_POWERSAVINGS_BALANCE : 0) + struct sched_group { struct sched_group *next; /* Must be a circular list */ @@ -1412,6 +1417,11 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); extern long sched_getaffinity(pid_t pid, cpumask_t *mask); +#include +extern int sched_mc_power_savings, sched_smt_power_savings; +extern struct sysdev_attribute attr_sched_mc_power_savings, attr_sched_smt_power_savings; +extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls); + extern void normalize_rt_tasks(void); #ifdef CONFIG_PM diff --git a/include/linux/topology.h b/include/linux/topology.h index a305ae2e44b6..ec1eca85290a 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -134,7 +134,8 @@ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE, \ + | SD_WAKE_AFFINE \ + | BALANCE_FOR_POWER, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/kernel/sched.c b/kernel/sched.c index 122b75584a13..54fa282657cc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1162,6 +1162,11 @@ static int sched_balance_self(int cpu, int flag) struct sched_domain *tmp, *sd = NULL; for_each_domain(cpu, tmp) { + /* + * If power savings logic is enabled for a domain, stop there. + */ + if (tmp->flags & SD_POWERSAVINGS_BALANCE) + break; if (tmp->flags & flag) sd = tmp; } @@ -2082,6 +2087,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long busiest_load_per_task, busiest_nr_running; unsigned long this_load_per_task, this_nr_running; int load_idx; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + int power_savings_balance = 1; + unsigned long leader_nr_running = 0, min_load_per_task = 0; + unsigned long min_nr_running = ULONG_MAX; + struct sched_group *group_min = NULL, *group_leader = NULL; +#endif max_load = this_load = total_load = total_pwr = 0; busiest_load_per_task = busiest_nr_running = 0; @@ -2094,7 +2105,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, load_idx = sd->idle_idx; do { - unsigned long load; + unsigned long load, group_capacity; int local_group; int i; unsigned long sum_nr_running, sum_weighted_load; @@ -2127,18 +2138,76 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* Adjust by relative CPU power of the group */ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + group_capacity = group->cpu_power / SCHED_LOAD_SCALE; + if (local_group) { this_load = avg_load; this = group; this_nr_running = sum_nr_running; this_load_per_task = sum_weighted_load; } else if (avg_load > max_load && - sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) { + sum_nr_running > group_capacity) { max_load = avg_load; busiest = group; busiest_nr_running = sum_nr_running; busiest_load_per_task = sum_weighted_load; } + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto group_next; + + /* + * If the local group is idle or completely loaded + * no need to do power savings balance at this domain + */ + if (local_group && (this_nr_running >= group_capacity || + !this_nr_running)) + power_savings_balance = 0; + + /* + * If a group is already running at full capacity or idle, + * don't include that group in power savings calculations + */ + if (!power_savings_balance || sum_nr_running >= group_capacity + || !sum_nr_running) + goto group_next; + + /* + * Calculate the group which has the least non-idle load. + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sum_nr_running < min_nr_running) || + (sum_nr_running == min_nr_running && + first_cpu(group->cpumask) < + first_cpu(group_min->cpumask))) { + group_min = group; + min_nr_running = sum_nr_running; + min_load_per_task = sum_weighted_load / + sum_nr_running; + } + + /* + * Calculate the group which is almost near its + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sum_nr_running <= group_capacity - 1) + if (sum_nr_running > leader_nr_running || + (sum_nr_running == leader_nr_running && + first_cpu(group->cpumask) > + first_cpu(group_leader->cpumask))) { + group_leader = group; + leader_nr_running = sum_nr_running; + } + +group_next: +#endif group = group->next; } while (group != sd->groups); @@ -2247,7 +2316,16 @@ small_imbalance: return busiest; out_balanced: +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto ret; + if (this == group_leader && group_leader != group_min) { + *imbalance = min_load_per_task; + return group_min; + } +ret: +#endif *imbalance = 0; return NULL; } @@ -2300,7 +2378,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, int active_balance = 0; int sd_idle = 0; - if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) + if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && + !sched_smt_power_savings) sd_idle = 1; schedstat_inc(sd, lb_cnt[idle]); @@ -2389,7 +2468,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, sd->balance_interval *= 2; } - if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !sched_smt_power_savings) return -1; return nr_moved; @@ -2404,7 +2484,7 @@ out_one_pinned: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) return -1; return 0; } @@ -2425,7 +2505,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, int nr_moved = 0; int sd_idle = 0; - if (sd->flags & SD_SHARE_CPUPOWER) + if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); @@ -2466,7 +2546,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) return -1; sd->nr_balance_failed = 0; return 0; @@ -5732,6 +5812,7 @@ static cpumask_t sched_domain_node_span(int node) } #endif +int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we * can switch it on easily if needed. @@ -6113,37 +6194,72 @@ static int build_sched_domains(const cpumask_t *cpu_map) #endif /* Calculate CPU power for physical packages and nodes */ +#ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - int power; struct sched_domain *sd; -#ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); - power = SCHED_LOAD_SCALE; - sd->groups->cpu_power = power; + sd->groups->cpu_power = SCHED_LOAD_SCALE; + } #endif #ifdef CONFIG_SCHED_MC + for_each_cpu_mask(i, *cpu_map) { + int power; + struct sched_domain *sd; sd = &per_cpu(core_domains, i); - power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) + if (sched_smt_power_savings) + power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); + else + power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) * SCHED_LOAD_SCALE / 10; sd->groups->cpu_power = power; + } +#endif + for_each_cpu_mask(i, *cpu_map) { + struct sched_domain *sd; +#ifdef CONFIG_SCHED_MC sd = &per_cpu(phys_domains, i); + if (i != first_cpu(sd->groups->cpumask)) + continue; - /* - * This has to be < 2 * SCHED_LOAD_SCALE - * Lets keep it SCHED_LOAD_SCALE, so that - * while calculating NUMA group's cpu_power - * we can simply do - * numa_group->cpu_power += phys_group->cpu_power; - * - * See "only add power once for each physical pkg" - * comment below - */ - sd->groups->cpu_power = SCHED_LOAD_SCALE; + sd->groups->cpu_power = 0; + if (sched_mc_power_savings || sched_smt_power_savings) { + int j; + + for_each_cpu_mask(j, sd->groups->cpumask) { + struct sched_domain *sd1; + sd1 = &per_cpu(core_domains, j); + /* + * for each core we will add once + * to the group in physical domain + */ + if (j != first_cpu(sd1->groups->cpumask)) + continue; + + if (sched_smt_power_savings) + sd->groups->cpu_power += sd1->groups->cpu_power; + else + sd->groups->cpu_power += SCHED_LOAD_SCALE; + } + } else + /* + * This has to be < 2 * SCHED_LOAD_SCALE + * Lets keep it SCHED_LOAD_SCALE, so that + * while calculating NUMA group's cpu_power + * we can simply do + * numa_group->cpu_power += phys_group->cpu_power; + * + * See "only add power once for each physical pkg" + * comment below + */ + sd->groups->cpu_power = SCHED_LOAD_SCALE; #else + int power; sd = &per_cpu(phys_domains, i); - power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * - (cpus_weight(sd->groups->cpumask)-1) / 10; + if (sched_smt_power_savings) + power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); + else + power = SCHED_LOAD_SCALE; sd->groups->cpu_power = power; #endif } @@ -6244,6 +6360,80 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) return err; } +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +int arch_reinit_sched_domains(void) +{ + int err; + + lock_cpu_hotplug(); + detach_destroy_domains(&cpu_online_map); + err = arch_init_sched_domains(&cpu_online_map); + unlock_cpu_hotplug(); + + return err; +} + +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) +{ + int ret; + + if (buf[0] != '0' && buf[0] != '1') + return -EINVAL; + + if (smt) + sched_smt_power_savings = (buf[0] == '1'); + else + sched_mc_power_savings = (buf[0] == '1'); + + ret = arch_reinit_sched_domains(); + + return ret ? ret : count; +} + +int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +{ + int err = 0; +#ifdef CONFIG_SCHED_SMT + if (smt_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_smt_power_savings.attr); +#endif +#ifdef CONFIG_SCHED_MC + if (!err && mc_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_mc_power_savings.attr); +#endif + return err; +} +#endif + +#ifdef CONFIG_SCHED_MC +static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) +{ + return sprintf(page, "%u\n", sched_mc_power_savings); +} +static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 0); +} +SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, + sched_mc_power_savings_store); +#endif + +#ifdef CONFIG_SCHED_SMT +static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) +{ + return sprintf(page, "%u\n", sched_smt_power_savings); +} +static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 1); +} +SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, + sched_smt_power_savings_store); +#endif + + #ifdef CONFIG_HOTPLUG_CPU /* * Force a reinitialization of the sched domains hierarchy. The domains -- cgit v1.2.3