From aa1a43262ad5df010768f69530fa179ff81651d3 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:22 +0100 Subject: PM: EM: Fix inefficient states detection Currently, a debug message is printed if an inefficient state is detected in the Energy Model. Unfortunately, it won't detect if the first state is inefficient or if two successive states are. Fix this behavior. Fixes: 27871f7a8a34 (PM: Introduce an Energy Model management framework) Signed-off-by: Vincent Donnefort Reviewed-by: Quentin Perret Reviewed-by: Lukasz Luba Reviewed-by: Matthias Kaehlcke Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index a332ccd829e2..97e62469a6b3 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -107,8 +107,7 @@ static void em_debug_remove_pd(struct device *dev) {} static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, int nr_states, struct em_data_callback *cb) { - unsigned long opp_eff, prev_opp_eff = ULONG_MAX; - unsigned long power, freq, prev_freq = 0; + unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; struct em_perf_state *table; int i, ret; u64 fmax; @@ -153,27 +152,21 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, table[i].power = power; table[i].frequency = prev_freq = freq; - - /* - * The hertz/watts efficiency ratio should decrease as the - * frequency grows on sane platforms. But this isn't always - * true in practice so warn the user if a higher OPP is more - * power efficient than a lower one. - */ - opp_eff = freq / power; - if (opp_eff >= prev_opp_eff) - dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", - i, i - 1); - prev_opp_eff = opp_eff; } /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; - for (i = 0; i < nr_states; i++) { + for (i = nr_states - 1; i >= 0; i--) { unsigned long power_res = em_scale_power(table[i].power); table[i].cost = div64_u64(fmax * power_res, table[i].frequency); + if (table[i].cost >= prev_cost) { + dev_dbg(dev, "EM: OPP:%lu is inefficient\n", + table[i].frequency); + } else { + prev_cost = table[i].cost; + } } pd->table = table; -- cgit v1.2.3 From c8ed99533dbc0fcc1142671ec80acb33045d2999 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:23 +0100 Subject: PM: EM: Mark inefficient states Some SoCs, such as the sd855 have OPPs within the same performance domain, whose cost is higher than others with a higher frequency. Even though those OPPs are interesting from a cooling perspective, it makes no sense to use them when the device can run at full capacity. Those OPPs handicap the performance domain, when choosing the most energy-efficient CPU and are wasting energy. They are inefficient. Hence, add support for such OPPs to the Energy Model. The table can now be read skipping inefficient performance states (and by extension, inefficient OPPs). Signed-off-by: Vincent Donnefort Reviewed-by: Matthias Kaehlcke Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 12 ++++++++++++ kernel/power/energy_model.c | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 39dcadd492b5..3641ca4acf04 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -17,13 +17,25 @@ * device). It can be a total power: static and dynamic. * @cost: The cost coefficient associated with this level, used during * energy calculation. Equal to: power * max_frequency / frequency + * @flags: see "em_perf_state flags" description below. */ struct em_perf_state { unsigned long frequency; unsigned long power; unsigned long cost; + unsigned long flags; }; +/* + * em_perf_state flags: + * + * EM_PERF_STATE_INEFFICIENT: The performance state is inefficient. There is + * in this em_perf_domain, another performance state with a higher frequency + * but a lower or equal power cost. Such inefficient states are ignored when + * using em_pd_get_efficient_*() functions. + */ +#define EM_PERF_STATE_INEFFICIENT BIT(0) + /** * struct em_perf_domain - Performance domain * @table: List of performance states, in ascending order diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 97e62469a6b3..6d8438347535 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -2,7 +2,7 @@ /* * Energy Model of devices * - * Copyright (c) 2018-2020, Arm ltd. + * Copyright (c) 2018-2021, Arm ltd. * Written by: Quentin Perret, Arm ltd. * Improvements provided by: Lukasz Luba, Arm ltd. */ @@ -42,6 +42,7 @@ static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) debugfs_create_ulong("frequency", 0444, d, &ps->frequency); debugfs_create_ulong("power", 0444, d, &ps->power); debugfs_create_ulong("cost", 0444, d, &ps->cost); + debugfs_create_ulong("inefficient", 0444, d, &ps->flags); } static int em_debug_cpus_show(struct seq_file *s, void *unused) @@ -162,6 +163,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, table[i].cost = div64_u64(fmax * power_res, table[i].frequency); if (table[i].cost >= prev_cost) { + table[i].flags = EM_PERF_STATE_INEFFICIENT; dev_dbg(dev, "EM: OPP:%lu is inefficient\n", table[i].frequency); } else { -- cgit v1.2.3 From 88f7a89560f6d0fc7803a8933637488f14e0a098 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:24 +0100 Subject: PM: EM: Extend em_perf_domain with a flag field Merge the current "milliwatts" option into a "flag" field. This intends to prepare the extension of this structure for inefficient states support in the Energy Model. Signed-off-by: Vincent Donnefort Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 13 ++++++++++--- kernel/power/energy_model.c | 6 ++++-- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 3641ca4acf04..671440371a95 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -40,8 +40,7 @@ struct em_perf_state { * struct em_perf_domain - Performance domain * @table: List of performance states, in ascending order * @nr_perf_states: Number of performance states - * @milliwatts: Flag indicating the power values are in milli-Watts - * or some other scale. + * @flags: See "em_perf_domain flags" * @cpus: Cpumask covering the CPUs of the domain. It's here * for performance reasons to avoid potential cache * misses during energy calculations in the scheduler @@ -56,10 +55,18 @@ struct em_perf_state { struct em_perf_domain { struct em_perf_state *table; int nr_perf_states; - int milliwatts; + unsigned long flags; unsigned long cpus[]; }; +/* + * em_perf_domain flags: + * + * EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some + * other scale. + */ +#define EM_PERF_DOMAIN_MILLIWATTS BIT(0) + #define em_span_cpus(em) (to_cpumask((em)->cpus)) #ifdef CONFIG_ENERGY_MODEL diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 6d8438347535..3a7d1573b214 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -56,7 +56,8 @@ DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); static int em_debug_units_show(struct seq_file *s, void *unused) { struct em_perf_domain *pd = s->private; - char *units = pd->milliwatts ? "milliWatts" : "bogoWatts"; + char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ? + "milliWatts" : "bogoWatts"; seq_printf(s, "%s\n", units); @@ -330,7 +331,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, if (ret) goto unlock; - dev->em_pd->milliwatts = milliwatts; + if (milliwatts) + dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); -- cgit v1.2.3 From 8354eb9eb3ddb4a8d0857648a470beffcc9d8639 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:25 +0100 Subject: PM: EM: Allow skipping inefficient states The new performance domain flag EM_PERF_DOMAIN_SKIP_INEFFICIENCIES allows to not take into account inefficient states when estimating energy consumption. This intends to let the Energy Model know that CPUFreq itself will skip inefficiencies and such states don't need to be part of the estimation anymore. Signed-off-by: Vincent Donnefort Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 43 +++++++++++++++++++++++++++++++++++++------ kernel/power/energy_model.c | 13 +++++++++++++ 2 files changed, 50 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 671440371a95..6377adc3b78d 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -64,8 +64,12 @@ struct em_perf_domain { * * EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some * other scale. + * + * EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating + * energy consumption. */ #define EM_PERF_DOMAIN_MILLIWATTS BIT(0) +#define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1) #define em_span_cpus(em) (to_cpumask((em)->cpus)) @@ -120,6 +124,37 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, bool milliwatts); void em_dev_unregister_perf_domain(struct device *dev); +/** + * em_pd_get_efficient_state() - Get an efficient performance state from the EM + * @pd : Performance domain for which we want an efficient frequency + * @freq : Frequency to map with the EM + * + * It is called from the scheduler code quite frequently and as a consequence + * doesn't implement any check. + * + * Return: An efficient performance state, high enough to meet @freq + * requirement. + */ +static inline +struct em_perf_state *em_pd_get_efficient_state(struct em_perf_domain *pd, + unsigned long freq) +{ + struct em_perf_state *ps; + int i; + + for (i = 0; i < pd->nr_perf_states; i++) { + ps = &pd->table[i]; + if (ps->frequency >= freq) { + if (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && + ps->flags & EM_PERF_STATE_INEFFICIENT) + continue; + break; + } + } + + return ps; +} + /** * em_cpu_energy() - Estimates the energy consumed by the CPUs of a * performance domain @@ -142,7 +177,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, { unsigned long freq, scale_cpu; struct em_perf_state *ps; - int i, cpu; + int cpu; if (!sum_util) return 0; @@ -167,11 +202,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested frequency. */ - for (i = 0; i < pd->nr_perf_states; i++) { - ps = &pd->table[i]; - if (ps->frequency >= freq) - break; - } + ps = em_pd_get_efficient_state(pd, freq); /* * The capacity of a CPU in the domain at the performance state (ps) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 3a7d1573b214..d353ef29e37f 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -65,6 +65,17 @@ static int em_debug_units_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_units); +static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0; + + seq_printf(s, "%d\n", enabled); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies); + static void em_debug_create_pd(struct device *dev) { struct dentry *d; @@ -78,6 +89,8 @@ static void em_debug_create_pd(struct device *dev) &em_debug_cpus_fops); debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); + debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd, + &em_debug_skip_inefficiencies_fops); /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) -- cgit v1.2.3 From e458716a92b57f854deb89bb40aa3554c2b6205e Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:30 +0100 Subject: PM: EM: Mark inefficiencies in CPUFreq The Energy Model has a 1:1 mapping between OPPs and performance states (em_perf_state). If a CPUFreq driver registers an Energy Model, inefficiencies found by the latter can be applied to CPUFreq. Signed-off-by: Vincent Donnefort Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'kernel') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index d353ef29e37f..0153b0ca7b23 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) "energy_model: " fmt #include +#include #include #include #include @@ -231,6 +232,43 @@ static int em_create_pd(struct device *dev, int nr_states, return 0; } +static void em_cpufreq_update_efficiencies(struct device *dev) +{ + struct em_perf_domain *pd = dev->em_pd; + struct em_perf_state *table; + struct cpufreq_policy *policy; + int found = 0; + int i; + + if (!_is_cpu_device(dev) || !pd) + return; + + policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd))); + if (!policy) { + dev_warn(dev, "EM: Access to CPUFreq policy failed"); + return; + } + + table = pd->table; + + for (i = 0; i < pd->nr_perf_states; i++) { + if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT)) + continue; + + if (!cpufreq_table_set_inefficient(policy, table[i].frequency)) + found++; + } + + if (!found) + return; + + /* + * Efficiencies have been installed in CPUFreq, inefficient frequencies + * will be skipped. The EM can do the same. + */ + pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES; +} + /** * em_pd_get() - Return the performance domain for a device * @dev : Device to find the performance domain for @@ -347,6 +385,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, if (milliwatts) dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; + em_cpufreq_update_efficiencies(dev); + em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); -- cgit v1.2.3