From c214f124161d446b340597e7c968e0a2dc149142 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Nov 2021 19:57:10 +0000 Subject: arch_topology: Introduce thermal pressure update function The thermal pressure is a mechanism which is used for providing information about reduced CPU performance to the scheduler. Usually code has to convert the value from frequency units into capacity units, which are understandable by the scheduler. Create a common conversion code which can be just used via a handy API. Internally, the topology_update_thermal_pressure() operates on frequency in MHz and max CPU frequency is taken from 'freq_factor' (per-cpu). Signed-off-by: Lukasz Luba Reviewed-by: Thara Gopinath Signed-off-by: Viresh Kumar --- drivers/base/arch_topology.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index ff16a36a908b..a3ae0b4eea09 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -22,6 +22,7 @@ static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; +static DEFINE_PER_CPU(u32, freq_factor) = 1; static bool supports_scale_freq_counters(const struct cpumask *cpus) { @@ -165,6 +166,47 @@ void topology_set_thermal_pressure(const struct cpumask *cpus, } EXPORT_SYMBOL_GPL(topology_set_thermal_pressure); +/** + * topology_update_thermal_pressure() - Update thermal pressure for CPUs + * @cpus : The related CPUs for which capacity has been reduced + * @capped_freq : The maximum allowed frequency that CPUs can run at + * + * Update the value of thermal pressure for all @cpus in the mask. The + * cpumask should include all (online+offline) affected CPUs, to avoid + * operating on stale data when hot-plug is used for some CPUs. The + * @capped_freq reflects the currently allowed max CPUs frequency due to + * thermal capping. It might be also a boost frequency value, which is bigger + * than the internal 'freq_factor' max frequency. In such case the pressure + * value should simply be removed, since this is an indication that there is + * no thermal throttling. The @capped_freq must be provided in kHz. + */ +void topology_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_freq) +{ + unsigned long max_capacity, capacity; + u32 max_freq; + int cpu; + + cpu = cpumask_first(cpus); + max_capacity = arch_scale_cpu_capacity(cpu); + max_freq = per_cpu(freq_factor, cpu); + + /* Convert to MHz scale which is used in 'freq_factor' */ + capped_freq /= 1000; + + /* + * Handle properly the boost frequencies, which should simply clean + * the thermal pressure value. + */ + if (max_freq <= capped_freq) + capacity = max_capacity; + else + capacity = mult_frac(max_capacity, capped_freq, max_freq); + + arch_set_thermal_pressure(cpus, max_capacity - capacity); +} +EXPORT_SYMBOL_GPL(topology_update_thermal_pressure); + static ssize_t cpu_capacity_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -217,7 +259,6 @@ static void update_topology_flags_workfn(struct work_struct *work) update_topology = 0; } -static DEFINE_PER_CPU(u32, freq_factor) = 1; static u32 *raw_capacity; static int free_raw_capacity(void) -- cgit v1.2.3 From 7e97b3dc2556743dd02612c92a8de7026e8d7dc9 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Nov 2021 19:57:14 +0000 Subject: arch_topology: Remove unused topology_set_thermal_pressure() and related There is no need of this function (and related) since code has been converted to use the new arch_update_thermal_pressure() API. The old code can be removed. Signed-off-by: Lukasz Luba Signed-off-by: Viresh Kumar --- arch/arm/include/asm/topology.h | 1 - arch/arm64/include/asm/topology.h | 1 - drivers/base/arch_topology.c | 17 +++++------------ include/linux/arch_topology.h | 3 --- include/linux/sched/topology.h | 7 ------- init/Kconfig | 2 +- 6 files changed, 6 insertions(+), 25 deletions(-) (limited to 'drivers/base') diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index f1eafacc9a30..c7d2510e5a78 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -23,7 +23,6 @@ /* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure -#define arch_set_thermal_pressure topology_set_thermal_pressure #define arch_update_thermal_pressure topology_update_thermal_pressure #else diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 7a421cbc99ed..f386b90a79c8 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -32,7 +32,6 @@ void update_freq_counters_refs(void); /* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure -#define arch_set_thermal_pressure topology_set_thermal_pressure #define arch_update_thermal_pressure topology_update_thermal_pressure #include diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index a3ae0b4eea09..976154140f0b 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -156,16 +156,6 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) DEFINE_PER_CPU(unsigned long, thermal_pressure); -void topology_set_thermal_pressure(const struct cpumask *cpus, - unsigned long th_pressure) -{ - int cpu; - - for_each_cpu(cpu, cpus) - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); -} -EXPORT_SYMBOL_GPL(topology_set_thermal_pressure); - /** * topology_update_thermal_pressure() - Update thermal pressure for CPUs * @cpus : The related CPUs for which capacity has been reduced @@ -183,7 +173,7 @@ EXPORT_SYMBOL_GPL(topology_set_thermal_pressure); void topology_update_thermal_pressure(const struct cpumask *cpus, unsigned long capped_freq) { - unsigned long max_capacity, capacity; + unsigned long max_capacity, capacity, th_pressure; u32 max_freq; int cpu; @@ -203,7 +193,10 @@ void topology_update_thermal_pressure(const struct cpumask *cpus, else capacity = mult_frac(max_capacity, capped_freq, max_freq); - arch_set_thermal_pressure(cpus, max_capacity - capacity); + th_pressure = max_capacity - capacity; + + for_each_cpu(cpu, cpus) + WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); } EXPORT_SYMBOL_GPL(topology_update_thermal_pressure); diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index ace1e5dcf773..cce6136b300a 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -56,9 +56,6 @@ static inline unsigned long topology_get_thermal_pressure(int cpu) return per_cpu(thermal_pressure, cpu); } -void topology_set_thermal_pressure(const struct cpumask *cpus, - unsigned long th_pressure); - void topology_update_thermal_pressure(const struct cpumask *cpus, unsigned long capped_freq); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 6e89a8e43aa7..8054641c0a7b 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -266,13 +266,6 @@ unsigned long arch_scale_thermal_pressure(int cpu) } #endif -#ifndef arch_set_thermal_pressure -static __always_inline -void arch_set_thermal_pressure(const struct cpumask *cpus, - unsigned long th_pressure) -{ } -#endif - #ifndef arch_update_thermal_pressure static __always_inline void arch_update_thermal_pressure(const struct cpumask *cpus, diff --git a/init/Kconfig b/init/Kconfig index 036b750e8d8a..086fd11348e0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -550,7 +550,7 @@ config SCHED_THERMAL_PRESSURE i.e. put less load on throttled CPUs than on non/less throttled ones. This requires the architecture to implement - arch_set_thermal_pressure() and arch_scale_thermal_pressure(). + arch_update_thermal_pressure() and arch_scale_thermal_pressure(). config BSD_PROCESS_ACCT bool "BSD Process Accounting" -- cgit v1.2.3 From c24efa6732788f0be22cdf5d2aedd5e3117e983f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 7 Dec 2021 19:54:32 +0100 Subject: PM: runtime: Capture device status before disabling runtime PM In some cases (for example, during system-wide suspend and resume of devices) it is useful to know whether or not runtime PM has ever been enabled for a given device and, if so, what the runtime PM status of it had been right before runtime PM was disabled for it last time. For this reason, introduce a new struct dev_pm_info field called last_status that will be used for capturing the runtime PM status of the device when its power.disable_depth counter changes from 0 to 1. The new field will be set to RPM_INVALID to start with and whenever power.disable_depth changes from 1 to 0, so it will be valid only when runtime PM of the device is currently disabled, but it has been enabled at least once. Immediately use power.last_status in rpm_resume() to make it handle the case when PM runtime is disabled for the device, but its runtime PM status is RPM_ACTIVE more consistently. Namely, make it return 1 if power.last_status is also equal to RPM_ACTIVE in that case (the idea being that if the status was RPM_ACTIVE last time when power.disable_depth was changing from 0 to 1 and it is still RPM_ACTIVE, it can be assumed to reflect what happened to the device last time when it was using runtime PM) and -EACCES otherwise. Update the documentation to provide a description of last_status and change the description of pm_runtime_resume() in it to reflect the new behavior of rpm_active(). While at it, rearrange the code in pm_runtime_enable() to be more straightforward and replace the WARN() macro in it with a pr_warn() invocation which is less disruptive. Link: https://lore.kernel.org/linux-pm/20211026222626.39222-1-ulf.hansson@linaro.org/t/#u Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- Documentation/power/runtime_pm.rst | 14 ++++++++---- drivers/base/power/runtime.c | 45 +++++++++++++++++++++----------------- include/linux/pm.h | 2 ++ 3 files changed, 37 insertions(+), 24 deletions(-) (limited to 'drivers/base') diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst index d6bf84f061f4..65b86e487afe 100644 --- a/Documentation/power/runtime_pm.rst +++ b/Documentation/power/runtime_pm.rst @@ -265,6 +265,10 @@ defined in include/linux/pm.h: RPM_SUSPENDED, which means that each device is initially regarded by the PM core as 'suspended', regardless of its real hardware status + `enum rpm_status last_status;` + - the last runtime PM status of the device captured before disabling runtime + PM for it (invalid initially and when disable_depth is 0) + `unsigned int runtime_auto;` - if set, indicates that the user space has allowed the device driver to power manage the device at run time via the /sys/devices/.../power/control @@ -333,10 +337,12 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: `int pm_runtime_resume(struct device *dev);` - execute the subsystem-level resume callback for the device; returns 0 on - success, 1 if the device's runtime PM status was already 'active' or - error code on failure, where -EAGAIN means it may be safe to attempt to - resume the device again in future, but 'power.runtime_error' should be - checked additionally, and -EACCES means that 'power.disable_depth' is + success, 1 if the device's runtime PM status is already 'active' (also if + 'power.disable_depth' is nonzero, but the status was 'active' when it was + changing from 0 to 1) or error code on failure, where -EAGAIN means it may + be safe to attempt to resume the device again in future, but + 'power.runtime_error' should be checked additionally, and -EACCES means + that the callback could not be run, because 'power.disable_depth' was different from 0 `int pm_runtime_resume_and_get(struct device *dev);` diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index d504cd4ab3cb..844b6b811610 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -742,13 +742,15 @@ static int rpm_resume(struct device *dev, int rpmflags) trace_rpm_resume_rcuidle(dev, rpmflags); repeat: - if (dev->power.runtime_error) + if (dev->power.runtime_error) { retval = -EINVAL; - else if (dev->power.disable_depth == 1 && dev->power.is_suspended - && dev->power.runtime_status == RPM_ACTIVE) - retval = 1; - else if (dev->power.disable_depth > 0) - retval = -EACCES; + } else if (dev->power.disable_depth > 0) { + if (dev->power.runtime_status == RPM_ACTIVE && + dev->power.last_status == RPM_ACTIVE) + retval = 1; + else + retval = -EACCES; + } if (retval) goto out; @@ -1410,8 +1412,10 @@ void __pm_runtime_disable(struct device *dev, bool check_resume) /* Update time accounting before disabling PM-runtime. */ update_pm_runtime_accounting(dev); - if (!dev->power.disable_depth++) + if (!dev->power.disable_depth++) { __pm_runtime_barrier(dev); + dev->power.last_status = dev->power.runtime_status; + } out: spin_unlock_irq(&dev->power.lock); @@ -1428,23 +1432,23 @@ void pm_runtime_enable(struct device *dev) spin_lock_irqsave(&dev->power.lock, flags); - if (dev->power.disable_depth > 0) { - dev->power.disable_depth--; - - /* About to enable runtime pm, set accounting_timestamp to now */ - if (!dev->power.disable_depth) - dev->power.accounting_timestamp = ktime_get_mono_fast_ns(); - } else { + if (!dev->power.disable_depth) { dev_warn(dev, "Unbalanced %s!\n", __func__); + goto out; } - WARN(!dev->power.disable_depth && - dev->power.runtime_status == RPM_SUSPENDED && - !dev->power.ignore_children && - atomic_read(&dev->power.child_count) > 0, - "Enabling runtime PM for inactive device (%s) with active children\n", - dev_name(dev)); + if (--dev->power.disable_depth > 0) + goto out; + dev->power.last_status = RPM_INVALID; + dev->power.accounting_timestamp = ktime_get_mono_fast_ns(); + + if (dev->power.runtime_status == RPM_SUSPENDED && + !dev->power.ignore_children && + atomic_read(&dev->power.child_count) > 0) + dev_warn(dev, "Enabling runtime PM for inactive device with active children\n"); + +out: spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_runtime_enable); @@ -1640,6 +1644,7 @@ EXPORT_SYMBOL_GPL(__pm_runtime_use_autosuspend); void pm_runtime_init(struct device *dev) { dev->power.runtime_status = RPM_SUSPENDED; + dev->power.last_status = RPM_INVALID; dev->power.idle_notification = false; dev->power.disable_depth = 1; diff --git a/include/linux/pm.h b/include/linux/pm.h index fc9691cb01b4..e1e9402180b9 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -521,6 +521,7 @@ const struct dev_pm_ops __maybe_unused name = { \ */ enum rpm_status { + RPM_INVALID = -1, RPM_ACTIVE = 0, RPM_RESUMING, RPM_SUSPENDED, @@ -634,6 +635,7 @@ struct dev_pm_info { unsigned int links_count; enum rpm_request request; enum rpm_status runtime_status; + enum rpm_status last_status; int runtime_error; int autosuspend_delay; u64 last_busy; -- cgit v1.2.3 From d1579e61192e0e686faa4208500ef4c3b529b16c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 10 Dec 2021 17:10:13 +0100 Subject: PM: runtime: Add safety net to supplier device release Because refcount_dec_not_one() returns true if the target refcount becomes saturated, it is generally unsafe to use its return value as a loop termination condition, but that is what happens when a device link's supplier device is released during runtime PM suspend operations and on device link removal. To address this, introduce pm_runtime_release_supplier() to be used in the above cases which will check the supplier device's runtime PM usage counter in addition to the refcount_dec_not_one() return value, so the loop can be terminated in case the rpm_active refcount value becomes invalid, and update the code in question to use it as appropriate. This change is not expected to have any visible functional impact. Reported-by: Peter Zijlstra Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Acked-by: Peter Zijlstra (Intel) --- drivers/base/core.c | 3 +-- drivers/base/power/runtime.c | 41 ++++++++++++++++++++++++++++++----------- include/linux/pm_runtime.h | 3 +++ 3 files changed, 34 insertions(+), 13 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/core.c b/drivers/base/core.c index fd034d742447..b191bd17de89 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -485,8 +485,7 @@ static void device_link_release_fn(struct work_struct *work) /* Ensure that all references to the link object have been dropped. */ device_link_synchronize_removal(); - while (refcount_dec_not_one(&link->rpm_active)) - pm_runtime_put(link->supplier); + pm_runtime_release_supplier(link, true); put_device(link->consumer); put_device(link->supplier); diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 844b6b811610..f1b856ea8e10 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -305,19 +305,40 @@ static int rpm_get_suppliers(struct device *dev) return 0; } +/** + * pm_runtime_release_supplier - Drop references to device link's supplier. + * @link: Target device link. + * @check_idle: Whether or not to check if the supplier device is idle. + * + * Drop all runtime PM references associated with @link to its supplier device + * and if @check_idle is set, check if that device is idle (and so it can be + * suspended). + */ +void pm_runtime_release_supplier(struct device_link *link, bool check_idle) +{ + struct device *supplier = link->supplier; + + /* + * The additional power.usage_count check is a safety net in case + * the rpm_active refcount becomes saturated, in which case + * refcount_dec_not_one() would return true forever, but it is not + * strictly necessary. + */ + while (refcount_dec_not_one(&link->rpm_active) && + atomic_read(&supplier->power.usage_count) > 0) + pm_runtime_put_noidle(supplier); + + if (check_idle) + pm_request_idle(supplier); +} + static void __rpm_put_suppliers(struct device *dev, bool try_to_suspend) { struct device_link *link; list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, - device_links_read_lock_held()) { - - while (refcount_dec_not_one(&link->rpm_active)) - pm_runtime_put_noidle(link->supplier); - - if (try_to_suspend) - pm_request_idle(link->supplier); - } + device_links_read_lock_held()) + pm_runtime_release_supplier(link, try_to_suspend); } static void rpm_put_suppliers(struct device *dev) @@ -1777,9 +1798,7 @@ void pm_runtime_drop_link(struct device_link *link) return; pm_runtime_drop_link_count(link->consumer); - - while (refcount_dec_not_one(&link->rpm_active)) - pm_runtime_put(link->supplier); + pm_runtime_release_supplier(link, true); } static bool pm_runtime_need_not_resume(struct device *dev) diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index eddd66d426ca..016de5776b6d 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -58,6 +58,7 @@ extern void pm_runtime_get_suppliers(struct device *dev); extern void pm_runtime_put_suppliers(struct device *dev); extern void pm_runtime_new_link(struct device *dev); extern void pm_runtime_drop_link(struct device_link *link); +extern void pm_runtime_release_supplier(struct device_link *link, bool check_idle); extern int devm_pm_runtime_enable(struct device *dev); @@ -283,6 +284,8 @@ static inline void pm_runtime_get_suppliers(struct device *dev) {} static inline void pm_runtime_put_suppliers(struct device *dev) {} static inline void pm_runtime_new_link(struct device *dev) {} static inline void pm_runtime_drop_link(struct device_link *link) {} +static inline void pm_runtime_release_supplier(struct device_link *link, + bool check_idle) {} #endif /* !CONFIG_PM */ -- cgit v1.2.3 From 50a4606655582f310b3f07c9492af9a72b40003b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 17 Dec 2021 20:16:02 +0100 Subject: PM: runtime: Simplify locking in pm_runtime_put_suppliers() Notice that pm_runtime_put_suppliers() cannot be called with disabled interrupts, because it may sleep (due to the device links read locking in the non-SRCU case), and so it can use spin_lock_irq() and spin_unlock_irq() for the locking. Update the function accordingly and while at it move the "put" local variable in it into the inner block where it is used. This change is not expected to have any visible functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson --- drivers/base/power/runtime.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index f1b856ea8e10..2f3cce17219b 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1748,8 +1748,6 @@ void pm_runtime_get_suppliers(struct device *dev) void pm_runtime_put_suppliers(struct device *dev) { struct device_link *link; - unsigned long flags; - bool put; int idx; idx = device_links_read_lock(); @@ -1757,11 +1755,17 @@ void pm_runtime_put_suppliers(struct device *dev) list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) if (link->supplier_preactivated) { + bool put; + link->supplier_preactivated = false; - spin_lock_irqsave(&dev->power.lock, flags); + + spin_lock_irq(&dev->power.lock); + put = pm_runtime_status_suspended(dev) && refcount_dec_not_one(&link->rpm_active); - spin_unlock_irqrestore(&dev->power.lock, flags); + + spin_unlock_irq(&dev->power.lock); + if (put) pm_runtime_put(link->supplier); } -- cgit v1.2.3