From 8e2ddb03643eb9d0bc4926946d7ce0d308eef0a5 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Nov 2016 13:53:20 +0530 Subject: cpufreq: schedutil: Avoid indented labels Switch to the more common practice of writing labels. Suggested-by: Peter Zijlstra Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 69e06898997d..8c4e1652e895 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -454,17 +454,17 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail; - out: +out: mutex_unlock(&global_tunables_lock); cpufreq_enable_fast_switch(policy); return 0; - fail: +fail: policy->governor_data = NULL; sugov_tunables_free(tunables); - free_sg_policy: +free_sg_policy: mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); -- cgit v1.2.3 From 4a71ce4348bb61740d411822357061f8bf870f4c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Nov 2016 13:53:21 +0530 Subject: cpufreq: schedutil: enable fast switch earlier The fast_switch_enabled flag will be used by both sugov_policy_alloc() and sugov_policy_free() with a later patch. Prepare for that by moving the calls to enable and disable it to the beginning of sugov_init() and end of sugov_exit(). Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 8c4e1652e895..68f21bb6bd44 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -416,9 +416,13 @@ static int sugov_init(struct cpufreq_policy *policy) if (policy->governor_data) return -EBUSY; + cpufreq_enable_fast_switch(policy); + sg_policy = sugov_policy_alloc(policy); - if (!sg_policy) - return -ENOMEM; + if (!sg_policy) { + ret = -ENOMEM; + goto disable_fast_switch; + } mutex_lock(&global_tunables_lock); @@ -456,8 +460,6 @@ static int sugov_init(struct cpufreq_policy *policy) out: mutex_unlock(&global_tunables_lock); - - cpufreq_enable_fast_switch(policy); return 0; fail: @@ -468,6 +470,10 @@ free_sg_policy: mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); + +disable_fast_switch: + cpufreq_disable_fast_switch(policy); + pr_err("initialization failed (error %d)\n", ret); return ret; } @@ -478,8 +484,6 @@ static void sugov_exit(struct cpufreq_policy *policy) struct sugov_tunables *tunables = sg_policy->tunables; unsigned int count; - cpufreq_disable_fast_switch(policy); - mutex_lock(&global_tunables_lock); count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); @@ -490,6 +494,7 @@ static void sugov_exit(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); + cpufreq_disable_fast_switch(policy); } static int sugov_start(struct cpufreq_policy *policy) -- cgit v1.2.3 From 02a7b1ee3baa15a98b541d8cfd156bbe1a091c20 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Nov 2016 13:53:22 +0530 Subject: cpufreq: schedutil: move slow path from workqueue to SCHED_FIFO task If slow path frequency changes are conducted in a SCHED_OTHER context then they may be delayed for some amount of time, including indefinitely, when real time or deadline activity is taking place. Move the slow path to a real time kernel thread. In the future the thread should be made SCHED_DEADLINE. The RT priority is arbitrarily set to 50 for now. Hackbench results on ARM Exynos, dual core A15 platform for 10 iterations: $ hackbench -s 100 -l 100 -g 10 -f 20 Before After --------------------------------- 1.808 1.603 1.847 1.251 2.229 1.590 1.952 1.600 1.947 1.257 1.925 1.627 2.694 1.620 1.258 1.621 1.919 1.632 1.250 1.240 Average: 1.8829 1.5041 Based on initial work by Steve Muckle. Signed-off-by: Steve Muckle Signed-off-by: Viresh Kumar Acked-by: Peter Zijlstra (Intel) Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 85 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 68f21bb6bd44..f165ba0f0766 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -12,11 +12,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include "sched.h" +#define SUGOV_KTHREAD_PRIORITY 50 + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; @@ -35,8 +38,10 @@ struct sugov_policy { /* The next fields are only needed if fast switch cannot be used. */ struct irq_work irq_work; - struct work_struct work; + struct kthread_work work; struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; bool work_in_progress; bool need_freq_update; @@ -291,7 +296,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, raw_spin_unlock(&sg_policy->update_lock); } -static void sugov_work(struct work_struct *work) +static void sugov_work(struct kthread_work *work) { struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); @@ -308,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work) struct sugov_policy *sg_policy; sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - schedule_work_on(smp_processor_id(), &sg_policy->work); + + /* + * For Real Time and Deadline tasks, schedutil governor shoots the + * frequency to maximum. And special care must be taken to ensure that + * this kthread doesn't result in that. + * + * This is (mostly) guaranteed by the work_in_progress flag. The flag is + * updated only at the end of the sugov_work() and before that schedutil + * rejects all other frequency scaling requests. + * + * Though there is a very rare case where the RT thread yields right + * after the work_in_progress flag is cleared. The effects of that are + * neglected for now. + */ + kthread_queue_work(&sg_policy->worker, &sg_policy->work); } /************************** sysfs interface ************************/ @@ -372,7 +391,6 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) sg_policy->policy = policy; init_irq_work(&sg_policy->irq_work, sugov_irq_work); - INIT_WORK(&sg_policy->work, sugov_work); mutex_init(&sg_policy->work_lock); raw_spin_lock_init(&sg_policy->update_lock); return sg_policy; @@ -384,6 +402,51 @@ static void sugov_policy_free(struct sugov_policy *sg_policy) kfree(sg_policy); } +static int sugov_kthread_create(struct sugov_policy *sg_policy) +{ + struct task_struct *thread; + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct cpufreq_policy *policy = sg_policy->policy; + int ret; + + /* kthread only required for slow path */ + if (policy->fast_switch_enabled) + return 0; + + kthread_init_work(&sg_policy->work, sugov_work); + kthread_init_worker(&sg_policy->worker); + thread = kthread_create(kthread_worker_fn, &sg_policy->worker, + "sugov:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR(thread)) { + pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); + return PTR_ERR(thread); + } + + ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + return ret; + } + + sg_policy->thread = thread; + kthread_bind_mask(thread, policy->related_cpus); + wake_up_process(thread); + + return 0; +} + +static void sugov_kthread_stop(struct sugov_policy *sg_policy) +{ + /* kthread only required for slow path */ + if (sg_policy->policy->fast_switch_enabled) + return; + + kthread_flush_worker(&sg_policy->worker); + kthread_stop(sg_policy->thread); +} + static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) { struct sugov_tunables *tunables; @@ -424,12 +487,16 @@ static int sugov_init(struct cpufreq_policy *policy) goto disable_fast_switch; } + ret = sugov_kthread_create(sg_policy); + if (ret) + goto free_sg_policy; + mutex_lock(&global_tunables_lock); if (global_tunables) { if (WARN_ON(have_governor_per_policy())) { ret = -EINVAL; - goto free_sg_policy; + goto stop_kthread; } policy->governor_data = sg_policy; sg_policy->tunables = global_tunables; @@ -441,7 +508,7 @@ static int sugov_init(struct cpufreq_policy *policy) tunables = sugov_tunables_alloc(sg_policy); if (!tunables) { ret = -ENOMEM; - goto free_sg_policy; + goto stop_kthread; } tunables->rate_limit_us = LATENCY_MULTIPLIER; @@ -466,6 +533,9 @@ fail: policy->governor_data = NULL; sugov_tunables_free(tunables); +stop_kthread: + sugov_kthread_stop(sg_policy); + free_sg_policy: mutex_unlock(&global_tunables_lock); @@ -493,6 +563,7 @@ static void sugov_exit(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); + sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); cpufreq_disable_fast_switch(policy); } @@ -541,7 +612,7 @@ static void sugov_stop(struct cpufreq_policy *policy) synchronize_sched(); irq_work_sync(&sg_policy->irq_work); - cancel_work_sync(&sg_policy->work); + kthread_cancel_work_sync(&sg_policy->work); } static void sugov_limits(struct cpufreq_policy *policy) -- cgit v1.2.3 From 21ef57297b15a49b0c4dd4e7135c1a08e9a29a1c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Nov 2016 13:53:23 +0530 Subject: cpufreq: schedutil: irq-work and mutex are only used in slow path Execute the irq-work specific initialization/exit code only when the fast path isn't available. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index f165ba0f0766..42a220e78f00 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -390,15 +390,12 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) return NULL; sg_policy->policy = policy; - init_irq_work(&sg_policy->irq_work, sugov_irq_work); - mutex_init(&sg_policy->work_lock); raw_spin_lock_init(&sg_policy->update_lock); return sg_policy; } static void sugov_policy_free(struct sugov_policy *sg_policy) { - mutex_destroy(&sg_policy->work_lock); kfree(sg_policy); } @@ -432,6 +429,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) sg_policy->thread = thread; kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + mutex_init(&sg_policy->work_lock); + wake_up_process(thread); return 0; @@ -445,6 +445,7 @@ static void sugov_kthread_stop(struct sugov_policy *sg_policy) kthread_flush_worker(&sg_policy->worker); kthread_stop(sg_policy->thread); + mutex_destroy(&sg_policy->work_lock); } static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) @@ -611,8 +612,10 @@ static void sugov_stop(struct cpufreq_policy *policy) synchronize_sched(); - irq_work_sync(&sg_policy->irq_work); - kthread_cancel_work_sync(&sg_policy->work); + if (!policy->fast_switch_enabled) { + irq_work_sync(&sg_policy->irq_work); + kthread_cancel_work_sync(&sg_policy->work); + } } static void sugov_limits(struct cpufreq_policy *policy) -- cgit v1.2.3 From 406e79385f3223d82272cf2be86bc95cd000a258 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 21 Nov 2016 22:45:40 +0100 Subject: PM / sleep: System sleep state selection interface rework There are systems in which the platform doesn't support any special sleep states, so suspend-to-idle (PM_SUSPEND_FREEZE) is the only available system sleep state. However, some user space frameworks only use the "mem" and (sometimes) "standby" sleep state labels, so the users of those systems need to modify user space in order to be able to use system suspend at all and that may be a pain in practice. Commit 0399d4db3edf (PM / sleep: Introduce command line argument for sleep state enumeration) attempted to address this problem by adding a command line argument to change the meaning of the "mem" string in /sys/power/state to make it trigger suspend-to-idle (instead of suspend-to-RAM). However, there also are systems in which the platform does support special sleep states, but suspend-to-idle is the preferred one anyway (it even may save more energy than the platform-provided sleep states in some cases) and the above commit doesn't help in those cases. For this reason, rework the system sleep state selection interface again (but preserve backwards compatibiliby). Namely, add a new sysfs file, /sys/power/mem_sleep, that will control the system suspend mode triggered by writing "mem" to /sys/power/state (in analogy with what /sys/power/disk does for hibernation). Make it select suspend-to-RAM ("deep" sleep) by default (if supported) and fall back to suspend-to-idle ("s2idle") otherwise and add a new command line argument, mem_sleep_default, allowing that default to be overridden if need be. At the same time, drop the relative_sleep_states command line argument that doesn't make sense any more. Signed-off-by: Rafael J. Wysocki Tested-by: Mario Limonciello --- Documentation/ABI/testing/sysfs-power | 45 ++++++++++-------- Documentation/kernel-parameters.txt | 13 +++--- Documentation/power/states.txt | 60 +++++++++++++++--------- kernel/power/main.c | 88 +++++++++++++++++++++++++++++++++-- kernel/power/power.h | 6 ++- kernel/power/suspend.c | 69 ++++++++++++++++----------- 6 files changed, 201 insertions(+), 80 deletions(-) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 50b368d490b5..f523e5a3ac33 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -7,30 +7,35 @@ Description: subsystem. What: /sys/power/state -Date: May 2014 +Date: November 2016 Contact: Rafael J. Wysocki Description: The /sys/power/state file controls system sleep states. Reading from this file returns the available sleep state - labels, which may be "mem", "standby", "freeze" and "disk" - (hibernation). The meanings of the first three labels depend on - the relative_sleep_states command line argument as follows: - 1) relative_sleep_states = 1 - "mem", "standby", "freeze" represent non-hibernation sleep - states from the deepest ("mem", always present) to the - shallowest ("freeze"). "standby" and "freeze" may or may - not be present depending on the capabilities of the - platform. "freeze" can only be present if "standby" is - present. - 2) relative_sleep_states = 0 (default) - "mem" - "suspend-to-RAM", present if supported. - "standby" - "power-on suspend", present if supported. - "freeze" - "suspend-to-idle", always present. - - Writing to this file one of these strings causes the system to - transition into the corresponding state, if available. See - Documentation/power/states.txt for a description of what - "suspend-to-RAM", "power-on suspend" and "suspend-to-idle" mean. + labels, which may be "mem" (suspend), "standby" (power-on + suspend), "freeze" (suspend-to-idle) and "disk" (hibernation). + + Writing one of the above strings to this file causes the system + to transition into the corresponding state, if available. + + See Documentation/power/states.txt for more information. + +What: /sys/power/mem_sleep +Date: November 2016 +Contact: Rafael J. Wysocki +Description: + The /sys/power/mem_sleep file controls the operating mode of + system suspend. Reading from it returns the available modes + as "s2idle" (always present), "shallow" and "deep" (present if + supported). The mode that will be used on subsequent attempts + to suspend the system (by writing "mem" to the /sys/power/state + file described above) is enclosed in square brackets. + + Writing one of the above strings to this file causes the mode + represented by it to be used on subsequent attempts to suspend + the system. + + See Documentation/power/states.txt for more information. What: /sys/power/disk Date: September 2006 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 37babf91f2cb..4131e169f97a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2325,6 +2325,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. memory contents and reserves bad memory regions that are detected. + mem_sleep_default= [SUSPEND] Default system suspend mode: + s2idle - Suspend-To-Idle + shallow - Power-On Suspend or equivalent (if supported) + deep - Suspend-To-RAM or equivalent (if supported) + See Documentation/power/states.txt. + meye.*= [HW] Set MotionEye Camera parameters See Documentation/video4linux/meye.txt. @@ -3668,13 +3674,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/cgroup-v1/cpusets.txt. - relative_sleep_states= - [SUSPEND] Use sleep state labeling where the deepest - state available other than hibernation is always "mem". - Format: { "0" | "1" } - 0 -- Traditional sleep state labels. - 1 -- Relative sleep state labels. - reserve= [KNL,BUGS] Force the kernel to ignore some iomem area reservetop= [X86-32] diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt index 50f3ef9177c1..008ecb588317 100644 --- a/Documentation/power/states.txt +++ b/Documentation/power/states.txt @@ -8,25 +8,41 @@ for each state. The states are represented by strings that can be read or written to the /sys/power/state file. Those strings may be "mem", "standby", "freeze" and -"disk", where the last one always represents hibernation (Suspend-To-Disk) and -the meaning of the remaining ones depends on the relative_sleep_states command -line argument. - -For relative_sleep_states=1, the strings "mem", "standby" and "freeze" label the -available non-hibernation sleep states from the deepest to the shallowest, -respectively. In that case, "mem" is always present in /sys/power/state, -because there is at least one non-hibernation sleep state in every system. If -the given system supports two non-hibernation sleep states, "standby" is present -in /sys/power/state in addition to "mem". If the system supports three -non-hibernation sleep states, "freeze" will be present in /sys/power/state in -addition to "mem" and "standby". - -For relative_sleep_states=0, which is the default, the following descriptions -apply. - -state: Suspend-To-Idle +"disk", where the last three always represent Power-On Suspend (if supported), +Suspend-To-Idle and hibernation (Suspend-To-Disk), respectively. + +The meaning of the "mem" string is controlled by the /sys/power/mem_sleep file. +It contains strings representing the available modes of system suspend that may +be triggered by writing "mem" to /sys/power/state. These modes are "s2idle" +(Suspend-To-Idle), "shallow" (Power-On Suspend) and "deep" (Suspend-To-RAM). +The "s2idle" mode is always available, while the other ones are only available +if supported by the platform (if not supported, the strings representing them +are not present in /sys/power/mem_sleep). The string representing the suspend +mode to be used subsequently is enclosed in square brackets. Writing one of +the other strings present in /sys/power/mem_sleep to it causes the suspend mode +to be used subsequently to change to the one represented by that string. + +Consequently, there are two ways to cause the system to go into the +Suspend-To-Idle sleep state. The first one is to write "freeze" directly to +/sys/power/state. The second one is to write "s2idle" to /sys/power/mem_sleep +and then to wrtie "mem" to /sys/power/state. Similarly, there are two ways +to cause the system to go into the Power-On Suspend sleep state (the strings to +write to the control files in that case are "standby" or "shallow" and "mem", +respectively) if that state is supported by the platform. In turn, there is +only one way to cause the system to go into the Suspend-To-RAM state (write +"deep" into /sys/power/mem_sleep and "mem" into /sys/power/state). + +The default suspend mode (ie. the one to be used without writing anything into +/sys/power/mem_sleep) is either "deep" (if Suspend-To-RAM is supported) or +"s2idle", but it can be overridden by the value of the "mem_sleep_default" +parameter in the kernel command line. + +The properties of all of the sleep states are described below. + + +State: Suspend-To-Idle ACPI state: S0 -Label: "freeze" +Label: "s2idle" ("freeze") This state is a generic, pure software, light-weight, system sleep state. It allows more energy to be saved relative to runtime idle by freezing user @@ -35,13 +51,13 @@ lower-power than available at run time), such that the processors can spend more time in their idle states. This state can be used for platforms without Power-On Suspend/Suspend-to-RAM -support, or it can be used in addition to Suspend-to-RAM (memory sleep) -to provide reduced resume latency. It is always supported. +support, or it can be used in addition to Suspend-to-RAM to provide reduced +resume latency. It is always supported. State: Standby / Power-On Suspend ACPI State: S1 -Label: "standby" +Label: "shallow" ("standby") This state, if supported, offers moderate, though real, power savings, while providing a relatively low-latency transition back to a working system. No @@ -58,7 +74,7 @@ state. State: Suspend-to-RAM ACPI State: S3 -Label: "mem" +Label: "deep" This state, if supported, offers significant power savings as everything in the system is put into a low-power state, except for memory, which should be placed diff --git a/kernel/power/main.c b/kernel/power/main.c index 281a697fd458..d401c21136d1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -78,6 +78,78 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(pm_async); +#ifdef CONFIG_SUSPEND +static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + char *s = buf; + suspend_state_t i; + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (mem_sleep_states[i]) { + const char *label = mem_sleep_states[i]; + + if (mem_sleep_current == i) + s += sprintf(s, "[%s] ", label); + else + s += sprintf(s, "%s ", label); + } + + /* Convert the last space to a newline if needed. */ + if (s != buf) + *(s-1) = '\n'; + + return (s - buf); +} + +static suspend_state_t decode_suspend_state(const char *buf, size_t n) +{ + suspend_state_t state; + char *p; + int len; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { + const char *label = mem_sleep_states[state]; + + if (label && len == strlen(label) && !strncmp(buf, label, len)) + return state; + } + + return PM_SUSPEND_ON; +} + +static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + + state = decode_suspend_state(buf, n); + if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON) + mem_sleep_current = state; + else + error = -EINVAL; + + out: + pm_autosleep_unlock(); + return error ? error : n; +} + +power_attr(mem_sleep); +#endif /* CONFIG_SUSPEND */ + #ifdef CONFIG_PM_DEBUG int pm_test_level = TEST_NONE; @@ -368,12 +440,16 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, } state = decode_state(buf, n); - if (state < PM_SUSPEND_MAX) + if (state < PM_SUSPEND_MAX) { + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + error = pm_suspend(state); - else if (state == PM_SUSPEND_MAX) + } else if (state == PM_SUSPEND_MAX) { error = hibernate(); - else + } else { error = -EINVAL; + } out: pm_autosleep_unlock(); @@ -485,6 +561,9 @@ static ssize_t autosleep_store(struct kobject *kobj, && strcmp(buf, "off") && strcmp(buf, "off\n")) return -EINVAL; + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + error = pm_autosleep_set_state(state); return error ? error : n; } @@ -602,6 +681,9 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, +#ifdef CONFIG_SUSPEND + &mem_sleep_attr.attr, +#endif #ifdef CONFIG_PM_AUTOSLEEP &autosleep_attr.attr, #endif diff --git a/kernel/power/power.h b/kernel/power/power.h index 56d1d0dedf76..1dfa0da827d3 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -189,11 +189,15 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); #ifdef CONFIG_SUSPEND /* kernel/power/suspend.c */ -extern const char *pm_labels[]; +extern const char * const pm_labels[]; extern const char *pm_states[]; +extern const char *mem_sleep_states[]; +extern suspend_state_t mem_sleep_current; extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ +#define mem_sleep_current PM_SUSPEND_ON + static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6ccb08f57fcb..15e6baef5c73 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -32,8 +32,21 @@ #include "power.h" -const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; +const char * const pm_labels[] = { + [PM_SUSPEND_FREEZE] = "freeze", + [PM_SUSPEND_STANDBY] = "standby", + [PM_SUSPEND_MEM] = "mem", +}; const char *pm_states[PM_SUSPEND_MAX]; +static const char * const mem_sleep_labels[] = { + [PM_SUSPEND_FREEZE] = "s2idle", + [PM_SUSPEND_STANDBY] = "shallow", + [PM_SUSPEND_MEM] = "deep", +}; +const char *mem_sleep_states[PM_SUSPEND_MAX]; + +suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE; +static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM; unsigned int pm_suspend_global_flags; EXPORT_SYMBOL_GPL(pm_suspend_global_flags); @@ -110,30 +123,32 @@ static bool valid_state(suspend_state_t state) return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); } -/* - * If this is set, the "mem" label always corresponds to the deepest sleep state - * available, the "standby" label corresponds to the second deepest sleep state - * available (if any), and the "freeze" label corresponds to the remaining - * available sleep state (if there is one). - */ -static bool relative_states; - void __init pm_states_init(void) { + /* "mem" and "freeze" are always present in /sys/power/state. */ + pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM]; + pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE]; /* - * freeze state should be supported even without any suspend_ops, - * initialize pm_states accordingly here + * Suspend-to-idle should be supported even without any suspend_ops, + * initialize mem_sleep_states[] accordingly here. */ - pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; + mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE]; } -static int __init sleep_states_setup(char *str) +static int __init mem_sleep_default_setup(char *str) { - relative_states = !strncmp(str, "1", 1); + suspend_state_t state; + + for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++) + if (mem_sleep_labels[state] && + !strcmp(str, mem_sleep_labels[state])) { + mem_sleep_default = state; + break; + } + return 1; } - -__setup("relative_sleep_states=", sleep_states_setup); +__setup("mem_sleep_default=", mem_sleep_default_setup); /** * suspend_set_ops - Set the global suspend method table. @@ -141,21 +156,21 @@ __setup("relative_sleep_states=", sleep_states_setup); */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - suspend_state_t i; - int j = 0; - lock_system_sleep(); suspend_ops = ops; - for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) - if (valid_state(i)) { - pm_states[i] = pm_labels[j++]; - } else if (!relative_states) { - pm_states[i] = NULL; - j++; - } - pm_states[PM_SUSPEND_FREEZE] = pm_labels[j]; + if (valid_state(PM_SUSPEND_STANDBY)) { + mem_sleep_states[PM_SUSPEND_STANDBY] = mem_sleep_labels[PM_SUSPEND_STANDBY]; + pm_states[PM_SUSPEND_STANDBY] = pm_labels[PM_SUSPEND_STANDBY]; + if (mem_sleep_default == PM_SUSPEND_STANDBY) + mem_sleep_current = PM_SUSPEND_STANDBY; + } + if (valid_state(PM_SUSPEND_MEM)) { + mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; + if (mem_sleep_default == PM_SUSPEND_MEM) + mem_sleep_current = PM_SUSPEND_MEM; + } unlock_system_sleep(); } -- cgit v1.2.3 From 08b98d3291652bdcd1029a059e39fbcae5ad93e2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 17 Nov 2016 03:28:53 +0100 Subject: PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag Modify the ACPI system sleep support setup code to select suspend-to-idle as the default system sleep state if the ACPI_FADT_LOW_POWER_S0 flag is set in the FADT and the default sleep state was not selected from the kernel command line. Signed-off-by: Rafael J. Wysocki Tested-by: Mario Limonciello --- Documentation/power/states.txt | 4 +++- drivers/acpi/sleep.c | 8 ++++++++ include/linux/suspend.h | 2 ++ kernel/power/suspend.c | 4 ++-- 4 files changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt index 008ecb588317..8a39ce45d8a0 100644 --- a/Documentation/power/states.txt +++ b/Documentation/power/states.txt @@ -35,7 +35,9 @@ only one way to cause the system to go into the Suspend-To-RAM state (write The default suspend mode (ie. the one to be used without writing anything into /sys/power/mem_sleep) is either "deep" (if Suspend-To-RAM is supported) or "s2idle", but it can be overridden by the value of the "mem_sleep_default" -parameter in the kernel command line. +parameter in the kernel command line. On some ACPI-based systems, depending on +the information in the FADT, the default may be "s2idle" even if Suspend-To-RAM +is supported. The properties of all of the sleep states are described below. diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index deb0ff78eba8..ce1855fd584b 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -691,6 +691,14 @@ static void acpi_sleep_suspend_setup(void) if (acpi_sleep_state_supported(i)) sleep_states[i] = 1; + /* + * Use suspend-to-idle by default if ACPI_FADT_LOW_POWER_S0 is set and + * the default suspend mode was not selected from the command line. + */ + if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0 && + mem_sleep_default > PM_SUSPEND_MEM) + mem_sleep_default = PM_SUSPEND_FREEZE; + suspend_set_ops(old_suspend_ordering ? &acpi_suspend_ops_old : &acpi_suspend_ops); freeze_set_ops(&acpi_freeze_ops); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index d9718378a8be..0c729c3c8549 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -194,6 +194,8 @@ struct platform_freeze_ops { }; #ifdef CONFIG_SUSPEND +extern suspend_state_t mem_sleep_default; + /** * suspend_set_ops - set platform dependent suspend operations * @ops: The new suspend operations to set. diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 15e6baef5c73..f67ceb7768b8 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -46,7 +46,7 @@ static const char * const mem_sleep_labels[] = { const char *mem_sleep_states[PM_SUSPEND_MAX]; suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE; -static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM; +suspend_state_t mem_sleep_default = PM_SUSPEND_MAX; unsigned int pm_suspend_global_flags; EXPORT_SYMBOL_GPL(pm_suspend_global_flags); @@ -168,7 +168,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) } if (valid_state(PM_SUSPEND_MEM)) { mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; - if (mem_sleep_default == PM_SUSPEND_MEM) + if (mem_sleep_default >= PM_SUSPEND_MEM) mem_sleep_current = PM_SUSPEND_MEM; } -- cgit v1.2.3 From d06e622d3d9206e6a2cc45a0f9a3256da8773ff4 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 24 Nov 2016 13:51:11 +0530 Subject: cpufreq: schedutil: Rectify comment in sugov_irq_work() function This patch rectifies a comment present in sugov_irq_work() function to follow proper grammar. Suggested-by: Ingo Molnar Signed-off-by: Viresh Kumar Acked-by: Ingo Molnar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 42a220e78f00..fd4659313640 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -315,15 +315,15 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); /* - * For Real Time and Deadline tasks, schedutil governor shoots the - * frequency to maximum. And special care must be taken to ensure that - * this kthread doesn't result in that. + * For RT and deadline tasks, the schedutil governor shoots the + * frequency to maximum. Special care must be taken to ensure that this + * kthread doesn't result in the same behavior. * * This is (mostly) guaranteed by the work_in_progress flag. The flag is - * updated only at the end of the sugov_work() and before that schedutil - * rejects all other frequency scaling requests. + * updated only at the end of the sugov_work() function and before that + * the schedutil governor rejects all other frequency scaling requests. * - * Though there is a very rare case where the RT thread yields right + * There is a very rare case though, where the RT thread yields right * after the work_in_progress flag is cleared. The effects of that are * neglected for now. */ -- cgit v1.2.3 From bb8313b603eb8fd52de48a079bfcd72dcab2ef1e Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 28 Nov 2016 23:03:04 -0800 Subject: cpuidle: Allow enforcing deepest idle state selection When idle injection is used to cap power, we need to override the governor's choice of idle states. For this reason, make it possible the deepest idle state selection to be enforced by setting a flag on a given CPU to achieve the maximum potential power draw reduction. Signed-off-by: Jacob Pan [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 13 ++++++++++++- include/linux/cpuidle.h | 7 ++++++- kernel/sched/idle.c | 13 ++++++++----- 3 files changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index c73207abb5a4..afc005b917fe 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -97,7 +97,17 @@ static int find_deepest_state(struct cpuidle_driver *drv, return ret; } -#ifdef CONFIG_SUSPEND +/* Set the current cpu to use the deepest idle state, override governors */ +void cpuidle_use_deepest_state(bool enable) +{ + struct cpuidle_device *dev; + + preempt_disable(); + dev = cpuidle_get_device(); + dev->use_deepest_state = enable; + preempt_enable(); +} + /** * cpuidle_find_deepest_state - Find the deepest available idle state. * @drv: cpuidle driver for the given CPU. @@ -109,6 +119,7 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv, return find_deepest_state(drv, dev, UINT_MAX, 0, false); } +#ifdef CONFIG_SUSPEND static void enter_freeze_proper(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) { diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 15deea449edc..da346f2817a8 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -74,6 +74,7 @@ struct cpuidle_driver_kobj; struct cpuidle_device { unsigned int registered:1; unsigned int enabled:1; + unsigned int use_deepest_state:1; unsigned int cpu; int last_residency; @@ -192,11 +193,12 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver( static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; } #endif -#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND) +#ifdef CONFIG_CPU_IDLE extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev); +extern void cpuidle_use_deepest_state(bool enable); #else static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev) @@ -204,6 +206,9 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } +static inline void cpuidle_use_deepest_state(bool enable) +{ +} #endif /* kernel/sched/idle.c */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1d8718d5300d..513e4dfeeae7 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -164,11 +164,14 @@ static void cpuidle_idle_call(void) * timekeeping to prevent timer interrupts from kicking us out of idle * until a proper wakeup interrupt happens. */ - if (idle_should_freeze()) { - entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state > 0) { - local_irq_enable(); - goto exit_idle; + + if (idle_should_freeze() || dev->use_deepest_state) { + if (idle_should_freeze()) { + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state > 0) { + local_irq_enable(); + goto exit_idle; + } } next_state = cpuidle_find_deepest_state(drv, dev); -- cgit v1.2.3 From c1de45ca831acee9b72c9320dde447edafadb43f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Nov 2016 23:03:05 -0800 Subject: sched/idle: Add support for tasks that inject idle Idle injection drivers such as Intel powerclamp and ACPI PAD drivers use realtime tasks to take control of CPU then inject idle. There are two issues with this approach: 1. Low efficiency: injected idle task is treated as busy so sched ticks do not stop during injected idle period, the result of these unwanted wakeups can be ~20% loss in power savings. 2. Idle accounting: injected idle time is presented to user as busy. This patch addresses the issues by introducing a new PF_IDLE flag which allows any given task to be treated as idle task while the flag is set. Therefore, idle injection tasks can run through the normal flow of NOHZ idle enter/exit to get the correct accounting as well as tick stop when possible. The implication is that idle task is then no longer limited to PID == 0. Acked-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- include/linux/cpu.h | 2 + include/linux/sched.h | 3 +- kernel/fork.c | 2 +- kernel/sched/core.c | 1 + kernel/sched/idle.c | 162 +++++++++++++++++++++++++++++++------------------- 5 files changed, 107 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index b886dc17f2f3..ac0efae38072 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -245,6 +245,8 @@ void arch_cpu_idle_dead(void); int cpu_report_state(int cpu); int cpu_check_up_prepare(int cpu); void cpu_set_state_online(int cpu); +void play_idle(unsigned long duration_ms); + #ifdef CONFIG_HOTPLUG_CPU bool cpu_wait_death(unsigned int cpu, int seconds); bool cpu_report_death(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index 348f51b0ec92..114c7fcb6af6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2254,6 +2254,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, /* * Per process flags */ +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ @@ -2609,7 +2610,7 @@ extern struct task_struct *idle_task(int cpu); */ static inline bool is_idle_task(const struct task_struct *p) { - return p->pid == 0; + return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); diff --git a/kernel/fork.c b/kernel/fork.c index 623259fc794d..5074b2f0827b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1537,7 +1537,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..63b3a8a49884 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5285,6 +5285,7 @@ void init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + idle->flags |= PF_IDLE; kasan_unpoison_task_stack(idle); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 513e4dfeeae7..6a4bae0a649d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -205,76 +205,65 @@ exit_idle: * * Called with polling cleared. */ -static void cpu_idle_loop(void) +static void do_idle(void) { - int cpu = smp_processor_id(); - - while (1) { - /* - * If the arch has a polling bit, we maintain an invariant: - * - * Our polling bit is clear if we're not scheduled (i.e. if - * rq->curr != rq->idle). This means that, if rq->idle has - * the polling bit set, then setting need_resched is - * guaranteed to cause the cpu to reschedule. - */ - - __current_set_polling(); - quiet_vmstat(); - tick_nohz_idle_enter(); + /* + * If the arch has a polling bit, we maintain an invariant: + * + * Our polling bit is clear if we're not scheduled (i.e. if rq->curr != + * rq->idle). This means that, if rq->idle has the polling bit set, + * then setting need_resched is guaranteed to cause the CPU to + * reschedule. + */ - while (!need_resched()) { - check_pgt_cache(); - rmb(); + __current_set_polling(); + tick_nohz_idle_enter(); - if (cpu_is_offline(cpu)) { - cpuhp_report_idle_dead(); - arch_cpu_idle_dead(); - } + while (!need_resched()) { + check_pgt_cache(); + rmb(); - local_irq_disable(); - arch_cpu_idle_enter(); - - /* - * In poll mode we reenable interrupts and spin. - * - * Also if we detected in the wakeup from idle - * path that the tick broadcast device expired - * for us, we don't want to go deep idle as we - * know that the IPI is going to arrive right - * away - */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) - cpu_idle_poll(); - else - cpuidle_idle_call(); - - arch_cpu_idle_exit(); + if (cpu_is_offline(smp_processor_id())) { + cpuhp_report_idle_dead(); + arch_cpu_idle_dead(); } - /* - * Since we fell out of the loop above, we know - * TIF_NEED_RESCHED must be set, propagate it into - * PREEMPT_NEED_RESCHED. - * - * This is required because for polling idle loops we will - * not have had an IPI to fold the state for us. - */ - preempt_set_need_resched(); - tick_nohz_idle_exit(); - __current_clr_polling(); + local_irq_disable(); + arch_cpu_idle_enter(); /* - * We promise to call sched_ttwu_pending and reschedule - * if need_resched is set while polling is set. That - * means that clearing polling needs to be visible - * before doing these things. + * In poll mode we reenable interrupts and spin. Also if we + * detected in the wakeup from idle path that the tick + * broadcast device expired for us, we don't want to go deep + * idle as we know that the IPI is going to arrive right away. */ - smp_mb__after_atomic(); - - sched_ttwu_pending(); - schedule_preempt_disabled(); + if (cpu_idle_force_poll || tick_check_broadcast_expired()) + cpu_idle_poll(); + else + cpuidle_idle_call(); + arch_cpu_idle_exit(); } + + /* + * Since we fell out of the loop above, we know TIF_NEED_RESCHED must + * be set, propagate it into PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will not have had + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); + tick_nohz_idle_exit(); + __current_clr_polling(); + + /* + * We promise to call sched_ttwu_pending() and reschedule if + * need_resched() is set while polling is set. That means that clearing + * polling needs to be visible before doing these things. + */ + smp_mb__after_atomic(); + + sched_ttwu_pending(); + schedule_preempt_disabled(); } bool cpu_in_idle(unsigned long pc) @@ -283,6 +272,56 @@ bool cpu_in_idle(unsigned long pc) pc < (unsigned long)__cpuidle_text_end; } +struct idle_timer { + struct hrtimer timer; + int done; +}; + +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) +{ + struct idle_timer *it = container_of(timer, struct idle_timer, timer); + + WRITE_ONCE(it->done, 1); + set_tsk_need_resched(current); + + return HRTIMER_NORESTART; +} + +void play_idle(unsigned long duration_ms) +{ + struct idle_timer it; + + /* + * Only FIFO tasks can disable the tick since they don't need the forced + * preemption. + */ + WARN_ON_ONCE(current->policy != SCHED_FIFO); + WARN_ON_ONCE(current->nr_cpus_allowed != 1); + WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); + WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); + WARN_ON_ONCE(!duration_ms); + + rcu_sleep_check(); + preempt_disable(); + current->flags |= PF_IDLE; + cpuidle_use_deepest_state(true); + + it.done = 0; + hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + it.timer.function = idle_inject_timer_fn; + hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED); + + while (!READ_ONCE(it.done)) + do_idle(); + + cpuidle_use_deepest_state(false); + current->flags &= ~PF_IDLE; + + preempt_fold_need_resched(); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(play_idle); + void cpu_startup_entry(enum cpuhp_state state) { /* @@ -302,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state) #endif arch_cpu_idle_prepare(); cpuhp_online_idle(state); - cpu_idle_loop(); + while (1) + do_idle(); } -- cgit v1.2.3