summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRafael J. Wysocki <rafael.j.wysocki@intel.com>2017-08-21 01:50:20 +0200
committerRafael J. Wysocki <rafael.j.wysocki@intel.com>2017-08-21 01:50:20 +0200
commit57ccaf33845491ac7ee41796511cec8dcd49777e (patch)
treef45043d506b4ce4e93aaf059c87538558c8f90fe
parentc587c79f90632df59c61383c6abebb2e07a81911 (diff)
parentd77d4888cb8458b098accd4d7555c0f7f6399c4e (diff)
downloadlinux-57ccaf33845491ac7ee41796511cec8dcd49777e.tar.bz2
Merge back intel_pstate material for v4.14.
-rw-r--r--Documentation/admin-guide/pm/intel_pstate.rst61
-rw-r--r--drivers/cpufreq/intel_pstate.c316
2 files changed, 28 insertions, 349 deletions
diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst
index 1d6249825efc..d2b6fda3d67b 100644
--- a/Documentation/admin-guide/pm/intel_pstate.rst
+++ b/Documentation/admin-guide/pm/intel_pstate.rst
@@ -167,35 +167,17 @@ is set.
``powersave``
.............
-Without HWP, this P-state selection algorithm generally depends on the
-processor model and/or the system profile setting in the ACPI tables and there
-are two variants of it.
-
-One of them is used with processors from the Atom line and (regardless of the
-processor model) on platforms with the system profile in the ACPI tables set to
-"mobile" (laptops mostly), "tablet", "appliance PC", "desktop", or
-"workstation". It is also used with processors supporting the HWP feature if
-that feature has not been enabled (that is, with the ``intel_pstate=no_hwp``
-argument in the kernel command line). It is similar to the algorithm
+Without HWP, this P-state selection algorithm is similar to the algorithm
implemented by the generic ``schedutil`` scaling governor except that the
utilization metric used by it is based on numbers coming from feedback
registers of the CPU. It generally selects P-states proportional to the
-current CPU utilization, so it is referred to as the "proportional" algorithm.
-
-The second variant of the ``powersave`` P-state selection algorithm, used in all
-of the other cases (generally, on processors from the Core line, so it is
-referred to as the "Core" algorithm), is based on the values read from the APERF
-and MPERF feedback registers and the previously requested target P-state.
-It does not really take CPU utilization into account explicitly, but as a rule
-it causes the CPU P-state to ramp up very quickly in response to increased
-utilization which is generally desirable in server environments.
-
-Regardless of the variant, this algorithm is run by the driver's utilization
-update callback for the given CPU when it is invoked by the CPU scheduler, but
-not more often than every 10 ms (that can be tweaked via ``debugfs`` in `this
-particular case <Tuning Interface in debugfs_>`_). Like in the ``performance``
-case, the hardware configuration is not touched if the new P-state turns out to
-be the same as the current one.
+current CPU utilization.
+
+This algorithm is run by the driver's utilization update callback for the
+given CPU when it is invoked by the CPU scheduler, but not more often than
+every 10 ms. Like in the ``performance`` case, the hardware configuration
+is not touched if the new P-state turns out to be the same as the current
+one.
This is the default P-state selection algorithm if the
:c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option
@@ -720,34 +702,7 @@ P-state is called, the ``ftrace`` filter can be set to to
gnome-shell-3409 [001] ..s. 2537.650850: intel_pstate_set_pstate <-intel_pstate_timer_func
<idle>-0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
-Tuning Interface in ``debugfs``
--------------------------------
-
-The ``powersave`` algorithm provided by ``intel_pstate`` for `the Core line of
-processors in the active mode <powersave_>`_ is based on a `PID controller`_
-whose parameters were chosen to address a number of different use cases at the
-same time. However, it still is possible to fine-tune it to a specific workload
-and the ``debugfs`` interface under ``/sys/kernel/debug/pstate_snb/`` is
-provided for this purpose. [Note that the ``pstate_snb`` directory will be
-present only if the specific P-state selection algorithm matching the interface
-in it actually is in use.]
-
-The following files present in that directory can be used to modify the PID
-controller parameters at run time:
-
-| ``deadband``
-| ``d_gain_pct``
-| ``i_gain_pct``
-| ``p_gain_pct``
-| ``sample_rate_ms``
-| ``setpoint``
-
-Note, however, that achieving desirable results this way generally requires
-expert-level understanding of the power vs performance tradeoff, so extra care
-is recommended when attempting to do that.
-
.. _LCEU2015: http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
.. _SDM: http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
.. _ACPI specification: http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf
-.. _PID controller: https://en.wikipedia.org/wiki/PID_controller
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 65ee4fcace1f..d5a61f45a00c 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -37,8 +37,7 @@
#include <asm/cpufeature.h>
#include <asm/intel-family.h>
-#define INTEL_PSTATE_DEFAULT_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC)
-#define INTEL_PSTATE_HWP_SAMPLING_INTERVAL (50 * NSEC_PER_MSEC)
+#define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC)
#define INTEL_CPUFREQ_TRANSITION_LATENCY 20000
#define INTEL_CPUFREQ_TRANSITION_DELAY 500
@@ -173,28 +172,6 @@ struct vid_data {
};
/**
- * struct _pid - Stores PID data
- * @setpoint: Target set point for busyness or performance
- * @integral: Storage for accumulated error values
- * @p_gain: PID proportional gain
- * @i_gain: PID integral gain
- * @d_gain: PID derivative gain
- * @deadband: PID deadband
- * @last_err: Last error storage for integral part of PID calculation
- *
- * Stores PID coefficients and last error for PID controller.
- */
-struct _pid {
- int setpoint;
- int32_t integral;
- int32_t p_gain;
- int32_t i_gain;
- int32_t d_gain;
- int deadband;
- int32_t last_err;
-};
-
-/**
* struct global_params - Global parameters, mostly tunable via sysfs.
* @no_turbo: Whether or not to use turbo P-states.
* @turbo_disabled: Whethet or not turbo P-states are available at all,
@@ -223,7 +200,6 @@ struct global_params {
* @last_update: Time of the last update.
* @pstate: Stores P state limits for this CPU
* @vid: Stores VID limits for this CPU
- * @pid: Stores PID parameters for this CPU
* @last_sample_time: Last Sample time
* @aperf_mperf_shift: Number of clock cycles after aperf, merf is incremented
* This shift is a multiplier to mperf delta to
@@ -258,7 +234,6 @@ struct cpudata {
struct pstate_data pstate;
struct vid_data vid;
- struct _pid pid;
u64 last_update;
u64 last_sample_time;
@@ -284,28 +259,6 @@ struct cpudata {
static struct cpudata **all_cpu_data;
/**
- * struct pstate_adjust_policy - Stores static PID configuration data
- * @sample_rate_ms: PID calculation sample rate in ms
- * @sample_rate_ns: Sample rate calculation in ns
- * @deadband: PID deadband
- * @setpoint: PID Setpoint
- * @p_gain_pct: PID proportional gain
- * @i_gain_pct: PID integral gain
- * @d_gain_pct: PID derivative gain
- *
- * Stores per CPU model static PID configuration data.
- */
-struct pstate_adjust_policy {
- int sample_rate_ms;
- s64 sample_rate_ns;
- int deadband;
- int setpoint;
- int p_gain_pct;
- int d_gain_pct;
- int i_gain_pct;
-};
-
-/**
* struct pstate_funcs - Per CPU model specific callbacks
* @get_max: Callback to get maximum non turbo effective P state
* @get_max_physical: Callback to get maximum non turbo physical P state
@@ -314,7 +267,6 @@ struct pstate_adjust_policy {
* @get_scaling: Callback to get frequency scaling factor
* @get_val: Callback to convert P state to actual MSR write value
* @get_vid: Callback to get VID data for Atom platforms
- * @update_util: Active mode utilization update callback.
*
* Core and Atom CPU models have different way to get P State limits. This
* structure is used to store those callbacks.
@@ -328,20 +280,9 @@ struct pstate_funcs {
int (*get_aperf_mperf_shift)(void);
u64 (*get_val)(struct cpudata*, int pstate);
void (*get_vid)(struct cpudata *);
- void (*update_util)(struct update_util_data *data, u64 time,
- unsigned int flags);
};
static struct pstate_funcs pstate_funcs __read_mostly;
-static struct pstate_adjust_policy pid_params __read_mostly = {
- .sample_rate_ms = 10,
- .sample_rate_ns = 10 * NSEC_PER_MSEC,
- .deadband = 0,
- .setpoint = 97,
- .p_gain_pct = 20,
- .d_gain_pct = 0,
- .i_gain_pct = 0,
-};
static int hwp_active __read_mostly;
static bool per_cpu_limits __read_mostly;
@@ -509,56 +450,6 @@ static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
}
#endif
-static signed int pid_calc(struct _pid *pid, int32_t busy)
-{
- signed int result;
- int32_t pterm, dterm, fp_error;
- int32_t integral_limit;
-
- fp_error = pid->setpoint - busy;
-
- if (abs(fp_error) <= pid->deadband)
- return 0;
-
- pterm = mul_fp(pid->p_gain, fp_error);
-
- pid->integral += fp_error;
-
- /*
- * We limit the integral here so that it will never
- * get higher than 30. This prevents it from becoming
- * too large an input over long periods of time and allows
- * it to get factored out sooner.
- *
- * The value of 30 was chosen through experimentation.
- */
- integral_limit = int_tofp(30);
- if (pid->integral > integral_limit)
- pid->integral = integral_limit;
- if (pid->integral < -integral_limit)
- pid->integral = -integral_limit;
-
- dterm = mul_fp(pid->d_gain, fp_error - pid->last_err);
- pid->last_err = fp_error;
-
- result = pterm + mul_fp(pid->integral, pid->i_gain) + dterm;
- result = result + (1 << (FRAC_BITS-1));
- return (signed int)fp_toint(result);
-}
-
-static inline void intel_pstate_pid_reset(struct cpudata *cpu)
-{
- struct _pid *pid = &cpu->pid;
-
- pid->p_gain = percent_fp(pid_params.p_gain_pct);
- pid->d_gain = percent_fp(pid_params.d_gain_pct);
- pid->i_gain = percent_fp(pid_params.i_gain_pct);
- pid->setpoint = int_tofp(pid_params.setpoint);
- pid->last_err = pid->setpoint - int_tofp(100);
- pid->deadband = int_tofp(pid_params.deadband);
- pid->integral = 0;
-}
-
static inline void update_turbo_state(void)
{
u64 misc_en;
@@ -911,82 +802,6 @@ static void intel_pstate_update_policies(void)
cpufreq_update_policy(cpu);
}
-/************************** debugfs begin ************************/
-static int pid_param_set(void *data, u64 val)
-{
- unsigned int cpu;
-
- *(u32 *)data = val;
- pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC;
- for_each_possible_cpu(cpu)
- if (all_cpu_data[cpu])
- intel_pstate_pid_reset(all_cpu_data[cpu]);
-
- return 0;
-}
-
-static int pid_param_get(void *data, u64 *val)
-{
- *val = *(u32 *)data;
- return 0;
-}
-DEFINE_SIMPLE_ATTRIBUTE(fops_pid_param, pid_param_get, pid_param_set, "%llu\n");
-
-static struct dentry *debugfs_parent;
-
-struct pid_param {
- char *name;
- void *value;
- struct dentry *dentry;
-};
-
-static struct pid_param pid_files[] = {
- {"sample_rate_ms", &pid_params.sample_rate_ms, },
- {"d_gain_pct", &pid_params.d_gain_pct, },
- {"i_gain_pct", &pid_params.i_gain_pct, },
- {"deadband", &pid_params.deadband, },
- {"setpoint", &pid_params.setpoint, },
- {"p_gain_pct", &pid_params.p_gain_pct, },
- {NULL, NULL, }
-};
-
-static void intel_pstate_debug_expose_params(void)
-{
- int i;
-
- debugfs_parent = debugfs_create_dir("pstate_snb", NULL);
- if (IS_ERR_OR_NULL(debugfs_parent))
- return;
-
- for (i = 0; pid_files[i].name; i++) {
- struct dentry *dentry;
-
- dentry = debugfs_create_file(pid_files[i].name, 0660,
- debugfs_parent, pid_files[i].value,
- &fops_pid_param);
- if (!IS_ERR(dentry))
- pid_files[i].dentry = dentry;
- }
-}
-
-static void intel_pstate_debug_hide_params(void)
-{
- int i;
-
- if (IS_ERR_OR_NULL(debugfs_parent))
- return;
-
- for (i = 0; pid_files[i].name; i++) {
- debugfs_remove(pid_files[i].dentry);
- pid_files[i].dentry = NULL;
- }
-
- debugfs_remove(debugfs_parent);
- debugfs_parent = NULL;
-}
-
-/************************** debugfs end ************************/
-
/************************** sysfs begin ************************/
#define show_one(file_name, object) \
static ssize_t show_##file_name \
@@ -1622,7 +1437,7 @@ static inline int32_t get_avg_pstate(struct cpudata *cpu)
cpu->sample.core_avg_perf);
}
-static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
+static inline int32_t get_target_pstate(struct cpudata *cpu)
{
struct sample *sample = &cpu->sample;
int32_t busy_frac, boost;
@@ -1660,44 +1475,6 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
return target;
}
-static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
-{
- int32_t perf_scaled, max_pstate, current_pstate, sample_ratio;
- u64 duration_ns;
-
- /*
- * perf_scaled is the ratio of the average P-state during the last
- * sampling period to the P-state requested last time (in percent).
- *
- * That measures the system's response to the previous P-state
- * selection.
- */
- max_pstate = cpu->pstate.max_pstate_physical;
- current_pstate = cpu->pstate.current_pstate;
- perf_scaled = mul_ext_fp(cpu->sample.core_avg_perf,
- div_fp(100 * max_pstate, current_pstate));
-
- /*
- * Since our utilization update callback will not run unless we are
- * in C0, check if the actual elapsed time is significantly greater (3x)
- * than our sample interval. If it is, then we were idle for a long
- * enough period of time to adjust our performance metric.
- */
- duration_ns = cpu->sample.time - cpu->last_sample_time;
- if ((s64)duration_ns > pid_params.sample_rate_ns * 3) {
- sample_ratio = div_fp(pid_params.sample_rate_ns, duration_ns);
- perf_scaled = mul_fp(perf_scaled, sample_ratio);
- } else {
- sample_ratio = div_fp(100 * (cpu->sample.mperf << cpu->aperf_mperf_shift),
- cpu->sample.tsc);
- if (sample_ratio < int_tofp(1))
- perf_scaled = 0;
- }
-
- cpu->sample.busy_scaled = perf_scaled;
- return cpu->pstate.current_pstate - pid_calc(&cpu->pid, perf_scaled);
-}
-
static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate)
{
int max_pstate = intel_pstate_get_base_pstate(cpu);
@@ -1717,13 +1494,15 @@ static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate));
}
-static void intel_pstate_adjust_pstate(struct cpudata *cpu, int target_pstate)
+static void intel_pstate_adjust_pstate(struct cpudata *cpu)
{
int from = cpu->pstate.current_pstate;
struct sample *sample;
+ int target_pstate;
update_turbo_state();
+ target_pstate = get_target_pstate(cpu);
target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
trace_cpu_frequency(target_pstate * cpu->pstate.scaling, cpu->cpu);
intel_pstate_update_pstate(cpu, target_pstate);
@@ -1740,23 +1519,6 @@ static void intel_pstate_adjust_pstate(struct cpudata *cpu, int target_pstate)
fp_toint(cpu->iowait_boost * 100));
}
-static void intel_pstate_update_util_pid(struct update_util_data *data,
- u64 time, unsigned int flags)
-{
- struct cpudata *cpu = container_of(data, struct cpudata, update_util);
- u64 delta_ns = time - cpu->sample.time;
-
- if ((s64)delta_ns < pid_params.sample_rate_ns)
- return;
-
- if (intel_pstate_sample(cpu, time)) {
- int target_pstate;
-
- target_pstate = get_target_pstate_use_performance(cpu);
- intel_pstate_adjust_pstate(cpu, target_pstate);
- }
-}
-
static void intel_pstate_update_util(struct update_util_data *data, u64 time,
unsigned int flags)
{
@@ -1765,6 +1527,15 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
if (flags & SCHED_CPUFREQ_IOWAIT) {
cpu->iowait_boost = int_tofp(1);
+ cpu->last_update = time;
+ /*
+ * The last time the busy was 100% so P-state was max anyway
+ * so avoid overhead of computation.
+ */
+ if (fp_toint(cpu->sample.busy_scaled) == 100)
+ return;
+
+ goto set_pstate;
} else if (cpu->iowait_boost) {
/* Clear iowait_boost if the CPU may have been idle. */
delta_ns = time - cpu->last_update;
@@ -1773,15 +1544,12 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
}
cpu->last_update = time;
delta_ns = time - cpu->sample.time;
- if ((s64)delta_ns < INTEL_PSTATE_DEFAULT_SAMPLING_INTERVAL)
+ if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL)
return;
- if (intel_pstate_sample(cpu, time)) {
- int target_pstate;
-
- target_pstate = get_target_pstate_use_cpu_load(cpu);
- intel_pstate_adjust_pstate(cpu, target_pstate);
- }
+set_pstate:
+ if (intel_pstate_sample(cpu, time))
+ intel_pstate_adjust_pstate(cpu);
}
static struct pstate_funcs core_funcs = {
@@ -1791,7 +1559,6 @@ static struct pstate_funcs core_funcs = {
.get_turbo = core_get_turbo_pstate,
.get_scaling = core_get_scaling,
.get_val = core_get_val,
- .update_util = intel_pstate_update_util_pid,
};
static const struct pstate_funcs silvermont_funcs = {
@@ -1802,7 +1569,6 @@ static const struct pstate_funcs silvermont_funcs = {
.get_val = atom_get_val,
.get_scaling = silvermont_get_scaling,
.get_vid = atom_get_vid,
- .update_util = intel_pstate_update_util,
};
static const struct pstate_funcs airmont_funcs = {
@@ -1813,7 +1579,6 @@ static const struct pstate_funcs airmont_funcs = {
.get_val = atom_get_val,
.get_scaling = airmont_get_scaling,
.get_vid = atom_get_vid,
- .update_util = intel_pstate_update_util,
};
static const struct pstate_funcs knl_funcs = {
@@ -1824,7 +1589,6 @@ static const struct pstate_funcs knl_funcs = {
.get_aperf_mperf_shift = knl_get_aperf_mperf_shift,
.get_scaling = core_get_scaling,
.get_val = core_get_val,
- .update_util = intel_pstate_update_util_pid,
};
static const struct pstate_funcs bxt_funcs = {
@@ -1834,7 +1598,6 @@ static const struct pstate_funcs bxt_funcs = {
.get_turbo = core_get_turbo_pstate,
.get_scaling = core_get_scaling,
.get_val = core_get_val,
- .update_util = intel_pstate_update_util,
};
#define ICPU(model, policy) \
@@ -1878,8 +1641,6 @@ static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = {
{}
};
-static bool pid_in_use(void);
-
static int intel_pstate_init_cpu(unsigned int cpunum)
{
struct cpudata *cpu;
@@ -1910,8 +1671,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
intel_pstate_disable_ee(cpunum);
intel_pstate_hwp_enable(cpu);
- } else if (pid_in_use()) {
- intel_pstate_pid_reset(cpu);
}
intel_pstate_get_cpu_pstates(cpu);
@@ -1934,7 +1693,7 @@ static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
/* Prevent intel_pstate_update_util() from using stale data. */
cpu->sample.time = 0;
cpufreq_add_update_util_hook(cpu_num, &cpu->update_util,
- pstate_funcs.update_util);
+ intel_pstate_update_util);
cpu->update_util_set = true;
}
@@ -2261,12 +2020,6 @@ static struct cpufreq_driver intel_cpufreq = {
static struct cpufreq_driver *default_driver = &intel_pstate;
-static bool pid_in_use(void)
-{
- return intel_pstate_driver == &intel_pstate &&
- pstate_funcs.update_util == intel_pstate_update_util_pid;
-}
-
static void intel_pstate_driver_cleanup(void)
{
unsigned int cpu;
@@ -2301,9 +2054,6 @@ static int intel_pstate_register_driver(struct cpufreq_driver *driver)
global.min_perf_pct = min_perf_pct_min();
- if (pid_in_use())
- intel_pstate_debug_expose_params();
-
return 0;
}
@@ -2312,9 +2062,6 @@ static int intel_pstate_unregister_driver(void)
if (hwp_active)
return -EBUSY;
- if (pid_in_use())
- intel_pstate_debug_hide_params();
-
cpufreq_unregister_driver(intel_pstate_driver);
intel_pstate_driver_cleanup();
@@ -2382,24 +2129,6 @@ static int __init intel_pstate_msrs_not_valid(void)
return 0;
}
-#ifdef CONFIG_ACPI
-static void intel_pstate_use_acpi_profile(void)
-{
- switch (acpi_gbl_FADT.preferred_profile) {
- case PM_MOBILE:
- case PM_TABLET:
- case PM_APPLIANCE_PC:
- case PM_DESKTOP:
- case PM_WORKSTATION:
- pstate_funcs.update_util = intel_pstate_update_util;
- }
-}
-#else
-static void intel_pstate_use_acpi_profile(void)
-{
-}
-#endif
-
static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
{
pstate_funcs.get_max = funcs->get_max;
@@ -2409,10 +2138,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
pstate_funcs.get_scaling = funcs->get_scaling;
pstate_funcs.get_val = funcs->get_val;
pstate_funcs.get_vid = funcs->get_vid;
- pstate_funcs.update_util = funcs->update_util;
pstate_funcs.get_aperf_mperf_shift = funcs->get_aperf_mperf_shift;
-
- intel_pstate_use_acpi_profile();
}
#ifdef CONFIG_ACPI
@@ -2556,9 +2282,7 @@ static int __init intel_pstate_init(void)
if (x86_match_cpu(hwp_support_ids)) {
copy_cpu_funcs(&core_funcs);
- if (no_hwp) {
- pstate_funcs.update_util = intel_pstate_update_util;
- } else {
+ if (!no_hwp) {
hwp_active++;
intel_pstate.attr = hwp_cpufreq_attrs;
goto hwp_cpu_matched;