summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu/aperfmperf.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu/aperfmperf.c')
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c480
1 files changed, 390 insertions, 90 deletions
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 9ca008f9e9b1..1f60a2b27936 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -6,146 +6,446 @@
* Copyright (C) 2017 Intel Corp.
* Author: Len Brown <len.brown@intel.com>
*/
-
+#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/ktime.h>
#include <linux/math64.h>
#include <linux/percpu.h>
-#include <linux/cpufreq.h>
-#include <linux/smp.h>
-#include <linux/sched/isolation.h>
#include <linux/rcupdate.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/topology.h>
+#include <linux/smp.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
#include "cpu.h"
-struct aperfmperf_sample {
- unsigned int khz;
- atomic_t scfpending;
- ktime_t time;
- u64 aperf;
- u64 mperf;
+struct aperfmperf {
+ seqcount_t seq;
+ unsigned long last_update;
+ u64 acnt;
+ u64 mcnt;
+ u64 aperf;
+ u64 mperf;
};
-static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
+ .seq = SEQCNT_ZERO(cpu_samples.seq)
+};
-#define APERFMPERF_CACHE_THRESHOLD_MS 10
-#define APERFMPERF_REFRESH_DELAY_MS 10
-#define APERFMPERF_STALE_THRESHOLD_MS 1000
+static void init_counter_refs(void)
+{
+ u64 aperf, mperf;
+
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+ this_cpu_write(cpu_samples.aperf, aperf);
+ this_cpu_write(cpu_samples.mperf, mperf);
+}
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
/*
- * aperfmperf_snapshot_khz()
- * On the current CPU, snapshot APERF, MPERF, and jiffies
- * unless we already did it within 10ms
- * calculate kHz, save snapshot
+ * APERF/MPERF frequency ratio computation.
+ *
+ * The scheduler wants to do frequency invariant accounting and needs a <1
+ * ratio to account for the 'current' frequency, corresponding to
+ * freq_curr / freq_max.
+ *
+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
+ * our P-state setting is little more than a request/hint, we need to observe
+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
+ * interval after discarding idle time. This is given by:
+ *
+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
+ *
+ * where freq_base is the max non-turbo P-state.
+ *
+ * The freq_max term has to be set to a somewhat arbitrary value, because we
+ * can't know which turbo states will be available at a given point in time:
+ * it all depends on the thermal headroom of the entire package. We set it to
+ * the turbo level with 4 cores active.
+ *
+ * Benchmarks show that's a good compromise between the 1C turbo ratio
+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
+ * which would ignore the entire turbo range (a conspicuous part, making
+ * freq_curr/freq_max always maxed out).
+ *
+ * An exception to the heuristic above is the Atom uarch, where we choose the
+ * highest turbo level for freq_max since Atom's are generally oriented towards
+ * power efficiency.
+ *
+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
*/
-static void aperfmperf_snapshot_khz(void *dummy)
+
+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
+
+static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
+
+void arch_set_max_freq_ratio(bool turbo_disabled)
{
- u64 aperf, aperf_delta;
- u64 mperf, mperf_delta;
- struct aperfmperf_sample *s = this_cpu_ptr(&samples);
- unsigned long flags;
+ arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
+ arch_turbo_freq_ratio;
+}
+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
- local_irq_save(flags);
- rdmsrl(MSR_IA32_APERF, aperf);
- rdmsrl(MSR_IA32_MPERF, mperf);
- local_irq_restore(flags);
+static bool __init turbo_disabled(void)
+{
+ u64 misc_en;
+ int err;
+
+ err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
+ if (err)
+ return false;
+
+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
+}
+
+static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ int err;
+
+ err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
+ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
+
+ return true;
+}
+
+#define X86_MATCH(model) \
+ X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
+ INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
+
+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(XEON_PHI_KNL),
+ X86_MATCH(XEON_PHI_KNM),
+ {}
+};
+
+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(SKYLAKE_X),
+ {}
+};
+
+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(ATOM_GOLDMONT),
+ X86_MATCH(ATOM_GOLDMONT_D),
+ X86_MATCH(ATOM_GOLDMONT_PLUS),
+ {}
+};
+
+static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
+ int num_delta_fratio)
+{
+ int fratio, delta_fratio, found;
+ int err, i;
+ u64 msr;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ fratio = (msr >> 8) & 0xFF;
+ i = 16;
+ found = 0;
+ do {
+ if (found >= num_delta_fratio) {
+ *turbo_freq = fratio;
+ return true;
+ }
+
+ delta_fratio = (msr >> (i + 5)) & 0x7;
+
+ if (delta_fratio) {
+ found += 1;
+ fratio -= delta_fratio;
+ }
+
+ i += 8;
+ } while (i < 64);
+
+ return true;
+}
+
+static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
+{
+ u64 ratios, counts;
+ u32 group_size;
+ int err, i;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
+ if (err)
+ return false;
+
+ for (i = 0; i < 64; i += 8) {
+ group_size = (counts >> i) & 0xFF;
+ if (group_size >= size) {
+ *turbo_freq = (ratios >> i) & 0xFF;
+ return true;
+ }
+ }
+
+ return false;
+}
- aperf_delta = aperf - s->aperf;
- mperf_delta = mperf - s->mperf;
+static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ u64 msr;
+ int err;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
+
+ /* The CPU may have less than 4 cores */
+ if (!*turbo_freq)
+ *turbo_freq = msr & 0xFF; /* 1C turbo */
+
+ return true;
+}
+
+static bool __init intel_set_max_freq_ratio(void)
+{
+ u64 base_freq, turbo_freq;
+ u64 turbo_ratio;
+ if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
+ goto out;
+
+ if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ return false;
+
+out:
/*
- * There is no architectural guarantee that MPERF
- * increments faster than we can read it.
+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
+ * but then fill all MSR's with zeroes.
+ * Some CPUs have turbo boost but don't declare any turbo ratio
+ * in MSR_TURBO_RATIO_LIMIT.
*/
- if (mperf_delta == 0)
- return;
+ if (!base_freq || !turbo_freq) {
+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
+ return false;
+ }
- s->time = ktime_get();
- s->aperf = aperf;
- s->mperf = mperf;
- s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
- atomic_set_release(&s->scfpending, 0);
+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
+ if (!turbo_ratio) {
+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = turbo_ratio;
+ arch_set_max_freq_ratio(turbo_disabled());
+
+ return true;
}
-static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
+#ifdef CONFIG_PM_SLEEP
+static struct syscore_ops freq_invariance_syscore_ops = {
+ .resume = init_counter_refs,
+};
+
+static void register_freq_invariance_syscore_ops(void)
{
- s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
- struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
+ register_syscore_ops(&freq_invariance_syscore_ops);
+}
+#else
+static inline void register_freq_invariance_syscore_ops(void) {}
+#endif
- /* Don't bother re-computing within the cache threshold time. */
- if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
- return true;
+static void freq_invariance_enable(void)
+{
+ if (static_branch_unlikely(&arch_scale_freq_key)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ static_branch_enable(&arch_scale_freq_key);
+ register_freq_invariance_syscore_ops();
+ pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
+}
+
+void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
+{
+ arch_turbo_freq_ratio = ratio;
+ arch_set_max_freq_ratio(turbo_disabled);
+ freq_invariance_enable();
+}
+
+static void __init bp_init_freq_invariance(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
- if (!atomic_xchg(&s->scfpending, 1) || wait)
- smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
+ if (intel_set_max_freq_ratio())
+ freq_invariance_enable();
+}
- /* Return false if the previous iteration was too long ago. */
- return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
+static void disable_freq_invariance_workfn(struct work_struct *work)
+{
+ static_branch_disable(&arch_scale_freq_key);
}
-unsigned int aperfmperf_get_khz(int cpu)
+static DECLARE_WORK(disable_freq_invariance_work,
+ disable_freq_invariance_workfn);
+
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+
+static void scale_freq_tick(u64 acnt, u64 mcnt)
{
- if (!cpu_khz)
- return 0;
+ u64 freq_scale;
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return 0;
+ if (!arch_scale_freq_invariant())
+ return;
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- return 0;
+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
+ goto error;
- if (rcu_is_idle_cpu(cpu))
- return 0; /* Idle CPUs are completely uninteresting. */
+ if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+ goto error;
- aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
- return per_cpu(samples.khz, cpu);
+ freq_scale = div64_u64(acnt, mcnt);
+ if (!freq_scale)
+ goto error;
+
+ if (freq_scale > SCHED_CAPACITY_SCALE)
+ freq_scale = SCHED_CAPACITY_SCALE;
+
+ this_cpu_write(arch_freq_scale, freq_scale);
+ return;
+
+error:
+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
+ schedule_work(&disable_freq_invariance_work);
}
+#else
+static inline void bp_init_freq_invariance(void) { }
+static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
+#endif /* CONFIG_X86_64 && CONFIG_SMP */
-void arch_freq_prepare_all(void)
+void arch_scale_freq_tick(void)
{
- ktime_t now = ktime_get();
- bool wait = false;
- int cpu;
+ struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
+ u64 acnt, mcnt, aperf, mperf;
- if (!cpu_khz)
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
return;
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return;
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+ acnt = aperf - s->aperf;
+ mcnt = mperf - s->mperf;
- for_each_online_cpu(cpu) {
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- continue;
- if (rcu_is_idle_cpu(cpu))
- continue; /* Idle CPUs are completely uninteresting. */
- if (!aperfmperf_snapshot_cpu(cpu, now, false))
- wait = true;
- }
+ s->aperf = aperf;
+ s->mperf = mperf;
+
+ raw_write_seqcount_begin(&s->seq);
+ s->last_update = jiffies;
+ s->acnt = acnt;
+ s->mcnt = mcnt;
+ raw_write_seqcount_end(&s->seq);
- if (wait)
- msleep(APERFMPERF_REFRESH_DELAY_MS);
+ scale_freq_tick(acnt, mcnt);
}
+/*
+ * Discard samples older than the define maximum sample age of 20ms. There
+ * is no point in sending IPIs in such a case. If the scheduler tick was
+ * not running then the CPU is either idle or isolated.
+ */
+#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
+
unsigned int arch_freq_get_on_cpu(int cpu)
{
- struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
+ struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
+ unsigned int seq, freq;
+ unsigned long last;
+ u64 acnt, mcnt;
- if (!cpu_khz)
- return 0;
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ goto fallback;
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return 0;
+ do {
+ seq = raw_read_seqcount_begin(&s->seq);
+ last = s->last_update;
+ acnt = s->acnt;
+ mcnt = s->mcnt;
+ } while (read_seqcount_retry(&s->seq, seq));
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- return 0;
+ /*
+ * Bail on invalid count and when the last update was too long ago,
+ * which covers idle and NOHZ full CPUs.
+ */
+ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
+ goto fallback;
+
+ return div64_u64((cpu_khz * acnt), mcnt);
+
+fallback:
+ freq = cpufreq_quick_get(cpu);
+ return freq ? freq : cpu_khz;
+}
- if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
- return per_cpu(samples.khz, cpu);
+static int __init bp_init_aperfmperf(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return 0;
- msleep(APERFMPERF_REFRESH_DELAY_MS);
- atomic_set(&s->scfpending, 1);
- smp_mb(); /* ->scfpending before smp_call_function_single(). */
- smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+ init_counter_refs();
+ bp_init_freq_invariance();
+ return 0;
+}
+early_initcall(bp_init_aperfmperf);
- return per_cpu(samples.khz, cpu);
+void ap_init_aperfmperf(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ init_counter_refs();
}