summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c480
-rw-r--r--arch/x86/kernel/cpu/bugs.c7
-rw-r--r--arch/x86/kernel/cpu/common.c105
-rw-r--r--arch/x86/kernel/cpu/cpu.h5
-rw-r--r--arch/x86/kernel/cpu/intel.c117
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c32
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c8
-rw-r--r--arch/x86/kernel/cpu/mce/core.c10
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c110
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c59
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c10
-rw-r--r--arch/x86/kernel/cpu/proc.c11
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c14
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c113
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h2
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c13
-rw-r--r--arch/x86/kernel/cpu/tsx.c104
19 files changed, 848 insertions, 359 deletions
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 9ca008f9e9b1..1f60a2b27936 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -6,146 +6,446 @@
* Copyright (C) 2017 Intel Corp.
* Author: Len Brown <len.brown@intel.com>
*/
-
+#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/ktime.h>
#include <linux/math64.h>
#include <linux/percpu.h>
-#include <linux/cpufreq.h>
-#include <linux/smp.h>
-#include <linux/sched/isolation.h>
#include <linux/rcupdate.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/topology.h>
+#include <linux/smp.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
#include "cpu.h"
-struct aperfmperf_sample {
- unsigned int khz;
- atomic_t scfpending;
- ktime_t time;
- u64 aperf;
- u64 mperf;
+struct aperfmperf {
+ seqcount_t seq;
+ unsigned long last_update;
+ u64 acnt;
+ u64 mcnt;
+ u64 aperf;
+ u64 mperf;
};
-static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
+ .seq = SEQCNT_ZERO(cpu_samples.seq)
+};
-#define APERFMPERF_CACHE_THRESHOLD_MS 10
-#define APERFMPERF_REFRESH_DELAY_MS 10
-#define APERFMPERF_STALE_THRESHOLD_MS 1000
+static void init_counter_refs(void)
+{
+ u64 aperf, mperf;
+
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+ this_cpu_write(cpu_samples.aperf, aperf);
+ this_cpu_write(cpu_samples.mperf, mperf);
+}
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
/*
- * aperfmperf_snapshot_khz()
- * On the current CPU, snapshot APERF, MPERF, and jiffies
- * unless we already did it within 10ms
- * calculate kHz, save snapshot
+ * APERF/MPERF frequency ratio computation.
+ *
+ * The scheduler wants to do frequency invariant accounting and needs a <1
+ * ratio to account for the 'current' frequency, corresponding to
+ * freq_curr / freq_max.
+ *
+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
+ * our P-state setting is little more than a request/hint, we need to observe
+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
+ * interval after discarding idle time. This is given by:
+ *
+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
+ *
+ * where freq_base is the max non-turbo P-state.
+ *
+ * The freq_max term has to be set to a somewhat arbitrary value, because we
+ * can't know which turbo states will be available at a given point in time:
+ * it all depends on the thermal headroom of the entire package. We set it to
+ * the turbo level with 4 cores active.
+ *
+ * Benchmarks show that's a good compromise between the 1C turbo ratio
+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
+ * which would ignore the entire turbo range (a conspicuous part, making
+ * freq_curr/freq_max always maxed out).
+ *
+ * An exception to the heuristic above is the Atom uarch, where we choose the
+ * highest turbo level for freq_max since Atom's are generally oriented towards
+ * power efficiency.
+ *
+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
*/
-static void aperfmperf_snapshot_khz(void *dummy)
+
+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
+
+static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
+
+void arch_set_max_freq_ratio(bool turbo_disabled)
{
- u64 aperf, aperf_delta;
- u64 mperf, mperf_delta;
- struct aperfmperf_sample *s = this_cpu_ptr(&samples);
- unsigned long flags;
+ arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
+ arch_turbo_freq_ratio;
+}
+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
- local_irq_save(flags);
- rdmsrl(MSR_IA32_APERF, aperf);
- rdmsrl(MSR_IA32_MPERF, mperf);
- local_irq_restore(flags);
+static bool __init turbo_disabled(void)
+{
+ u64 misc_en;
+ int err;
+
+ err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
+ if (err)
+ return false;
+
+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
+}
+
+static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ int err;
+
+ err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
+ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
+
+ return true;
+}
+
+#define X86_MATCH(model) \
+ X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
+ INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
+
+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(XEON_PHI_KNL),
+ X86_MATCH(XEON_PHI_KNM),
+ {}
+};
+
+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(SKYLAKE_X),
+ {}
+};
+
+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(ATOM_GOLDMONT),
+ X86_MATCH(ATOM_GOLDMONT_D),
+ X86_MATCH(ATOM_GOLDMONT_PLUS),
+ {}
+};
+
+static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
+ int num_delta_fratio)
+{
+ int fratio, delta_fratio, found;
+ int err, i;
+ u64 msr;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ fratio = (msr >> 8) & 0xFF;
+ i = 16;
+ found = 0;
+ do {
+ if (found >= num_delta_fratio) {
+ *turbo_freq = fratio;
+ return true;
+ }
+
+ delta_fratio = (msr >> (i + 5)) & 0x7;
+
+ if (delta_fratio) {
+ found += 1;
+ fratio -= delta_fratio;
+ }
+
+ i += 8;
+ } while (i < 64);
+
+ return true;
+}
+
+static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
+{
+ u64 ratios, counts;
+ u32 group_size;
+ int err, i;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
+ if (err)
+ return false;
+
+ for (i = 0; i < 64; i += 8) {
+ group_size = (counts >> i) & 0xFF;
+ if (group_size >= size) {
+ *turbo_freq = (ratios >> i) & 0xFF;
+ return true;
+ }
+ }
+
+ return false;
+}
- aperf_delta = aperf - s->aperf;
- mperf_delta = mperf - s->mperf;
+static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ u64 msr;
+ int err;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
+
+ /* The CPU may have less than 4 cores */
+ if (!*turbo_freq)
+ *turbo_freq = msr & 0xFF; /* 1C turbo */
+
+ return true;
+}
+
+static bool __init intel_set_max_freq_ratio(void)
+{
+ u64 base_freq, turbo_freq;
+ u64 turbo_ratio;
+ if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
+ goto out;
+
+ if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ return false;
+
+out:
/*
- * There is no architectural guarantee that MPERF
- * increments faster than we can read it.
+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
+ * but then fill all MSR's with zeroes.
+ * Some CPUs have turbo boost but don't declare any turbo ratio
+ * in MSR_TURBO_RATIO_LIMIT.
*/
- if (mperf_delta == 0)
- return;
+ if (!base_freq || !turbo_freq) {
+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
+ return false;
+ }
- s->time = ktime_get();
- s->aperf = aperf;
- s->mperf = mperf;
- s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
- atomic_set_release(&s->scfpending, 0);
+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
+ if (!turbo_ratio) {
+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = turbo_ratio;
+ arch_set_max_freq_ratio(turbo_disabled());
+
+ return true;
}
-static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
+#ifdef CONFIG_PM_SLEEP
+static struct syscore_ops freq_invariance_syscore_ops = {
+ .resume = init_counter_refs,
+};
+
+static void register_freq_invariance_syscore_ops(void)
{
- s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
- struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
+ register_syscore_ops(&freq_invariance_syscore_ops);
+}
+#else
+static inline void register_freq_invariance_syscore_ops(void) {}
+#endif
- /* Don't bother re-computing within the cache threshold time. */
- if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
- return true;
+static void freq_invariance_enable(void)
+{
+ if (static_branch_unlikely(&arch_scale_freq_key)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ static_branch_enable(&arch_scale_freq_key);
+ register_freq_invariance_syscore_ops();
+ pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
+}
+
+void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
+{
+ arch_turbo_freq_ratio = ratio;
+ arch_set_max_freq_ratio(turbo_disabled);
+ freq_invariance_enable();
+}
+
+static void __init bp_init_freq_invariance(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
- if (!atomic_xchg(&s->scfpending, 1) || wait)
- smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
+ if (intel_set_max_freq_ratio())
+ freq_invariance_enable();
+}
- /* Return false if the previous iteration was too long ago. */
- return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
+static void disable_freq_invariance_workfn(struct work_struct *work)
+{
+ static_branch_disable(&arch_scale_freq_key);
}
-unsigned int aperfmperf_get_khz(int cpu)
+static DECLARE_WORK(disable_freq_invariance_work,
+ disable_freq_invariance_workfn);
+
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+
+static void scale_freq_tick(u64 acnt, u64 mcnt)
{
- if (!cpu_khz)
- return 0;
+ u64 freq_scale;
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return 0;
+ if (!arch_scale_freq_invariant())
+ return;
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- return 0;
+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
+ goto error;
- if (rcu_is_idle_cpu(cpu))
- return 0; /* Idle CPUs are completely uninteresting. */
+ if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+ goto error;
- aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
- return per_cpu(samples.khz, cpu);
+ freq_scale = div64_u64(acnt, mcnt);
+ if (!freq_scale)
+ goto error;
+
+ if (freq_scale > SCHED_CAPACITY_SCALE)
+ freq_scale = SCHED_CAPACITY_SCALE;
+
+ this_cpu_write(arch_freq_scale, freq_scale);
+ return;
+
+error:
+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
+ schedule_work(&disable_freq_invariance_work);
}
+#else
+static inline void bp_init_freq_invariance(void) { }
+static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
+#endif /* CONFIG_X86_64 && CONFIG_SMP */
-void arch_freq_prepare_all(void)
+void arch_scale_freq_tick(void)
{
- ktime_t now = ktime_get();
- bool wait = false;
- int cpu;
+ struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
+ u64 acnt, mcnt, aperf, mperf;
- if (!cpu_khz)
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
return;
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return;
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+ acnt = aperf - s->aperf;
+ mcnt = mperf - s->mperf;
- for_each_online_cpu(cpu) {
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- continue;
- if (rcu_is_idle_cpu(cpu))
- continue; /* Idle CPUs are completely uninteresting. */
- if (!aperfmperf_snapshot_cpu(cpu, now, false))
- wait = true;
- }
+ s->aperf = aperf;
+ s->mperf = mperf;
+
+ raw_write_seqcount_begin(&s->seq);
+ s->last_update = jiffies;
+ s->acnt = acnt;
+ s->mcnt = mcnt;
+ raw_write_seqcount_end(&s->seq);
- if (wait)
- msleep(APERFMPERF_REFRESH_DELAY_MS);
+ scale_freq_tick(acnt, mcnt);
}
+/*
+ * Discard samples older than the define maximum sample age of 20ms. There
+ * is no point in sending IPIs in such a case. If the scheduler tick was
+ * not running then the CPU is either idle or isolated.
+ */
+#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
+
unsigned int arch_freq_get_on_cpu(int cpu)
{
- struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
+ struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
+ unsigned int seq, freq;
+ unsigned long last;
+ u64 acnt, mcnt;
- if (!cpu_khz)
- return 0;
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ goto fallback;
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return 0;
+ do {
+ seq = raw_read_seqcount_begin(&s->seq);
+ last = s->last_update;
+ acnt = s->acnt;
+ mcnt = s->mcnt;
+ } while (read_seqcount_retry(&s->seq, seq));
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- return 0;
+ /*
+ * Bail on invalid count and when the last update was too long ago,
+ * which covers idle and NOHZ full CPUs.
+ */
+ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
+ goto fallback;
+
+ return div64_u64((cpu_khz * acnt), mcnt);
+
+fallback:
+ freq = cpufreq_quick_get(cpu);
+ return freq ? freq : cpu_khz;
+}
- if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
- return per_cpu(samples.khz, cpu);
+static int __init bp_init_aperfmperf(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return 0;
- msleep(APERFMPERF_REFRESH_DELAY_MS);
- atomic_set(&s->scfpending, 1);
- smp_mb(); /* ->scfpending before smp_call_function_single(). */
- smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+ init_counter_refs();
+ bp_init_freq_invariance();
+ return 0;
+}
+early_initcall(bp_init_aperfmperf);
- return per_cpu(samples.khz, cpu);
+void ap_init_aperfmperf(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ init_counter_refs();
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 6296e1ebed1d..d879a6c93609 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -446,6 +446,13 @@ void update_srbds_msr(void)
if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED)
return;
+ /*
+ * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX
+ * being disabled and it hasn't received the SRBDS MSR microcode.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
+ return;
+
rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
switch (srbds_mitigation) {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ed4417500700..2e9142797c99 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -60,6 +60,7 @@
#include <asm/uv/uv.h>
#include <asm/sigframe.h>
#include <asm/traps.h>
+#include <asm/sev.h>
#include "cpu.h"
@@ -298,13 +299,6 @@ static int __init cachesize_setup(char *str)
}
__setup("cachesize=", cachesize_setup);
-static int __init x86_sep_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_SEP);
- return 1;
-}
-__setup("nosep", x86_sep_setup);
-
/* Standard macro to see if a specific flag is changeable */
static inline int flag_is_changeable_p(u32 flag)
{
@@ -376,26 +370,12 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
}
#endif
-static __init int setup_disable_smep(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_SMEP);
- return 1;
-}
-__setup("nosmep", setup_disable_smep);
-
static __always_inline void setup_smep(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_SMEP))
cr4_set_bits(X86_CR4_SMEP);
}
-static __init int setup_disable_smap(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_SMAP);
- return 1;
-}
-__setup("nosmap", setup_disable_smap);
-
static __always_inline void setup_smap(struct cpuinfo_x86 *c)
{
unsigned long eflags = native_save_fl();
@@ -403,14 +383,8 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
/* This should have been cleared long ago */
BUG_ON(eflags & X86_EFLAGS_AC);
- if (cpu_has(c, X86_FEATURE_SMAP)) {
-#ifdef CONFIG_X86_SMAP
+ if (cpu_has(c, X86_FEATURE_SMAP))
cr4_set_bits(X86_CR4_SMAP);
-#else
- clear_cpu_cap(c, X86_FEATURE_SMAP);
- cr4_clear_bits(X86_CR4_SMAP);
-#endif
- }
}
static __always_inline void setup_umip(struct cpuinfo_x86 *c)
@@ -1368,8 +1342,8 @@ static void detect_nopl(void)
static void __init cpu_parse_early_param(void)
{
char arg[128];
- char *argptr = arg;
- int arglen, res, bit;
+ char *argptr = arg, *opt;
+ int arglen, taint = 0;
#ifdef CONFIG_X86_32
if (cmdline_find_option_bool(boot_command_line, "no387"))
@@ -1397,21 +1371,61 @@ static void __init cpu_parse_early_param(void)
return;
pr_info("Clearing CPUID bits:");
- do {
- res = get_option(&argptr, &bit);
- if (res == 0 || res == 3)
- break;
- /* If the argument was too long, the last bit may be cut off */
- if (res == 1 && arglen >= sizeof(arg))
- break;
+ while (argptr) {
+ bool found __maybe_unused = false;
+ unsigned int bit;
- if (bit >= 0 && bit < NCAPINTS * 32) {
- pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+ opt = strsep(&argptr, ",");
+
+ /*
+ * Handle naked numbers first for feature flags which don't
+ * have names.
+ */
+ if (!kstrtouint(opt, 10, &bit)) {
+ if (bit < NCAPINTS * 32) {
+
+#ifdef CONFIG_X86_FEATURE_NAMES
+ /* empty-string, i.e., ""-defined feature flags */
+ if (!x86_cap_flags[bit])
+ pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit));
+ else
+#endif
+ pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+
+ setup_clear_cpu_cap(bit);
+ taint++;
+ }
+ /*
+ * The assumption is that there are no feature names with only
+ * numbers in the name thus go to the next argument.
+ */
+ continue;
+ }
+
+#ifdef CONFIG_X86_FEATURE_NAMES
+ for (bit = 0; bit < 32 * NCAPINTS; bit++) {
+ if (!x86_cap_flag(bit))
+ continue;
+
+ if (strcmp(x86_cap_flag(bit), opt))
+ continue;
+
+ pr_cont(" %s", opt);
setup_clear_cpu_cap(bit);
+ taint++;
+ found = true;
+ break;
}
- } while (res == 2);
+
+ if (!found)
+ pr_cont(" (unknown: %s)", opt);
+#endif
+ }
pr_cont("\n");
+
+ if (taint)
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
}
/*
@@ -1855,15 +1869,9 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
update_srbds_msr();
-}
-static __init int setup_noclflush(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
- setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT);
- return 1;
+ tsx_ap_init();
}
-__setup("noclflush", setup_noclflush);
void print_cpu_info(struct cpuinfo_x86 *c)
{
@@ -2124,6 +2132,9 @@ void cpu_init_exception_handling(void)
load_TR_desc();
+ /* GHCB needs to be setup to handle #VC. */
+ setup_ghcb();
+
/* Finally load the IDT */
load_current_idt();
}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index ee6f23f7587d..2a8e584fc991 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -55,11 +55,10 @@ enum tsx_ctrl_states {
extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
extern void __init tsx_init(void);
-extern void tsx_enable(void);
-extern void tsx_disable(void);
-extern void tsx_clear_cpuid(void);
+void tsx_ap_init(void);
#else
static inline void tsx_init(void) { }
+static inline void tsx_ap_init(void) { }
#endif /* CONFIG_CPU_SUP_INTEL */
extern void get_cpu_cap(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 8321c43554a1..fd5dead8371c 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -7,10 +7,13 @@
#include <linux/smp.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <linux/semaphore.h>
#include <linux/thread_info.h>
#include <linux/init.h>
#include <linux/uaccess.h>
+#include <linux/workqueue.h>
#include <linux/delay.h>
+#include <linux/cpuhotplug.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -91,7 +94,7 @@ static bool ring3mwait_disabled __read_mostly;
static int __init ring3mwait_disable(char *__unused)
{
ring3mwait_disabled = true;
- return 0;
+ return 1;
}
__setup("ring3mwait=disable", ring3mwait_disable);
@@ -181,6 +184,38 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
return false;
}
+int intel_cpu_collect_info(struct ucode_cpu_info *uci)
+{
+ unsigned int val[2];
+ unsigned int family, model;
+ struct cpu_signature csig = { 0 };
+ unsigned int eax, ebx, ecx, edx;
+
+ memset(uci, 0, sizeof(*uci));
+
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ csig.sig = eax;
+
+ family = x86_family(eax);
+ model = x86_model(eax);
+
+ if (model >= 5 || family > 6) {
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig.pf = 1 << ((val[1] >> 18) & 7);
+ }
+
+ csig.rev = intel_get_microcode_revision();
+
+ uci->cpu_sig = csig;
+ uci->valid = 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_cpu_collect_info);
+
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
@@ -717,13 +752,6 @@ static void init_intel(struct cpuinfo_x86 *c)
init_intel_misc_features(c);
- if (tsx_ctrl_state == TSX_CTRL_ENABLE)
- tsx_enable();
- else if (tsx_ctrl_state == TSX_CTRL_DISABLE)
- tsx_disable();
- else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT)
- tsx_clear_cpuid();
-
split_lock_init();
bus_lock_init();
@@ -1006,6 +1034,8 @@ static const struct {
static struct ratelimit_state bld_ratelimit;
+static DEFINE_SEMAPHORE(buslock_sem);
+
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
int len = strlen(opt), ratelimit;
@@ -1116,18 +1146,52 @@ static void split_lock_init(void)
split_lock_verify_msr(sld_state != sld_off);
}
+static void __split_lock_reenable(struct work_struct *work)
+{
+ sld_update_msr(true);
+ up(&buslock_sem);
+}
+
+/*
+ * If a CPU goes offline with pending delayed work to re-enable split lock
+ * detection then the delayed work will be executed on some other CPU. That
+ * handles releasing the buslock_sem, but because it executes on a
+ * different CPU probably won't re-enable split lock detection. This is a
+ * problem on HT systems since the sibling CPU on the same core may then be
+ * left running with split lock detection disabled.
+ *
+ * Unconditionally re-enable detection here.
+ */
+static int splitlock_cpu_offline(unsigned int cpu)
+{
+ sld_update_msr(true);
+
+ return 0;
+}
+
+static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable);
+
static void split_lock_warn(unsigned long ip)
{
- pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
- current->comm, current->pid, ip);
+ int cpu;
- /*
- * Disable the split lock detection for this task so it can make
- * progress and set TIF_SLD so the detection is re-enabled via
- * switch_to_sld() when the task is scheduled out.
- */
+ if (!current->reported_split_lock)
+ pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, ip);
+ current->reported_split_lock = 1;
+
+ /* misery factor #1, sleep 10ms before trying to execute split lock */
+ if (msleep_interruptible(10) > 0)
+ return;
+ /* Misery factor #2, only allow one buslocked disabled core at a time */
+ if (down_interruptible(&buslock_sem) == -EINTR)
+ return;
+ cpu = get_cpu();
+ schedule_delayed_work_on(cpu, &split_lock_reenable, 2);
+
+ /* Disable split lock detection on this CPU to make progress */
sld_update_msr(false);
- set_tsk_thread_flag(current, TIF_SLD);
+ put_cpu();
}
bool handle_guest_split_lock(unsigned long ip)
@@ -1201,18 +1265,6 @@ void handle_bus_lock(struct pt_regs *regs)
}
/*
- * This function is called only when switching between tasks with
- * different split-lock detection modes. It sets the MSR for the
- * mode of the new task. This is right most of the time, but since
- * the MSR is shared by hyperthreads on a physical core there can
- * be glitches when the two threads need different modes.
- */
-void switch_to_sld(unsigned long tifn)
-{
- sld_update_msr(!(tifn & _TIF_SLD));
-}
-
-/*
* Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should
* only be trusted if it is confirmed that a CPU model implements a
* specific feature at a particular bit position.
@@ -1237,6 +1289,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, 1),
{}
};
@@ -1281,10 +1334,14 @@ static void sld_state_show(void)
pr_info("disabled\n");
break;
case sld_warn:
- if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
- else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
+ pr_warn("No splitlock CPU offline handler\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
pr_info("#DB: warning on user-space bus_locks\n");
+ }
break;
case sld_fatal:
if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 1940d305db1c..1c87501e0fa3 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -1294,10 +1294,23 @@ out_free:
kfree(bank);
}
+static void __threshold_remove_device(struct threshold_bank **bp)
+{
+ unsigned int bank, numbanks = this_cpu_read(mce_num_banks);
+
+ for (bank = 0; bank < numbanks; bank++) {
+ if (!bp[bank])
+ continue;
+
+ threshold_remove_bank(bp[bank]);
+ bp[bank] = NULL;
+ }
+ kfree(bp);
+}
+
int mce_threshold_remove_device(unsigned int cpu)
{
struct threshold_bank **bp = this_cpu_read(threshold_banks);
- unsigned int bank, numbanks = this_cpu_read(mce_num_banks);
if (!bp)
return 0;
@@ -1308,13 +1321,7 @@ int mce_threshold_remove_device(unsigned int cpu)
*/
this_cpu_write(threshold_banks, NULL);
- for (bank = 0; bank < numbanks; bank++) {
- if (bp[bank]) {
- threshold_remove_bank(bp[bank]);
- bp[bank] = NULL;
- }
- }
- kfree(bp);
+ __threshold_remove_device(bp);
return 0;
}
@@ -1351,15 +1358,14 @@ int mce_threshold_create_device(unsigned int cpu)
if (!(this_cpu_read(bank_map) & (1 << bank)))
continue;
err = threshold_create_bank(bp, cpu, bank);
- if (err)
- goto out_err;
+ if (err) {
+ __threshold_remove_device(bp);
+ return err;
+ }
}
this_cpu_write(threshold_banks, bp);
if (thresholding_irq_en)
mce_threshold_vector = amd_threshold_interrupt;
return 0;
-out_err:
- mce_threshold_remove_device(cpu);
- return err;
}
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 0e3ae64d3b76..717192915f28 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -177,16 +177,14 @@ retry:
/* no more record */
if (*record_id == APEI_ERST_INVALID_RECORD_ID)
goto out;
- rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
+ rc = erst_read_record(*record_id, &rcd.hdr, sizeof(rcd), sizeof(rcd),
+ &CPER_CREATOR_MCE);
/* someone else has cleared the record, try next one */
if (rc == -ENOENT)
goto retry;
else if (rc < 0)
goto out;
- /* try to skip other type records in storage */
- else if (rc != sizeof(rcd) ||
- !guid_equal(&rcd.hdr.creator_id, &CPER_CREATOR_MCE))
- goto retry;
+
memcpy(m, &rcd.mce, sizeof(*m));
rc = sizeof(*m);
out:
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 981496e6bc0e..2c8ec5c71712 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -69,7 +69,9 @@ DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
struct mce_bank {
u64 ctl; /* subevents to enable */
- bool init; /* initialise bank? */
+
+ __u64 init : 1, /* initialise bank? */
+ __reserved_1 : 63;
};
static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
@@ -579,7 +581,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
pfn = mce->addr >> PAGE_SHIFT;
if (!memory_failure(pfn, 0)) {
- set_mce_nospec(pfn, whole_page(mce));
+ set_mce_nospec(pfn);
mce->kflags |= MCE_HANDLED_UC;
}
@@ -1316,7 +1318,7 @@ static void kill_me_maybe(struct callback_head *cb)
ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
if (!ret) {
- set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
sync_core();
return;
}
@@ -1342,7 +1344,7 @@ static void kill_me_never(struct callback_head *cb)
p->mce_count = 0;
pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
- set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
}
static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 1add86935349..00483d1c27e4 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -301,85 +301,65 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
}
}
-static __always_inline int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
+/* See AMD PPR(s) section Machine Check Error Handling. */
+static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
{
- u64 mcx_cfg;
+ char *panic_msg = NULL;
+ int ret;
/*
- * We need to look at the following bits:
- * - "succor" bit (data poisoning support), and
- * - TCC bit (Task Context Corrupt)
- * in MCi_STATUS to determine error severity.
+ * Default return value: Action required, the error must be handled
+ * immediately.
*/
- if (!mce_flags.succor)
- return MCE_PANIC_SEVERITY;
-
- mcx_cfg = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(m->bank));
-
- /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
- if ((mcx_cfg & MCI_CONFIG_MCAX) &&
- (m->status & MCI_STATUS_TCC) &&
- (err_ctx == IN_KERNEL))
- return MCE_PANIC_SEVERITY;
-
- /* ...otherwise invoke hwpoison handler. */
- return MCE_AR_SEVERITY;
-}
-
-/*
- * See AMD Error Scope Hierarchy table in a newer BKDG. For example
- * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
- */
-static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
-{
- enum context ctx = error_context(m, regs);
+ ret = MCE_AR_SEVERITY;
/* Processor Context Corrupt, no need to fumble too much, die! */
- if (m->status & MCI_STATUS_PCC)
- return MCE_PANIC_SEVERITY;
-
- if (m->status & MCI_STATUS_UC) {
-
- if (ctx == IN_KERNEL)
- return MCE_PANIC_SEVERITY;
+ if (m->status & MCI_STATUS_PCC) {
+ panic_msg = "Processor Context Corrupt";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
- /*
- * On older systems where overflow_recov flag is not present, we
- * should simply panic if an error overflow occurs. If
- * overflow_recov flag is present and set, then software can try
- * to at least kill process to prolong system operation.
- */
- if (mce_flags.overflow_recov) {
- if (mce_flags.smca)
- return mce_severity_amd_smca(m, ctx);
-
- /* kill current process */
- return MCE_AR_SEVERITY;
- } else {
- /* at least one error was not logged */
- if (m->status & MCI_STATUS_OVER)
- return MCE_PANIC_SEVERITY;
- }
-
- /*
- * For any other case, return MCE_UC_SEVERITY so that we log the
- * error and exit #MC handler.
- */
- return MCE_UC_SEVERITY;
+ if (m->status & MCI_STATUS_DEFERRED) {
+ ret = MCE_DEFERRED_SEVERITY;
+ goto out;
}
/*
- * deferred error: poll handler catches these and adds to mce_ring so
- * memory-failure can take recovery actions.
+ * If the UC bit is not set, the system either corrected or deferred
+ * the error. No action will be required after logging the error.
*/
- if (m->status & MCI_STATUS_DEFERRED)
- return MCE_DEFERRED_SEVERITY;
+ if (!(m->status & MCI_STATUS_UC)) {
+ ret = MCE_KEEP_SEVERITY;
+ goto out;
+ }
/*
- * corrected error: poll handler catches these and passes responsibility
- * of decoding the error to EDAC
+ * On MCA overflow, without the MCA overflow recovery feature the
+ * system will not be able to recover, panic.
*/
- return MCE_KEEP_SEVERITY;
+ if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) {
+ panic_msg = "Overflowed uncorrected error without MCA Overflow Recovery";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (!mce_flags.succor) {
+ panic_msg = "Uncorrected error without MCA Recovery";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (error_context(m, regs) == IN_KERNEL) {
+ panic_msg = "Uncorrected unrecoverable error in kernel context";
+ ret = MCE_PANIC_SEVERITY;
+ }
+
+out:
+ if (msg && panic_msg)
+ *msg = panic_msg;
+
+ return ret;
}
static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index f955d25076ba..239ff5fcec6a 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -758,9 +758,9 @@ static struct subsys_interface mc_cpu_interface = {
};
/**
- * mc_bp_resume - Update boot CPU microcode during resume.
+ * microcode_bsp_resume - Update boot CPU microcode during resume.
*/
-static void mc_bp_resume(void)
+void microcode_bsp_resume(void)
{
int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -772,7 +772,7 @@ static void mc_bp_resume(void)
}
static struct syscore_ops mc_syscore_ops = {
- .resume = mc_bp_resume,
+ .resume = microcode_bsp_resume,
};
static int mc_cpu_starting(unsigned int cpu)
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index d28a9f8f3fec..025c8f0cd948 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -45,20 +45,6 @@ static struct microcode_intel *intel_ucode_patch;
/* last level cache size per core */
static int llc_size_per_core;
-static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
- unsigned int s2, unsigned int p2)
-{
- if (s1 != s2)
- return false;
-
- /* Processor flags are either both 0 ... */
- if (!p1 && !p2)
- return true;
-
- /* ... or they intersect. */
- return p1 & p2;
-}
-
/*
* Returns 1 if update has been found, 0 otherwise.
*/
@@ -69,7 +55,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf)
struct extended_signature *ext_sig;
int i;
- if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
+ if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
return 1;
/* Look for ext. headers: */
@@ -80,7 +66,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf)
ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
for (i = 0; i < ext_hdr->count; i++) {
- if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
+ if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
return 1;
ext_sig++;
}
@@ -342,37 +328,6 @@ next:
return patch;
}
-static int collect_cpu_info_early(struct ucode_cpu_info *uci)
-{
- unsigned int val[2];
- unsigned int family, model;
- struct cpu_signature csig = { 0 };
- unsigned int eax, ebx, ecx, edx;
-
- memset(uci, 0, sizeof(*uci));
-
- eax = 0x00000001;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- csig.sig = eax;
-
- family = x86_family(eax);
- model = x86_model(eax);
-
- if ((model >= 5) || (family > 6)) {
- /* get processor flags from MSR 0x17 */
- native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig.pf = 1 << ((val[1] >> 18) & 7);
- }
-
- csig.rev = intel_get_microcode_revision();
-
- uci->cpu_sig = csig;
- uci->valid = 1;
-
- return 0;
-}
-
static void show_saved_mc(void)
{
#ifdef DEBUG
@@ -386,7 +341,7 @@ static void show_saved_mc(void)
return;
}
- collect_cpu_info_early(&uci);
+ intel_cpu_collect_info(&uci);
sig = uci.cpu_sig.sig;
pf = uci.cpu_sig.pf;
@@ -502,7 +457,7 @@ void show_ucode_info_early(void)
struct ucode_cpu_info uci;
if (delay_ucode_info) {
- collect_cpu_info_early(&uci);
+ intel_cpu_collect_info(&uci);
print_ucode_info(&uci, current_mc_date);
delay_ucode_info = 0;
}
@@ -604,7 +559,7 @@ int __init save_microcode_in_initrd_intel(void)
if (!(cp.data && cp.size))
return 0;
- collect_cpu_info_early(&uci);
+ intel_cpu_collect_info(&uci);
scan_microcode(cp.data, cp.size, &uci, true);
@@ -637,7 +592,7 @@ static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci)
if (!(cp.data && cp.size))
return NULL;
- collect_cpu_info_early(uci);
+ intel_cpu_collect_info(uci);
return scan_microcode(cp.data, cp.size, uci, false);
}
@@ -712,7 +667,7 @@ void reload_ucode_intel(void)
struct microcode_intel *p;
struct ucode_cpu_info uci;
- collect_cpu_info_early(&uci);
+ intel_cpu_collect_info(&uci);
p = find_patch(&uci);
if (!p)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 4b67094215bb..831613959a92 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -337,14 +337,6 @@ static void __init ms_hyperv_init_platform(void)
swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary;
#endif
}
-
-#ifdef CONFIG_SWIOTLB
- /*
- * Enable swiotlb force mode in Isolation VM to
- * use swiotlb bounce buffer for dma transaction.
- */
- swiotlb_force = SWIOTLB_FORCE;
-#endif
/* Isolation VMs are unenlightened SEV-based VMs, thus this check: */
if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
if (hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE)
@@ -465,6 +457,8 @@ static void __init ms_hyperv_init_platform(void)
*/
if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
mark_tsc_unstable("running on Hyper-V");
+
+ hardlockup_detector_disable();
}
static bool __init ms_hyperv_x2apic_available(void)
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 4eec8889b0ff..099b6f0d96bd 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -84,14 +84,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = aperfmperf_get_khz(cpu);
-
- if (!freq)
- freq = cpufreq_quick_get(cpu);
- if (!freq)
- freq = cpu_khz;
- seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
- freq / 1000, (freq % 1000));
+ unsigned int freq = arch_freq_get_on_cpu(cpu);
+
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
}
/* Cache size */
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 83f901e2c2df..f276aff521e8 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -341,14 +341,14 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
/* Check whether cpus belong to parent ctrl group */
cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
return -EINVAL;
}
/* Check whether cpus are dropped from this group */
cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
/* Give any dropped cpus to parent rdtgroup */
cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
update_closid_rmid(tmpmask, prgrp);
@@ -359,7 +359,7 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
* and update per-cpu rmid
*/
cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
head = &prgrp->mon.crdtgrp_list;
list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
if (crgrp == rdtgrp)
@@ -394,7 +394,7 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
/* Check whether cpus are dropped from this group */
cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
/* Can't drop from default group */
if (rdtgrp == &rdtgroup_default) {
rdt_last_cmd_puts("Can't drop CPUs from default group\n");
@@ -413,12 +413,12 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
* and update per-cpu closid/rmid.
*/
cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
if (r == rdtgrp)
continue;
cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
- if (cpumask_weight(tmpmask1))
+ if (!cpumask_empty(tmpmask1))
cpumask_rdtgrp_clear(r, tmpmask1);
}
update_closid_rmid(tmpmask, rdtgrp);
@@ -488,7 +488,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
/* check that user didn't specify any offline cpus */
cpumask_andnot(tmpmask, newmask, cpu_online_mask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
ret = -EINVAL;
rdt_last_cmd_puts("Can only assign online CPUs\n");
goto unlock;
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 4143b1e4c5c6..dbaa8326d6f2 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -43,6 +43,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 7c63a1911fae..3c24e6124d95 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -12,6 +12,92 @@
#include "encls.h"
#include "sgx.h"
+#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
+/*
+ * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
+ * determine the page index associated with the first PCMD entry
+ * within a PCMD page.
+ */
+#define PCMD_FIRST_MASK GENMASK(4, 0)
+
+/**
+ * reclaimer_writing_to_pcmd() - Query if any enclave page associated with
+ * a PCMD page is in process of being reclaimed.
+ * @encl: Enclave to which PCMD page belongs
+ * @start_addr: Address of enclave page using first entry within the PCMD page
+ *
+ * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is
+ * stored. The PCMD data of a reclaimed enclave page contains enough
+ * information for the processor to verify the page at the time
+ * it is loaded back into the Enclave Page Cache (EPC).
+ *
+ * The backing storage to which enclave pages are reclaimed is laid out as
+ * follows:
+ * Encrypted enclave pages:SECS page:PCMD pages
+ *
+ * Each PCMD page contains the PCMD metadata of
+ * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages.
+ *
+ * A PCMD page can only be truncated if it is (a) empty, and (b) not in the
+ * process of getting data (and thus soon being non-empty). (b) is tested with
+ * a check if an enclave page sharing the PCMD page is in the process of being
+ * reclaimed.
+ *
+ * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it
+ * intends to reclaim that enclave page - it means that the PCMD page
+ * associated with that enclave page is about to get some data and thus
+ * even if the PCMD page is empty, it should not be truncated.
+ *
+ * Context: Enclave mutex (&sgx_encl->lock) must be held.
+ * Return: 1 if the reclaimer is about to write to the PCMD page
+ * 0 if the reclaimer has no intention to write to the PCMD page
+ */
+static int reclaimer_writing_to_pcmd(struct sgx_encl *encl,
+ unsigned long start_addr)
+{
+ int reclaimed = 0;
+ int i;
+
+ /*
+ * PCMD_FIRST_MASK is based on number of PCMD entries within
+ * PCMD page being 32.
+ */
+ BUILD_BUG_ON(PCMDS_PER_PAGE != 32);
+
+ for (i = 0; i < PCMDS_PER_PAGE; i++) {
+ struct sgx_encl_page *entry;
+ unsigned long addr;
+
+ addr = start_addr + i * PAGE_SIZE;
+
+ /*
+ * Stop when reaching the SECS page - it does not
+ * have a page_array entry and its reclaim is
+ * started and completed with enclave mutex held so
+ * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED
+ * flag.
+ */
+ if (addr == encl->base + encl->size)
+ break;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ continue;
+
+ /*
+ * VA page slot ID uses same bit as the flag so it is important
+ * to ensure that the page is not already in backing store.
+ */
+ if (entry->epc_page &&
+ (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) {
+ reclaimed = 1;
+ break;
+ }
+ }
+
+ return reclaimed;
+}
+
/*
* Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
* follow right after the EPC data in the backing storage. In addition to the
@@ -47,6 +133,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
struct sgx_encl *encl = encl_page->encl;
pgoff_t page_index, page_pcmd_off;
+ unsigned long pcmd_first_page;
struct sgx_pageinfo pginfo;
struct sgx_backing b;
bool pcmd_page_empty;
@@ -58,6 +145,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
else
page_index = PFN_DOWN(encl->size);
+ /*
+ * Address of enclave page using the first entry within the PCMD page.
+ */
+ pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base;
+
page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
ret = sgx_encl_get_backing(encl, page_index, &b);
@@ -84,6 +176,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
}
memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
+ set_page_dirty(b.pcmd);
/*
* The area for the PCMD in the page was zeroed above. Check if the
@@ -94,12 +187,20 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
kunmap_atomic(pcmd_page);
kunmap_atomic((void *)(unsigned long)pginfo.contents);
- sgx_encl_put_backing(&b, false);
+ get_page(b.pcmd);
+ sgx_encl_put_backing(&b);
sgx_encl_truncate_backing_page(encl, page_index);
- if (pcmd_page_empty)
+ if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
+ pcmd_page = kmap_atomic(b.pcmd);
+ if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
+ pr_warn("PCMD page not empty after truncate.\n");
+ kunmap_atomic(pcmd_page);
+ }
+
+ put_page(b.pcmd);
return ret;
}
@@ -645,15 +746,9 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
/**
* sgx_encl_put_backing() - Unpin the backing storage
* @backing: data for accessing backing storage for the page
- * @do_write: mark pages dirty
*/
-void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write)
+void sgx_encl_put_backing(struct sgx_backing *backing)
{
- if (do_write) {
- set_page_dirty(backing->pcmd);
- set_page_dirty(backing->contents);
- }
-
put_page(backing->pcmd);
put_page(backing->contents);
}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index fec43ca65065..d44e7372151f 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -107,7 +107,7 @@ void sgx_encl_release(struct kref *ref);
int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
struct sgx_backing *backing);
-void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write);
+void sgx_encl_put_backing(struct sgx_backing *backing);
int sgx_encl_test_and_clear_young(struct mm_struct *mm,
struct sgx_encl_page *page);
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 8e4bc6453d26..ab4ec54bbdd9 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -191,6 +191,8 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
backing->pcmd_offset;
ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
+ set_page_dirty(backing->pcmd);
+ set_page_dirty(backing->contents);
kunmap_atomic((void *)(unsigned long)(pginfo.metadata -
backing->pcmd_offset));
@@ -308,6 +310,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
sgx_encl_ewb(epc_page, backing);
encl_page->epc_page = NULL;
encl->secs_child_cnt--;
+ sgx_encl_put_backing(backing);
if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size),
@@ -320,7 +323,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
- sgx_encl_put_backing(&secs_backing, true);
+ sgx_encl_put_backing(&secs_backing);
}
out:
@@ -379,11 +382,14 @@ static void sgx_reclaim_pages(void)
goto skip;
page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
+
+ mutex_lock(&encl_page->encl->lock);
ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]);
- if (ret)
+ if (ret) {
+ mutex_unlock(&encl_page->encl->lock);
goto skip;
+ }
- mutex_lock(&encl_page->encl->lock);
encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
mutex_unlock(&encl_page->encl->lock);
continue;
@@ -411,7 +417,6 @@ skip:
encl_page = epc_page->owner;
sgx_reclaimer_write(epc_page, &backing[i]);
- sgx_encl_put_backing(&backing[i], true);
kref_put(&encl_page->encl->refcount, sgx_encl_release);
epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index 9c7a5f049292..ec7bbac3a9f2 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -19,7 +19,7 @@
enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
-void tsx_disable(void)
+static void tsx_disable(void)
{
u64 tsx;
@@ -39,7 +39,7 @@ void tsx_disable(void)
wrmsrl(MSR_IA32_TSX_CTRL, tsx);
}
-void tsx_enable(void)
+static void tsx_enable(void)
{
u64 tsx;
@@ -58,7 +58,7 @@ void tsx_enable(void)
wrmsrl(MSR_IA32_TSX_CTRL, tsx);
}
-static bool __init tsx_ctrl_is_supported(void)
+static bool tsx_ctrl_is_supported(void)
{
u64 ia32_cap = x86_read_arch_cap_msr();
@@ -84,7 +84,45 @@ static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
return TSX_CTRL_ENABLE;
}
-void tsx_clear_cpuid(void)
+/*
+ * Disabling TSX is not a trivial business.
+ *
+ * First of all, there's a CPUID bit: X86_FEATURE_RTM_ALWAYS_ABORT
+ * which says that TSX is practically disabled (all transactions are
+ * aborted by default). When that bit is set, the kernel unconditionally
+ * disables TSX.
+ *
+ * In order to do that, however, it needs to dance a bit:
+ *
+ * 1. The first method to disable it is through MSR_TSX_FORCE_ABORT and
+ * the MSR is present only when *two* CPUID bits are set:
+ *
+ * - X86_FEATURE_RTM_ALWAYS_ABORT
+ * - X86_FEATURE_TSX_FORCE_ABORT
+ *
+ * 2. The second method is for CPUs which do not have the above-mentioned
+ * MSR: those use a different MSR - MSR_IA32_TSX_CTRL and disable TSX
+ * through that one. Those CPUs can also have the initially mentioned
+ * CPUID bit X86_FEATURE_RTM_ALWAYS_ABORT set and for those the same strategy
+ * applies: TSX gets disabled unconditionally.
+ *
+ * When either of the two methods are present, the kernel disables TSX and
+ * clears the respective RTM and HLE feature flags.
+ *
+ * An additional twist in the whole thing presents late microcode loading
+ * which, when done, may cause for the X86_FEATURE_RTM_ALWAYS_ABORT CPUID
+ * bit to be set after the update.
+ *
+ * A subsequent hotplug operation on any logical CPU except the BSP will
+ * cause for the supported CPUID feature bits to get re-detected and, if
+ * RTM and HLE get cleared all of a sudden, but, userspace did consult
+ * them before the update, then funny explosions will happen. Long story
+ * short: the kernel doesn't modify CPUID feature bits after booting.
+ *
+ * That's why, this function's call in init_intel() doesn't clear the
+ * feature flags.
+ */
+static void tsx_clear_cpuid(void)
{
u64 msr;
@@ -97,6 +135,39 @@ void tsx_clear_cpuid(void)
rdmsrl(MSR_TSX_FORCE_ABORT, msr);
msr |= MSR_TFA_TSX_CPUID_CLEAR;
wrmsrl(MSR_TSX_FORCE_ABORT, msr);
+ } else if (tsx_ctrl_is_supported()) {
+ rdmsrl(MSR_IA32_TSX_CTRL, msr);
+ msr |= TSX_CTRL_CPUID_CLEAR;
+ wrmsrl(MSR_IA32_TSX_CTRL, msr);
+ }
+}
+
+/*
+ * Disable TSX development mode
+ *
+ * When the microcode released in Feb 2022 is applied, TSX will be disabled by
+ * default on some processors. MSR 0x122 (TSX_CTRL) and MSR 0x123
+ * (IA32_MCU_OPT_CTRL) can be used to re-enable TSX for development, doing so is
+ * not recommended for production deployments. In particular, applying MD_CLEAR
+ * flows for mitigation of the Intel TSX Asynchronous Abort (TAA) transient
+ * execution attack may not be effective on these processors when Intel TSX is
+ * enabled with updated microcode.
+ */
+static void tsx_dev_mode_disable(void)
+{
+ u64 mcu_opt_ctrl;
+
+ /* Check if RTM_ALLOW exists */
+ if (!boot_cpu_has_bug(X86_BUG_TAA) || !tsx_ctrl_is_supported() ||
+ !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL))
+ return;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+
+ if (mcu_opt_ctrl & RTM_ALLOW) {
+ mcu_opt_ctrl &= ~RTM_ALLOW;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+ setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT);
}
}
@@ -105,14 +176,14 @@ void __init tsx_init(void)
char arg[5] = {};
int ret;
+ tsx_dev_mode_disable();
+
/*
- * Hardware will always abort a TSX transaction if both CPUID bits
- * RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are set. In this case, it is
- * better not to enumerate CPUID.RTM and CPUID.HLE bits. Clear them
- * here.
+ * Hardware will always abort a TSX transaction when the CPUID bit
+ * RTM_ALWAYS_ABORT is set. In this case, it is better not to enumerate
+ * CPUID.RTM and CPUID.HLE bits. Clear them here.
*/
- if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) &&
- boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) {
tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT;
tsx_clear_cpuid();
setup_clear_cpu_cap(X86_FEATURE_RTM);
@@ -175,3 +246,16 @@ void __init tsx_init(void)
setup_force_cpu_cap(X86_FEATURE_HLE);
}
}
+
+void tsx_ap_init(void)
+{
+ tsx_dev_mode_disable();
+
+ if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+ tsx_enable();
+ else if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ tsx_disable();
+ else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT)
+ /* See comment over that function for more details. */
+ tsx_clear_cpuid();
+}