From e8824890249355656968d8846908a313fe231f11 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Apr 2020 12:37:54 -0700 Subject: x86/delay: Preparatory code cleanup The naming conventions in the delay code are confusing at best. All delay variants use a loops argument and or variable which originates from the original delay_loop() implementation. But all variants except delay_loop() are based on TSC cycles. Rename the argument to cycles and make it type u64 to avoid these weird expansions to u64 in the functions. Rename MWAITX_MAX_LOOPS to MWAITX_MAX_WAIT_CYCLES for the same reason and fixup the comment of delay_mwaitx() as well. Mark the delay_fn function pointer __ro_after_init and fixup the comment for it. No functional change and preparation for the upcoming TPAUSE based delay variant. [ Kyung Min Park: Added __init to use_tsc_delay() ] Signed-off-by: Thomas Gleixner Signed-off-by: Kyung Min Park Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1587757076-30337-2-git-send-email-kyung.min.park@intel.com --- arch/x86/include/asm/delay.h | 3 ++- arch/x86/include/asm/mwait.h | 2 +- arch/x86/lib/delay.c | 45 +++++++++++++++++++++++--------------------- 3 files changed, 27 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index de9e7841f953..9aa38de7bd72 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -3,8 +3,9 @@ #define _ASM_X86_DELAY_H #include +#include -void use_tsc_delay(void); +void __init use_tsc_delay(void); void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index b809f117f3f4..a43b35b35049 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -20,7 +20,7 @@ #define MWAIT_ECX_INTERRUPT_BREAK 0x1 #define MWAITX_ECX_TIMER_ENABLE BIT(1) -#define MWAITX_MAX_LOOPS ((u32)-1) +#define MWAITX_MAX_WAIT_CYCLES UINT_MAX #define MWAITX_DISABLE_CSTATES 0xf0 u32 get_umwait_control_msr(void); diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index c126571e5e2e..887d52d5a7cc 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -27,9 +27,19 @@ # include #endif +static void delay_loop(u64 __loops); + +/* + * Calibration and selection of the delay mechanism happens only once + * during boot. + */ +static void (*delay_fn)(u64) __ro_after_init = delay_loop; + /* simple loop based delay: */ -static void delay_loop(unsigned long loops) +static void delay_loop(u64 __loops) { + unsigned long loops = (unsigned long)__loops; + asm volatile( " test %0,%0 \n" " jz 3f \n" @@ -49,9 +59,9 @@ static void delay_loop(unsigned long loops) } /* TSC based delay: */ -static void delay_tsc(unsigned long __loops) +static void delay_tsc(u64 cycles) { - u64 bclock, now, loops = __loops; + u64 bclock, now; int cpu; preempt_disable(); @@ -59,7 +69,7 @@ static void delay_tsc(unsigned long __loops) bclock = rdtsc_ordered(); for (;;) { now = rdtsc_ordered(); - if ((now - bclock) >= loops) + if ((now - bclock) >= cycles) break; /* Allow RT tasks to run */ @@ -77,7 +87,7 @@ static void delay_tsc(unsigned long __loops) * counter for this CPU. */ if (unlikely(cpu != smp_processor_id())) { - loops -= (now - bclock); + cycles -= (now - bclock); cpu = smp_processor_id(); bclock = rdtsc_ordered(); } @@ -87,24 +97,24 @@ static void delay_tsc(unsigned long __loops) /* * On some AMD platforms, MWAITX has a configurable 32-bit timer, that - * counts with TSC frequency. The input value is the loop of the - * counter, it will exit when the timer expires. + * counts with TSC frequency. The input value is the number of TSC cycles + * to wait. MWAITX will also exit when the timer expires. */ -static void delay_mwaitx(unsigned long __loops) +static void delay_mwaitx(u64 cycles) { - u64 start, end, delay, loops = __loops; + u64 start, end, delay; /* * Timer value of 0 causes MWAITX to wait indefinitely, unless there * is a store on the memory monitored by MONITORX. */ - if (loops == 0) + if (!cycles) return; start = rdtsc_ordered(); for (;;) { - delay = min_t(u64, MWAITX_MAX_LOOPS, loops); + delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles); /* * Use cpu_tss_rw as a cacheline-aligned, seldomly @@ -121,22 +131,15 @@ static void delay_mwaitx(unsigned long __loops) end = rdtsc_ordered(); - if (loops <= end - start) + if (cycles <= end - start) break; - loops -= end - start; - + cycles -= end - start; start = end; } } -/* - * Since we calibrate only once at boot, this - * function should be set once at boot and not changed - */ -static void (*delay_fn)(unsigned long) = delay_loop; - -void use_tsc_delay(void) +void __init use_tsc_delay(void) { if (delay_fn == delay_loop) delay_fn = delay_tsc; -- cgit v1.2.3 From 46f90c7aad62be1af76588108c730d826308a801 Mon Sep 17 00:00:00 2001 From: Kyung Min Park Date: Fri, 24 Apr 2020 12:37:55 -0700 Subject: x86/delay: Refactor delay_mwaitx() for TPAUSE support Refactor code to make it easier to add a new model specific function to delay for a number of cycles. No functional change. Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Signed-off-by: Kyung Min Park Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/1587757076-30337-3-git-send-email-kyung.min.park@intel.com --- arch/x86/lib/delay.c | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 887d52d5a7cc..fe91dc171cf8 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -34,6 +34,7 @@ static void delay_loop(u64 __loops); * during boot. */ static void (*delay_fn)(u64) __ro_after_init = delay_loop; +static void (*delay_halt_fn)(u64 start, u64 cycles) __ro_after_init; /* simple loop based delay: */ static void delay_loop(u64 __loops) @@ -100,9 +101,33 @@ static void delay_tsc(u64 cycles) * counts with TSC frequency. The input value is the number of TSC cycles * to wait. MWAITX will also exit when the timer expires. */ -static void delay_mwaitx(u64 cycles) +static void delay_halt_mwaitx(u64 unused, u64 cycles) { - u64 start, end, delay; + u64 delay; + + delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles); + /* + * Use cpu_tss_rw as a cacheline-aligned, seldomly accessed per-cpu + * variable as the monitor target. + */ + __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); + + /* + * AMD, like Intel, supports the EAX hint and EAX=0xf means, do not + * enter any deep C-state and we use it here in delay() to minimize + * wakeup latency. + */ + __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); +} + +/* + * Call a vendor specific function to delay for a given amount of time. Because + * these functions may return earlier than requested, check for actual elapsed + * time and call again until done. + */ +static void delay_halt(u64 __cycles) +{ + u64 start, end, cycles = __cycles; /* * Timer value of 0 causes MWAITX to wait indefinitely, unless there @@ -114,21 +139,7 @@ static void delay_mwaitx(u64 cycles) start = rdtsc_ordered(); for (;;) { - delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles); - - /* - * Use cpu_tss_rw as a cacheline-aligned, seldomly - * accessed per-cpu variable as the monitor target. - */ - __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); - - /* - * AMD, like Intel's MWAIT version, supports the EAX hint and - * EAX=0xf0 means, do not enter any deep C-state and we use it - * here in delay() to minimize wakeup latency. - */ - __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); - + delay_halt_fn(start, cycles); end = rdtsc_ordered(); if (cycles <= end - start) @@ -147,7 +158,8 @@ void __init use_tsc_delay(void) void use_mwaitx_delay(void) { - delay_fn = delay_mwaitx; + delay_halt_fn = delay_halt_mwaitx; + delay_fn = delay_halt; } int read_current_timer(unsigned long *timer_val) -- cgit v1.2.3 From cec5f268cd02d25d2d74807843d8ae0292fe0fb7 Mon Sep 17 00:00:00 2001 From: Kyung Min Park Date: Fri, 24 Apr 2020 12:37:56 -0700 Subject: x86/delay: Introduce TPAUSE delay TPAUSE instructs the processor to enter an implementation-dependent optimized state. The instruction execution wakes up when the time-stamp counter reaches or exceeds the implicit EDX:EAX 64-bit input value. The instruction execution also wakes up due to the expiration of the operating system time-limit or by an external interrupt or exceptions such as a debug exception or a machine check exception. TPAUSE offers a choice of two lower power states: 1. Light-weight power/performance optimized state C0.1 2. Improved power/performance optimized state C0.2 This way, it can save power with low wake-up latency in comparison to spinloop based delay. The selection between the two is governed by the input register. TPAUSE is available on processors with X86_FEATURE_WAITPKG. Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Signed-off-by: Kyung Min Park Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/1587757076-30337-4-git-send-email-kyung.min.park@intel.com --- arch/x86/Kconfig.assembler | 4 ++++ arch/x86/include/asm/delay.h | 1 + arch/x86/include/asm/mwait.h | 22 ++++++++++++++++++++++ arch/x86/kernel/time.c | 3 +++ arch/x86/lib/delay.c | 27 +++++++++++++++++++++++++++ 5 files changed, 57 insertions(+) (limited to 'arch') diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler index 13de0db38d4e..26b8c08e2fc4 100644 --- a/arch/x86/Kconfig.assembler +++ b/arch/x86/Kconfig.assembler @@ -15,3 +15,7 @@ config AS_SHA256_NI def_bool $(as-instr,sha256msg1 %xmm0$(comma)%xmm1) help Supported by binutils >= 2.24 and LLVM integrated assembler +config AS_TPAUSE + def_bool $(as-instr,tpause %ecx) + help + Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7 diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index 9aa38de7bd72..630891d25819 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -6,6 +6,7 @@ #include void __init use_tsc_delay(void); +void __init use_tpause_delay(void); void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index a43b35b35049..73d997aa2966 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -22,6 +22,8 @@ #define MWAITX_ECX_TIMER_ENABLE BIT(1) #define MWAITX_MAX_WAIT_CYCLES UINT_MAX #define MWAITX_DISABLE_CSTATES 0xf0 +#define TPAUSE_C01_STATE 1 +#define TPAUSE_C02_STATE 0 u32 get_umwait_control_msr(void); @@ -122,4 +124,24 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) current_clr_polling(); } +/* + * Caller can specify whether to enter C0.1 (low latency, less + * power saving) or C0.2 state (saves more power, but longer wakeup + * latency). This may be overridden by the IA32_UMWAIT_CONTROL MSR + * which can force requests for C0.2 to be downgraded to C0.1. + */ +static inline void __tpause(u32 ecx, u32 edx, u32 eax) +{ + /* "tpause %ecx, %edx, %eax;" */ + #ifdef CONFIG_AS_TPAUSE + asm volatile("tpause %%ecx\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #else + asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1\t\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #endif +} + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 106e7f87f534..371a6b348e44 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -103,6 +103,9 @@ static __init void x86_late_time_init(void) */ x86_init.irqs.intr_mode_init(); tsc_init(); + + if (static_cpu_has(X86_FEATURE_WAITPKG)) + use_tpause_delay(); } /* diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index fe91dc171cf8..65d15df6212d 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -96,6 +96,27 @@ static void delay_tsc(u64 cycles) preempt_enable(); } +/* + * On Intel the TPAUSE instruction waits until any of: + * 1) the TSC counter exceeds the value provided in EDX:EAX + * 2) global timeout in IA32_UMWAIT_CONTROL is exceeded + * 3) an external interrupt occurs + */ +static void delay_halt_tpause(u64 start, u64 cycles) +{ + u64 until = start + cycles; + u32 eax, edx; + + eax = lower_32_bits(until); + edx = upper_32_bits(until); + + /* + * Hard code the deeper (C0.2) sleep state because exit latency is + * small compared to the "microseconds" that usleep() will delay. + */ + __tpause(TPAUSE_C02_STATE, edx, eax); +} + /* * On some AMD platforms, MWAITX has a configurable 32-bit timer, that * counts with TSC frequency. The input value is the number of TSC cycles @@ -156,6 +177,12 @@ void __init use_tsc_delay(void) delay_fn = delay_tsc; } +void __init use_tpause_delay(void) +{ + delay_halt_fn = delay_halt_tpause; + delay_fn = delay_halt; +} + void use_mwaitx_delay(void) { delay_halt_fn = delay_halt_mwaitx; -- cgit v1.2.3 From bd35c77e32e4359580207891c0f7a438ad4b42df Mon Sep 17 00:00:00 2001 From: Krzysztof Piecuch Date: Thu, 23 Jan 2020 16:09:26 +0000 Subject: x86/tsc: Add tsc_early_khz command line parameter Changing base clock frequency directly impacts TSC Hz but not CPUID.16h value. An overclocked CPU supporting CPUID.16h and with partial CPUID.15h support will set TSC KHZ according to "best guess" given by CPUID.16h relying on tsc_refine_calibration_work to give better numbers later. tsc_refine_calibration_work will refuse to do its work when the outcome is off the early TSC KHZ value by more than 1% which is certain to happen on an overclocked system. Fix this by adding a tsc_early_khz command line parameter that makes the kernel skip early TSC calibration and use the given value instead. This allows the user to provide the expected TSC frequency that is closer to reality than the one reported by the hardware, enabling tsc_refine_calibration_work to do meaningful error checking. [ tglx: Made the variable __initdata as it's only used on init and removed the error checking in the argument parser because kstrto*() only stores to the variable if the string is valid ] Signed-off-by: Krzysztof Piecuch Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/O2CpIOrqLZHgNRkfjRpz_LGqnc1ix_seNIiOCvHY4RHoulOVRo6kMXKuLOfBVTi0SMMevg6Go1uZ_cL9fLYtYdTRNH78ChaFaZyG3VAyYz8=@protonmail.com --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ arch/x86/kernel/tsc.c | 12 +++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 7bc83f3d9bdf..409ee4ae467c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5067,6 +5067,12 @@ interruptions from clocksource watchdog are not acceptable). + tsc_early_khz= [X86] Skip early TSC calibration and use the given + value instead. Useful when the early TSC frequency discovery + procedure is not reliable, such as on overclocked systems + with CPUID.16h support and partial CPUID.15h support. + Format: + tsx= [X86] Control Transactional Synchronization Extensions (TSX) feature in Intel processors that support TSX control. diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index fdd4c1078632..49d925043171 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -41,6 +41,7 @@ EXPORT_SYMBOL(tsc_khz); * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; +static unsigned int __initdata tsc_early_khz; static DEFINE_STATIC_KEY_FALSE(__use_tsc); @@ -59,6 +60,12 @@ struct cyc2ns { static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); +static int __init tsc_early_khz_setup(char *buf) +{ + return kstrtouint(buf, 0, &tsc_early_khz); +} +early_param("tsc_early_khz", tsc_early_khz_setup); + __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) { int seq, idx; @@ -1412,7 +1419,10 @@ static bool __init determine_cpu_tsc_frequencies(bool early) if (early) { cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); + if (tsc_early_khz) + tsc_khz = tsc_early_khz; + else + tsc_khz = x86_platform.calibrate_tsc(); } else { /* We should not be here with non-native cpu calibration */ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); -- cgit v1.2.3