From e8824890249355656968d8846908a313fe231f11 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Apr 2020 12:37:54 -0700 Subject: x86/delay: Preparatory code cleanup The naming conventions in the delay code are confusing at best. All delay variants use a loops argument and or variable which originates from the original delay_loop() implementation. But all variants except delay_loop() are based on TSC cycles. Rename the argument to cycles and make it type u64 to avoid these weird expansions to u64 in the functions. Rename MWAITX_MAX_LOOPS to MWAITX_MAX_WAIT_CYCLES for the same reason and fixup the comment of delay_mwaitx() as well. Mark the delay_fn function pointer __ro_after_init and fixup the comment for it. No functional change and preparation for the upcoming TPAUSE based delay variant. [ Kyung Min Park: Added __init to use_tsc_delay() ] Signed-off-by: Thomas Gleixner Signed-off-by: Kyung Min Park Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1587757076-30337-2-git-send-email-kyung.min.park@intel.com --- arch/x86/lib/delay.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index c126571e5e2e..887d52d5a7cc 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -27,9 +27,19 @@ # include #endif +static void delay_loop(u64 __loops); + +/* + * Calibration and selection of the delay mechanism happens only once + * during boot. + */ +static void (*delay_fn)(u64) __ro_after_init = delay_loop; + /* simple loop based delay: */ -static void delay_loop(unsigned long loops) +static void delay_loop(u64 __loops) { + unsigned long loops = (unsigned long)__loops; + asm volatile( " test %0,%0 \n" " jz 3f \n" @@ -49,9 +59,9 @@ static void delay_loop(unsigned long loops) } /* TSC based delay: */ -static void delay_tsc(unsigned long __loops) +static void delay_tsc(u64 cycles) { - u64 bclock, now, loops = __loops; + u64 bclock, now; int cpu; preempt_disable(); @@ -59,7 +69,7 @@ static void delay_tsc(unsigned long __loops) bclock = rdtsc_ordered(); for (;;) { now = rdtsc_ordered(); - if ((now - bclock) >= loops) + if ((now - bclock) >= cycles) break; /* Allow RT tasks to run */ @@ -77,7 +87,7 @@ static void delay_tsc(unsigned long __loops) * counter for this CPU. */ if (unlikely(cpu != smp_processor_id())) { - loops -= (now - bclock); + cycles -= (now - bclock); cpu = smp_processor_id(); bclock = rdtsc_ordered(); } @@ -87,24 +97,24 @@ static void delay_tsc(unsigned long __loops) /* * On some AMD platforms, MWAITX has a configurable 32-bit timer, that - * counts with TSC frequency. The input value is the loop of the - * counter, it will exit when the timer expires. + * counts with TSC frequency. The input value is the number of TSC cycles + * to wait. MWAITX will also exit when the timer expires. */ -static void delay_mwaitx(unsigned long __loops) +static void delay_mwaitx(u64 cycles) { - u64 start, end, delay, loops = __loops; + u64 start, end, delay; /* * Timer value of 0 causes MWAITX to wait indefinitely, unless there * is a store on the memory monitored by MONITORX. */ - if (loops == 0) + if (!cycles) return; start = rdtsc_ordered(); for (;;) { - delay = min_t(u64, MWAITX_MAX_LOOPS, loops); + delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles); /* * Use cpu_tss_rw as a cacheline-aligned, seldomly @@ -121,22 +131,15 @@ static void delay_mwaitx(unsigned long __loops) end = rdtsc_ordered(); - if (loops <= end - start) + if (cycles <= end - start) break; - loops -= end - start; - + cycles -= end - start; start = end; } } -/* - * Since we calibrate only once at boot, this - * function should be set once at boot and not changed - */ -static void (*delay_fn)(unsigned long) = delay_loop; - -void use_tsc_delay(void) +void __init use_tsc_delay(void) { if (delay_fn == delay_loop) delay_fn = delay_tsc; -- cgit v1.2.3 From 46f90c7aad62be1af76588108c730d826308a801 Mon Sep 17 00:00:00 2001 From: Kyung Min Park Date: Fri, 24 Apr 2020 12:37:55 -0700 Subject: x86/delay: Refactor delay_mwaitx() for TPAUSE support Refactor code to make it easier to add a new model specific function to delay for a number of cycles. No functional change. Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Signed-off-by: Kyung Min Park Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/1587757076-30337-3-git-send-email-kyung.min.park@intel.com --- arch/x86/lib/delay.c | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) (limited to 'arch/x86/lib') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 887d52d5a7cc..fe91dc171cf8 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -34,6 +34,7 @@ static void delay_loop(u64 __loops); * during boot. */ static void (*delay_fn)(u64) __ro_after_init = delay_loop; +static void (*delay_halt_fn)(u64 start, u64 cycles) __ro_after_init; /* simple loop based delay: */ static void delay_loop(u64 __loops) @@ -100,9 +101,33 @@ static void delay_tsc(u64 cycles) * counts with TSC frequency. The input value is the number of TSC cycles * to wait. MWAITX will also exit when the timer expires. */ -static void delay_mwaitx(u64 cycles) +static void delay_halt_mwaitx(u64 unused, u64 cycles) { - u64 start, end, delay; + u64 delay; + + delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles); + /* + * Use cpu_tss_rw as a cacheline-aligned, seldomly accessed per-cpu + * variable as the monitor target. + */ + __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); + + /* + * AMD, like Intel, supports the EAX hint and EAX=0xf means, do not + * enter any deep C-state and we use it here in delay() to minimize + * wakeup latency. + */ + __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); +} + +/* + * Call a vendor specific function to delay for a given amount of time. Because + * these functions may return earlier than requested, check for actual elapsed + * time and call again until done. + */ +static void delay_halt(u64 __cycles) +{ + u64 start, end, cycles = __cycles; /* * Timer value of 0 causes MWAITX to wait indefinitely, unless there @@ -114,21 +139,7 @@ static void delay_mwaitx(u64 cycles) start = rdtsc_ordered(); for (;;) { - delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles); - - /* - * Use cpu_tss_rw as a cacheline-aligned, seldomly - * accessed per-cpu variable as the monitor target. - */ - __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); - - /* - * AMD, like Intel's MWAIT version, supports the EAX hint and - * EAX=0xf0 means, do not enter any deep C-state and we use it - * here in delay() to minimize wakeup latency. - */ - __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); - + delay_halt_fn(start, cycles); end = rdtsc_ordered(); if (cycles <= end - start) @@ -147,7 +158,8 @@ void __init use_tsc_delay(void) void use_mwaitx_delay(void) { - delay_fn = delay_mwaitx; + delay_halt_fn = delay_halt_mwaitx; + delay_fn = delay_halt; } int read_current_timer(unsigned long *timer_val) -- cgit v1.2.3 From cec5f268cd02d25d2d74807843d8ae0292fe0fb7 Mon Sep 17 00:00:00 2001 From: Kyung Min Park Date: Fri, 24 Apr 2020 12:37:56 -0700 Subject: x86/delay: Introduce TPAUSE delay TPAUSE instructs the processor to enter an implementation-dependent optimized state. The instruction execution wakes up when the time-stamp counter reaches or exceeds the implicit EDX:EAX 64-bit input value. The instruction execution also wakes up due to the expiration of the operating system time-limit or by an external interrupt or exceptions such as a debug exception or a machine check exception. TPAUSE offers a choice of two lower power states: 1. Light-weight power/performance optimized state C0.1 2. Improved power/performance optimized state C0.2 This way, it can save power with low wake-up latency in comparison to spinloop based delay. The selection between the two is governed by the input register. TPAUSE is available on processors with X86_FEATURE_WAITPKG. Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Signed-off-by: Kyung Min Park Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/1587757076-30337-4-git-send-email-kyung.min.park@intel.com --- arch/x86/Kconfig.assembler | 4 ++++ arch/x86/include/asm/delay.h | 1 + arch/x86/include/asm/mwait.h | 22 ++++++++++++++++++++++ arch/x86/kernel/time.c | 3 +++ arch/x86/lib/delay.c | 27 +++++++++++++++++++++++++++ 5 files changed, 57 insertions(+) (limited to 'arch/x86/lib') diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler index 13de0db38d4e..26b8c08e2fc4 100644 --- a/arch/x86/Kconfig.assembler +++ b/arch/x86/Kconfig.assembler @@ -15,3 +15,7 @@ config AS_SHA256_NI def_bool $(as-instr,sha256msg1 %xmm0$(comma)%xmm1) help Supported by binutils >= 2.24 and LLVM integrated assembler +config AS_TPAUSE + def_bool $(as-instr,tpause %ecx) + help + Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7 diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index 9aa38de7bd72..630891d25819 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -6,6 +6,7 @@ #include void __init use_tsc_delay(void); +void __init use_tpause_delay(void); void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index a43b35b35049..73d997aa2966 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -22,6 +22,8 @@ #define MWAITX_ECX_TIMER_ENABLE BIT(1) #define MWAITX_MAX_WAIT_CYCLES UINT_MAX #define MWAITX_DISABLE_CSTATES 0xf0 +#define TPAUSE_C01_STATE 1 +#define TPAUSE_C02_STATE 0 u32 get_umwait_control_msr(void); @@ -122,4 +124,24 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) current_clr_polling(); } +/* + * Caller can specify whether to enter C0.1 (low latency, less + * power saving) or C0.2 state (saves more power, but longer wakeup + * latency). This may be overridden by the IA32_UMWAIT_CONTROL MSR + * which can force requests for C0.2 to be downgraded to C0.1. + */ +static inline void __tpause(u32 ecx, u32 edx, u32 eax) +{ + /* "tpause %ecx, %edx, %eax;" */ + #ifdef CONFIG_AS_TPAUSE + asm volatile("tpause %%ecx\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #else + asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1\t\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #endif +} + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 106e7f87f534..371a6b348e44 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -103,6 +103,9 @@ static __init void x86_late_time_init(void) */ x86_init.irqs.intr_mode_init(); tsc_init(); + + if (static_cpu_has(X86_FEATURE_WAITPKG)) + use_tpause_delay(); } /* diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index fe91dc171cf8..65d15df6212d 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -96,6 +96,27 @@ static void delay_tsc(u64 cycles) preempt_enable(); } +/* + * On Intel the TPAUSE instruction waits until any of: + * 1) the TSC counter exceeds the value provided in EDX:EAX + * 2) global timeout in IA32_UMWAIT_CONTROL is exceeded + * 3) an external interrupt occurs + */ +static void delay_halt_tpause(u64 start, u64 cycles) +{ + u64 until = start + cycles; + u32 eax, edx; + + eax = lower_32_bits(until); + edx = upper_32_bits(until); + + /* + * Hard code the deeper (C0.2) sleep state because exit latency is + * small compared to the "microseconds" that usleep() will delay. + */ + __tpause(TPAUSE_C02_STATE, edx, eax); +} + /* * On some AMD platforms, MWAITX has a configurable 32-bit timer, that * counts with TSC frequency. The input value is the number of TSC cycles @@ -156,6 +177,12 @@ void __init use_tsc_delay(void) delay_fn = delay_tsc; } +void __init use_tpause_delay(void) +{ + delay_halt_fn = delay_halt_tpause; + delay_fn = delay_halt; +} + void use_mwaitx_delay(void) { delay_halt_fn = delay_halt_mwaitx; -- cgit v1.2.3