From e32d045cd4ba06b59878323e434bad010e78e658 Mon Sep 17 00:00:00 2001 From: Rajneesh Bhardwaj Date: Thu, 6 Jun 2019 06:54:19 +0530 Subject: x86/cpu: Add Ice Lake NNPI to Intel family Add the CPUID model number of Ice Lake Neural Network Processor for Deep Learning Inference (ICL-NNPI) to the Intel family list. Ice Lake NNPI uses model number 0x9D and this will be documented in a future version of Intel Software Development Manual. Signed-off-by: Rajneesh Bhardwaj Signed-off-by: Thomas Gleixner Cc: bp@suse.de Cc: Borislav Petkov Cc: Dave Hansen Cc: Andy Shevchenko Cc: "H. Peter Anvin" Cc: Kan Liang Cc: Peter Zijlstra Cc: platform-driver-x86@vger.kernel.org Cc: Qiuxu Zhuo Cc: Srinivas Pandruvada Cc: Len Brown Cc: Linux PM Link: https://lkml.kernel.org/r/20190606012419.13250-1-rajneesh.bhardwaj@linux.intel.com --- arch/x86/include/asm/intel-family.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 9f15384c504a..087de5d3b93a 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -53,6 +53,7 @@ #define INTEL_FAM6_CANNONLAKE_MOBILE 0x66 #define INTEL_FAM6_ICELAKE_MOBILE 0x7E +#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* "Small Core" Processors (Atom) */ -- cgit v1.2.3 From cbb99c0f588737ec98c333558922ce47e9a95827 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Wed, 5 Jun 2019 15:02:52 -0700 Subject: x86/cpufeatures: Add FDP_EXCPTN_ONLY and ZERO_FCS_FDS Add the CPUID enumeration for Intel's de-feature bits to accommodate passing these de-features through to kvm guests. These de-features are (from SDM vol 1, section 8.1.8): - X86_FEATURE_FDP_EXCPTN_ONLY: If CPUID.(EAX=07H,ECX=0H):EBX[bit 6] = 1, the data pointer (FDP) is updated only for the x87 non-control instructions that incur unmasked x87 exceptions. - X86_FEATURE_ZERO_FCS_FDS: If CPUID.(EAX=07H,ECX=0H):EBX[bit 13] = 1, the processor deprecates FCS and FDS; it saves each as 0000H. Signed-off-by: Aaron Lewis Signed-off-by: Borislav Petkov Reviewed-by: Jim Mattson Cc: Fenghua Yu Cc: Frederic Weisbecker Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Konrad Rzeszutek Wilk Cc: marcorr@google.com Cc: Peter Feiner Cc: pshier@google.com Cc: Robert Hoo Cc: Thomas Gleixner Cc: Thomas Lendacky Cc: x86-ml Link: https://lkml.kernel.org/r/20190605220252.103406-1-aaronlewis@google.com --- arch/x86/include/asm/cpufeatures.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 75f27ee2c263..1017b9c7dfe0 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -239,12 +239,14 @@ #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */ #define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ #define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ #define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ -- cgit v1.2.3 From acec0ce081de0c36459eea91647faf99296445a3 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 19 Jun 2019 18:51:09 +0200 Subject: x86/cpufeatures: Combine word 11 and 12 into a new scattered features word MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's a waste for the four X86_FEATURE_CQM_* feature bits to occupy two whole feature bits words. To better utilize feature words, re-define word 11 to host scattered features and move the four X86_FEATURE_CQM_* features into Linux defined word 11. More scattered features can be added in word 11 in the future. Rename leaf 11 in cpuid_leafs to CPUID_LNX_4 to reflect it's a Linux-defined leaf. Rename leaf 12 as CPUID_DUMMY which will be replaced by a meaningful name in the next patch when CPUID.7.1:EAX occupies world 12. Maximum number of RMID and cache occupancy scale are retrieved from CPUID.0xf.1 after scattered CQM features are enumerated. Carve out the code into a separate function. KVM doesn't support resctrl now. So it's safe to move the X86_FEATURE_CQM_* features to scattered features word 11 for KVM. Signed-off-by: Fenghua Yu Signed-off-by: Borislav Petkov Cc: Aaron Lewis Cc: Andy Lutomirski Cc: Babu Moger Cc: "Chang S. Bae" Cc: "Sean J Christopherson" Cc: Frederic Weisbecker Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juergen Gross Cc: Konrad Rzeszutek Wilk Cc: kvm ML Cc: Masahiro Yamada Cc: Masami Hiramatsu Cc: Nadav Amit Cc: Paolo Bonzini Cc: Pavel Tatashin Cc: Peter Feiner Cc: "Peter Zijlstra (Intel)" Cc: "Radim Krčmář" Cc: "Rafael J. Wysocki" Cc: Ravi V Shankar Cc: Sherry Hurwitz Cc: Thomas Gleixner Cc: Thomas Lendacky Cc: x86 Link: https://lkml.kernel.org/r/1560794416-217638-2-git-send-email-fenghua.yu@intel.com --- arch/x86/include/asm/cpufeature.h | 4 ++-- arch/x86/include/asm/cpufeatures.h | 17 ++++++++++------- arch/x86/kernel/cpu/common.c | 38 +++++++++++++++----------------------- arch/x86/kernel/cpu/cpuid-deps.c | 3 +++ arch/x86/kernel/cpu/scattered.c | 4 ++++ arch/x86/kvm/cpuid.h | 2 -- 6 files changed, 34 insertions(+), 34 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 1d337c51f7e6..403f70c2e431 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -22,8 +22,8 @@ enum cpuid_leafs CPUID_LNX_3, CPUID_7_0_EBX, CPUID_D_1_EAX, - CPUID_F_0_EDX, - CPUID_F_1_EDX, + CPUID_LNX_4, + CPUID_DUMMY, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1017b9c7dfe0..be858b86023a 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -271,13 +271,16 @@ #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */ -#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ - -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ -#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ -#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ +/* + * Extended auxiliary flags: Linux defined - for features scattered in various + * CPUID levels like 0xf, etc. + * + * Reuse free bits when adding new feature flags! + */ +#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */ +#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fe6ed9696467..efb114298cfb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -803,33 +803,25 @@ static void init_speculation_control(struct cpuinfo_x86 *c) static void init_cqm(struct cpuinfo_x86 *c) { - u32 eax, ebx, ecx, edx; - - /* Additional Intel-defined flags: level 0x0000000F */ - if (c->cpuid_level >= 0x0000000F) { + if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + return; + } - /* QoS sub-leaf, EAX=0Fh, ECX=0 */ - cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_F_0_EDX] = edx; + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = cpuid_ebx(0xf); - if (cpu_has(c, X86_FEATURE_CQM_LLC)) { - /* will be overridden if occupancy monitoring exists */ - c->x86_cache_max_rmid = ebx; + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || + cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + u32 eax, ebx, ecx, edx; - /* QoS sub-leaf, EAX=0Fh, ECX=1 */ - cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_F_1_EDX] = edx; + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); - if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) || - ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) || - (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) { - c->x86_cache_max_rmid = ecx; - c->x86_cache_occ_scale = ebx; - } - } else { - c->x86_cache_max_rmid = -1; - c->x86_cache_occ_scale = -1; - } + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; } } diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 2c0bd38a44ab..fa07a224e7b9 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -59,6 +59,9 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, {} }; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 94aa1c72ca98..adf9b71386ef 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -26,6 +26,10 @@ struct cpuid_bit { static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 9a327d5b6d1f..d78a61408243 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -47,8 +47,6 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX}, [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, - [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX}, - [CPUID_F_1_EDX] = { 0xf, 1, CPUID_EDX}, [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX}, [CPUID_6_EAX] = { 6, 0, CPUID_EAX}, [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX}, -- cgit v1.2.3 From b302e4b176d00e1cbc80148c5d0aee36751f7480 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 17 Jun 2019 11:00:16 -0700 Subject: x86/cpufeatures: Enumerate the new AVX512 BFLOAT16 instructions AVX512 BFLOAT16 instructions support 16-bit BFLOAT16 floating-point format (BF16) for deep learning optimization. BF16 is a short version of 32-bit single-precision floating-point format (FP32) and has several advantages over 16-bit half-precision floating-point format (FP16). BF16 keeps FP32 accumulation after multiplication without loss of precision, offers more than enough range for deep learning training tasks, and doesn't need to handle hardware exception. AVX512 BFLOAT16 instructions are enumerated in CPUID.7.1:EAX[bit 5] AVX512_BF16. CPUID.7.1:EAX contains only feature bits. Reuse the currently empty word 12 as a pure features word to hold the feature bits including AVX512_BF16. Detailed information of the CPUID bit and AVX512 BFLOAT16 instructions can be found in the latest Intel Architecture Instruction Set Extensions and Future Features Programming Reference. [ bp: Check CPUID(7) subleaf validity before accessing subleaf 1. ] Signed-off-by: Fenghua Yu Signed-off-by: Borislav Petkov Cc: "Chang S. Bae" Cc: Frederic Weisbecker Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Nadav Amit Cc: Paolo Bonzini Cc: Pavel Tatashin Cc: Peter Feiner Cc: Radim Krcmar Cc: "Rafael J. Wysocki" Cc: "Ravi V Shankar" Cc: Robert Hoo Cc: "Sean J Christopherson" Cc: Thomas Gleixner Cc: Thomas Lendacky Cc: x86 Link: https://lkml.kernel.org/r/1560794416-217638-3-git-send-email-fenghua.yu@intel.com --- arch/x86/include/asm/cpufeature.h | 2 +- arch/x86/include/asm/cpufeatures.h | 3 +++ arch/x86/kernel/cpu/common.c | 6 ++++++ arch/x86/kernel/cpu/cpuid-deps.c | 1 + 4 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 403f70c2e431..58acda503817 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -23,7 +23,7 @@ enum cpuid_leafs CPUID_7_0_EBX, CPUID_D_1_EAX, CPUID_LNX_4, - CPUID_DUMMY, + CPUID_7_1_EAX, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index be858b86023a..8ecd9fac97c3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -282,6 +282,9 @@ #define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */ #define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ +/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ +#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ + /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index efb114298cfb..dad20bc891d5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -847,6 +847,12 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_7_0_EBX] = ebx; c->x86_capability[CPUID_7_ECX] = ecx; c->x86_capability[CPUID_7_EDX] = edx; + + /* Check valid sub-leaf index before accessing it */ + if (eax >= 1) { + cpuid_count(0x00000007, 1, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_7_1_EAX] = eax; + } } /* Extended state features: level 0x0000000d */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index fa07a224e7b9..a444028d8145 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -62,6 +62,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, {} }; -- cgit v1.2.3 From 8b71340d702ec5d587443b38a852671c4fb6a723 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 8 May 2019 03:02:20 -0700 Subject: x86/fsgsbase/64: Add intrinsics for FSGSBASE instructions [ luto: Rename the variables from FS and GS to FSBASE and GSBASE and make safe to include on 32-bit kernels. ] Signed-off-by: Andi Kleen Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Reviewed-by: Andi Kleen Cc: Ravi Shankar Cc: H. Peter Anvin Link: https://lkml.kernel.org/r/1557309753-24073-6-git-send-email-chang.seok.bae@intel.com --- arch/x86/include/asm/fsgsbase.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index bca4c743de77..fdd1177499b4 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -19,6 +19,36 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task); extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); +/* Must be protected by X86_FEATURE_FSGSBASE check. */ + +static __always_inline unsigned long rdfsbase(void) +{ + unsigned long fsbase; + + asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); + + return fsbase; +} + +static __always_inline unsigned long rdgsbase(void) +{ + unsigned long gsbase; + + asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); + + return gsbase; +} + +static __always_inline void wrfsbase(unsigned long fsbase) +{ + asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory"); +} + +static __always_inline void wrgsbase(unsigned long gsbase) +{ + asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); +} + /* Helper functions for reading/writing FS/GS base */ static inline unsigned long x86_fsbase_read_cpu(void) -- cgit v1.2.3 From a86b4625138d39e97b4cc254fc9c4bb9e1dc4542 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Wed, 8 May 2019 03:02:21 -0700 Subject: x86/fsgsbase/64: Enable FSGSBASE instructions in helper functions Add cpu feature conditional FSGSBASE access to the relevant helper functions. That allows to accelerate certain FS/GS base operations in subsequent changes. Note, that while possible, the user space entry/exit GSBASE operations are not going to use the new FSGSBASE instructions. The reason is that it would require additional storage for the user space value which adds more complexity to the low level code and experiments have shown marginal benefit. This may be revisited later but for now the SWAPGS based handling in the entry code is preserved except for the paranoid entry/exit code. To preserve the SWAPGS entry mechanism introduce __[rd|wr]gsbase_inactive() helpers. Note, for Xen PV, paravirt hooks can be added later as they might allow a very efficient but different implementation. [ tglx: Massaged changelog ] Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Andi Kleen Cc: Ravi Shankar Cc: Andrew Cooper Cc: H. Peter Anvin Link: https://lkml.kernel.org/r/1557309753-24073-7-git-send-email-chang.seok.bae@intel.com --- arch/x86/include/asm/fsgsbase.h | 27 ++++++++--------- arch/x86/kernel/process_64.c | 66 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 15 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index fdd1177499b4..aefd53767a5d 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -49,35 +49,32 @@ static __always_inline void wrgsbase(unsigned long gsbase) asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); } +#include + /* Helper functions for reading/writing FS/GS base */ static inline unsigned long x86_fsbase_read_cpu(void) { unsigned long fsbase; - rdmsrl(MSR_FS_BASE, fsbase); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) + fsbase = rdfsbase(); + else + rdmsrl(MSR_FS_BASE, fsbase); return fsbase; } -static inline unsigned long x86_gsbase_read_cpu_inactive(void) -{ - unsigned long gsbase; - - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); - - return gsbase; -} - static inline void x86_fsbase_write_cpu(unsigned long fsbase) { - wrmsrl(MSR_FS_BASE, fsbase); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) + wrfsbase(fsbase); + else + wrmsrl(MSR_FS_BASE, fsbase); } -static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase) -{ - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); -} +extern unsigned long x86_gsbase_read_cpu_inactive(void); +extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 250e4c4ac6d9..c34ee0f72378 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -161,6 +161,40 @@ enum which_selector { GS }; +/* + * Out of line to be protected from kprobes. It is not used on Xen + * paravirt. When paravirt support is needed, it needs to be renamed + * with native_ prefix. + */ +static noinline unsigned long __rdgsbase_inactive(void) +{ + unsigned long gsbase; + + lockdep_assert_irqs_disabled(); + + native_swapgs(); + gsbase = rdgsbase(); + native_swapgs(); + + return gsbase; +} +NOKPROBE_SYMBOL(__rdgsbase_inactive); + +/* + * Out of line to be protected from kprobes. It is not used on Xen + * paravirt. When paravirt support is needed, it needs to be renamed + * with native_ prefix. + */ +static noinline void __wrgsbase_inactive(unsigned long gsbase) +{ + lockdep_assert_irqs_disabled(); + + native_swapgs(); + wrgsbase(gsbase); + native_swapgs(); +} +NOKPROBE_SYMBOL(__wrgsbase_inactive); + /* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. @@ -339,6 +373,38 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task, return base; } +unsigned long x86_gsbase_read_cpu_inactive(void) +{ + unsigned long gsbase; + + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + /* Interrupts are disabled here. */ + local_irq_save(flags); + gsbase = __rdgsbase_inactive(); + local_irq_restore(flags); + } else { + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + } + + return gsbase; +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + /* Interrupts are disabled here. */ + local_irq_save(flags); + __wrgsbase_inactive(gsbase); + local_irq_restore(flags); + } else { + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + } +} + unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; -- cgit v1.2.3 From 79e1932fa3cedd731ddbd6af111fe4db8ca109ae Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Wed, 8 May 2019 03:02:26 -0700 Subject: x86/entry/64: Introduce the FIND_PERCPU_BASE macro GSBASE is used to find per-CPU data in the kernel. But when GSBASE is unknown, the per-CPU base can be found from the per_cpu_offset table with a CPU NR. The CPU NR is extracted from the limit field of the CPUNODE entry in GDT, or by the RDPID instruction. This is a prerequisite for using FSGSBASE in the low level entry code. Also, add the GAS-compatible RDPID macro as binutils 2.21 do not support it. Support is added in version 2.27. [ tglx: Massaged changelog ] Suggested-by: H. Peter Anvin Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Andi Kleen Cc: Ravi Shankar Cc: Dave Hansen Link: https://lkml.kernel.org/r/1557309753-24073-12-git-send-email-chang.seok.bae@intel.com --- arch/x86/entry/calling.h | 34 ++++++++++++++++++++++++++++++++++ arch/x86/include/asm/inst.h | 15 +++++++++++++++ 2 files changed, 49 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index efb0d1b1f15f..9a524360ae2e 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,6 +6,7 @@ #include #include #include +#include /* @@ -345,6 +346,39 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm +#ifdef CONFIG_SMP + +/* + * CPU/node NR is loaded from the limit (size) field of a special segment + * descriptor entry in GDT. + */ +.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req + movq $__CPUNODE_SEG, \reg + lsl \reg, \reg +.endm + +/* + * Fetch the per-CPU GSBASE value for this processor and put it in @reg. + * We normally use %gs for accessing per-CPU data, but we are setting up + * %gs here and obviously can not use %gs itself to access per-CPU data. + */ +.macro GET_PERCPU_BASE reg:req + ALTERNATIVE \ + "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \ + "RDPID \reg", \ + X86_FEATURE_RDPID + andq $VDSO_CPUNODE_MASK, \reg + movq __per_cpu_offset(, \reg, 8), \reg +.endm + +#else + +.macro GET_PERCPU_BASE reg:req + movq pcpu_unit_offsets(%rip), \reg +.endm + +#endif /* CONFIG_SMP */ + /* * This does 'call enter_from_user_mode' unless we can avoid it based on * kernel config or using the static jump infrastructure. diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index f5a796da07f8..d063841a17e3 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -306,6 +306,21 @@ .endif MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 .endm + +.macro RDPID opd + REG_TYPE rdpid_opd_type \opd + .if rdpid_opd_type == REG_TYPE_R64 + R64_NUM rdpid_opd \opd + .else + R32_NUM rdpid_opd \opd + .endif + .byte 0xf3 + .if rdpid_opd > 7 + PFX_REX rdpid_opd 0 + .endif + .byte 0x0f, 0xc7 + MODRM 0xc0 rdpid_opd 0x7 +.endm #endif #endif -- cgit v1.2.3 From f987c955c74501c9295a81372c7d363cbe07c8a6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 8 May 2019 03:02:32 -0700 Subject: x86/elf: Enumerate kernel FSGSBASE capability in AT_HWCAP2 The kernel needs to explicitly enable FSGSBASE. So, the application needs to know if it can safely use these instructions. Just looking at the CPUID bit is not enough because it may be running in a kernel that does not enable the instructions. One way for the application would be to just try and catch the SIGILL. But that is difficult to do in libraries which may not want to overwrite the signal handlers of the main application. Enumerate the enabled FSGSBASE capability in bit 1 of AT_HWCAP2 in the ELF aux vector. AT_HWCAP2 is already used by PPC for similar purposes. The application can access it open coded or by using the getauxval() function in newer versions of glibc. [ tglx: Massaged changelog ] Signed-off-by: Andi Kleen Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Ravi Shankar Cc: H. Peter Anvin Link: https://lkml.kernel.org/r/1557309753-24073-18-git-send-email-chang.seok.bae@intel.com --- arch/x86/include/uapi/asm/hwcap2.h | 3 +++ arch/x86/kernel/cpu/common.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index 6ebaae90e207..c5ce54e749f6 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -5,4 +5,7 @@ /* MONITOR/MWAIT enabled in Ring 3 */ #define HWCAP2_RING3MWAIT (1 << 0) +/* Kernel allows FSGSBASE instructions available in Ring 3 */ +#define HWCAP2_FSGSBASE BIT(1) + #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1305f16b6105..637c9117d5ae 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1387,8 +1387,10 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_umip(c); /* Enable FSGSBASE instructions if available. */ - if (cpu_has(c, X86_FEATURE_FSGSBASE)) + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { cr4_set_bits(X86_CR4_FSGSBASE); + elf_hwcap2 |= HWCAP2_FSGSBASE; + } /* * The vendor-specific functions might have changed features. -- cgit v1.2.3 From 0a05fa67e62c73baad2a7d0fa20e8f96896c1093 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 17 Jun 2019 14:55:37 +0300 Subject: x86/cpu: Split Tremont based Atoms from the rest Split Tremont based Atoms from the rest to keep logical grouping. Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20190617115537.33309-1-andriy.shevchenko@linux.intel.com --- arch/x86/include/asm/intel-family.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 087de5d3b93a..c92a367a4a7a 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -74,6 +74,7 @@ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ #define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */ #define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ + #define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */ /* Xeon Phi */ -- cgit v1.2.3 From 761fdd5e3327db6c646a09bab5ad48cd42680cd2 Mon Sep 17 00:00:00 2001 From: Tony W Wang-oc Date: Tue, 18 Jun 2019 08:37:05 +0000 Subject: x86/cpu: Create Zhaoxin processors architecture support file Add x86 architecture support for new Zhaoxin processors. Carve out initialization code needed by Zhaoxin processors into a separate compilation unit. To identify Zhaoxin CPU, add a new vendor type X86_VENDOR_ZHAOXIN for system recognition. Signed-off-by: Tony W Wang-oc Signed-off-by: Thomas Gleixner Cc: "hpa@zytor.com" Cc: "gregkh@linuxfoundation.org" Cc: "rjw@rjwysocki.net" Cc: "lenb@kernel.org" Cc: David Wang Cc: "Cooper Yan(BJ-RD)" Cc: "Qiyuan Wang(BJ-RD)" Cc: "Herry Yang(BJ-RD)" Link: https://lkml.kernel.org/r/01042674b2f741b2aed1f797359bdffb@zhaoxin.com --- MAINTAINERS | 6 ++ arch/x86/Kconfig.cpu | 13 +++ arch/x86/include/asm/processor.h | 3 +- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/zhaoxin.c | 167 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/zhaoxin.c (limited to 'arch/x86/include') diff --git a/MAINTAINERS b/MAINTAINERS index 57f496cff999..dfdefc6cb3a2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17477,6 +17477,12 @@ Q: https://patchwork.linuxtv.org/project/linux-media/list/ S: Maintained F: drivers/media/dvb-frontends/zd1301_demod* +ZHAOXIN PROCESSOR SUPPORT +M: Tony W Wang-oc +L: linux-kernel@vger.kernel.org +S: Maintained +F: arch/x86/kernel/cpu/zhaoxin.c + ZPOOL COMPRESSED PAGE STORAGE API M: Dan Streetman L: linux-mm@kvack.org diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 6adce15268bd..8e29c991ba3e 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -480,3 +480,16 @@ config CPU_SUP_UMC_32 CPU might render the kernel unbootable. If unsure, say N. + +config CPU_SUP_ZHAOXIN + default y + bool "Support Zhaoxin processors" if PROCESSOR_SELECT + help + This enables detection, tunings and quirks for Zhaoxin processors + + You need this enabled if you want your kernel to run on a + Zhaoxin CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a Zhaoxin + CPU might render the kernel unbootable. + + If unsure, say N. diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c34a35c78618..e57d2ca2ed87 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -144,7 +144,8 @@ enum cpuid_regs_idx { #define X86_VENDOR_TRANSMETA 7 #define X86_VENDOR_NSC 8 #define X86_VENDOR_HYGON 9 -#define X86_VENDOR_NUM 10 +#define X86_VENDOR_ZHAOXIN 10 +#define X86_VENDOR_NUM 11 #define X86_VENDOR_UNKNOWN 0xff diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 5102bf7c8192..a7d9a4cb3ab6 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o +obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o obj-$(CONFIG_X86_MCE) += mce/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c new file mode 100644 index 000000000000..8e6f2f4b4afe --- /dev/null +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include + +#include "cpu.h" + +#define MSR_ZHAOXIN_FCR57 0x00001257 + +#define ACE_PRESENT (1 << 6) +#define ACE_ENABLED (1 << 7) +#define ACE_FCR (1 << 7) /* MSR_ZHAOXIN_FCR */ + +#define RNG_PRESENT (1 << 2) +#define RNG_ENABLED (1 << 3) +#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */ + +#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 +#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 +#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 +#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 +#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 +#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 + +static void init_zhaoxin_cap(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + /* Test for Extended Feature Flags presence */ + if (cpuid_eax(0xC0000000) >= 0xC0000001) { + u32 tmp = cpuid_edx(0xC0000001); + + /* Enable ACE unit, if present and disabled */ + if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) { + rdmsr(MSR_ZHAOXIN_FCR57, lo, hi); + /* Enable ACE unit */ + lo |= ACE_FCR; + wrmsr(MSR_ZHAOXIN_FCR57, lo, hi); + pr_info("CPU: Enabled ACE h/w crypto\n"); + } + + /* Enable RNG unit, if present and disabled */ + if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) { + rdmsr(MSR_ZHAOXIN_FCR57, lo, hi); + /* Enable RNG unit */ + lo |= RNG_ENABLE; + wrmsr(MSR_ZHAOXIN_FCR57, lo, hi); + pr_info("CPU: Enabled h/w RNG\n"); + } + + /* + * Store Extended Feature Flags as word 5 of the CPU + * capability bit array + */ + c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001); + } + + if (c->x86 >= 0x6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + cpu_detect_cache_sizes(c); +} + +static void early_init_zhaoxin(struct cpuinfo_x86 *c) +{ + if (c->x86 >= 0x6) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#endif + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + + if (c->cpuid_level >= 0x00000001) { + u32 eax, ebx, ecx, edx; + + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + /* + * If HTT (EDX[28]) is set EBX[16:23] contain the number of + * apicids which are reserved per package. Store the resulting + * shift value for the package management code. + */ + if (edx & (1U << 28)) + c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); + } + +} + +static void zhaoxin_detect_vmx_virtcap(struct cpuinfo_x86 *c) +{ + u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; + + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); + msr_ctl = vmx_msr_high | vmx_msr_low; + + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) + set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) + set_cpu_cap(c, X86_FEATURE_VNMI); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, + vmx_msr_low, vmx_msr_high); + msr_ctl2 = vmx_msr_high | vmx_msr_low; + if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && + (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) + set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) + set_cpu_cap(c, X86_FEATURE_EPT); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) + set_cpu_cap(c, X86_FEATURE_VPID); + } +} + +static void init_zhaoxin(struct cpuinfo_x86 *c) +{ + early_init_zhaoxin(c); + init_intel_cacheinfo(c); + detect_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + + if (c->cpuid_level > 9) { + unsigned int eax = cpuid_eax(10); + + /* + * Check for version and the number of counters + * Version(eax[7:0]) can't be 0; + * Counters(eax[15:8]) should be greater than 1; + */ + if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + + if (c->x86 >= 0x6) + init_zhaoxin_cap(c); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +#endif + + if (cpu_has(c, X86_FEATURE_VMX)) + zhaoxin_detect_vmx_virtcap(c); +} + +#ifdef CONFIG_X86_32 +static unsigned int +zhaoxin_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + return size; +} +#endif + +static const struct cpu_dev zhaoxin_cpu_dev = { + .c_vendor = "zhaoxin", + .c_ident = { " Shanghai " }, + .c_early_init = early_init_zhaoxin, + .c_init = init_zhaoxin, +#ifdef CONFIG_X86_32 + .legacy_cache_size = zhaoxin_size_cache, +#endif + .c_x86_vendor = X86_VENDOR_ZHAOXIN, +}; + +cpu_dev_register(zhaoxin_cpu_dev); -- cgit v1.2.3 From 6dbbf5ec9e1e9f607a4c51266d0f9a63ba754b63 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 19 Jun 2019 18:33:54 -0700 Subject: x86/cpufeatures: Enumerate user wait instructions umonitor, umwait, and tpause are a set of user wait instructions. umonitor arms address monitoring hardware using an address. The address range is determined by using CPUID.0x5. A store to an address within the specified address range triggers the monitoring hardware to wake up the processor waiting in umwait. umwait instructs the processor to enter an implementation-dependent optimized state while monitoring a range of addresses. The optimized state may be either a light-weight power/performance optimized state (C0.1 state) or an improved power/performance optimized state (C0.2 state). tpause instructs the processor to enter an implementation-dependent optimized state C0.1 or C0.2 state and wake up when time-stamp counter reaches specified timeout. The three instructions may be executed at any privilege level. The instructions provide power saving method while waiting in user space. Additionally, they can allow a sibling hyperthread to make faster progress while this thread is waiting. One example of an application usage of umwait is when waiting for input data from another application, such as a user level multi-threaded packet processing engine. Availability of the user wait instructions is indicated by the presence of the CPUID feature flag WAITPKG CPUID.0x07.0x0:ECX[5]. Detailed information on the instructions and CPUID feature WAITPKG flag can be found in the latest Intel Architecture Instruction Set Extensions and Future Features Programming Reference and Intel 64 and IA-32 Architectures Software Developer's Manual. Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Reviewed-by: Ashok Raj Reviewed-by: Andy Lutomirski Cc: "Borislav Petkov" Cc: "H Peter Anvin" Cc: "Peter Zijlstra" Cc: "Tony Luck" Cc: "Ravi V Shankar" Link: https://lkml.kernel.org/r/1560994438-235698-2-git-send-email-fenghua.yu@intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8ecd9fac97c3..998c2cc08363 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -330,6 +330,7 @@ #define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */ #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ #define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ #define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ -- cgit v1.2.3 From bd688c69b7e6693de3bd78f38fd63f7850c2711e Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 19 Jun 2019 18:33:55 -0700 Subject: x86/umwait: Initialize umwait control values umwait or tpause allows the processor to enter a light-weight power/performance optimized state (C0.1 state) or an improved power/performance optimized state (C0.2 state) for a period specified by the instruction or until the system time limit or until a store to the monitored address range in umwait. IA32_UMWAIT_CONTROL MSR register allows the OS to enable/disable C0.2 on the processor and to set the maximum time the processor can reside in C0.1 or C0.2. By default C0.2 is enabled so the user wait instructions can enter the C0.2 state to save more power with slower wakeup time. Andy Lutomirski proposed to set the maximum umwait time to 100000 cycles by default. A quote from Andy: "What I want to avoid is the case where it works dramatically differently on NO_HZ_FULL systems as compared to everything else. Also, UMWAIT may behave a bit differently if the max timeout is hit, and I'd like that path to get exercised widely by making it happen even on default configs." A sysfs interface to adjust the time and the C0.2 enablement is provided in a follow up change. [ tglx: Renamed MSR_IA32_UMWAIT_CONTROL_MAX_TIME to MSR_IA32_UMWAIT_CONTROL_TIME_MASK because the constant is used as mask throughout the code. Massaged comments and changelog ] Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Reviewed-by: Ashok Raj Reviewed-by: Andy Lutomirski Cc: "Borislav Petkov" Cc: "H Peter Anvin" Cc: "Peter Zijlstra" Cc: "Tony Luck" Cc: "Ravi V Shankar" Link: https://lkml.kernel.org/r/1560994438-235698-3-git-send-email-fenghua.yu@intel.com --- arch/x86/include/asm/msr-index.h | 9 ++++++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/umwait.c | 62 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 arch/x86/kernel/cpu/umwait.c (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 979ef971cc78..6b4fc2788078 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -61,6 +61,15 @@ #define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31 #define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT) +#define MSR_IA32_UMWAIT_CONTROL 0xe1 +#define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0) +#define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1) +/* + * The time field is bit[31:2], but representing a 32bit value with + * bit[1:0] zero. + */ +#define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U) + #define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) #define NHM_C1_AUTO_DEMOTE (1UL << 26) diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index a7d9a4cb3ab6..4b4eb06e117c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -24,6 +24,7 @@ obj-y += match.o obj-y += bugs.o obj-y += aperfmperf.o obj-y += cpuid-deps.o +obj-y += umwait.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c new file mode 100644 index 000000000000..0a113c731df3 --- /dev/null +++ b/arch/x86/kernel/cpu/umwait.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include + +#define UMWAIT_C02_ENABLE 0 + +#define UMWAIT_CTRL_VAL(maxtime, c02_disable) \ + (((maxtime) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \ + ((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE)) + +/* + * Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default, + * umwait max time is 100000 in TSC-quanta and C0.2 is enabled + */ +static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); + +/* Set IA32_UMWAIT_CONTROL MSR on this CPU to the current global setting. */ +static int umwait_cpu_online(unsigned int cpu) +{ + wrmsr(MSR_IA32_UMWAIT_CONTROL, umwait_control_cached, 0); + return 0; +} + +/* + * On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which + * is the only active CPU at this time. The MSR is set up on the APs via the + * CPU hotplug callback. + * + * This function is invoked on resume from suspend and hibernation. On + * resume from suspend the restore should be not required, but we neither + * trust the firmware nor does it matter if the same value is written + * again. + */ +static void umwait_syscore_resume(void) +{ + wrmsr(MSR_IA32_UMWAIT_CONTROL, umwait_control_cached, 0); +} + +static struct syscore_ops umwait_syscore_ops = { + .resume = umwait_syscore_resume, +}; + +static int __init umwait_init(void) +{ + int ret; + + if (!boot_cpu_has(X86_FEATURE_WAITPKG)) + return -ENODEV; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", + umwait_cpu_online, NULL); + if (ret < 0) + return ret; + + register_syscore_ops(&umwait_syscore_ops); + + return 0; +} +device_initcall(umwait_init); -- cgit v1.2.3 From 049331f277fef1c3f2527c2c9afa1d285e9a1247 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Jul 2019 14:19:36 +0200 Subject: x86/fsgsbase: Revert FSGSBASE support The FSGSBASE series turned out to have serious bugs and there is still an open issue which is not fully understood yet. The confidence in those changes has become close to zero especially as the test cases which have been shipped with that series were obviously never run before sending the final series out to LKML. ./fsgsbase_64 >/dev/null Segmentation fault As the merge window is close, the only sane decision is to revert FSGSBASE support. The revert is necessary as this branch has been merged into perf/core already and rebasing all of that a few days before the merge window is not the most brilliant idea. I could definitely slap myself for not noticing the test case fail when merging that series, but TBH my expectations weren't that low back then. Won't happen again. Revert the following commits: 539bca535dec ("x86/entry/64: Fix and clean up paranoid_exit") 2c7b5ac5d5a9 ("Documentation/x86/64: Add documentation for GS/FS addressing mode") f987c955c745 ("x86/elf: Enumerate kernel FSGSBASE capability in AT_HWCAP2") 2032f1f96ee0 ("x86/cpu: Enable FSGSBASE on 64bit by default and add a chicken bit") 5bf0cab60ee2 ("x86/entry/64: Document GSBASE handling in the paranoid path") 708078f65721 ("x86/entry/64: Handle FSGSBASE enabled paranoid entry/exit") 79e1932fa3ce ("x86/entry/64: Introduce the FIND_PERCPU_BASE macro") 1d07316b1363 ("x86/entry/64: Switch CR3 before SWAPGS in paranoid entry") f60a83df4593 ("x86/process/64: Use FSGSBASE instructions on thread copy and ptrace") 1ab5f3f7fe3d ("x86/process/64: Use FSBSBASE in switch_to() if available") a86b4625138d ("x86/fsgsbase/64: Enable FSGSBASE instructions in helper functions") 8b71340d702e ("x86/fsgsbase/64: Add intrinsics for FSGSBASE instructions") b64ed19b93c3 ("x86/cpu: Add 'unsafe_fsgsbase' to enable CR4.FSGSBASE") Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Chang S. Bae Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Andi Kleen Cc: Ravi Shankar Cc: Dave Hansen Cc: H. Peter Anvin --- Documentation/admin-guide/kernel-parameters.txt | 2 - Documentation/x86/entry_64.rst | 9 -- Documentation/x86/x86_64/fsgs.rst | 199 ------------------------ Documentation/x86/x86_64/index.rst | 1 - arch/x86/entry/calling.h | 40 ----- arch/x86/entry/entry_64.S | 132 ++++------------ arch/x86/include/asm/fsgsbase.h | 45 ++---- arch/x86/include/asm/inst.h | 15 -- arch/x86/include/uapi/asm/hwcap2.h | 3 - arch/x86/kernel/cpu/common.c | 22 --- arch/x86/kernel/process_64.c | 119 ++------------ 11 files changed, 50 insertions(+), 537 deletions(-) delete mode 100644 Documentation/x86/x86_64/fsgs.rst (limited to 'arch/x86/include') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 35bc3c3574c6..138f6664b2e2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2857,8 +2857,6 @@ no5lvl [X86-64] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. - nofsgsbase [X86] Disables FSGSBASE instructions. - no_console_suspend [HW] Never suspend the console Disable suspending of consoles during suspend and diff --git a/Documentation/x86/entry_64.rst b/Documentation/x86/entry_64.rst index b87c1d816aea..a48b3f6ebbe8 100644 --- a/Documentation/x86/entry_64.rst +++ b/Documentation/x86/entry_64.rst @@ -108,12 +108,3 @@ We try to only use IST entries and the paranoid entry code for vectors that absolutely need the more expensive check for the GS base - and we generate all 'normal' entry points with the regular (faster) paranoid=0 variant. - -On a FSGSBASE system, however, user space can set GS without kernel -interaction. It means the value of GS base itself does not imply anything, -whether a kernel value or a user space value. So, there is no longer a safe -way to check whether the exception is entering from user mode or kernel -mode in the paranoid entry code path. So the GSBASE value needs to be read -out, saved and the kernel GSBASE value written. On exit the saved GSBASE -value needs to be restored unconditionally. The non paranoid entry/exit -code still uses SWAPGS unconditionally as the state is known. diff --git a/Documentation/x86/x86_64/fsgs.rst b/Documentation/x86/x86_64/fsgs.rst deleted file mode 100644 index 380c0b5ccca2..000000000000 --- a/Documentation/x86/x86_64/fsgs.rst +++ /dev/null @@ -1,199 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Using FS and GS segments in user space applications -=================================================== - -The x86 architecture supports segmentation. Instructions which access -memory can use segment register based addressing mode. The following -notation is used to address a byte within a segment: - - Segment-register:Byte-address - -The segment base address is added to the Byte-address to compute the -resulting virtual address which is accessed. This allows to access multiple -instances of data with the identical Byte-address, i.e. the same code. The -selection of a particular instance is purely based on the base-address in -the segment register. - -In 32-bit mode the CPU provides 6 segments, which also support segment -limits. The limits can be used to enforce address space protections. - -In 64-bit mode the CS/SS/DS/ES segments are ignored and the base address is -always 0 to provide a full 64bit address space. The FS and GS segments are -still functional in 64-bit mode. - -Common FS and GS usage ------------------------------- - -The FS segment is commonly used to address Thread Local Storage (TLS). FS -is usually managed by runtime code or a threading library. Variables -declared with the '__thread' storage class specifier are instantiated per -thread and the compiler emits the FS: address prefix for accesses to these -variables. Each thread has its own FS base address so common code can be -used without complex address offset calculations to access the per thread -instances. Applications should not use FS for other purposes when they use -runtimes or threading libraries which manage the per thread FS. - -The GS segment has no common use and can be used freely by -applications. GCC and Clang support GS based addressing via address space -identifiers. - -Reading and writing the FS/GS base address ------------------------------------------- - -There exist two mechanisms to read and write the FS/FS base address: - - - the arch_prctl() system call - - - the FSGSBASE instruction family - -Accessing FS/GS base with arch_prctl() --------------------------------------- - - The arch_prctl(2) based mechanism is available on all 64bit CPUs and all - kernel versions. - - Reading the base: - - arch_prctl(ARCH_GET_FS, &fsbase); - arch_prctl(ARCH_GET_GS, &gsbase); - - Writing the base: - - arch_prctl(ARCH_SET_FS, fsbase); - arch_prctl(ARCH_SET_GS, gsbase); - - The ARCH_SET_GS prctl may be disabled depending on kernel configuration - and security settings. - -Accessing FS/GS base with the FSGSBASE instructions ---------------------------------------------------- - - With the Ivy Bridge CPU generation Intel introduced a new set of - instructions to access the FS and GS base registers directly from user - space. These instructions are also supported on AMD Family 17H CPUs. The - following instructions are available: - - =============== =========================== - RDFSBASE %reg Read the FS base register - RDGSBASE %reg Read the GS base register - WRFSBASE %reg Write the FS base register - WRGSBASE %reg Write the GS base register - =============== =========================== - - The instructions avoid the overhead of the arch_prctl() syscall and allow - more flexible usage of the FS/GS addressing modes in user space - applications. This does not prevent conflicts between threading libraries - and runtimes which utilize FS and applications which want to use it for - their own purpose. - -FSGSBASE instructions enablement -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The instructions are enumerated in CPUID leaf 7, bit 0 of EBX. If - available /proc/cpuinfo shows 'fsgsbase' in the flag entry of the CPUs. - - The availability of the instructions does not enable them - automatically. The kernel has to enable them explicitly in CR4. The - reason for this is that older kernels make assumptions about the values in - the GS register and enforce them when GS base is set via - arch_prctl(). Allowing user space to write arbitrary values to GS base - would violate these assumptions and cause malfunction. - - On kernels which do not enable FSGSBASE the execution of the FSGSBASE - instructions will fault with a #UD exception. - - The kernel provides reliable information about the enabled state in the - ELF AUX vector. If the HWCAP2_FSGSBASE bit is set in the AUX vector, the - kernel has FSGSBASE instructions enabled and applications can use them. - The following code example shows how this detection works:: - - #include - #include - - /* Will be eventually in asm/hwcap.h */ - #ifndef HWCAP2_FSGSBASE - #define HWCAP2_FSGSBASE (1 << 1) - #endif - - .... - - unsigned val = getauxval(AT_HWCAP2); - - if (val & HWCAP2_FSGSBASE) - printf("FSGSBASE enabled\n"); - -FSGSBASE instructions compiler support -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE -instructions. Clang supports them as well. - - =================== =========================== - _readfsbase_u64() Read the FS base register - _readfsbase_u64() Read the GS base register - _writefsbase_u64() Write the FS base register - _writegsbase_u64() Write the GS base register - =================== =========================== - -To utilize these instrinsics must be included in the source -code and the compiler option -mfsgsbase has to be added. - -Compiler support for FS/GS based addressing -------------------------------------------- - -GCC version 6 and newer provide support for FS/GS based addressing via -Named Address Spaces. GCC implements the following address space -identifiers for x86: - - ========= ==================================== - __seg_fs Variable is addressed relative to FS - __seg_gs Variable is addressed relative to GS - ========= ==================================== - -The preprocessor symbols __SEG_FS and __SEG_GS are defined when these -address spaces are supported. Code which implements fallback modes should -check whether these symbols are defined. Usage example:: - - #ifdef __SEG_GS - - long data0 = 0; - long data1 = 1; - - long __seg_gs *ptr; - - /* Check whether FSGSBASE is enabled by the kernel (HWCAP2_FSGSBASE) */ - .... - - /* Set GS to point to data0 */ - _writegsbase_u64(&data0); - - /* Access offset 0 of GS */ - ptr = 0; - printf("data0 = %ld\n", *ptr); - - /* Set GS to point to data1 */ - _writegsbase_u64(&data1); - /* ptr still addresses offset 0! */ - printf("data1 = %ld\n", *ptr); - - -Clang does not provide the GCC address space identifiers, but it provides -address spaces via an attribute based mechanism in Clang 5 and newer -versions: - - ==================================== ===================================== - __attribute__((address_space(256)) Variable is addressed relative to GS - __attribute__((address_space(257)) Variable is addressed relative to FS - ==================================== ===================================== - -FS/GS based addressing with inline assembly -------------------------------------------- - -In case the compiler does not support address spaces, inline assembly can -be used for FS/GS based addressing mode:: - - mov %fs:offset, %reg - mov %gs:offset, %reg - - mov %reg, %fs:offset - mov %reg, %gs:offset diff --git a/Documentation/x86/x86_64/index.rst b/Documentation/x86/x86_64/index.rst index a56070fc8e77..d6eaaa5a35fc 100644 --- a/Documentation/x86/x86_64/index.rst +++ b/Documentation/x86/x86_64/index.rst @@ -14,4 +14,3 @@ x86_64 Support fake-numa-for-cpusets cpu-hotplug-spec machinecheck - fsgs diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index d3fbe2dc03ea..efb0d1b1f15f 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,7 +6,6 @@ #include #include #include -#include /* @@ -338,12 +337,6 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm -.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req - rdgsbase \save_reg - GET_PERCPU_BASE \scratch_reg - wrgsbase \scratch_reg -.endm - #endif /* CONFIG_X86_64 */ .macro STACKLEAK_ERASE @@ -352,39 +345,6 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm -#ifdef CONFIG_SMP - -/* - * CPU/node NR is loaded from the limit (size) field of a special segment - * descriptor entry in GDT. - */ -.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req - movq $__CPUNODE_SEG, \reg - lsl \reg, \reg -.endm - -/* - * Fetch the per-CPU GSBASE value for this processor and put it in @reg. - * We normally use %gs for accessing per-CPU data, but we are setting up - * %gs here and obviously can not use %gs itself to access per-CPU data. - */ -.macro GET_PERCPU_BASE reg:req - ALTERNATIVE \ - "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \ - "RDPID \reg", \ - X86_FEATURE_RDPID - andq $VDSO_CPUNODE_MASK, \reg - movq __per_cpu_offset(, \reg, 8), \reg -.endm - -#else - -.macro GET_PERCPU_BASE reg:req - movq pcpu_unit_offsets(%rip), \reg -.endm - -#endif /* CONFIG_SMP */ - /* * This does 'call enter_from_user_mode' unless we can avoid it based on * kernel config or using the static jump infrastructure. diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 670306f588bf..3b7a0e8d3bc0 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -38,7 +38,6 @@ #include #include #include -#include #include #include "calling.h" @@ -948,6 +947,7 @@ ENTRY(\sym) addq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif + /* these procedures expect "no swapgs" flag in ebx */ .if \paranoid jmp paranoid_exit .else @@ -1164,21 +1164,24 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1 #endif /* - * Save all registers in pt_regs. Return GSBASE related information - * in EBX depending on the availability of the FSGSBASE instructions: - * - * FSGSBASE R/EBX - * N 0 -> SWAPGS on exit - * 1 -> no SWAPGS on exit - * - * Y GSBASE value at entry, must be restored in paranoid_exit + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 + movl $1, %ebx + movl $MSR_GS_BASE, %ecx + rdmsr + testl %edx, %edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx, %ebx +1: /* * Always stash CR3 in %r14. This value will be restored, * verbatim, at exit. Needed if paranoid_entry interrupted @@ -1188,49 +1191,9 @@ ENTRY(paranoid_entry) * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return * to kernel code, but with a user CR3 value. - * - * Switching CR3 does not depend on kernel GSBASE so it can - * be done before switching to the kernel GSBASE. This is - * required for FSGSBASE because the kernel GSBASE has to - * be retrieved from a kernel internal table. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 - /* - * Handling GSBASE depends on the availability of FSGSBASE. - * - * Without FSGSBASE the kernel enforces that negative GSBASE - * values indicate kernel GSBASE. With FSGSBASE no assumptions - * can be made about the GSBASE value when entering from user - * space. - */ - ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE - - /* - * Read the current GSBASE and store it in in %rbx unconditionally, - * retrieve and set the current CPUs kernel GSBASE. The stored value - * has to be restored in paranoid_exit unconditionally. - */ - SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx - ret - -.Lparanoid_entry_checkgs: - /* EBX = 1 -> kernel GSBASE active, no restore required */ - movl $1, %ebx - /* - * The kernel-enforced convention is a negative GSBASE indicates - * a kernel value. No SWAPGS needed on entry and exit. - */ - movl $MSR_GS_BASE, %ecx - rdmsr - testl %edx, %edx - jns .Lparanoid_entry_swapgs - ret - -.Lparanoid_entry_swapgs: - SWAPGS - /* EBX = 0 -> SWAPGS required on exit */ - xorl %ebx, %ebx ret END(paranoid_entry) @@ -1241,48 +1204,28 @@ END(paranoid_entry) * * We may be returning to very strange contexts (e.g. very early * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, there's no good reason to try - * to handle preemption here. - * - * R/EBX contains the GSBASE related information depending on the - * availability of the FSGSBASE instructions: + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. * - * FSGSBASE R/EBX - * N 0 -> SWAPGS on exit - * 1 -> no SWAPGS on exit - * - * Y User space GSBASE, must be restored unconditionally + * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) - - /* - * The order of operations is important. IRQ tracing requires - * kernel GSBASE and CR3. RESTORE_CR3 requires kernel GS base. - * - * NB to anyone to tries to optimize this code: this code does - * not execute at all for exceptions coming from user mode. Those - * exceptions go through error_exit instead. - */ - TRACE_IRQS_IRETQ_DEBUG - RESTORE_CR3 scratch_reg=%rax save_reg=%r14 - - /* Handle the three GSBASE cases. */ - ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE - - /* With FSGSBASE enabled, unconditionally restore GSBASE */ - wrgsbase %rbx - jmp restore_regs_and_return_to_kernel - -.Lparanoid_exit_checkgs: - /* On non-FSGSBASE systems, conditionally do SWAPGS */ - testl %ebx, %ebx - jnz restore_regs_and_return_to_kernel - - /* We are returning to a context with user GSBASE. */ + TRACE_IRQS_OFF_DEBUG + testl %ebx, %ebx /* swapgs needed? */ + jnz .Lparanoid_exit_no_swapgs + TRACE_IRQS_IRETQ + /* Always restore stashed CR3 value (see paranoid_entry) */ + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK - jmp restore_regs_and_return_to_kernel + jmp .Lparanoid_exit_restore +.Lparanoid_exit_no_swapgs: + TRACE_IRQS_IRETQ_DEBUG + /* Always restore stashed CR3 value (see paranoid_entry) */ + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 +.Lparanoid_exit_restore: + jmp restore_regs_and_return_to_kernel END(paranoid_exit) /* @@ -1693,27 +1636,10 @@ end_repeat_nmi: /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 - /* - * The above invocation of paranoid_entry stored the GSBASE - * related information in R/EBX depending on the availability - * of FSGSBASE. - * - * If FSGSBASE is enabled, restore the saved GSBASE value - * unconditionally, otherwise take the conditional SWAPGS path. - */ - ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE - - wrgsbase %rbx - jmp nmi_restore - -nmi_no_fsgsbase: - /* EBX == 0 -> invoke SWAPGS */ - testl %ebx, %ebx + testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore - nmi_swapgs: SWAPGS_UNSAFE_STACK - nmi_restore: POP_REGS diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index aefd53767a5d..bca4c743de77 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -19,63 +19,36 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task); extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); -/* Must be protected by X86_FEATURE_FSGSBASE check. */ +/* Helper functions for reading/writing FS/GS base */ -static __always_inline unsigned long rdfsbase(void) +static inline unsigned long x86_fsbase_read_cpu(void) { unsigned long fsbase; - asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); + rdmsrl(MSR_FS_BASE, fsbase); return fsbase; } -static __always_inline unsigned long rdgsbase(void) +static inline unsigned long x86_gsbase_read_cpu_inactive(void) { unsigned long gsbase; - asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); return gsbase; } -static __always_inline void wrfsbase(unsigned long fsbase) -{ - asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory"); -} - -static __always_inline void wrgsbase(unsigned long gsbase) -{ - asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); -} - -#include - -/* Helper functions for reading/writing FS/GS base */ - -static inline unsigned long x86_fsbase_read_cpu(void) +static inline void x86_fsbase_write_cpu(unsigned long fsbase) { - unsigned long fsbase; - - if (static_cpu_has(X86_FEATURE_FSGSBASE)) - fsbase = rdfsbase(); - else - rdmsrl(MSR_FS_BASE, fsbase); - - return fsbase; + wrmsrl(MSR_FS_BASE, fsbase); } -static inline void x86_fsbase_write_cpu(unsigned long fsbase) +static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase) { - if (static_cpu_has(X86_FEATURE_FSGSBASE)) - wrfsbase(fsbase); - else - wrmsrl(MSR_FS_BASE, fsbase); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); } -extern unsigned long x86_gsbase_read_cpu_inactive(void); -extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); - #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index d063841a17e3..f5a796da07f8 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -306,21 +306,6 @@ .endif MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 .endm - -.macro RDPID opd - REG_TYPE rdpid_opd_type \opd - .if rdpid_opd_type == REG_TYPE_R64 - R64_NUM rdpid_opd \opd - .else - R32_NUM rdpid_opd \opd - .endif - .byte 0xf3 - .if rdpid_opd > 7 - PFX_REX rdpid_opd 0 - .endif - .byte 0x0f, 0xc7 - MODRM 0xc0 rdpid_opd 0x7 -.endm #endif #endif diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index c5ce54e749f6..6ebaae90e207 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -5,7 +5,4 @@ /* MONITOR/MWAIT enabled in Ring 3 */ #define HWCAP2_RING3MWAIT (1 << 0) -/* Kernel allows FSGSBASE instructions available in Ring 3 */ -#define HWCAP2_FSGSBASE BIT(1) - #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 637c9117d5ae..dad20bc891d5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -366,22 +366,6 @@ out: cr4_clear_bits(X86_CR4_UMIP); } -static __init int x86_nofsgsbase_setup(char *arg) -{ - /* Require an exact match without trailing characters. */ - if (strlen(arg)) - return 0; - - /* Do not emit a message if the feature is not present. */ - if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) - return 1; - - setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); - pr_info("FSGSBASE disabled via kernel command line\n"); - return 1; -} -__setup("nofsgsbase", x86_nofsgsbase_setup); - /* * Protection Keys are not available in 32-bit mode. */ @@ -1386,12 +1370,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smap(c); setup_umip(c); - /* Enable FSGSBASE instructions if available. */ - if (cpu_has(c, X86_FEATURE_FSGSBASE)) { - cr4_set_bits(X86_CR4_FSGSBASE); - elf_hwcap2 |= HWCAP2_FSGSBASE; - } - /* * The vendor-specific functions might have changed features. * Now we do "generic changes." diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 8f239091c15d..250e4c4ac6d9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -161,40 +161,6 @@ enum which_selector { GS }; -/* - * Out of line to be protected from kprobes. It is not used on Xen - * paravirt. When paravirt support is needed, it needs to be renamed - * with native_ prefix. - */ -static noinline unsigned long __rdgsbase_inactive(void) -{ - unsigned long gsbase; - - lockdep_assert_irqs_disabled(); - - native_swapgs(); - gsbase = rdgsbase(); - native_swapgs(); - - return gsbase; -} -NOKPROBE_SYMBOL(__rdgsbase_inactive); - -/* - * Out of line to be protected from kprobes. It is not used on Xen - * paravirt. When paravirt support is needed, it needs to be renamed - * with native_ prefix. - */ -static noinline void __wrgsbase_inactive(unsigned long gsbase) -{ - lockdep_assert_irqs_disabled(); - - native_swapgs(); - wrgsbase(gsbase); - native_swapgs(); -} -NOKPROBE_SYMBOL(__wrgsbase_inactive); - /* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. @@ -244,22 +210,8 @@ static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - unsigned long flags; - - /* - * If FSGSBASE is enabled, we can't make any useful guesses - * about the base, and user code expects us to save the current - * value. Fortunately, reading the base directly is efficient. - */ - task->thread.fsbase = rdfsbase(); - local_irq_save(flags); - task->thread.gsbase = __rdgsbase_inactive(); - local_irq_restore(flags); - } else { - save_base_legacy(task, task->thread.fsindex, FS); - save_base_legacy(task, task->thread.gsindex, GS); - } + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); } #if IS_ENABLED(CONFIG_KVM) @@ -338,22 +290,10 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - /* Update the FS and GS selectors if they could have changed. */ - if (unlikely(prev->fsindex || next->fsindex)) - loadseg(FS, next->fsindex); - if (unlikely(prev->gsindex || next->gsindex)) - loadseg(GS, next->gsindex); - - /* Update the bases. */ - wrfsbase(next->fsbase); - __wrgsbase_inactive(next->gsbase); - } else { - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); - } + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); } static unsigned long x86_fsgsbase_read_task(struct task_struct *task, @@ -399,46 +339,13 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task, return base; } -unsigned long x86_gsbase_read_cpu_inactive(void) -{ - unsigned long gsbase; - - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - unsigned long flags; - - /* Interrupts are disabled here. */ - local_irq_save(flags); - gsbase = __rdgsbase_inactive(); - local_irq_restore(flags); - } else { - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); - } - - return gsbase; -} - -void x86_gsbase_write_cpu_inactive(unsigned long gsbase) -{ - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - unsigned long flags; - - /* Interrupts are disabled here. */ - local_irq_save(flags); - __wrgsbase_inactive(gsbase); - local_irq_restore(flags); - } else { - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); - } -} - unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; if (task == current) fsbase = x86_fsbase_read_cpu(); - else if (static_cpu_has(X86_FEATURE_FSGSBASE) || - (task->thread.fsindex == 0)) + else if (task->thread.fsindex == 0) fsbase = task->thread.fsbase; else fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); @@ -452,8 +359,7 @@ unsigned long x86_gsbase_read_task(struct task_struct *task) if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); - else if (static_cpu_has(X86_FEATURE_FSGSBASE) || - (task->thread.gsindex == 0)) + else if (task->thread.gsindex == 0) gsbase = task->thread.gsbase; else gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); @@ -493,11 +399,10 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap_ptr = NULL; - save_fsgs(me); - p->thread.fsindex = me->thread.fsindex; - p->thread.fsbase = me->thread.fsbase; - p->thread.gsindex = me->thread.gsindex; - p->thread.gsbase = me->thread.gsbase; + savesegment(gs, p->thread.gsindex); + p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase; + savesegment(fs, p->thread.fsindex); + p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); -- cgit v1.2.3