From 1b7aebf0487613033aff26420e32fa2076d52846 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 19 Jun 2019 10:32:53 -0400 Subject: x86/cacheinfo: Fix a -Wtype-limits warning cpuinfo_x86.x86_model is an unsigned type, so comparing against zero will generate a compilation warning: arch/x86/kernel/cpu/cacheinfo.c: In function 'cacheinfo_amd_init_llc_id': arch/x86/kernel/cpu/cacheinfo.c:662:19: warning: comparison is always true \ due to limited range of data type [-Wtype-limits] Remove the unnecessary lower bound check. [ bp: Massage. ] Fixes: 68091ee7ac3c ("x86/CPU/AMD: Calculate last level cache ID from number of sharing threads") Signed-off-by: Qian Cai Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: "Gustavo A. R. Silva" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Pu Wen Cc: Suravee Suthikulpanit Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/1560954773-11967-1-git-send-email-cai@lca.pw --- arch/x86/kernel/cpu/cacheinfo.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 395d46f78582..c7503be92f35 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -658,8 +658,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) if (c->x86 < 0x17) { /* LLC is at the node level. */ per_cpu(cpu_llc_id, cpu) = node_id; - } else if (c->x86 == 0x17 && - c->x86_model >= 0 && c->x86_model <= 0x1F) { + } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* * LLC is at the core complex level. * Core complex ID is ApicId[3] for these processors. -- cgit v1.2.3 From 45fc56e629caa451467e7664fbd4c797c434a6c4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 19 Jun 2019 17:24:34 +0200 Subject: x86/cpufeatures: Carve out CQM features retrieval ... into a separate function for better readability. Split out from a patch from Fenghua Yu to keep the mechanical, sole code movement separate for easy review. No functional changes. Signed-off-by: Borislav Petkov Cc: Fenghua Yu Cc: x86@kernel.org --- arch/x86/kernel/cpu/common.c | 60 ++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2c57fffebf9b..fe6ed9696467 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -801,6 +801,38 @@ static void init_speculation_control(struct cpuinfo_x86 *c) } } +static void init_cqm(struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + + /* Additional Intel-defined flags: level 0x0000000F */ + if (c->cpuid_level >= 0x0000000F) { + + /* QoS sub-leaf, EAX=0Fh, ECX=0 */ + cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_F_0_EDX] = edx; + + if (cpu_has(c, X86_FEATURE_CQM_LLC)) { + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = ebx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_F_1_EDX] = edx; + + if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) || + ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) || + (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) { + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + } + } else { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + } + } +} + void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -832,33 +864,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_D_1_EAX] = eax; } - /* Additional Intel-defined flags: level 0x0000000F */ - if (c->cpuid_level >= 0x0000000F) { - - /* QoS sub-leaf, EAX=0Fh, ECX=0 */ - cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_F_0_EDX] = edx; - - if (cpu_has(c, X86_FEATURE_CQM_LLC)) { - /* will be overridden if occupancy monitoring exists */ - c->x86_cache_max_rmid = ebx; - - /* QoS sub-leaf, EAX=0Fh, ECX=1 */ - cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_F_1_EDX] = edx; - - if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) || - ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) || - (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) { - c->x86_cache_max_rmid = ecx; - c->x86_cache_occ_scale = ebx; - } - } else { - c->x86_cache_max_rmid = -1; - c->x86_cache_occ_scale = -1; - } - } - /* AMD-defined flags: level 0x80000001 */ eax = cpuid_eax(0x80000000); c->extended_cpuid_level = eax; @@ -889,6 +894,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); init_speculation_control(c); + init_cqm(c); /* * Clear/Set all flags overridden by options, after probe. -- cgit v1.2.3 From acec0ce081de0c36459eea91647faf99296445a3 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 19 Jun 2019 18:51:09 +0200 Subject: x86/cpufeatures: Combine word 11 and 12 into a new scattered features word MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's a waste for the four X86_FEATURE_CQM_* feature bits to occupy two whole feature bits words. To better utilize feature words, re-define word 11 to host scattered features and move the four X86_FEATURE_CQM_* features into Linux defined word 11. More scattered features can be added in word 11 in the future. Rename leaf 11 in cpuid_leafs to CPUID_LNX_4 to reflect it's a Linux-defined leaf. Rename leaf 12 as CPUID_DUMMY which will be replaced by a meaningful name in the next patch when CPUID.7.1:EAX occupies world 12. Maximum number of RMID and cache occupancy scale are retrieved from CPUID.0xf.1 after scattered CQM features are enumerated. Carve out the code into a separate function. KVM doesn't support resctrl now. So it's safe to move the X86_FEATURE_CQM_* features to scattered features word 11 for KVM. Signed-off-by: Fenghua Yu Signed-off-by: Borislav Petkov Cc: Aaron Lewis Cc: Andy Lutomirski Cc: Babu Moger Cc: "Chang S. Bae" Cc: "Sean J Christopherson" Cc: Frederic Weisbecker Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Juergen Gross Cc: Konrad Rzeszutek Wilk Cc: kvm ML Cc: Masahiro Yamada Cc: Masami Hiramatsu Cc: Nadav Amit Cc: Paolo Bonzini Cc: Pavel Tatashin Cc: Peter Feiner Cc: "Peter Zijlstra (Intel)" Cc: "Radim Krčmář" Cc: "Rafael J. Wysocki" Cc: Ravi V Shankar Cc: Sherry Hurwitz Cc: Thomas Gleixner Cc: Thomas Lendacky Cc: x86 Link: https://lkml.kernel.org/r/1560794416-217638-2-git-send-email-fenghua.yu@intel.com --- arch/x86/include/asm/cpufeature.h | 4 ++-- arch/x86/include/asm/cpufeatures.h | 17 ++++++++++------- arch/x86/kernel/cpu/common.c | 38 +++++++++++++++----------------------- arch/x86/kernel/cpu/cpuid-deps.c | 3 +++ arch/x86/kernel/cpu/scattered.c | 4 ++++ arch/x86/kvm/cpuid.h | 2 -- 6 files changed, 34 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 1d337c51f7e6..403f70c2e431 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -22,8 +22,8 @@ enum cpuid_leafs CPUID_LNX_3, CPUID_7_0_EBX, CPUID_D_1_EAX, - CPUID_F_0_EDX, - CPUID_F_1_EDX, + CPUID_LNX_4, + CPUID_DUMMY, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1017b9c7dfe0..be858b86023a 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -271,13 +271,16 @@ #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */ -#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ - -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ -#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ -#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ +/* + * Extended auxiliary flags: Linux defined - for features scattered in various + * CPUID levels like 0xf, etc. + * + * Reuse free bits when adding new feature flags! + */ +#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */ +#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fe6ed9696467..efb114298cfb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -803,33 +803,25 @@ static void init_speculation_control(struct cpuinfo_x86 *c) static void init_cqm(struct cpuinfo_x86 *c) { - u32 eax, ebx, ecx, edx; - - /* Additional Intel-defined flags: level 0x0000000F */ - if (c->cpuid_level >= 0x0000000F) { + if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + return; + } - /* QoS sub-leaf, EAX=0Fh, ECX=0 */ - cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_F_0_EDX] = edx; + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = cpuid_ebx(0xf); - if (cpu_has(c, X86_FEATURE_CQM_LLC)) { - /* will be overridden if occupancy monitoring exists */ - c->x86_cache_max_rmid = ebx; + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || + cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + u32 eax, ebx, ecx, edx; - /* QoS sub-leaf, EAX=0Fh, ECX=1 */ - cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_F_1_EDX] = edx; + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); - if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) || - ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) || - (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) { - c->x86_cache_max_rmid = ecx; - c->x86_cache_occ_scale = ebx; - } - } else { - c->x86_cache_max_rmid = -1; - c->x86_cache_occ_scale = -1; - } + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; } } diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 2c0bd38a44ab..fa07a224e7b9 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -59,6 +59,9 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, {} }; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 94aa1c72ca98..adf9b71386ef 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -26,6 +26,10 @@ struct cpuid_bit { static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 9a327d5b6d1f..d78a61408243 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -47,8 +47,6 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX}, [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, - [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX}, - [CPUID_F_1_EDX] = { 0xf, 1, CPUID_EDX}, [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX}, [CPUID_6_EAX] = { 6, 0, CPUID_EAX}, [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX}, -- cgit v1.2.3 From b302e4b176d00e1cbc80148c5d0aee36751f7480 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 17 Jun 2019 11:00:16 -0700 Subject: x86/cpufeatures: Enumerate the new AVX512 BFLOAT16 instructions AVX512 BFLOAT16 instructions support 16-bit BFLOAT16 floating-point format (BF16) for deep learning optimization. BF16 is a short version of 32-bit single-precision floating-point format (FP32) and has several advantages over 16-bit half-precision floating-point format (FP16). BF16 keeps FP32 accumulation after multiplication without loss of precision, offers more than enough range for deep learning training tasks, and doesn't need to handle hardware exception. AVX512 BFLOAT16 instructions are enumerated in CPUID.7.1:EAX[bit 5] AVX512_BF16. CPUID.7.1:EAX contains only feature bits. Reuse the currently empty word 12 as a pure features word to hold the feature bits including AVX512_BF16. Detailed information of the CPUID bit and AVX512 BFLOAT16 instructions can be found in the latest Intel Architecture Instruction Set Extensions and Future Features Programming Reference. [ bp: Check CPUID(7) subleaf validity before accessing subleaf 1. ] Signed-off-by: Fenghua Yu Signed-off-by: Borislav Petkov Cc: "Chang S. Bae" Cc: Frederic Weisbecker Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Nadav Amit Cc: Paolo Bonzini Cc: Pavel Tatashin Cc: Peter Feiner Cc: Radim Krcmar Cc: "Rafael J. Wysocki" Cc: "Ravi V Shankar" Cc: Robert Hoo Cc: "Sean J Christopherson" Cc: Thomas Gleixner Cc: Thomas Lendacky Cc: x86 Link: https://lkml.kernel.org/r/1560794416-217638-3-git-send-email-fenghua.yu@intel.com --- arch/x86/include/asm/cpufeature.h | 2 +- arch/x86/include/asm/cpufeatures.h | 3 +++ arch/x86/kernel/cpu/common.c | 6 ++++++ arch/x86/kernel/cpu/cpuid-deps.c | 1 + 4 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 403f70c2e431..58acda503817 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -23,7 +23,7 @@ enum cpuid_leafs CPUID_7_0_EBX, CPUID_D_1_EAX, CPUID_LNX_4, - CPUID_DUMMY, + CPUID_7_1_EAX, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index be858b86023a..8ecd9fac97c3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -282,6 +282,9 @@ #define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */ #define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ +/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ +#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ + /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index efb114298cfb..dad20bc891d5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -847,6 +847,12 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_7_0_EBX] = ebx; c->x86_capability[CPUID_7_ECX] = ecx; c->x86_capability[CPUID_7_EDX] = edx; + + /* Check valid sub-leaf index before accessing it */ + if (eax >= 1) { + cpuid_count(0x00000007, 1, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_7_1_EAX] = eax; + } } /* Extended state features: level 0x0000000d */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index fa07a224e7b9..a444028d8145 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -62,6 +62,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, {} }; -- cgit v1.2.3 From b64ed19b93c368be0fb6acf05377e8e3a694c92b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 8 May 2019 03:02:18 -0700 Subject: x86/cpu: Add 'unsafe_fsgsbase' to enable CR4.FSGSBASE This is temporary. It will allow the next few patches to be tested incrementally. Setting unsafe_fsgsbase is a root hole. Don't do it. Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Reviewed-by: Andi Kleen Reviewed-by: Andy Lutomirski Cc: Ravi Shankar Cc: Andrew Morton Cc: Randy Dunlap Cc: H. Peter Anvin Link: https://lkml.kernel.org/r/1557309753-24073-4-git-send-email-chang.seok.bae@intel.com --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ arch/x86/kernel/cpu/common.c | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'arch/x86/kernel/cpu') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..b0fa5273b0fc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2857,6 +2857,9 @@ no5lvl [X86-64] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. + unsafe_fsgsbase [X86] Allow FSGSBASE instructions. This will be + replaced with a nofsgsbase flag. + no_console_suspend [HW] Never suspend the console Disable suspending of consoles during suspend and diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index dad20bc891d5..71defe2d1b7c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -366,6 +366,22 @@ out: cr4_clear_bits(X86_CR4_UMIP); } +/* + * Temporary hack: FSGSBASE is unsafe until a few kernel code paths are + * updated. This allows us to get the kernel ready incrementally. + * + * Once all the pieces are in place, these will go away and be replaced with + * a nofsgsbase chicken flag. + */ +static bool unsafe_fsgsbase; + +static __init int setup_unsafe_fsgsbase(char *arg) +{ + unsafe_fsgsbase = true; + return 1; +} +__setup("unsafe_fsgsbase", setup_unsafe_fsgsbase); + /* * Protection Keys are not available in 32-bit mode. */ @@ -1370,6 +1386,14 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smap(c); setup_umip(c); + /* Enable FSGSBASE instructions if available. */ + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { + if (unsafe_fsgsbase) + cr4_set_bits(X86_CR4_FSGSBASE); + else + clear_cpu_cap(c, X86_FEATURE_FSGSBASE); + } + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." -- cgit v1.2.3 From 2032f1f96ee0da600633c6c627b9c0a2e0f8b8a6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 8 May 2019 03:02:31 -0700 Subject: x86/cpu: Enable FSGSBASE on 64bit by default and add a chicken bit Now that FSGSBASE is fully supported, remove unsafe_fsgsbase, enable FSGSBASE by default, and add nofsgsbase to disable it. Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Reviewed-by: Andi Kleen Cc: Ravi Shankar Cc: H. Peter Anvin Link: https://lkml.kernel.org/r/1557309753-24073-17-git-send-email-chang.seok.bae@intel.com --- Documentation/admin-guide/kernel-parameters.txt | 3 +-- arch/x86/kernel/cpu/common.c | 32 +++++++++++-------------- 2 files changed, 15 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b0fa5273b0fc..35bc3c3574c6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2857,8 +2857,7 @@ no5lvl [X86-64] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. - unsafe_fsgsbase [X86] Allow FSGSBASE instructions. This will be - replaced with a nofsgsbase flag. + nofsgsbase [X86] Disables FSGSBASE instructions. no_console_suspend [HW] Never suspend the console diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 71defe2d1b7c..1305f16b6105 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -366,21 +366,21 @@ out: cr4_clear_bits(X86_CR4_UMIP); } -/* - * Temporary hack: FSGSBASE is unsafe until a few kernel code paths are - * updated. This allows us to get the kernel ready incrementally. - * - * Once all the pieces are in place, these will go away and be replaced with - * a nofsgsbase chicken flag. - */ -static bool unsafe_fsgsbase; - -static __init int setup_unsafe_fsgsbase(char *arg) +static __init int x86_nofsgsbase_setup(char *arg) { - unsafe_fsgsbase = true; + /* Require an exact match without trailing characters. */ + if (strlen(arg)) + return 0; + + /* Do not emit a message if the feature is not present. */ + if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); + pr_info("FSGSBASE disabled via kernel command line\n"); return 1; } -__setup("unsafe_fsgsbase", setup_unsafe_fsgsbase); +__setup("nofsgsbase", x86_nofsgsbase_setup); /* * Protection Keys are not available in 32-bit mode. @@ -1387,12 +1387,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_umip(c); /* Enable FSGSBASE instructions if available. */ - if (cpu_has(c, X86_FEATURE_FSGSBASE)) { - if (unsafe_fsgsbase) - cr4_set_bits(X86_CR4_FSGSBASE); - else - clear_cpu_cap(c, X86_FEATURE_FSGSBASE); - } + if (cpu_has(c, X86_FEATURE_FSGSBASE)) + cr4_set_bits(X86_CR4_FSGSBASE); /* * The vendor-specific functions might have changed features. -- cgit v1.2.3 From f987c955c74501c9295a81372c7d363cbe07c8a6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 8 May 2019 03:02:32 -0700 Subject: x86/elf: Enumerate kernel FSGSBASE capability in AT_HWCAP2 The kernel needs to explicitly enable FSGSBASE. So, the application needs to know if it can safely use these instructions. Just looking at the CPUID bit is not enough because it may be running in a kernel that does not enable the instructions. One way for the application would be to just try and catch the SIGILL. But that is difficult to do in libraries which may not want to overwrite the signal handlers of the main application. Enumerate the enabled FSGSBASE capability in bit 1 of AT_HWCAP2 in the ELF aux vector. AT_HWCAP2 is already used by PPC for similar purposes. The application can access it open coded or by using the getauxval() function in newer versions of glibc. [ tglx: Massaged changelog ] Signed-off-by: Andi Kleen Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Ravi Shankar Cc: H. Peter Anvin Link: https://lkml.kernel.org/r/1557309753-24073-18-git-send-email-chang.seok.bae@intel.com --- arch/x86/include/uapi/asm/hwcap2.h | 3 +++ arch/x86/kernel/cpu/common.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index 6ebaae90e207..c5ce54e749f6 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -5,4 +5,7 @@ /* MONITOR/MWAIT enabled in Ring 3 */ #define HWCAP2_RING3MWAIT (1 << 0) +/* Kernel allows FSGSBASE instructions available in Ring 3 */ +#define HWCAP2_FSGSBASE BIT(1) + #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1305f16b6105..637c9117d5ae 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1387,8 +1387,10 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_umip(c); /* Enable FSGSBASE instructions if available. */ - if (cpu_has(c, X86_FEATURE_FSGSBASE)) + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { cr4_set_bits(X86_CR4_FSGSBASE); + elf_hwcap2 |= HWCAP2_FSGSBASE; + } /* * The vendor-specific functions might have changed features. -- cgit v1.2.3 From 761fdd5e3327db6c646a09bab5ad48cd42680cd2 Mon Sep 17 00:00:00 2001 From: Tony W Wang-oc Date: Tue, 18 Jun 2019 08:37:05 +0000 Subject: x86/cpu: Create Zhaoxin processors architecture support file Add x86 architecture support for new Zhaoxin processors. Carve out initialization code needed by Zhaoxin processors into a separate compilation unit. To identify Zhaoxin CPU, add a new vendor type X86_VENDOR_ZHAOXIN for system recognition. Signed-off-by: Tony W Wang-oc Signed-off-by: Thomas Gleixner Cc: "hpa@zytor.com" Cc: "gregkh@linuxfoundation.org" Cc: "rjw@rjwysocki.net" Cc: "lenb@kernel.org" Cc: David Wang Cc: "Cooper Yan(BJ-RD)" Cc: "Qiyuan Wang(BJ-RD)" Cc: "Herry Yang(BJ-RD)" Link: https://lkml.kernel.org/r/01042674b2f741b2aed1f797359bdffb@zhaoxin.com --- MAINTAINERS | 6 ++ arch/x86/Kconfig.cpu | 13 +++ arch/x86/include/asm/processor.h | 3 +- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/zhaoxin.c | 167 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/zhaoxin.c (limited to 'arch/x86/kernel/cpu') diff --git a/MAINTAINERS b/MAINTAINERS index 57f496cff999..dfdefc6cb3a2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17477,6 +17477,12 @@ Q: https://patchwork.linuxtv.org/project/linux-media/list/ S: Maintained F: drivers/media/dvb-frontends/zd1301_demod* +ZHAOXIN PROCESSOR SUPPORT +M: Tony W Wang-oc +L: linux-kernel@vger.kernel.org +S: Maintained +F: arch/x86/kernel/cpu/zhaoxin.c + ZPOOL COMPRESSED PAGE STORAGE API M: Dan Streetman L: linux-mm@kvack.org diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 6adce15268bd..8e29c991ba3e 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -480,3 +480,16 @@ config CPU_SUP_UMC_32 CPU might render the kernel unbootable. If unsure, say N. + +config CPU_SUP_ZHAOXIN + default y + bool "Support Zhaoxin processors" if PROCESSOR_SELECT + help + This enables detection, tunings and quirks for Zhaoxin processors + + You need this enabled if you want your kernel to run on a + Zhaoxin CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a Zhaoxin + CPU might render the kernel unbootable. + + If unsure, say N. diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c34a35c78618..e57d2ca2ed87 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -144,7 +144,8 @@ enum cpuid_regs_idx { #define X86_VENDOR_TRANSMETA 7 #define X86_VENDOR_NSC 8 #define X86_VENDOR_HYGON 9 -#define X86_VENDOR_NUM 10 +#define X86_VENDOR_ZHAOXIN 10 +#define X86_VENDOR_NUM 11 #define X86_VENDOR_UNKNOWN 0xff diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 5102bf7c8192..a7d9a4cb3ab6 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o +obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o obj-$(CONFIG_X86_MCE) += mce/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c new file mode 100644 index 000000000000..8e6f2f4b4afe --- /dev/null +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include + +#include "cpu.h" + +#define MSR_ZHAOXIN_FCR57 0x00001257 + +#define ACE_PRESENT (1 << 6) +#define ACE_ENABLED (1 << 7) +#define ACE_FCR (1 << 7) /* MSR_ZHAOXIN_FCR */ + +#define RNG_PRESENT (1 << 2) +#define RNG_ENABLED (1 << 3) +#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */ + +#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 +#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 +#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 +#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 +#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 +#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 + +static void init_zhaoxin_cap(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + /* Test for Extended Feature Flags presence */ + if (cpuid_eax(0xC0000000) >= 0xC0000001) { + u32 tmp = cpuid_edx(0xC0000001); + + /* Enable ACE unit, if present and disabled */ + if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) { + rdmsr(MSR_ZHAOXIN_FCR57, lo, hi); + /* Enable ACE unit */ + lo |= ACE_FCR; + wrmsr(MSR_ZHAOXIN_FCR57, lo, hi); + pr_info("CPU: Enabled ACE h/w crypto\n"); + } + + /* Enable RNG unit, if present and disabled */ + if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) { + rdmsr(MSR_ZHAOXIN_FCR57, lo, hi); + /* Enable RNG unit */ + lo |= RNG_ENABLE; + wrmsr(MSR_ZHAOXIN_FCR57, lo, hi); + pr_info("CPU: Enabled h/w RNG\n"); + } + + /* + * Store Extended Feature Flags as word 5 of the CPU + * capability bit array + */ + c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001); + } + + if (c->x86 >= 0x6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + cpu_detect_cache_sizes(c); +} + +static void early_init_zhaoxin(struct cpuinfo_x86 *c) +{ + if (c->x86 >= 0x6) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#endif + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + + if (c->cpuid_level >= 0x00000001) { + u32 eax, ebx, ecx, edx; + + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + /* + * If HTT (EDX[28]) is set EBX[16:23] contain the number of + * apicids which are reserved per package. Store the resulting + * shift value for the package management code. + */ + if (edx & (1U << 28)) + c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); + } + +} + +static void zhaoxin_detect_vmx_virtcap(struct cpuinfo_x86 *c) +{ + u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; + + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); + msr_ctl = vmx_msr_high | vmx_msr_low; + + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) + set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) + set_cpu_cap(c, X86_FEATURE_VNMI); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, + vmx_msr_low, vmx_msr_high); + msr_ctl2 = vmx_msr_high | vmx_msr_low; + if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && + (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) + set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) + set_cpu_cap(c, X86_FEATURE_EPT); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) + set_cpu_cap(c, X86_FEATURE_VPID); + } +} + +static void init_zhaoxin(struct cpuinfo_x86 *c) +{ + early_init_zhaoxin(c); + init_intel_cacheinfo(c); + detect_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + + if (c->cpuid_level > 9) { + unsigned int eax = cpuid_eax(10); + + /* + * Check for version and the number of counters + * Version(eax[7:0]) can't be 0; + * Counters(eax[15:8]) should be greater than 1; + */ + if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + + if (c->x86 >= 0x6) + init_zhaoxin_cap(c); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +#endif + + if (cpu_has(c, X86_FEATURE_VMX)) + zhaoxin_detect_vmx_virtcap(c); +} + +#ifdef CONFIG_X86_32 +static unsigned int +zhaoxin_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + return size; +} +#endif + +static const struct cpu_dev zhaoxin_cpu_dev = { + .c_vendor = "zhaoxin", + .c_ident = { " Shanghai " }, + .c_early_init = early_init_zhaoxin, + .c_init = init_zhaoxin, +#ifdef CONFIG_X86_32 + .legacy_cache_size = zhaoxin_size_cache, +#endif + .c_x86_vendor = X86_VENDOR_ZHAOXIN, +}; + +cpu_dev_register(zhaoxin_cpu_dev); -- cgit v1.2.3 From cc9e303c91f5c25c49a4312552841f4c23fa2b69 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 15 May 2019 09:59:00 +0300 Subject: x86/cpu: Disable frequency requests via aperfmperf IPI for nohz_full CPUs Since commit 7d5905dc14a8 ("x86 / CPU: Always show current CPU frequency in /proc/cpuinfo") open and read of /proc/cpuinfo sends IPI to all CPUs. Many applications read /proc/cpuinfo at the start for trivial reasons like counting cores or detecting cpu features. While sensitive workloads like DPDK network polling don't like any interrupts. Integrates this feature with cpu isolation and do not send IPIs to CPUs without housekeeping flag HK_FLAG_MISC (set by nohz_full). Code that requests cpu frequency like show_cpuinfo() falls back to the last frequency set by the cpufreq driver if this method returns 0. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Len Brown Cc: Frederic Weisbecker Cc: "Rafael J. Wysocki" Cc: "Paul E. McKenney" Link: https://lkml.kernel.org/r/155790354043.1104.15333317408370209.stgit@buzz --- arch/x86/kernel/cpu/aperfmperf.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index e71a6ff8a67e..e2f319dc992d 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "cpu.h" @@ -85,6 +86,9 @@ unsigned int aperfmperf_get_khz(int cpu) if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; + if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + return 0; + aperfmperf_snapshot_cpu(cpu, ktime_get(), true); return per_cpu(samples.khz, cpu); } @@ -101,9 +105,12 @@ void arch_freq_prepare_all(void) if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return; - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + continue; if (!aperfmperf_snapshot_cpu(cpu, now, false)) wait = true; + } if (wait) msleep(APERFMPERF_REFRESH_DELAY_MS); @@ -117,6 +124,9 @@ unsigned int arch_freq_get_on_cpu(int cpu) if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; + if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + return 0; + if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) return per_cpu(samples.khz, cpu); -- cgit v1.2.3 From bd688c69b7e6693de3bd78f38fd63f7850c2711e Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 19 Jun 2019 18:33:55 -0700 Subject: x86/umwait: Initialize umwait control values umwait or tpause allows the processor to enter a light-weight power/performance optimized state (C0.1 state) or an improved power/performance optimized state (C0.2 state) for a period specified by the instruction or until the system time limit or until a store to the monitored address range in umwait. IA32_UMWAIT_CONTROL MSR register allows the OS to enable/disable C0.2 on the processor and to set the maximum time the processor can reside in C0.1 or C0.2. By default C0.2 is enabled so the user wait instructions can enter the C0.2 state to save more power with slower wakeup time. Andy Lutomirski proposed to set the maximum umwait time to 100000 cycles by default. A quote from Andy: "What I want to avoid is the case where it works dramatically differently on NO_HZ_FULL systems as compared to everything else. Also, UMWAIT may behave a bit differently if the max timeout is hit, and I'd like that path to get exercised widely by making it happen even on default configs." A sysfs interface to adjust the time and the C0.2 enablement is provided in a follow up change. [ tglx: Renamed MSR_IA32_UMWAIT_CONTROL_MAX_TIME to MSR_IA32_UMWAIT_CONTROL_TIME_MASK because the constant is used as mask throughout the code. Massaged comments and changelog ] Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Reviewed-by: Ashok Raj Reviewed-by: Andy Lutomirski Cc: "Borislav Petkov" Cc: "H Peter Anvin" Cc: "Peter Zijlstra" Cc: "Tony Luck" Cc: "Ravi V Shankar" Link: https://lkml.kernel.org/r/1560994438-235698-3-git-send-email-fenghua.yu@intel.com --- arch/x86/include/asm/msr-index.h | 9 ++++++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/umwait.c | 62 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 arch/x86/kernel/cpu/umwait.c (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 979ef971cc78..6b4fc2788078 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -61,6 +61,15 @@ #define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31 #define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT) +#define MSR_IA32_UMWAIT_CONTROL 0xe1 +#define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0) +#define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1) +/* + * The time field is bit[31:2], but representing a 32bit value with + * bit[1:0] zero. + */ +#define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U) + #define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) #define NHM_C1_AUTO_DEMOTE (1UL << 26) diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index a7d9a4cb3ab6..4b4eb06e117c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -24,6 +24,7 @@ obj-y += match.o obj-y += bugs.o obj-y += aperfmperf.o obj-y += cpuid-deps.o +obj-y += umwait.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c new file mode 100644 index 000000000000..0a113c731df3 --- /dev/null +++ b/arch/x86/kernel/cpu/umwait.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include + +#define UMWAIT_C02_ENABLE 0 + +#define UMWAIT_CTRL_VAL(maxtime, c02_disable) \ + (((maxtime) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \ + ((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE)) + +/* + * Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default, + * umwait max time is 100000 in TSC-quanta and C0.2 is enabled + */ +static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); + +/* Set IA32_UMWAIT_CONTROL MSR on this CPU to the current global setting. */ +static int umwait_cpu_online(unsigned int cpu) +{ + wrmsr(MSR_IA32_UMWAIT_CONTROL, umwait_control_cached, 0); + return 0; +} + +/* + * On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which + * is the only active CPU at this time. The MSR is set up on the APs via the + * CPU hotplug callback. + * + * This function is invoked on resume from suspend and hibernation. On + * resume from suspend the restore should be not required, but we neither + * trust the firmware nor does it matter if the same value is written + * again. + */ +static void umwait_syscore_resume(void) +{ + wrmsr(MSR_IA32_UMWAIT_CONTROL, umwait_control_cached, 0); +} + +static struct syscore_ops umwait_syscore_ops = { + .resume = umwait_syscore_resume, +}; + +static int __init umwait_init(void) +{ + int ret; + + if (!boot_cpu_has(X86_FEATURE_WAITPKG)) + return -ENODEV; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", + umwait_cpu_online, NULL); + if (ret < 0) + return ret; + + register_syscore_ops(&umwait_syscore_ops); + + return 0; +} +device_initcall(umwait_init); -- cgit v1.2.3 From ff4b353f2ef9dc8e396d7cb9572801e34a8c7374 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 19 Jun 2019 18:33:56 -0700 Subject: x86/umwait: Add sysfs interface to control umwait C0.2 state C0.2 state in umwait and tpause instructions can be enabled or disabled on a processor through IA32_UMWAIT_CONTROL MSR register. By default, C0.2 is enabled and the user wait instructions results in lower power consumption with slower wakeup time. But in real time systems which require faster wakeup time although power savings could be smaller, the administrator needs to disable C0.2 and all umwait invocations from user applications use C0.1. Create a sysfs interface which allows the administrator to control C0.2 state during run time. Andy Lutomirski suggested to turn off local irqs before writing the MSR to ensure the cached control value is not changed by a concurrent sysfs write from a different CPU via IPI. [ tglx: Simplified the update logic in the write function and got rid of all the convoluted type casts. Added a shared update function and made the namespace consistent. Moved the sysfs create invocation. Massaged changelog ] Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Reviewed-by: Ashok Raj Reviewed-by: Tony Luck Cc: "Borislav Petkov" Cc: "H Peter Anvin" Cc: "Andy Lutomirski" Cc: "Peter Zijlstra" Cc: "Ravi V Shankar" Link: https://lkml.kernel.org/r/1560994438-235698-4-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/umwait.c | 118 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 110 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 0a113c731df3..56149d630e35 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -7,8 +7,8 @@ #define UMWAIT_C02_ENABLE 0 -#define UMWAIT_CTRL_VAL(maxtime, c02_disable) \ - (((maxtime) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \ +#define UMWAIT_CTRL_VAL(max_time, c02_disable) \ + (((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \ ((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE)) /* @@ -17,10 +17,38 @@ */ static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); -/* Set IA32_UMWAIT_CONTROL MSR on this CPU to the current global setting. */ +/* + * Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in + * the sysfs write functions. + */ +static DEFINE_MUTEX(umwait_lock); + +static void umwait_update_control_msr(void * unused) +{ + lockdep_assert_irqs_disabled(); + wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0); +} + +/* + * The CPU hotplug callback sets the control MSR to the global control + * value. + * + * Disable interrupts so the read of umwait_control_cached and the WRMSR + * are protected against a concurrent sysfs write. Otherwise the sysfs + * write could update the cached value after it had been read on this CPU + * and issue the IPI before the old value had been written. The IPI would + * interrupt, write the new value and after return from IPI the previous + * value would be written by this CPU. + * + * With interrupts disabled the upcoming CPU either sees the new control + * value or the IPI is updating this CPU to the new control value after + * interrupts have been reenabled. + */ static int umwait_cpu_online(unsigned int cpu) { - wrmsr(MSR_IA32_UMWAIT_CONTROL, umwait_control_cached, 0); + local_irq_disable(); + umwait_update_control_msr(NULL); + local_irq_enable(); return 0; } @@ -36,15 +64,86 @@ static int umwait_cpu_online(unsigned int cpu) */ static void umwait_syscore_resume(void) { - wrmsr(MSR_IA32_UMWAIT_CONTROL, umwait_control_cached, 0); + umwait_update_control_msr(NULL); } static struct syscore_ops umwait_syscore_ops = { .resume = umwait_syscore_resume, }; +/* sysfs interface */ + +/* + * When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled. + * Otherwise, C0.2 is enabled. + */ +static inline bool umwait_ctrl_c02_enabled(u32 ctrl) +{ + return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE); +} + +static inline u32 umwait_ctrl_max_time(u32 ctrl) +{ + return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK; +} + +static inline void umwait_update_control(u32 maxtime, bool c02_enable) +{ + u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK; + + if (!c02_enable) + ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE; + + WRITE_ONCE(umwait_control_cached, ctrl); + /* Propagate to all CPUs */ + on_each_cpu(umwait_update_control_msr, NULL, 1); +} + +static ssize_t +enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(umwait_control_cached); + + return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl)); +} + +static ssize_t enable_c02_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + bool c02_enable; + u32 ctrl; + int ret; + + ret = kstrtobool(buf, &c02_enable); + if (ret) + return ret; + + mutex_lock(&umwait_lock); + + ctrl = READ_ONCE(umwait_control_cached); + if (c02_enable != umwait_ctrl_c02_enabled(ctrl)) + umwait_update_control(ctrl, c02_enable); + + mutex_unlock(&umwait_lock); + + return count; +} +static DEVICE_ATTR_RW(enable_c02); + +static struct attribute *umwait_attrs[] = { + &dev_attr_enable_c02.attr, + NULL +}; + +static struct attribute_group umwait_attr_group = { + .attrs = umwait_attrs, + .name = "umwait_control", +}; + static int __init umwait_init(void) { + struct device *dev; int ret; if (!boot_cpu_has(X86_FEATURE_WAITPKG)) @@ -52,11 +151,14 @@ static int __init umwait_init(void) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", umwait_cpu_online, NULL); - if (ret < 0) - return ret; register_syscore_ops(&umwait_syscore_ops); - return 0; + /* + * Add umwait control interface. Ignore failure, so at least the + * default values are set up in case the machine manages to boot. + */ + dev = cpu_subsys.dev_root; + return sysfs_create_group(&dev->kobj, &umwait_attr_group); } device_initcall(umwait_init); -- cgit v1.2.3 From bd9a0c97e53c3d7a56b2751179903ddc5da42683 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 19 Jun 2019 18:33:57 -0700 Subject: x86/umwait: Add sysfs interface to control umwait maximum time IA32_UMWAIT_CONTROL[31:2] determines the maximum time in TSC-quanta that processor can stay in C0.1 or C0.2. A zero value means no maximum time. Each instruction sets its own deadline in the instruction's implicit input EDX:EAX value. The instruction wakes up if the time-stamp counter reaches or exceeds the specified deadline, or the umwait maximum time expires, or a store happens in the monitored address range in umwait. The administrator can write an unsigned 32-bit number to /sys/devices/system/cpu/umwait_control/max_time to change the default value. Note that a value of zero means there is no limit. The lower two bits of the value must be zero. [ tglx: Simplify the write function. Massage changelog ] Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Reviewed-by: Ashok Raj Reviewed-by: Tony Luck Cc: "Borislav Petkov" Cc: "H Peter Anvin" Cc: "Andy Lutomirski" Cc: "Peter Zijlstra" Cc: "Ravi V Shankar" Link: https://lkml.kernel.org/r/1560994438-235698-5-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/umwait.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 56149d630e35..6a204e7336c1 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -131,8 +131,44 @@ static ssize_t enable_c02_store(struct device *dev, } static DEVICE_ATTR_RW(enable_c02); +static ssize_t +max_time_show(struct device *kobj, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(umwait_control_cached); + + return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl)); +} + +static ssize_t max_time_store(struct device *kobj, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 max_time, ctrl; + int ret; + + ret = kstrtou32(buf, 0, &max_time); + if (ret) + return ret; + + /* bits[1:0] must be zero */ + if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK) + return -EINVAL; + + mutex_lock(&umwait_lock); + + ctrl = READ_ONCE(umwait_control_cached); + if (max_time != umwait_ctrl_max_time(ctrl)) + umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl)); + + mutex_unlock(&umwait_lock); + + return count; +} +static DEVICE_ATTR_RW(max_time); + static struct attribute *umwait_attrs[] = { &dev_attr_enable_c02.attr, + &dev_attr_max_time.attr, NULL }; -- cgit v1.2.3 From 1e03bff3600101bd9158d005e4313132e55bdec8 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 27 Jun 2019 19:35:36 -0700 Subject: x86/cpu/intel: Clear cache self-snoop capability in CPUs with known errata Processors which have self-snooping capability can handle conflicting memory type across CPUs by snooping its own cache. However, there exists CPU models in which having conflicting memory types still leads to unpredictable behavior, machine check errors, or hangs. Clear this feature on affected CPUs to prevent its use. Suggested-by: Alan Cox Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Tony Luck Cc: "H. Peter Anvin" Cc: Andy Shevchenko Cc: Andi Kleen Cc: Hans de Goede Cc: Greg Kroah-Hartman Cc: Jordan Borgner Cc: "Ravi V. Shankar" Cc: Mohammad Etemadi Cc: Ricardo Neri Cc: Andy Shevchenko Cc: Andi Kleen Cc: Peter Feiner Cc: "Rafael J. Wysocki" Link: https://lkml.kernel.org/r/1561689337-19390-2-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/kernel/cpu/intel.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index f17c1a714779..8d6d92ebeb54 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -66,6 +66,32 @@ void check_mpx_erratum(struct cpuinfo_x86 *c) } } +/* + * Processors which have self-snooping capability can handle conflicting + * memory type across CPUs by snooping its own cache. However, there exists + * CPU models in which having conflicting memory types still leads to + * unpredictable behavior, machine check errors, or hangs. Clear this + * feature to prevent its use on machines with known erratas. + */ +static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c) +{ + switch (c->x86_model) { + case INTEL_FAM6_CORE_YONAH: + case INTEL_FAM6_CORE2_MEROM: + case INTEL_FAM6_CORE2_MEROM_L: + case INTEL_FAM6_CORE2_PENRYN: + case INTEL_FAM6_CORE2_DUNNINGTON: + case INTEL_FAM6_NEHALEM: + case INTEL_FAM6_NEHALEM_G: + case INTEL_FAM6_NEHALEM_EP: + case INTEL_FAM6_NEHALEM_EX: + case INTEL_FAM6_WESTMERE: + case INTEL_FAM6_WESTMERE_EP: + case INTEL_FAM6_SANDYBRIDGE: + setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP); + } +} + static bool ring3mwait_disabled __read_mostly; static int __init ring3mwait_disable(char *__unused) @@ -304,6 +330,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) } check_mpx_erratum(c); + check_memory_type_self_snoop_errata(c); /* * Get the number of SMT siblings early from the extended topology -- cgit v1.2.3 From fd329f276ecaad7a371d6f91b9bbea031d0c3440 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Thu, 27 Jun 2019 19:35:37 -0700 Subject: x86/mtrr: Skip cache flushes on CPUs with cache self-snooping Programming MTRR registers in multi-processor systems is a rather lengthy process. Furthermore, all processors must program these registers in lock step and with interrupts disabled; the process also involves flushing caches and TLBs twice. As a result, the process may take a considerable amount of time. On some platforms, this can lead to a large skew of the refined-jiffies clock source. Early when booting, if no other clock is available (e.g., booting with hpet=disabled), the refined-jiffies clock source is used to monitor the TSC clock source. If the skew of refined-jiffies is too large, Linux wrongly assumes that the TSC is unstable: clocksource: timekeeping watchdog on CPU1: Marking clocksource 'tsc-early' as unstable because the skew is too large: clocksource: 'refined-jiffies' wd_now: fffedc10 wd_last: fffedb90 mask: ffffffff clocksource: 'tsc-early' cs_now: 5eccfddebc cs_last: 5e7e3303d4 mask: ffffffffffffffff tsc: Marking TSC unstable due to clocksource watchdog As per measurements, around 98% of the time needed by the procedure to program MTRRs in multi-processor systems is spent flushing caches with wbinvd(). As per the Section 11.11.8 of the Intel 64 and IA 32 Architectures Software Developer's Manual, it is not necessary to flush caches if the CPU supports cache self-snooping. Thus, skipping the cache flushes can reduce by several tens of milliseconds the time needed to complete the programming of the MTRR registers: Platform Before After 104-core (208 Threads) Skylake 1437ms 28ms 2-core ( 4 Threads) Haswell 114ms 2ms Reported-by: Mohammad Etemadi Signed-off-by: Ricardo Neri Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Alan Cox Cc: Tony Luck Cc: "H. Peter Anvin" Cc: Andy Shevchenko Cc: Andi Kleen Cc: Hans de Goede Cc: Greg Kroah-Hartman Cc: Jordan Borgner Cc: "Ravi V. Shankar" Cc: Ricardo Neri Cc: Andy Shevchenko Cc: Andi Kleen Cc: Peter Feiner Cc: "Rafael J. Wysocki" Link: https://lkml.kernel.org/r/1561689337-19390-3-git-send-email-ricardo.neri-calderon@linux.intel.com --- arch/x86/kernel/cpu/mtrr/generic.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 9356c1c9024d..aa5c064a6a22 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -743,7 +743,15 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); - wbinvd(); + + /* + * Cache flushing is the most time-consuming step when programming + * the MTRRs. Fortunately, as per the Intel Software Development + * Manual, we can skip it if the processor supports cache self- + * snooping. + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (boot_cpu_has(X86_FEATURE_PGE)) { @@ -760,7 +768,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); - wbinvd(); + + /* Again, only flush caches if we have to. */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); } static void post_set(void) __releases(set_atomicity_lock) -- cgit v1.2.3 From 049331f277fef1c3f2527c2c9afa1d285e9a1247 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Jul 2019 14:19:36 +0200 Subject: x86/fsgsbase: Revert FSGSBASE support The FSGSBASE series turned out to have serious bugs and there is still an open issue which is not fully understood yet. The confidence in those changes has become close to zero especially as the test cases which have been shipped with that series were obviously never run before sending the final series out to LKML. ./fsgsbase_64 >/dev/null Segmentation fault As the merge window is close, the only sane decision is to revert FSGSBASE support. The revert is necessary as this branch has been merged into perf/core already and rebasing all of that a few days before the merge window is not the most brilliant idea. I could definitely slap myself for not noticing the test case fail when merging that series, but TBH my expectations weren't that low back then. Won't happen again. Revert the following commits: 539bca535dec ("x86/entry/64: Fix and clean up paranoid_exit") 2c7b5ac5d5a9 ("Documentation/x86/64: Add documentation for GS/FS addressing mode") f987c955c745 ("x86/elf: Enumerate kernel FSGSBASE capability in AT_HWCAP2") 2032f1f96ee0 ("x86/cpu: Enable FSGSBASE on 64bit by default and add a chicken bit") 5bf0cab60ee2 ("x86/entry/64: Document GSBASE handling in the paranoid path") 708078f65721 ("x86/entry/64: Handle FSGSBASE enabled paranoid entry/exit") 79e1932fa3ce ("x86/entry/64: Introduce the FIND_PERCPU_BASE macro") 1d07316b1363 ("x86/entry/64: Switch CR3 before SWAPGS in paranoid entry") f60a83df4593 ("x86/process/64: Use FSGSBASE instructions on thread copy and ptrace") 1ab5f3f7fe3d ("x86/process/64: Use FSBSBASE in switch_to() if available") a86b4625138d ("x86/fsgsbase/64: Enable FSGSBASE instructions in helper functions") 8b71340d702e ("x86/fsgsbase/64: Add intrinsics for FSGSBASE instructions") b64ed19b93c3 ("x86/cpu: Add 'unsafe_fsgsbase' to enable CR4.FSGSBASE") Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Chang S. Bae Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Andi Kleen Cc: Ravi Shankar Cc: Dave Hansen Cc: H. Peter Anvin --- Documentation/admin-guide/kernel-parameters.txt | 2 - Documentation/x86/entry_64.rst | 9 -- Documentation/x86/x86_64/fsgs.rst | 199 ------------------------ Documentation/x86/x86_64/index.rst | 1 - arch/x86/entry/calling.h | 40 ----- arch/x86/entry/entry_64.S | 132 ++++------------ arch/x86/include/asm/fsgsbase.h | 45 ++---- arch/x86/include/asm/inst.h | 15 -- arch/x86/include/uapi/asm/hwcap2.h | 3 - arch/x86/kernel/cpu/common.c | 22 --- arch/x86/kernel/process_64.c | 119 ++------------ 11 files changed, 50 insertions(+), 537 deletions(-) delete mode 100644 Documentation/x86/x86_64/fsgs.rst (limited to 'arch/x86/kernel/cpu') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 35bc3c3574c6..138f6664b2e2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2857,8 +2857,6 @@ no5lvl [X86-64] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. - nofsgsbase [X86] Disables FSGSBASE instructions. - no_console_suspend [HW] Never suspend the console Disable suspending of consoles during suspend and diff --git a/Documentation/x86/entry_64.rst b/Documentation/x86/entry_64.rst index b87c1d816aea..a48b3f6ebbe8 100644 --- a/Documentation/x86/entry_64.rst +++ b/Documentation/x86/entry_64.rst @@ -108,12 +108,3 @@ We try to only use IST entries and the paranoid entry code for vectors that absolutely need the more expensive check for the GS base - and we generate all 'normal' entry points with the regular (faster) paranoid=0 variant. - -On a FSGSBASE system, however, user space can set GS without kernel -interaction. It means the value of GS base itself does not imply anything, -whether a kernel value or a user space value. So, there is no longer a safe -way to check whether the exception is entering from user mode or kernel -mode in the paranoid entry code path. So the GSBASE value needs to be read -out, saved and the kernel GSBASE value written. On exit the saved GSBASE -value needs to be restored unconditionally. The non paranoid entry/exit -code still uses SWAPGS unconditionally as the state is known. diff --git a/Documentation/x86/x86_64/fsgs.rst b/Documentation/x86/x86_64/fsgs.rst deleted file mode 100644 index 380c0b5ccca2..000000000000 --- a/Documentation/x86/x86_64/fsgs.rst +++ /dev/null @@ -1,199 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -Using FS and GS segments in user space applications -=================================================== - -The x86 architecture supports segmentation. Instructions which access -memory can use segment register based addressing mode. The following -notation is used to address a byte within a segment: - - Segment-register:Byte-address - -The segment base address is added to the Byte-address to compute the -resulting virtual address which is accessed. This allows to access multiple -instances of data with the identical Byte-address, i.e. the same code. The -selection of a particular instance is purely based on the base-address in -the segment register. - -In 32-bit mode the CPU provides 6 segments, which also support segment -limits. The limits can be used to enforce address space protections. - -In 64-bit mode the CS/SS/DS/ES segments are ignored and the base address is -always 0 to provide a full 64bit address space. The FS and GS segments are -still functional in 64-bit mode. - -Common FS and GS usage ------------------------------- - -The FS segment is commonly used to address Thread Local Storage (TLS). FS -is usually managed by runtime code or a threading library. Variables -declared with the '__thread' storage class specifier are instantiated per -thread and the compiler emits the FS: address prefix for accesses to these -variables. Each thread has its own FS base address so common code can be -used without complex address offset calculations to access the per thread -instances. Applications should not use FS for other purposes when they use -runtimes or threading libraries which manage the per thread FS. - -The GS segment has no common use and can be used freely by -applications. GCC and Clang support GS based addressing via address space -identifiers. - -Reading and writing the FS/GS base address ------------------------------------------- - -There exist two mechanisms to read and write the FS/FS base address: - - - the arch_prctl() system call - - - the FSGSBASE instruction family - -Accessing FS/GS base with arch_prctl() --------------------------------------- - - The arch_prctl(2) based mechanism is available on all 64bit CPUs and all - kernel versions. - - Reading the base: - - arch_prctl(ARCH_GET_FS, &fsbase); - arch_prctl(ARCH_GET_GS, &gsbase); - - Writing the base: - - arch_prctl(ARCH_SET_FS, fsbase); - arch_prctl(ARCH_SET_GS, gsbase); - - The ARCH_SET_GS prctl may be disabled depending on kernel configuration - and security settings. - -Accessing FS/GS base with the FSGSBASE instructions ---------------------------------------------------- - - With the Ivy Bridge CPU generation Intel introduced a new set of - instructions to access the FS and GS base registers directly from user - space. These instructions are also supported on AMD Family 17H CPUs. The - following instructions are available: - - =============== =========================== - RDFSBASE %reg Read the FS base register - RDGSBASE %reg Read the GS base register - WRFSBASE %reg Write the FS base register - WRGSBASE %reg Write the GS base register - =============== =========================== - - The instructions avoid the overhead of the arch_prctl() syscall and allow - more flexible usage of the FS/GS addressing modes in user space - applications. This does not prevent conflicts between threading libraries - and runtimes which utilize FS and applications which want to use it for - their own purpose. - -FSGSBASE instructions enablement -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The instructions are enumerated in CPUID leaf 7, bit 0 of EBX. If - available /proc/cpuinfo shows 'fsgsbase' in the flag entry of the CPUs. - - The availability of the instructions does not enable them - automatically. The kernel has to enable them explicitly in CR4. The - reason for this is that older kernels make assumptions about the values in - the GS register and enforce them when GS base is set via - arch_prctl(). Allowing user space to write arbitrary values to GS base - would violate these assumptions and cause malfunction. - - On kernels which do not enable FSGSBASE the execution of the FSGSBASE - instructions will fault with a #UD exception. - - The kernel provides reliable information about the enabled state in the - ELF AUX vector. If the HWCAP2_FSGSBASE bit is set in the AUX vector, the - kernel has FSGSBASE instructions enabled and applications can use them. - The following code example shows how this detection works:: - - #include - #include - - /* Will be eventually in asm/hwcap.h */ - #ifndef HWCAP2_FSGSBASE - #define HWCAP2_FSGSBASE (1 << 1) - #endif - - .... - - unsigned val = getauxval(AT_HWCAP2); - - if (val & HWCAP2_FSGSBASE) - printf("FSGSBASE enabled\n"); - -FSGSBASE instructions compiler support -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE -instructions. Clang supports them as well. - - =================== =========================== - _readfsbase_u64() Read the FS base register - _readfsbase_u64() Read the GS base register - _writefsbase_u64() Write the FS base register - _writegsbase_u64() Write the GS base register - =================== =========================== - -To utilize these instrinsics must be included in the source -code and the compiler option -mfsgsbase has to be added. - -Compiler support for FS/GS based addressing -------------------------------------------- - -GCC version 6 and newer provide support for FS/GS based addressing via -Named Address Spaces. GCC implements the following address space -identifiers for x86: - - ========= ==================================== - __seg_fs Variable is addressed relative to FS - __seg_gs Variable is addressed relative to GS - ========= ==================================== - -The preprocessor symbols __SEG_FS and __SEG_GS are defined when these -address spaces are supported. Code which implements fallback modes should -check whether these symbols are defined. Usage example:: - - #ifdef __SEG_GS - - long data0 = 0; - long data1 = 1; - - long __seg_gs *ptr; - - /* Check whether FSGSBASE is enabled by the kernel (HWCAP2_FSGSBASE) */ - .... - - /* Set GS to point to data0 */ - _writegsbase_u64(&data0); - - /* Access offset 0 of GS */ - ptr = 0; - printf("data0 = %ld\n", *ptr); - - /* Set GS to point to data1 */ - _writegsbase_u64(&data1); - /* ptr still addresses offset 0! */ - printf("data1 = %ld\n", *ptr); - - -Clang does not provide the GCC address space identifiers, but it provides -address spaces via an attribute based mechanism in Clang 5 and newer -versions: - - ==================================== ===================================== - __attribute__((address_space(256)) Variable is addressed relative to GS - __attribute__((address_space(257)) Variable is addressed relative to FS - ==================================== ===================================== - -FS/GS based addressing with inline assembly -------------------------------------------- - -In case the compiler does not support address spaces, inline assembly can -be used for FS/GS based addressing mode:: - - mov %fs:offset, %reg - mov %gs:offset, %reg - - mov %reg, %fs:offset - mov %reg, %gs:offset diff --git a/Documentation/x86/x86_64/index.rst b/Documentation/x86/x86_64/index.rst index a56070fc8e77..d6eaaa5a35fc 100644 --- a/Documentation/x86/x86_64/index.rst +++ b/Documentation/x86/x86_64/index.rst @@ -14,4 +14,3 @@ x86_64 Support fake-numa-for-cpusets cpu-hotplug-spec machinecheck - fsgs diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index d3fbe2dc03ea..efb0d1b1f15f 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,7 +6,6 @@ #include #include #include -#include /* @@ -338,12 +337,6 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm -.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req - rdgsbase \save_reg - GET_PERCPU_BASE \scratch_reg - wrgsbase \scratch_reg -.endm - #endif /* CONFIG_X86_64 */ .macro STACKLEAK_ERASE @@ -352,39 +345,6 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm -#ifdef CONFIG_SMP - -/* - * CPU/node NR is loaded from the limit (size) field of a special segment - * descriptor entry in GDT. - */ -.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req - movq $__CPUNODE_SEG, \reg - lsl \reg, \reg -.endm - -/* - * Fetch the per-CPU GSBASE value for this processor and put it in @reg. - * We normally use %gs for accessing per-CPU data, but we are setting up - * %gs here and obviously can not use %gs itself to access per-CPU data. - */ -.macro GET_PERCPU_BASE reg:req - ALTERNATIVE \ - "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \ - "RDPID \reg", \ - X86_FEATURE_RDPID - andq $VDSO_CPUNODE_MASK, \reg - movq __per_cpu_offset(, \reg, 8), \reg -.endm - -#else - -.macro GET_PERCPU_BASE reg:req - movq pcpu_unit_offsets(%rip), \reg -.endm - -#endif /* CONFIG_SMP */ - /* * This does 'call enter_from_user_mode' unless we can avoid it based on * kernel config or using the static jump infrastructure. diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 670306f588bf..3b7a0e8d3bc0 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -38,7 +38,6 @@ #include #include #include -#include #include #include "calling.h" @@ -948,6 +947,7 @@ ENTRY(\sym) addq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif + /* these procedures expect "no swapgs" flag in ebx */ .if \paranoid jmp paranoid_exit .else @@ -1164,21 +1164,24 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1 #endif /* - * Save all registers in pt_regs. Return GSBASE related information - * in EBX depending on the availability of the FSGSBASE instructions: - * - * FSGSBASE R/EBX - * N 0 -> SWAPGS on exit - * 1 -> no SWAPGS on exit - * - * Y GSBASE value at entry, must be restored in paranoid_exit + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 + movl $1, %ebx + movl $MSR_GS_BASE, %ecx + rdmsr + testl %edx, %edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx, %ebx +1: /* * Always stash CR3 in %r14. This value will be restored, * verbatim, at exit. Needed if paranoid_entry interrupted @@ -1188,49 +1191,9 @@ ENTRY(paranoid_entry) * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return * to kernel code, but with a user CR3 value. - * - * Switching CR3 does not depend on kernel GSBASE so it can - * be done before switching to the kernel GSBASE. This is - * required for FSGSBASE because the kernel GSBASE has to - * be retrieved from a kernel internal table. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 - /* - * Handling GSBASE depends on the availability of FSGSBASE. - * - * Without FSGSBASE the kernel enforces that negative GSBASE - * values indicate kernel GSBASE. With FSGSBASE no assumptions - * can be made about the GSBASE value when entering from user - * space. - */ - ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE - - /* - * Read the current GSBASE and store it in in %rbx unconditionally, - * retrieve and set the current CPUs kernel GSBASE. The stored value - * has to be restored in paranoid_exit unconditionally. - */ - SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx - ret - -.Lparanoid_entry_checkgs: - /* EBX = 1 -> kernel GSBASE active, no restore required */ - movl $1, %ebx - /* - * The kernel-enforced convention is a negative GSBASE indicates - * a kernel value. No SWAPGS needed on entry and exit. - */ - movl $MSR_GS_BASE, %ecx - rdmsr - testl %edx, %edx - jns .Lparanoid_entry_swapgs - ret - -.Lparanoid_entry_swapgs: - SWAPGS - /* EBX = 0 -> SWAPGS required on exit */ - xorl %ebx, %ebx ret END(paranoid_entry) @@ -1241,48 +1204,28 @@ END(paranoid_entry) * * We may be returning to very strange contexts (e.g. very early * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, there's no good reason to try - * to handle preemption here. - * - * R/EBX contains the GSBASE related information depending on the - * availability of the FSGSBASE instructions: + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. * - * FSGSBASE R/EBX - * N 0 -> SWAPGS on exit - * 1 -> no SWAPGS on exit - * - * Y User space GSBASE, must be restored unconditionally + * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) - - /* - * The order of operations is important. IRQ tracing requires - * kernel GSBASE and CR3. RESTORE_CR3 requires kernel GS base. - * - * NB to anyone to tries to optimize this code: this code does - * not execute at all for exceptions coming from user mode. Those - * exceptions go through error_exit instead. - */ - TRACE_IRQS_IRETQ_DEBUG - RESTORE_CR3 scratch_reg=%rax save_reg=%r14 - - /* Handle the three GSBASE cases. */ - ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE - - /* With FSGSBASE enabled, unconditionally restore GSBASE */ - wrgsbase %rbx - jmp restore_regs_and_return_to_kernel - -.Lparanoid_exit_checkgs: - /* On non-FSGSBASE systems, conditionally do SWAPGS */ - testl %ebx, %ebx - jnz restore_regs_and_return_to_kernel - - /* We are returning to a context with user GSBASE. */ + TRACE_IRQS_OFF_DEBUG + testl %ebx, %ebx /* swapgs needed? */ + jnz .Lparanoid_exit_no_swapgs + TRACE_IRQS_IRETQ + /* Always restore stashed CR3 value (see paranoid_entry) */ + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK - jmp restore_regs_and_return_to_kernel + jmp .Lparanoid_exit_restore +.Lparanoid_exit_no_swapgs: + TRACE_IRQS_IRETQ_DEBUG + /* Always restore stashed CR3 value (see paranoid_entry) */ + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 +.Lparanoid_exit_restore: + jmp restore_regs_and_return_to_kernel END(paranoid_exit) /* @@ -1693,27 +1636,10 @@ end_repeat_nmi: /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 - /* - * The above invocation of paranoid_entry stored the GSBASE - * related information in R/EBX depending on the availability - * of FSGSBASE. - * - * If FSGSBASE is enabled, restore the saved GSBASE value - * unconditionally, otherwise take the conditional SWAPGS path. - */ - ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE - - wrgsbase %rbx - jmp nmi_restore - -nmi_no_fsgsbase: - /* EBX == 0 -> invoke SWAPGS */ - testl %ebx, %ebx + testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore - nmi_swapgs: SWAPGS_UNSAFE_STACK - nmi_restore: POP_REGS diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index aefd53767a5d..bca4c743de77 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -19,63 +19,36 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task); extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); -/* Must be protected by X86_FEATURE_FSGSBASE check. */ +/* Helper functions for reading/writing FS/GS base */ -static __always_inline unsigned long rdfsbase(void) +static inline unsigned long x86_fsbase_read_cpu(void) { unsigned long fsbase; - asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); + rdmsrl(MSR_FS_BASE, fsbase); return fsbase; } -static __always_inline unsigned long rdgsbase(void) +static inline unsigned long x86_gsbase_read_cpu_inactive(void) { unsigned long gsbase; - asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); return gsbase; } -static __always_inline void wrfsbase(unsigned long fsbase) -{ - asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory"); -} - -static __always_inline void wrgsbase(unsigned long gsbase) -{ - asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); -} - -#include - -/* Helper functions for reading/writing FS/GS base */ - -static inline unsigned long x86_fsbase_read_cpu(void) +static inline void x86_fsbase_write_cpu(unsigned long fsbase) { - unsigned long fsbase; - - if (static_cpu_has(X86_FEATURE_FSGSBASE)) - fsbase = rdfsbase(); - else - rdmsrl(MSR_FS_BASE, fsbase); - - return fsbase; + wrmsrl(MSR_FS_BASE, fsbase); } -static inline void x86_fsbase_write_cpu(unsigned long fsbase) +static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase) { - if (static_cpu_has(X86_FEATURE_FSGSBASE)) - wrfsbase(fsbase); - else - wrmsrl(MSR_FS_BASE, fsbase); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); } -extern unsigned long x86_gsbase_read_cpu_inactive(void); -extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); - #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index d063841a17e3..f5a796da07f8 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -306,21 +306,6 @@ .endif MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 .endm - -.macro RDPID opd - REG_TYPE rdpid_opd_type \opd - .if rdpid_opd_type == REG_TYPE_R64 - R64_NUM rdpid_opd \opd - .else - R32_NUM rdpid_opd \opd - .endif - .byte 0xf3 - .if rdpid_opd > 7 - PFX_REX rdpid_opd 0 - .endif - .byte 0x0f, 0xc7 - MODRM 0xc0 rdpid_opd 0x7 -.endm #endif #endif diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index c5ce54e749f6..6ebaae90e207 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -5,7 +5,4 @@ /* MONITOR/MWAIT enabled in Ring 3 */ #define HWCAP2_RING3MWAIT (1 << 0) -/* Kernel allows FSGSBASE instructions available in Ring 3 */ -#define HWCAP2_FSGSBASE BIT(1) - #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 637c9117d5ae..dad20bc891d5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -366,22 +366,6 @@ out: cr4_clear_bits(X86_CR4_UMIP); } -static __init int x86_nofsgsbase_setup(char *arg) -{ - /* Require an exact match without trailing characters. */ - if (strlen(arg)) - return 0; - - /* Do not emit a message if the feature is not present. */ - if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) - return 1; - - setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); - pr_info("FSGSBASE disabled via kernel command line\n"); - return 1; -} -__setup("nofsgsbase", x86_nofsgsbase_setup); - /* * Protection Keys are not available in 32-bit mode. */ @@ -1386,12 +1370,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smap(c); setup_umip(c); - /* Enable FSGSBASE instructions if available. */ - if (cpu_has(c, X86_FEATURE_FSGSBASE)) { - cr4_set_bits(X86_CR4_FSGSBASE); - elf_hwcap2 |= HWCAP2_FSGSBASE; - } - /* * The vendor-specific functions might have changed features. * Now we do "generic changes." diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 8f239091c15d..250e4c4ac6d9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -161,40 +161,6 @@ enum which_selector { GS }; -/* - * Out of line to be protected from kprobes. It is not used on Xen - * paravirt. When paravirt support is needed, it needs to be renamed - * with native_ prefix. - */ -static noinline unsigned long __rdgsbase_inactive(void) -{ - unsigned long gsbase; - - lockdep_assert_irqs_disabled(); - - native_swapgs(); - gsbase = rdgsbase(); - native_swapgs(); - - return gsbase; -} -NOKPROBE_SYMBOL(__rdgsbase_inactive); - -/* - * Out of line to be protected from kprobes. It is not used on Xen - * paravirt. When paravirt support is needed, it needs to be renamed - * with native_ prefix. - */ -static noinline void __wrgsbase_inactive(unsigned long gsbase) -{ - lockdep_assert_irqs_disabled(); - - native_swapgs(); - wrgsbase(gsbase); - native_swapgs(); -} -NOKPROBE_SYMBOL(__wrgsbase_inactive); - /* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. @@ -244,22 +210,8 @@ static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - unsigned long flags; - - /* - * If FSGSBASE is enabled, we can't make any useful guesses - * about the base, and user code expects us to save the current - * value. Fortunately, reading the base directly is efficient. - */ - task->thread.fsbase = rdfsbase(); - local_irq_save(flags); - task->thread.gsbase = __rdgsbase_inactive(); - local_irq_restore(flags); - } else { - save_base_legacy(task, task->thread.fsindex, FS); - save_base_legacy(task, task->thread.gsindex, GS); - } + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); } #if IS_ENABLED(CONFIG_KVM) @@ -338,22 +290,10 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - /* Update the FS and GS selectors if they could have changed. */ - if (unlikely(prev->fsindex || next->fsindex)) - loadseg(FS, next->fsindex); - if (unlikely(prev->gsindex || next->gsindex)) - loadseg(GS, next->gsindex); - - /* Update the bases. */ - wrfsbase(next->fsbase); - __wrgsbase_inactive(next->gsbase); - } else { - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); - } + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); } static unsigned long x86_fsgsbase_read_task(struct task_struct *task, @@ -399,46 +339,13 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task, return base; } -unsigned long x86_gsbase_read_cpu_inactive(void) -{ - unsigned long gsbase; - - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - unsigned long flags; - - /* Interrupts are disabled here. */ - local_irq_save(flags); - gsbase = __rdgsbase_inactive(); - local_irq_restore(flags); - } else { - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); - } - - return gsbase; -} - -void x86_gsbase_write_cpu_inactive(unsigned long gsbase) -{ - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { - unsigned long flags; - - /* Interrupts are disabled here. */ - local_irq_save(flags); - __wrgsbase_inactive(gsbase); - local_irq_restore(flags); - } else { - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); - } -} - unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; if (task == current) fsbase = x86_fsbase_read_cpu(); - else if (static_cpu_has(X86_FEATURE_FSGSBASE) || - (task->thread.fsindex == 0)) + else if (task->thread.fsindex == 0) fsbase = task->thread.fsbase; else fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); @@ -452,8 +359,7 @@ unsigned long x86_gsbase_read_task(struct task_struct *task) if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); - else if (static_cpu_has(X86_FEATURE_FSGSBASE) || - (task->thread.gsindex == 0)) + else if (task->thread.gsindex == 0) gsbase = task->thread.gsbase; else gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); @@ -493,11 +399,10 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap_ptr = NULL; - save_fsgs(me); - p->thread.fsindex = me->thread.fsindex; - p->thread.fsbase = me->thread.fsbase; - p->thread.gsindex = me->thread.gsindex; - p->thread.gsbase = me->thread.gsbase; + savesegment(gs, p->thread.gsindex); + p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase; + savesegment(fs, p->thread.fsindex); + p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); -- cgit v1.2.3