From 36ad35131adacc29b328b9c8b6277a8bf0d6fd5d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 27 Feb 2019 10:10:23 +0100
Subject: x86/speculation: Consolidate CPU whitelists

The CPU vulnerability whitelists have some overlap and there are more
whitelists coming along.

Use the driver_data field in the x86_cpu_id struct to denote the
whitelisted vulnerabilities and combine all whitelists into one.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 arch/x86/kernel/cpu/common.c | 110 +++++++++++++++++++++++--------------------
 1 file changed, 60 insertions(+), 50 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cb28e98a0659..26ec15034f86 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -948,61 +948,72 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
 #endif
 }
 
-static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
-	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_SALTWELL,	X86_FEATURE_ANY },
-	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_SALTWELL_TABLET,	X86_FEATURE_ANY },
-	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_BONNELL_MID,	X86_FEATURE_ANY },
-	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_SALTWELL_MID,	X86_FEATURE_ANY },
-	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_BONNELL,	X86_FEATURE_ANY },
-	{ X86_VENDOR_CENTAUR,	5 },
-	{ X86_VENDOR_INTEL,	5 },
-	{ X86_VENDOR_NSC,	5 },
-	{ X86_VENDOR_ANY,	4 },
+#define NO_SPECULATION	BIT(0)
+#define NO_MELTDOWN	BIT(1)
+#define NO_SSB		BIT(2)
+#define NO_L1TF		BIT(3)
+
+#define VULNWL(_vendor, _family, _model, _whitelist)	\
+	{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
+
+#define VULNWL_INTEL(model, whitelist)		\
+	VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
+
+#define VULNWL_AMD(family, whitelist)		\
+	VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
+
+#define VULNWL_HYGON(family, whitelist)		\
+	VULNWL(HYGON, family, X86_MODEL_ANY, whitelist)
+
+static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
+	VULNWL(ANY,	4, X86_MODEL_ANY,	NO_SPECULATION),
+	VULNWL(CENTAUR,	5, X86_MODEL_ANY,	NO_SPECULATION),
+	VULNWL(INTEL,	5, X86_MODEL_ANY,	NO_SPECULATION),
+	VULNWL(NSC,	5, X86_MODEL_ANY,	NO_SPECULATION),
+
+	VULNWL_INTEL(ATOM_SALTWELL,		NO_SPECULATION),
+	VULNWL_INTEL(ATOM_SALTWELL_TABLET,	NO_SPECULATION),
+	VULNWL_INTEL(ATOM_SALTWELL_MID,		NO_SPECULATION),
+	VULNWL_INTEL(ATOM_BONNELL,		NO_SPECULATION),
+	VULNWL_INTEL(ATOM_BONNELL_MID,		NO_SPECULATION),
+
+	VULNWL_INTEL(ATOM_SILVERMONT,		NO_SSB | NO_L1TF),
+	VULNWL_INTEL(ATOM_SILVERMONT_X,		NO_SSB | NO_L1TF),
+	VULNWL_INTEL(ATOM_SILVERMONT_MID,	NO_SSB | NO_L1TF),
+	VULNWL_INTEL(ATOM_AIRMONT,		NO_SSB | NO_L1TF),
+	VULNWL_INTEL(XEON_PHI_KNL,		NO_SSB | NO_L1TF),
+	VULNWL_INTEL(XEON_PHI_KNM,		NO_SSB | NO_L1TF),
+
+	VULNWL_INTEL(CORE_YONAH,		NO_SSB),
+
+	VULNWL_INTEL(ATOM_AIRMONT_MID,		NO_L1TF),
+	VULNWL_INTEL(ATOM_GOLDMONT,		NO_L1TF),
+	VULNWL_INTEL(ATOM_GOLDMONT_X,		NO_L1TF),
+	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_L1TF),
+
+	VULNWL_AMD(0x0f,		NO_MELTDOWN | NO_SSB | NO_L1TF),
+	VULNWL_AMD(0x10,		NO_MELTDOWN | NO_SSB | NO_L1TF),
+	VULNWL_AMD(0x11,		NO_MELTDOWN | NO_SSB | NO_L1TF),
+	VULNWL_AMD(0x12,		NO_MELTDOWN | NO_SSB | NO_L1TF),
+
+	/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
+	VULNWL_AMD(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF),
+	VULNWL_HYGON(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF),
 	{}
 };
 
-static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
-	{ X86_VENDOR_AMD },
-	{ X86_VENDOR_HYGON },
-	{}
-};
-
-/* Only list CPUs which speculate but are non susceptible to SSB */
-static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT		},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT_X	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT_MID	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_CORE_YONAH		},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNL		},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNM		},
-	{ X86_VENDOR_AMD,	0x12,					},
-	{ X86_VENDOR_AMD,	0x11,					},
-	{ X86_VENDOR_AMD,	0x10,					},
-	{ X86_VENDOR_AMD,	0xf,					},
-	{}
-};
+static bool __init cpu_matches(unsigned long which)
+{
+	const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist);
 
-static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
-	/* in addition to cpu_no_speculation */
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT_X	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT		},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT_MID	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT_MID	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_GOLDMONT	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_GOLDMONT_X	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_GOLDMONT_PLUS	},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNL		},
-	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNM		},
-	{}
-};
+	return m && !!(m->driver_data & which);
+}
 
 static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 {
 	u64 ia32_cap = 0;
 
-	if (x86_match_cpu(cpu_no_speculation))
+	if (cpu_matches(NO_SPECULATION))
 		return;
 
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
@@ -1011,15 +1022,14 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
 		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
 
-	if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
-	   !(ia32_cap & ARCH_CAP_SSB_NO) &&
+	if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
 	   !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
 		setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
 
 	if (ia32_cap & ARCH_CAP_IBRS_ALL)
 		setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
 
-	if (x86_match_cpu(cpu_no_meltdown))
+	if (cpu_matches(NO_MELTDOWN))
 		return;
 
 	/* Rogue Data Cache Load? No! */
@@ -1028,7 +1038,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 
 	setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 
-	if (x86_match_cpu(cpu_no_l1tf))
+	if (cpu_matches(NO_L1TF))
 		return;
 
 	setup_force_cpu_bug(X86_BUG_L1TF);
-- 
cgit v1.2.3


From ed5194c2732c8084af9fd159c146ea92bf137128 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 18 Jan 2019 16:50:16 -0800
Subject: x86/speculation/mds: Add basic bug infrastructure for MDS

Microarchitectural Data Sampling (MDS), is a class of side channel attacks
on internal buffers in Intel CPUs. The variants are:

 - Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126)
 - Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130)
 - Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127)

MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a
dependent load (store-to-load forwarding) as an optimization. The forward
can also happen to a faulting or assisting load operation for a different
memory address, which can be exploited under certain conditions. Store
buffers are partitioned between Hyper-Threads so cross thread forwarding is
not possible. But if a thread enters or exits a sleep state the store
buffer is repartitioned which can expose data from one thread to the other.

MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage
L1 miss situations and to hold data which is returned or sent in response
to a memory or I/O operation. Fill buffers can forward data to a load
operation and also write data to the cache. When the fill buffer is
deallocated it can retain the stale data of the preceding operations which
can then be forwarded to a faulting or assisting load operation, which can
be exploited under certain conditions. Fill buffers are shared between
Hyper-Threads so cross thread leakage is possible.

MLDPS leaks Load Port Data. Load ports are used to perform load operations
from memory or I/O. The received data is then forwarded to the register
file or a subsequent operation. In some implementations the Load Port can
contain stale data from a previous operation which can be forwarded to
faulting or assisting loads under certain conditions, which again can be
exploited eventually. Load ports are shared between Hyper-Threads so cross
thread leakage is possible.

All variants have the same mitigation for single CPU thread case (SMT off),
so the kernel can treat them as one MDS issue.

Add the basic infrastructure to detect if the current CPU is affected by
MDS.

[ tglx: Rewrote changelog ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 arch/x86/include/asm/cpufeatures.h |  2 ++
 arch/x86/include/asm/msr-index.h   |  5 +++++
 arch/x86/kernel/cpu/common.c       | 25 ++++++++++++++++---------
 3 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 6d6122524711..ae3f987b24f1 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -344,6 +344,7 @@
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
 #define X86_FEATURE_AVX512_4VNNIW	(18*32+ 2) /* AVX-512 Neural Network Instructions */
 #define X86_FEATURE_AVX512_4FMAPS	(18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_MD_CLEAR		(18*32+10) /* VERW clears CPU buffers */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
@@ -381,5 +382,6 @@
 #define X86_BUG_SPECTRE_V2		X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
 #define X86_BUG_SPEC_STORE_BYPASS	X86_BUG(17) /* CPU is affected by speculative store bypass attack */
 #define X86_BUG_L1TF			X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
+#define X86_BUG_MDS			X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e4074556c37b..e2d30636c03f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -79,6 +79,11 @@
 						 * attack, so no Speculative Store Bypass
 						 * control required.
 						 */
+#define ARCH_CAP_MDS_NO			BIT(5)   /*
+						  * Not susceptible to
+						  * Microarchitectural Data
+						  * Sampling (MDS) vulnerabilities.
+						  */
 
 #define MSR_IA32_FLUSH_CMD		0x0000010b
 #define L1D_FLUSH			BIT(0)	/*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 26ec15034f86..e34817bca504 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -952,6 +952,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
 #define NO_MELTDOWN	BIT(1)
 #define NO_SSB		BIT(2)
 #define NO_L1TF		BIT(3)
+#define NO_MDS		BIT(4)
 
 #define VULNWL(_vendor, _family, _model, _whitelist)	\
 	{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
@@ -971,6 +972,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 	VULNWL(INTEL,	5, X86_MODEL_ANY,	NO_SPECULATION),
 	VULNWL(NSC,	5, X86_MODEL_ANY,	NO_SPECULATION),
 
+	/* Intel Family 6 */
 	VULNWL_INTEL(ATOM_SALTWELL,		NO_SPECULATION),
 	VULNWL_INTEL(ATOM_SALTWELL_TABLET,	NO_SPECULATION),
 	VULNWL_INTEL(ATOM_SALTWELL_MID,		NO_SPECULATION),
@@ -987,18 +989,20 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 	VULNWL_INTEL(CORE_YONAH,		NO_SSB),
 
 	VULNWL_INTEL(ATOM_AIRMONT_MID,		NO_L1TF),
-	VULNWL_INTEL(ATOM_GOLDMONT,		NO_L1TF),
-	VULNWL_INTEL(ATOM_GOLDMONT_X,		NO_L1TF),
-	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_L1TF),
 
-	VULNWL_AMD(0x0f,		NO_MELTDOWN | NO_SSB | NO_L1TF),
-	VULNWL_AMD(0x10,		NO_MELTDOWN | NO_SSB | NO_L1TF),
-	VULNWL_AMD(0x11,		NO_MELTDOWN | NO_SSB | NO_L1TF),
-	VULNWL_AMD(0x12,		NO_MELTDOWN | NO_SSB | NO_L1TF),
+	VULNWL_INTEL(ATOM_GOLDMONT,		NO_MDS | NO_L1TF),
+	VULNWL_INTEL(ATOM_GOLDMONT_X,		NO_MDS | NO_L1TF),
+	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_MDS | NO_L1TF),
+
+	/* AMD Family 0xf - 0x12 */
+	VULNWL_AMD(0x0f,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
+	VULNWL_AMD(0x10,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
+	VULNWL_AMD(0x11,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
+	VULNWL_AMD(0x12,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
 
 	/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
-	VULNWL_AMD(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF),
-	VULNWL_HYGON(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF),
+	VULNWL_AMD(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS),
+	VULNWL_HYGON(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS),
 	{}
 };
 
@@ -1029,6 +1033,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 	if (ia32_cap & ARCH_CAP_IBRS_ALL)
 		setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
 
+	if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO))
+		setup_force_cpu_bug(X86_BUG_MDS);
+
 	if (cpu_matches(NO_MELTDOWN))
 		return;
 
-- 
cgit v1.2.3


From e261f209c3666e842fd645a1e31f001c3a26def9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 Mar 2019 20:21:08 +0100
Subject: x86/speculation/mds: Add BUG_MSBDS_ONLY

This bug bit is set on CPUs which are only affected by Microarchitectural
Store Buffer Data Sampling (MSBDS) and not by any other MDS variant.

This is important because the Store Buffers are partitioned between
Hyper-Threads so cross thread forwarding is not possible. But if a thread
enters or exits a sleep state the store buffer is repartitioned which can
expose data from one thread to the other. This transition can be mitigated.

That means that for CPUs which are only affected by MSBDS SMT can be
enabled, if the CPU is not affected by other SMT sensitive vulnerabilities,
e.g. L1TF. The XEON PHI variants fall into that category. Also the
Silvermont/Airmont ATOMs, but for them it's not really relevant as they do
not support SMT, but mark them for completeness sake.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/kernel/cpu/common.c       | 20 ++++++++++++--------
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index ae3f987b24f1..bdcea163850a 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -383,5 +383,6 @@
 #define X86_BUG_SPEC_STORE_BYPASS	X86_BUG(17) /* CPU is affected by speculative store bypass attack */
 #define X86_BUG_L1TF			X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
 #define X86_BUG_MDS			X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
+#define X86_BUG_MSBDS_ONLY		X86_BUG(20) /* CPU is only affected by the  MSDBS variant of BUG_MDS */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e34817bca504..132a63dc5a76 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -953,6 +953,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
 #define NO_SSB		BIT(2)
 #define NO_L1TF		BIT(3)
 #define NO_MDS		BIT(4)
+#define MSBDS_ONLY	BIT(5)
 
 #define VULNWL(_vendor, _family, _model, _whitelist)	\
 	{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
@@ -979,16 +980,16 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 	VULNWL_INTEL(ATOM_BONNELL,		NO_SPECULATION),
 	VULNWL_INTEL(ATOM_BONNELL_MID,		NO_SPECULATION),
 
-	VULNWL_INTEL(ATOM_SILVERMONT,		NO_SSB | NO_L1TF),
-	VULNWL_INTEL(ATOM_SILVERMONT_X,		NO_SSB | NO_L1TF),
-	VULNWL_INTEL(ATOM_SILVERMONT_MID,	NO_SSB | NO_L1TF),
-	VULNWL_INTEL(ATOM_AIRMONT,		NO_SSB | NO_L1TF),
-	VULNWL_INTEL(XEON_PHI_KNL,		NO_SSB | NO_L1TF),
-	VULNWL_INTEL(XEON_PHI_KNM,		NO_SSB | NO_L1TF),
+	VULNWL_INTEL(ATOM_SILVERMONT,		NO_SSB | NO_L1TF | MSBDS_ONLY),
+	VULNWL_INTEL(ATOM_SILVERMONT_X,		NO_SSB | NO_L1TF | MSBDS_ONLY),
+	VULNWL_INTEL(ATOM_SILVERMONT_MID,	NO_SSB | NO_L1TF | MSBDS_ONLY),
+	VULNWL_INTEL(ATOM_AIRMONT,		NO_SSB | NO_L1TF | MSBDS_ONLY),
+	VULNWL_INTEL(XEON_PHI_KNL,		NO_SSB | NO_L1TF | MSBDS_ONLY),
+	VULNWL_INTEL(XEON_PHI_KNM,		NO_SSB | NO_L1TF | MSBDS_ONLY),
 
 	VULNWL_INTEL(CORE_YONAH,		NO_SSB),
 
-	VULNWL_INTEL(ATOM_AIRMONT_MID,		NO_L1TF),
+	VULNWL_INTEL(ATOM_AIRMONT_MID,		NO_L1TF | MSBDS_ONLY),
 
 	VULNWL_INTEL(ATOM_GOLDMONT,		NO_MDS | NO_L1TF),
 	VULNWL_INTEL(ATOM_GOLDMONT_X,		NO_MDS | NO_L1TF),
@@ -1033,8 +1034,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 	if (ia32_cap & ARCH_CAP_IBRS_ALL)
 		setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
 
-	if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO))
+	if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) {
 		setup_force_cpu_bug(X86_BUG_MDS);
+		if (cpu_matches(MSBDS_ONLY))
+			setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
+	}
 
 	if (cpu_matches(NO_MELTDOWN))
 		return;
-- 
cgit v1.2.3


From 04dcbdb8057827b043b3c71aa397c4c63e67d086 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Feb 2019 23:42:51 +0100
Subject: x86/speculation/mds: Clear CPU buffers on exit to user

Add a static key which controls the invocation of the CPU buffer clear
mechanism on exit to user space and add the call into
prepare_exit_to_usermode() and do_nmi() right before actually returning.

Add documentation which kernel to user space transition this covers and
explain why some corner cases are not mitigated.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 Documentation/x86/mds.rst            | 52 ++++++++++++++++++++++++++++++++++++
 arch/x86/entry/common.c              |  3 +++
 arch/x86/include/asm/nospec-branch.h | 13 +++++++++
 arch/x86/kernel/cpu/bugs.c           |  3 +++
 arch/x86/kernel/nmi.c                |  4 +++
 arch/x86/kernel/traps.c              |  8 ++++++
 6 files changed, 83 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst
index 1096738d50f2..54d935bf283b 100644
--- a/Documentation/x86/mds.rst
+++ b/Documentation/x86/mds.rst
@@ -97,3 +97,55 @@ According to current knowledge additional mitigations inside the kernel
 itself are not required because the necessary gadgets to expose the leaked
 data cannot be controlled in a way which allows exploitation from malicious
 user space or VM guests.
+
+Mitigation points
+-----------------
+
+1. Return to user space
+^^^^^^^^^^^^^^^^^^^^^^^
+
+   When transitioning from kernel to user space the CPU buffers are flushed
+   on affected CPUs when the mitigation is not disabled on the kernel
+   command line. The migitation is enabled through the static key
+   mds_user_clear.
+
+   The mitigation is invoked in prepare_exit_to_usermode() which covers
+   most of the kernel to user space transitions. There are a few exceptions
+   which are not invoking prepare_exit_to_usermode() on return to user
+   space. These exceptions use the paranoid exit code.
+
+   - Non Maskable Interrupt (NMI):
+
+     Access to sensible data like keys, credentials in the NMI context is
+     mostly theoretical: The CPU can do prefetching or execute a
+     misspeculated code path and thereby fetching data which might end up
+     leaking through a buffer.
+
+     But for mounting other attacks the kernel stack address of the task is
+     already valuable information. So in full mitigation mode, the NMI is
+     mitigated on the return from do_nmi() to provide almost complete
+     coverage.
+
+   - Double fault (#DF):
+
+     A double fault is usually fatal, but the ESPFIX workaround, which can
+     be triggered from user space through modify_ldt(2) is a recoverable
+     double fault. #DF uses the paranoid exit path, so explicit mitigation
+     in the double fault handler is required.
+
+   - Machine Check Exception (#MC):
+
+     Another corner case is a #MC which hits between the CPU buffer clear
+     invocation and the actual return to user. As this still is in kernel
+     space it takes the paranoid exit path which does not clear the CPU
+     buffers. So the #MC handler repopulates the buffers to some
+     extent. Machine checks are not reliably controllable and the window is
+     extremly small so mitigation would just tick a checkbox that this
+     theoretical corner case is covered. To keep the amount of special
+     cases small, ignore #MC.
+
+   - Debug Exception (#DB):
+
+     This takes the paranoid exit path only when the INT1 breakpoint is in
+     kernel space. #DB on a user space address takes the regular exit path,
+     so no extra mitigation required.
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 7bc105f47d21..19f650d729f5 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -31,6 +31,7 @@
 #include <asm/vdso.h>
 #include <linux/uaccess.h>
 #include <asm/cpufeature.h>
+#include <asm/nospec-branch.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
@@ -212,6 +213,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 #endif
 
 	user_enter_irqoff();
+
+	mds_user_clear_cpu_buffers();
 }
 
 #define SYSCALL_EXIT_WORK_FLAGS				\
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 67cb9b2082b1..65b747286d96 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -318,6 +318,8 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
 DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
 DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 
+DECLARE_STATIC_KEY_FALSE(mds_user_clear);
+
 #include <asm/segment.h>
 
 /**
@@ -343,6 +345,17 @@ static inline void mds_clear_cpu_buffers(void)
 	asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
 }
 
+/**
+ * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability
+ *
+ * Clear CPU buffers if the corresponding static key is enabled
+ */
+static inline void mds_user_clear_cpu_buffers(void)
+{
+	if (static_branch_likely(&mds_user_clear))
+		mds_clear_cpu_buffers();
+}
+
 #endif /* __ASSEMBLY__ */
 
 /*
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 01874d54f4fd..dbb45014de1b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -63,6 +63,9 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
 /* Control unconditional IBPB in switch_mm() */
 DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 
+/* Control MDS CPU buffer clear before returning to user space */
+DEFINE_STATIC_KEY_FALSE(mds_user_clear);
+
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 18bc9b51ac9b..086cf1d1d71d 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -34,6 +34,7 @@
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
 #include <asm/cache.h>
+#include <asm/nospec-branch.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/nmi.h>
@@ -533,6 +534,9 @@ nmi_restart:
 		write_cr2(this_cpu_read(nmi_cr2));
 	if (this_cpu_dec_return(nmi_state))
 		goto nmi_restart;
+
+	if (user_mode(regs))
+		mds_user_clear_cpu_buffers();
 }
 NOKPROBE_SYMBOL(do_nmi);
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9b7c4ca8f0a7..85fe1870f873 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -58,6 +58,7 @@
 #include <asm/alternative.h>
 #include <asm/fpu/xstate.h>
 #include <asm/trace/mpx.h>
+#include <asm/nospec-branch.h>
 #include <asm/mpx.h>
 #include <asm/vm86.h>
 #include <asm/umip.h>
@@ -366,6 +367,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 		regs->ip = (unsigned long)general_protection;
 		regs->sp = (unsigned long)&gpregs->orig_ax;
 
+		/*
+		 * This situation can be triggered by userspace via
+		 * modify_ldt(2) and the return does not take the regular
+		 * user space exit, so a CPU buffer clear is required when
+		 * MDS mitigation is enabled.
+		 */
+		mds_user_clear_cpu_buffers();
 		return;
 	}
 #endif
-- 
cgit v1.2.3


From 650b68a0622f933444a6d66936abb3103029413b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 27 Feb 2019 12:48:14 +0100
Subject: x86/kvm/vmx: Add MDS protection when L1D Flush is not active

CPUs which are affected by L1TF and MDS mitigate MDS with the L1D Flush on
VMENTER when updated microcode is installed.

If a CPU is not affected by L1TF or if the L1D Flush is not in use, then
MDS mitigation needs to be invoked explicitly.

For these cases, follow the host mitigation state and invoke the MDS
mitigation before VMENTER.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 arch/x86/kernel/cpu/bugs.c | 1 +
 arch/x86/kvm/vmx/vmx.c     | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index dbb45014de1b..29ed8e8dfee2 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -65,6 +65,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 
 /* Control MDS CPU buffer clear before returning to user space */
 DEFINE_STATIC_KEY_FALSE(mds_user_clear);
+EXPORT_SYMBOL_GPL(mds_user_clear);
 
 void __init check_bugs(void)
 {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 30a6bcd735ec..544bd24a9c1e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6369,8 +6369,11 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
 	evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
 		(unsigned long)&current_evmcs->host_rsp : 0;
 
+	/* L1D Flush includes CPU buffer clear to mitigate MDS */
 	if (static_branch_unlikely(&vmx_l1d_should_flush))
 		vmx_l1d_flush(vcpu);
+	else if (static_branch_unlikely(&mds_user_clear))
+		mds_clear_cpu_buffers();
 
 	asm(
 		/* Store host registers */
-- 
cgit v1.2.3


From 07f07f55a29cb705e221eda7894dd67ab81ef343 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Feb 2019 23:04:01 +0100
Subject: x86/speculation/mds: Conditionally clear CPU buffers on idle entry

Add a static key which controls the invocation of the CPU buffer clear
mechanism on idle entry. This is independent of other MDS mitigations
because the idle entry invocation to mitigate the potential leakage due to
store buffer repartitioning is only necessary on SMT systems.

Add the actual invocations to the different halt/mwait variants which
covers all usage sites. mwaitx is not patched as it's not available on
Intel CPUs.

The buffer clear is only invoked before entering the C-State to prevent
that stale data from the idling CPU is spilled to the Hyper-Thread sibling
after the Store buffer got repartitioned and all entries are available to
the non idle sibling.

When coming out of idle the store buffer is partitioned again so each
sibling has half of it available. Now CPU which returned from idle could be
speculatively exposed to contents of the sibling, but the buffers are
flushed either on exit to user space or on VMENTER.

When later on conditional buffer clearing is implemented on top of this,
then there is no action required either because before returning to user
space the context switch will set the condition flag which causes a flush
on the return to user path.

Note, that the buffer clearing on idle is only sensible on CPUs which are
solely affected by MSBDS and not any other variant of MDS because the other
MDS variants cannot be mitigated when SMT is enabled, so the buffer
clearing on idle would be a window dressing exercise.

This intentionally does not handle the case in the acpi/processor_idle
driver which uses the legacy IO port interface for C-State transitions for
two reasons:

 - The acpi/processor_idle driver was replaced by the intel_idle driver
   almost a decade ago. Anything Nehalem upwards supports it and defaults
   to that new driver.

 - The legacy IO port interface is likely to be used on older and therefore
   unaffected CPUs or on systems which do not receive microcode updates
   anymore, so there is no point in adding that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 Documentation/x86/mds.rst            | 42 ++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/irqflags.h      |  4 ++++
 arch/x86/include/asm/mwait.h         |  7 ++++++
 arch/x86/include/asm/nospec-branch.h | 12 +++++++++++
 arch/x86/kernel/cpu/bugs.c           |  3 +++
 5 files changed, 68 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst
index 54d935bf283b..87ce8ac9f36e 100644
--- a/Documentation/x86/mds.rst
+++ b/Documentation/x86/mds.rst
@@ -149,3 +149,45 @@ Mitigation points
      This takes the paranoid exit path only when the INT1 breakpoint is in
      kernel space. #DB on a user space address takes the regular exit path,
      so no extra mitigation required.
+
+
+2. C-State transition
+^^^^^^^^^^^^^^^^^^^^^
+
+   When a CPU goes idle and enters a C-State the CPU buffers need to be
+   cleared on affected CPUs when SMT is active. This addresses the
+   repartitioning of the store buffer when one of the Hyper-Threads enters
+   a C-State.
+
+   When SMT is inactive, i.e. either the CPU does not support it or all
+   sibling threads are offline CPU buffer clearing is not required.
+
+   The idle clearing is enabled on CPUs which are only affected by MSBDS
+   and not by any other MDS variant. The other MDS variants cannot be
+   protected against cross Hyper-Thread attacks because the Fill Buffer and
+   the Load Ports are shared. So on CPUs affected by other variants, the
+   idle clearing would be a window dressing exercise and is therefore not
+   activated.
+
+   The invocation is controlled by the static key mds_idle_clear which is
+   switched depending on the chosen mitigation mode and the SMT state of
+   the system.
+
+   The buffer clear is only invoked before entering the C-State to prevent
+   that stale data from the idling CPU from spilling to the Hyper-Thread
+   sibling after the store buffer got repartitioned and all entries are
+   available to the non idle sibling.
+
+   When coming out of idle the store buffer is partitioned again so each
+   sibling has half of it available. The back from idle CPU could be then
+   speculatively exposed to contents of the sibling. The buffers are
+   flushed either on exit to user space or on VMENTER so malicious code
+   in user space or the guest cannot speculatively access them.
+
+   The mitigation is hooked into all variants of halt()/mwait(), but does
+   not cover the legacy ACPI IO-Port mechanism because the ACPI idle driver
+   has been superseded by the intel_idle driver around 2010 and is
+   preferred on all affected CPUs which are expected to gain the MD_CLEAR
+   functionality in microcode. Aside of that the IO-Port mechanism is a
+   legacy interface which is only used on older systems which are either
+   not affected or do not receive microcode updates anymore.
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 058e40fed167..8a0e56e1dcc9 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -6,6 +6,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/nospec-branch.h>
+
 /* Provide __cpuidle; we can't safely include <linux/cpu.h> */
 #define __cpuidle __attribute__((__section__(".cpuidle.text")))
 
@@ -54,11 +56,13 @@ static inline void native_irq_enable(void)
 
 static inline __cpuidle void native_safe_halt(void)
 {
+	mds_idle_clear_cpu_buffers();
 	asm volatile("sti; hlt": : :"memory");
 }
 
 static inline __cpuidle void native_halt(void)
 {
+	mds_idle_clear_cpu_buffers();
 	asm volatile("hlt": : :"memory");
 }
 
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 39a2fb29378a..eb0f80ce8524 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -6,6 +6,7 @@
 #include <linux/sched/idle.h>
 
 #include <asm/cpufeature.h>
+#include <asm/nospec-branch.h>
 
 #define MWAIT_SUBSTATE_MASK		0xf
 #define MWAIT_CSTATE_MASK		0xf
@@ -40,6 +41,8 @@ static inline void __monitorx(const void *eax, unsigned long ecx,
 
 static inline void __mwait(unsigned long eax, unsigned long ecx)
 {
+	mds_idle_clear_cpu_buffers();
+
 	/* "mwait %eax, %ecx;" */
 	asm volatile(".byte 0x0f, 0x01, 0xc9;"
 		     :: "a" (eax), "c" (ecx));
@@ -74,6 +77,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
 static inline void __mwaitx(unsigned long eax, unsigned long ebx,
 			    unsigned long ecx)
 {
+	/* No MDS buffer clear as this is AMD/HYGON only */
+
 	/* "mwaitx %eax, %ebx, %ecx;" */
 	asm volatile(".byte 0x0f, 0x01, 0xfb;"
 		     :: "a" (eax), "b" (ebx), "c" (ecx));
@@ -81,6 +86,8 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
 
 static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
 {
+	mds_idle_clear_cpu_buffers();
+
 	trace_hardirqs_on();
 	/* "mwait %eax, %ecx;" */
 	asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 65b747286d96..4e970390110f 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -319,6 +319,7 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
 DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 
 DECLARE_STATIC_KEY_FALSE(mds_user_clear);
+DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
 
 #include <asm/segment.h>
 
@@ -356,6 +357,17 @@ static inline void mds_user_clear_cpu_buffers(void)
 		mds_clear_cpu_buffers();
 }
 
+/**
+ * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
+ *
+ * Clear CPU buffers if the corresponding static key is enabled
+ */
+static inline void mds_idle_clear_cpu_buffers(void)
+{
+	if (static_branch_likely(&mds_idle_clear))
+		mds_clear_cpu_buffers();
+}
+
 #endif /* __ASSEMBLY__ */
 
 /*
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 29ed8e8dfee2..916995167301 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -66,6 +66,9 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 /* Control MDS CPU buffer clear before returning to user space */
 DEFINE_STATIC_KEY_FALSE(mds_user_clear);
 EXPORT_SYMBOL_GPL(mds_user_clear);
+/* Control MDS CPU buffer clear before idling (halt, mwait) */
+DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
+EXPORT_SYMBOL_GPL(mds_idle_clear);
 
 void __init check_bugs(void)
 {
-- 
cgit v1.2.3


From bc1241700acd82ec69fde98c5763ce51086269f8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Feb 2019 22:04:08 +0100
Subject: x86/speculation/mds: Add mitigation control for MDS

Now that the mitigations are in place, add a command line parameter to
control the mitigation, a mitigation selector function and a SMT update
mechanism.

This is the minimal straight forward initial implementation which just
provides an always on/off mode. The command line parameter is:

  mds=[full|off]

This is consistent with the existing mitigations for other speculative
hardware vulnerabilities.

The idle invocation is dynamically updated according to the SMT state of
the system similar to the dynamic update of the STIBP mitigation. The idle
mitigation is limited to CPUs which are only affected by MSBDS and not any
other variant, because the other variants cannot be mitigated on SMT
enabled systems.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 22 ++++++++
 arch/x86/include/asm/processor.h                |  5 ++
 arch/x86/kernel/cpu/bugs.c                      | 70 +++++++++++++++++++++++++
 3 files changed, 97 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 858b6c0b9a15..dddb024eb523 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2356,6 +2356,28 @@
 			Format: <first>,<last>
 			Specifies range of consoles to be captured by the MDA.
 
+	mds=		[X86,INTEL]
+			Control mitigation for the Micro-architectural Data
+			Sampling (MDS) vulnerability.
+
+			Certain CPUs are vulnerable to an exploit against CPU
+			internal buffers which can forward information to a
+			disclosure gadget under certain conditions.
+
+			In vulnerable processors, the speculatively
+			forwarded data can be used in a cache side channel
+			attack, to access data to which the attacker does
+			not have direct access.
+
+			This parameter controls the MDS mitigation. The
+			options are:
+
+			full    - Enable MDS mitigation on vulnerable CPUs
+			off     - Unconditionally disable MDS mitigation
+
+			Not specifying this option is equivalent to
+			mds=full.
+
 	mem=nn[KMG]	[KNL,BOOT] Force usage of a specific amount of memory
 			Amount of memory to be used when the kernel is not able
 			to see the whole system memory or for test.
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 33051436c864..1f0295783325 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -992,4 +992,9 @@ enum l1tf_mitigations {
 
 extern enum l1tf_mitigations l1tf_mitigation;
 
+enum mds_mitigations {
+	MDS_MITIGATION_OFF,
+	MDS_MITIGATION_FULL,
+};
+
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 916995167301..c7b29d200d27 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -37,6 +37,7 @@
 static void __init spectre_v2_select_mitigation(void);
 static void __init ssb_select_mitigation(void);
 static void __init l1tf_select_mitigation(void);
+static void __init mds_select_mitigation(void);
 
 /* The base value of the SPEC_CTRL MSR that always has to be preserved. */
 u64 x86_spec_ctrl_base;
@@ -108,6 +109,8 @@ void __init check_bugs(void)
 
 	l1tf_select_mitigation();
 
+	mds_select_mitigation();
+
 #ifdef CONFIG_X86_32
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
@@ -213,6 +216,50 @@ static void x86_amd_ssb_disable(void)
 		wrmsrl(MSR_AMD64_LS_CFG, msrval);
 }
 
+#undef pr_fmt
+#define pr_fmt(fmt)	"MDS: " fmt
+
+/* Default mitigation for L1TF-affected CPUs */
+static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
+
+static const char * const mds_strings[] = {
+	[MDS_MITIGATION_OFF]	= "Vulnerable",
+	[MDS_MITIGATION_FULL]	= "Mitigation: Clear CPU buffers"
+};
+
+static void __init mds_select_mitigation(void)
+{
+	if (!boot_cpu_has_bug(X86_BUG_MDS)) {
+		mds_mitigation = MDS_MITIGATION_OFF;
+		return;
+	}
+
+	if (mds_mitigation == MDS_MITIGATION_FULL) {
+		if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
+			static_branch_enable(&mds_user_clear);
+		else
+			mds_mitigation = MDS_MITIGATION_OFF;
+	}
+	pr_info("%s\n", mds_strings[mds_mitigation]);
+}
+
+static int __init mds_cmdline(char *str)
+{
+	if (!boot_cpu_has_bug(X86_BUG_MDS))
+		return 0;
+
+	if (!str)
+		return -EINVAL;
+
+	if (!strcmp(str, "off"))
+		mds_mitigation = MDS_MITIGATION_OFF;
+	else if (!strcmp(str, "full"))
+		mds_mitigation = MDS_MITIGATION_FULL;
+
+	return 0;
+}
+early_param("mds", mds_cmdline);
+
 #undef pr_fmt
 #define pr_fmt(fmt)     "Spectre V2 : " fmt
 
@@ -617,6 +664,26 @@ static void update_indir_branch_cond(void)
 		static_branch_disable(&switch_to_cond_stibp);
 }
 
+/* Update the static key controlling the MDS CPU buffer clear in idle */
+static void update_mds_branch_idle(void)
+{
+	/*
+	 * Enable the idle clearing if SMT is active on CPUs which are
+	 * affected only by MSBDS and not any other MDS variant.
+	 *
+	 * The other variants cannot be mitigated when SMT is enabled, so
+	 * clearing the buffers on idle just to prevent the Store Buffer
+	 * repartitioning leak would be a window dressing exercise.
+	 */
+	if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
+		return;
+
+	if (sched_smt_active())
+		static_branch_enable(&mds_idle_clear);
+	else
+		static_branch_disable(&mds_idle_clear);
+}
+
 void arch_smt_update(void)
 {
 	/* Enhanced IBRS implies STIBP. No update required. */
@@ -638,6 +705,9 @@ void arch_smt_update(void)
 		break;
 	}
 
+	if (mds_mitigation == MDS_MITIGATION_FULL)
+		update_mds_branch_idle();
+
 	mutex_unlock(&spec_ctrl_mutex);
 }
 
-- 
cgit v1.2.3


From 8a4b06d391b0a42a373808979b5028f5c84d9c6a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Feb 2019 22:51:43 +0100
Subject: x86/speculation/mds: Add sysfs reporting for MDS

Add the sysfs reporting file for MDS. It exposes the vulnerability and
mitigation state similar to the existing files for the other speculative
hardware vulnerabilities.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 Documentation/ABI/testing/sysfs-devices-system-cpu |  1 +
 arch/x86/kernel/cpu/bugs.c                         | 25 ++++++++++++++++++++++
 drivers/base/cpu.c                                 |  8 +++++++
 include/linux/cpu.h                                |  2 ++
 4 files changed, 36 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 9605dbd4b5b5..2db5c3407fd6 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -484,6 +484,7 @@ What:		/sys/devices/system/cpu/vulnerabilities
 		/sys/devices/system/cpu/vulnerabilities/spectre_v2
 		/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
 		/sys/devices/system/cpu/vulnerabilities/l1tf
+		/sys/devices/system/cpu/vulnerabilities/mds
 Date:		January 2018
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	Information about CPU vulnerabilities
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c7b29d200d27..7ab16a6ed064 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1172,6 +1172,22 @@ static ssize_t l1tf_show_state(char *buf)
 }
 #endif
 
+static ssize_t mds_show_state(char *buf)
+{
+	if (!hypervisor_is_type(X86_HYPER_NATIVE)) {
+		return sprintf(buf, "%s; SMT Host state unknown\n",
+			       mds_strings[mds_mitigation]);
+	}
+
+	if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
+		return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+			       sched_smt_active() ? "mitigated" : "disabled");
+	}
+
+	return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+		       sched_smt_active() ? "vulnerable" : "disabled");
+}
+
 static char *stibp_state(void)
 {
 	if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
@@ -1238,6 +1254,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
 		if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
 			return l1tf_show_state(buf);
 		break;
+
+	case X86_BUG_MDS:
+		return mds_show_state(buf);
+
 	default:
 		break;
 	}
@@ -1269,4 +1289,9 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b
 {
 	return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
 }
+
+ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
+}
 #endif
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index eb9443d5bae1..2fd6ca1021c2 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -546,11 +546,18 @@ ssize_t __weak cpu_show_l1tf(struct device *dev,
 	return sprintf(buf, "Not affected\n");
 }
 
+ssize_t __weak cpu_show_mds(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "Not affected\n");
+}
+
 static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
 static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
 static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
 static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
 static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
+static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL);
 
 static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_meltdown.attr,
@@ -558,6 +565,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_spectre_v2.attr,
 	&dev_attr_spec_store_bypass.attr,
 	&dev_attr_l1tf.attr,
+	&dev_attr_mds.attr,
 	NULL
 };
 
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 5041357d0297..3c87ad888ed3 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -57,6 +57,8 @@ extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
 					  struct device_attribute *attr, char *buf);
 extern ssize_t cpu_show_l1tf(struct device *dev,
 			     struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_mds(struct device *dev,
+			    struct device_attribute *attr, char *buf);
 
 extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,
-- 
cgit v1.2.3


From 22dd8365088b6403630b82423cf906491859b65e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 Feb 2019 09:40:40 +0100
Subject: x86/speculation/mds: Add mitigation mode VMWERV

In virtualized environments it can happen that the host has the microcode
update which utilizes the VERW instruction to clear CPU buffers, but the
hypervisor is not yet updated to expose the X86_FEATURE_MD_CLEAR CPUID bit
to guests.

Introduce an internal mitigation mode VMWERV which enables the invocation
of the CPU buffer clearing even if X86_FEATURE_MD_CLEAR is not set. If the
system has no updated microcode this results in a pointless execution of
the VERW instruction wasting a few CPU cycles. If the microcode is updated,
but not exposed to a guest then the CPU buffers will be cleared.

That said: Virtual Machines Will Eventually Receive Vaccine

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>
---
 Documentation/x86/mds.rst        | 27 +++++++++++++++++++++++++++
 arch/x86/include/asm/processor.h |  1 +
 arch/x86/kernel/cpu/bugs.c       | 18 ++++++++++++------
 3 files changed, 40 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst
index 87ce8ac9f36e..3d6f943f1afb 100644
--- a/Documentation/x86/mds.rst
+++ b/Documentation/x86/mds.rst
@@ -93,11 +93,38 @@ The kernel provides a function to invoke the buffer clearing:
 The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state
 (idle) transitions.
 
+As a special quirk to address virtualization scenarios where the host has
+the microcode updated, but the hypervisor does not (yet) expose the
+MD_CLEAR CPUID bit to guests, the kernel issues the VERW instruction in the
+hope that it might actually clear the buffers. The state is reflected
+accordingly.
+
 According to current knowledge additional mitigations inside the kernel
 itself are not required because the necessary gadgets to expose the leaked
 data cannot be controlled in a way which allows exploitation from malicious
 user space or VM guests.
 
+Kernel internal mitigation modes
+--------------------------------
+
+ ======= ============================================================
+ off      Mitigation is disabled. Either the CPU is not affected or
+          mds=off is supplied on the kernel command line
+
+ full     Mitigation is eanbled. CPU is affected and MD_CLEAR is
+          advertised in CPUID.
+
+ vmwerv	  Mitigation is enabled. CPU is affected and MD_CLEAR is not
+	  advertised in CPUID. That is mainly for virtualization
+	  scenarios where the host has the updated microcode but the
+	  hypervisor does not expose MD_CLEAR in CPUID. It's a best
+	  effort approach without guarantee.
+ ======= ============================================================
+
+If the CPU is affected and mds=off is not supplied on the kernel command
+line then the kernel selects the appropriate mitigation mode depending on
+the availability of the MD_CLEAR CPUID bit.
+
 Mitigation points
 -----------------
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 1f0295783325..aca1ef8cc79f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -995,6 +995,7 @@ extern enum l1tf_mitigations l1tf_mitigation;
 enum mds_mitigations {
 	MDS_MITIGATION_OFF,
 	MDS_MITIGATION_FULL,
+	MDS_MITIGATION_VMWERV,
 };
 
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 7ab16a6ed064..95cda38c8785 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -224,7 +224,8 @@ static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL
 
 static const char * const mds_strings[] = {
 	[MDS_MITIGATION_OFF]	= "Vulnerable",
-	[MDS_MITIGATION_FULL]	= "Mitigation: Clear CPU buffers"
+	[MDS_MITIGATION_FULL]	= "Mitigation: Clear CPU buffers",
+	[MDS_MITIGATION_VMWERV]	= "Vulnerable: Clear CPU buffers attempted, no microcode",
 };
 
 static void __init mds_select_mitigation(void)
@@ -235,10 +236,9 @@ static void __init mds_select_mitigation(void)
 	}
 
 	if (mds_mitigation == MDS_MITIGATION_FULL) {
-		if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
-			static_branch_enable(&mds_user_clear);
-		else
-			mds_mitigation = MDS_MITIGATION_OFF;
+		if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
+			mds_mitigation = MDS_MITIGATION_VMWERV;
+		static_branch_enable(&mds_user_clear);
 	}
 	pr_info("%s\n", mds_strings[mds_mitigation]);
 }
@@ -705,8 +705,14 @@ void arch_smt_update(void)
 		break;
 	}
 
-	if (mds_mitigation == MDS_MITIGATION_FULL)
+	switch (mds_mitigation) {
+	case MDS_MITIGATION_FULL:
+	case MDS_MITIGATION_VMWERV:
 		update_mds_branch_idle();
+		break;
+	case MDS_MITIGATION_OFF:
+		break;
+	}
 
 	mutex_unlock(&spec_ctrl_mutex);
 }
-- 
cgit v1.2.3


From 65fd4cb65b2dad97feb8330b6690445910b56d6a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 19 Feb 2019 11:10:49 +0100
Subject: Documentation: Move L1TF to separate directory

Move L!TF to a separate directory so the MDS stuff can be added at the
side. Otherwise the all hardware vulnerabilites have their own top level
entry. Should have done that right away.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
---
 Documentation/ABI/testing/sysfs-devices-system-cpu |   2 +-
 Documentation/admin-guide/hw-vuln/index.rst        |  12 +
 Documentation/admin-guide/hw-vuln/l1tf.rst         | 614 +++++++++++++++++++++
 Documentation/admin-guide/index.rst                |   6 +-
 Documentation/admin-guide/kernel-parameters.txt    |   2 +-
 Documentation/admin-guide/l1tf.rst                 | 614 ---------------------
 arch/x86/kernel/cpu/bugs.c                         |   2 +-
 arch/x86/kvm/vmx/vmx.c                             |   4 +-
 8 files changed, 633 insertions(+), 623 deletions(-)
 create mode 100644 Documentation/admin-guide/hw-vuln/index.rst
 create mode 100644 Documentation/admin-guide/hw-vuln/l1tf.rst
 delete mode 100644 Documentation/admin-guide/l1tf.rst

(limited to 'arch/x86/kernel')

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 2db5c3407fd6..744c6d764b0c 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -498,7 +498,7 @@ Description:	Information about CPU vulnerabilities
 		"Mitigation: $M"  CPU is affected and mitigation $M is in effect
 
 		Details about the l1tf file can be found in
-		Documentation/admin-guide/l1tf.rst
+		Documentation/admin-guide/hw-vuln/l1tf.rst
 
 What:		/sys/devices/system/cpu/smt
 		/sys/devices/system/cpu/smt/active
diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
new file mode 100644
index 000000000000..8ce2009f1981
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -0,0 +1,12 @@
+========================
+Hardware vulnerabilities
+========================
+
+This section describes CPU vulnerabilities and provides an overview of the
+possible mitigations along with guidance for selecting mitigations if they
+are configurable at compile, boot or run time.
+
+.. toctree::
+   :maxdepth: 1
+
+   l1tf
diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst
new file mode 100644
index 000000000000..9af977384168
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/l1tf.rst
@@ -0,0 +1,614 @@
+L1TF - L1 Terminal Fault
+========================
+
+L1 Terminal Fault is a hardware vulnerability which allows unprivileged
+speculative access to data which is available in the Level 1 Data Cache
+when the page table entry controlling the virtual address, which is used
+for the access, has the Present bit cleared or other reserved bits set.
+
+Affected processors
+-------------------
+
+This vulnerability affects a wide range of Intel processors. The
+vulnerability is not present on:
+
+   - Processors from AMD, Centaur and other non Intel vendors
+
+   - Older processor models, where the CPU family is < 6
+
+   - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
+     Penwell, Pineview, Silvermont, Airmont, Merrifield)
+
+   - The Intel XEON PHI family
+
+   - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
+     IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
+     by the Meltdown vulnerability either. These CPUs should become
+     available by end of 2018.
+
+Whether a processor is affected or not can be read out from the L1TF
+vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
+
+Related CVEs
+------------
+
+The following CVE entries are related to the L1TF vulnerability:
+
+   =============  =================  ==============================
+   CVE-2018-3615  L1 Terminal Fault  SGX related aspects
+   CVE-2018-3620  L1 Terminal Fault  OS, SMM related aspects
+   CVE-2018-3646  L1 Terminal Fault  Virtualization related aspects
+   =============  =================  ==============================
+
+Problem
+-------
+
+If an instruction accesses a virtual address for which the relevant page
+table entry (PTE) has the Present bit cleared or other reserved bits set,
+then speculative execution ignores the invalid PTE and loads the referenced
+data if it is present in the Level 1 Data Cache, as if the page referenced
+by the address bits in the PTE was still present and accessible.
+
+While this is a purely speculative mechanism and the instruction will raise
+a page fault when it is retired eventually, the pure act of loading the
+data and making it available to other speculative instructions opens up the
+opportunity for side channel attacks to unprivileged malicious code,
+similar to the Meltdown attack.
+
+While Meltdown breaks the user space to kernel space protection, L1TF
+allows to attack any physical memory address in the system and the attack
+works across all protection domains. It allows an attack of SGX and also
+works from inside virtual machines because the speculation bypasses the
+extended page table (EPT) protection mechanism.
+
+
+Attack scenarios
+----------------
+
+1. Malicious user space
+^^^^^^^^^^^^^^^^^^^^^^^
+
+   Operating Systems store arbitrary information in the address bits of a
+   PTE which is marked non present. This allows a malicious user space
+   application to attack the physical memory to which these PTEs resolve.
+   In some cases user-space can maliciously influence the information
+   encoded in the address bits of the PTE, thus making attacks more
+   deterministic and more practical.
+
+   The Linux kernel contains a mitigation for this attack vector, PTE
+   inversion, which is permanently enabled and has no performance
+   impact. The kernel ensures that the address bits of PTEs, which are not
+   marked present, never point to cacheable physical memory space.
+
+   A system with an up to date kernel is protected against attacks from
+   malicious user space applications.
+
+2. Malicious guest in a virtual machine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   The fact that L1TF breaks all domain protections allows malicious guest
+   OSes, which can control the PTEs directly, and malicious guest user
+   space applications, which run on an unprotected guest kernel lacking the
+   PTE inversion mitigation for L1TF, to attack physical host memory.
+
+   A special aspect of L1TF in the context of virtualization is symmetric
+   multi threading (SMT). The Intel implementation of SMT is called
+   HyperThreading. The fact that Hyperthreads on the affected processors
+   share the L1 Data Cache (L1D) is important for this. As the flaw allows
+   only to attack data which is present in L1D, a malicious guest running
+   on one Hyperthread can attack the data which is brought into the L1D by
+   the context which runs on the sibling Hyperthread of the same physical
+   core. This context can be host OS, host user space or a different guest.
+
+   If the processor does not support Extended Page Tables, the attack is
+   only possible, when the hypervisor does not sanitize the content of the
+   effective (shadow) page tables.
+
+   While solutions exist to mitigate these attack vectors fully, these
+   mitigations are not enabled by default in the Linux kernel because they
+   can affect performance significantly. The kernel provides several
+   mechanisms which can be utilized to address the problem depending on the
+   deployment scenario. The mitigations, their protection scope and impact
+   are described in the next sections.
+
+   The default mitigations and the rationale for choosing them are explained
+   at the end of this document. See :ref:`default_mitigations`.
+
+.. _l1tf_sys_info:
+
+L1TF system information
+-----------------------
+
+The Linux kernel provides a sysfs interface to enumerate the current L1TF
+status of the system: whether the system is vulnerable, and which
+mitigations are active. The relevant sysfs file is:
+
+/sys/devices/system/cpu/vulnerabilities/l1tf
+
+The possible values in this file are:
+
+  ===========================   ===============================
+  'Not affected'		The processor is not vulnerable
+  'Mitigation: PTE Inversion'	The host protection is active
+  ===========================   ===============================
+
+If KVM/VMX is enabled and the processor is vulnerable then the following
+information is appended to the 'Mitigation: PTE Inversion' part:
+
+  - SMT status:
+
+    =====================  ================
+    'VMX: SMT vulnerable'  SMT is enabled
+    'VMX: SMT disabled'    SMT is disabled
+    =====================  ================
+
+  - L1D Flush mode:
+
+    ================================  ====================================
+    'L1D vulnerable'		      L1D flushing is disabled
+
+    'L1D conditional cache flushes'   L1D flush is conditionally enabled
+
+    'L1D cache flushes'		      L1D flush is unconditionally enabled
+    ================================  ====================================
+
+The resulting grade of protection is discussed in the following sections.
+
+
+Host mitigation mechanism
+-------------------------
+
+The kernel is unconditionally protected against L1TF attacks from malicious
+user space running on the host.
+
+
+Guest mitigation mechanisms
+---------------------------
+
+.. _l1d_flush:
+
+1. L1D flush on VMENTER
+^^^^^^^^^^^^^^^^^^^^^^^
+
+   To make sure that a guest cannot attack data which is present in the L1D
+   the hypervisor flushes the L1D before entering the guest.
+
+   Flushing the L1D evicts not only the data which should not be accessed
+   by a potentially malicious guest, it also flushes the guest
+   data. Flushing the L1D has a performance impact as the processor has to
+   bring the flushed guest data back into the L1D. Depending on the
+   frequency of VMEXIT/VMENTER and the type of computations in the guest
+   performance degradation in the range of 1% to 50% has been observed. For
+   scenarios where guest VMEXIT/VMENTER are rare the performance impact is
+   minimal. Virtio and mechanisms like posted interrupts are designed to
+   confine the VMEXITs to a bare minimum, but specific configurations and
+   application scenarios might still suffer from a high VMEXIT rate.
+
+   The kernel provides two L1D flush modes:
+    - conditional ('cond')
+    - unconditional ('always')
+
+   The conditional mode avoids L1D flushing after VMEXITs which execute
+   only audited code paths before the corresponding VMENTER. These code
+   paths have been verified that they cannot expose secrets or other
+   interesting data to an attacker, but they can leak information about the
+   address space layout of the hypervisor.
+
+   Unconditional mode flushes L1D on all VMENTER invocations and provides
+   maximum protection. It has a higher overhead than the conditional
+   mode. The overhead cannot be quantified correctly as it depends on the
+   workload scenario and the resulting number of VMEXITs.
+
+   The general recommendation is to enable L1D flush on VMENTER. The kernel
+   defaults to conditional mode on affected processors.
+
+   **Note**, that L1D flush does not prevent the SMT problem because the
+   sibling thread will also bring back its data into the L1D which makes it
+   attackable again.
+
+   L1D flush can be controlled by the administrator via the kernel command
+   line and sysfs control files. See :ref:`mitigation_control_command_line`
+   and :ref:`mitigation_control_kvm`.
+
+.. _guest_confinement:
+
+2. Guest VCPU confinement to dedicated physical cores
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   To address the SMT problem, it is possible to make a guest or a group of
+   guests affine to one or more physical cores. The proper mechanism for
+   that is to utilize exclusive cpusets to ensure that no other guest or
+   host tasks can run on these cores.
+
+   If only a single guest or related guests run on sibling SMT threads on
+   the same physical core then they can only attack their own memory and
+   restricted parts of the host memory.
+
+   Host memory is attackable, when one of the sibling SMT threads runs in
+   host OS (hypervisor) context and the other in guest context. The amount
+   of valuable information from the host OS context depends on the context
+   which the host OS executes, i.e. interrupts, soft interrupts and kernel
+   threads. The amount of valuable data from these contexts cannot be
+   declared as non-interesting for an attacker without deep inspection of
+   the code.
+
+   **Note**, that assigning guests to a fixed set of physical cores affects
+   the ability of the scheduler to do load balancing and might have
+   negative effects on CPU utilization depending on the hosting
+   scenario. Disabling SMT might be a viable alternative for particular
+   scenarios.
+
+   For further information about confining guests to a single or to a group
+   of cores consult the cpusets documentation:
+
+   https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
+
+.. _interrupt_isolation:
+
+3. Interrupt affinity
+^^^^^^^^^^^^^^^^^^^^^
+
+   Interrupts can be made affine to logical CPUs. This is not universally
+   true because there are types of interrupts which are truly per CPU
+   interrupts, e.g. the local timer interrupt. Aside of that multi queue
+   devices affine their interrupts to single CPUs or groups of CPUs per
+   queue without allowing the administrator to control the affinities.
+
+   Moving the interrupts, which can be affinity controlled, away from CPUs
+   which run untrusted guests, reduces the attack vector space.
+
+   Whether the interrupts with are affine to CPUs, which run untrusted
+   guests, provide interesting data for an attacker depends on the system
+   configuration and the scenarios which run on the system. While for some
+   of the interrupts it can be assumed that they won't expose interesting
+   information beyond exposing hints about the host OS memory layout, there
+   is no way to make general assumptions.
+
+   Interrupt affinity can be controlled by the administrator via the
+   /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
+   available at:
+
+   https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
+
+.. _smt_control:
+
+4. SMT control
+^^^^^^^^^^^^^^
+
+   To prevent the SMT issues of L1TF it might be necessary to disable SMT
+   completely. Disabling SMT can have a significant performance impact, but
+   the impact depends on the hosting scenario and the type of workloads.
+   The impact of disabling SMT needs also to be weighted against the impact
+   of other mitigation solutions like confining guests to dedicated cores.
+
+   The kernel provides a sysfs interface to retrieve the status of SMT and
+   to control it. It also provides a kernel command line interface to
+   control SMT.
+
+   The kernel command line interface consists of the following options:
+
+     =========== ==========================================================
+     nosmt	 Affects the bring up of the secondary CPUs during boot. The
+		 kernel tries to bring all present CPUs online during the
+		 boot process. "nosmt" makes sure that from each physical
+		 core only one - the so called primary (hyper) thread is
+		 activated. Due to a design flaw of Intel processors related
+		 to Machine Check Exceptions the non primary siblings have
+		 to be brought up at least partially and are then shut down
+		 again.  "nosmt" can be undone via the sysfs interface.
+
+     nosmt=force Has the same effect as "nosmt" but it does not allow to
+		 undo the SMT disable via the sysfs interface.
+     =========== ==========================================================
+
+   The sysfs interface provides two files:
+
+   - /sys/devices/system/cpu/smt/control
+   - /sys/devices/system/cpu/smt/active
+
+   /sys/devices/system/cpu/smt/control:
+
+     This file allows to read out the SMT control state and provides the
+     ability to disable or (re)enable SMT. The possible states are:
+
+	==============  ===================================================
+	on		SMT is supported by the CPU and enabled. All
+			logical CPUs can be onlined and offlined without
+			restrictions.
+
+	off		SMT is supported by the CPU and disabled. Only
+			the so called primary SMT threads can be onlined
+			and offlined without restrictions. An attempt to
+			online a non-primary sibling is rejected
+
+	forceoff	Same as 'off' but the state cannot be controlled.
+			Attempts to write to the control file are rejected.
+
+	notsupported	The processor does not support SMT. It's therefore
+			not affected by the SMT implications of L1TF.
+			Attempts to write to the control file are rejected.
+	==============  ===================================================
+
+     The possible states which can be written into this file to control SMT
+     state are:
+
+     - on
+     - off
+     - forceoff
+
+   /sys/devices/system/cpu/smt/active:
+
+     This file reports whether SMT is enabled and active, i.e. if on any
+     physical core two or more sibling threads are online.
+
+   SMT control is also possible at boot time via the l1tf kernel command
+   line parameter in combination with L1D flush control. See
+   :ref:`mitigation_control_command_line`.
+
+5. Disabling EPT
+^^^^^^^^^^^^^^^^
+
+  Disabling EPT for virtual machines provides full mitigation for L1TF even
+  with SMT enabled, because the effective page tables for guests are
+  managed and sanitized by the hypervisor. Though disabling EPT has a
+  significant performance impact especially when the Meltdown mitigation
+  KPTI is enabled.
+
+  EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
+
+There is ongoing research and development for new mitigation mechanisms to
+address the performance impact of disabling SMT or EPT.
+
+.. _mitigation_control_command_line:
+
+Mitigation control on the kernel command line
+---------------------------------------------
+
+The kernel command line allows to control the L1TF mitigations at boot
+time with the option "l1tf=". The valid arguments for this option are:
+
+  ============  =============================================================
+  full		Provides all available mitigations for the L1TF
+		vulnerability. Disables SMT and enables all mitigations in
+		the hypervisors, i.e. unconditional L1D flushing
+
+		SMT control and L1D flush control via the sysfs interface
+		is still possible after boot.  Hypervisors will issue a
+		warning when the first VM is started in a potentially
+		insecure configuration, i.e. SMT enabled or L1D flush
+		disabled.
+
+  full,force	Same as 'full', but disables SMT and L1D flush runtime
+		control. Implies the 'nosmt=force' command line option.
+		(i.e. sysfs control of SMT is disabled.)
+
+  flush		Leaves SMT enabled and enables the default hypervisor
+		mitigation, i.e. conditional L1D flushing
+
+		SMT control and L1D flush control via the sysfs interface
+		is still possible after boot.  Hypervisors will issue a
+		warning when the first VM is started in a potentially
+		insecure configuration, i.e. SMT enabled or L1D flush
+		disabled.
+
+  flush,nosmt	Disables SMT and enables the default hypervisor mitigation,
+		i.e. conditional L1D flushing.
+
+		SMT control and L1D flush control via the sysfs interface
+		is still possible after boot.  Hypervisors will issue a
+		warning when the first VM is started in a potentially
+		insecure configuration, i.e. SMT enabled or L1D flush
+		disabled.
+
+  flush,nowarn	Same as 'flush', but hypervisors will not warn when a VM is
+		started in a potentially insecure configuration.
+
+  off		Disables hypervisor mitigations and doesn't emit any
+		warnings.
+		It also drops the swap size and available RAM limit restrictions
+		on both hypervisor and bare metal.
+
+  ============  =============================================================
+
+The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
+
+
+.. _mitigation_control_kvm:
+
+Mitigation control for KVM - module parameter
+-------------------------------------------------------------
+
+The KVM hypervisor mitigation mechanism, flushing the L1D cache when
+entering a guest, can be controlled with a module parameter.
+
+The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
+following arguments:
+
+  ============  ==============================================================
+  always	L1D cache flush on every VMENTER.
+
+  cond		Flush L1D on VMENTER only when the code between VMEXIT and
+		VMENTER can leak host memory which is considered
+		interesting for an attacker. This still can leak host memory
+		which allows e.g. to determine the hosts address space layout.
+
+  never		Disables the mitigation
+  ============  ==============================================================
+
+The parameter can be provided on the kernel command line, as a module
+parameter when loading the modules and at runtime modified via the sysfs
+file:
+
+/sys/module/kvm_intel/parameters/vmentry_l1d_flush
+
+The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
+line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
+module parameter is ignored and writes to the sysfs file are rejected.
+
+
+Mitigation selection guide
+--------------------------
+
+1. No virtualization in use
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   The system is protected by the kernel unconditionally and no further
+   action is required.
+
+2. Virtualization with trusted guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   If the guest comes from a trusted source and the guest OS kernel is
+   guaranteed to have the L1TF mitigations in place the system is fully
+   protected against L1TF and no further action is required.
+
+   To avoid the overhead of the default L1D flushing on VMENTER the
+   administrator can disable the flushing via the kernel command line and
+   sysfs control files. See :ref:`mitigation_control_command_line` and
+   :ref:`mitigation_control_kvm`.
+
+
+3. Virtualization with untrusted guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+3.1. SMT not supported or disabled
+""""""""""""""""""""""""""""""""""
+
+  If SMT is not supported by the processor or disabled in the BIOS or by
+  the kernel, it's only required to enforce L1D flushing on VMENTER.
+
+  Conditional L1D flushing is the default behaviour and can be tuned. See
+  :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
+
+3.2. EPT not supported or disabled
+""""""""""""""""""""""""""""""""""
+
+  If EPT is not supported by the processor or disabled in the hypervisor,
+  the system is fully protected. SMT can stay enabled and L1D flushing on
+  VMENTER is not required.
+
+  EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
+
+3.3. SMT and EPT supported and active
+"""""""""""""""""""""""""""""""""""""
+
+  If SMT and EPT are supported and active then various degrees of
+  mitigations can be employed:
+
+  - L1D flushing on VMENTER:
+
+    L1D flushing on VMENTER is the minimal protection requirement, but it
+    is only potent in combination with other mitigation methods.
+
+    Conditional L1D flushing is the default behaviour and can be tuned. See
+    :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
+
+  - Guest confinement:
+
+    Confinement of guests to a single or a group of physical cores which
+    are not running any other processes, can reduce the attack surface
+    significantly, but interrupts, soft interrupts and kernel threads can
+    still expose valuable data to a potential attacker. See
+    :ref:`guest_confinement`.
+
+  - Interrupt isolation:
+
+    Isolating the guest CPUs from interrupts can reduce the attack surface
+    further, but still allows a malicious guest to explore a limited amount
+    of host physical memory. This can at least be used to gain knowledge
+    about the host address space layout. The interrupts which have a fixed
+    affinity to the CPUs which run the untrusted guests can depending on
+    the scenario still trigger soft interrupts and schedule kernel threads
+    which might expose valuable information. See
+    :ref:`interrupt_isolation`.
+
+The above three mitigation methods combined can provide protection to a
+certain degree, but the risk of the remaining attack surface has to be
+carefully analyzed. For full protection the following methods are
+available:
+
+  - Disabling SMT:
+
+    Disabling SMT and enforcing the L1D flushing provides the maximum
+    amount of protection. This mitigation is not depending on any of the
+    above mitigation methods.
+
+    SMT control and L1D flushing can be tuned by the command line
+    parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
+    time with the matching sysfs control files. See :ref:`smt_control`,
+    :ref:`mitigation_control_command_line` and
+    :ref:`mitigation_control_kvm`.
+
+  - Disabling EPT:
+
+    Disabling EPT provides the maximum amount of protection as well. It is
+    not depending on any of the above mitigation methods. SMT can stay
+    enabled and L1D flushing is not required, but the performance impact is
+    significant.
+
+    EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
+    parameter.
+
+3.4. Nested virtual machines
+""""""""""""""""""""""""""""
+
+When nested virtualization is in use, three operating systems are involved:
+the bare metal hypervisor, the nested hypervisor and the nested virtual
+machine.  VMENTER operations from the nested hypervisor into the nested
+guest will always be processed by the bare metal hypervisor. If KVM is the
+bare metal hypervisor it will:
+
+ - Flush the L1D cache on every switch from the nested hypervisor to the
+   nested virtual machine, so that the nested hypervisor's secrets are not
+   exposed to the nested virtual machine;
+
+ - Flush the L1D cache on every switch from the nested virtual machine to
+   the nested hypervisor; this is a complex operation, and flushing the L1D
+   cache avoids that the bare metal hypervisor's secrets are exposed to the
+   nested virtual machine;
+
+ - Instruct the nested hypervisor to not perform any L1D cache flush. This
+   is an optimization to avoid double L1D flushing.
+
+
+.. _default_mitigations:
+
+Default mitigations
+-------------------
+
+  The kernel default mitigations for vulnerable processors are:
+
+  - PTE inversion to protect against malicious user space. This is done
+    unconditionally and cannot be controlled. The swap storage is limited
+    to ~16TB.
+
+  - L1D conditional flushing on VMENTER when EPT is enabled for
+    a guest.
+
+  The kernel does not by default enforce the disabling of SMT, which leaves
+  SMT systems vulnerable when running untrusted guests with EPT enabled.
+
+  The rationale for this choice is:
+
+  - Force disabling SMT can break existing setups, especially with
+    unattended updates.
+
+  - If regular users run untrusted guests on their machine, then L1TF is
+    just an add on to other malware which might be embedded in an untrusted
+    guest, e.g. spam-bots or attacks on the local network.
+
+    There is no technical way to prevent a user from running untrusted code
+    on their machines blindly.
+
+  - It's technically extremely unlikely and from today's knowledge even
+    impossible that L1TF can be exploited via the most popular attack
+    mechanisms like JavaScript because these mechanisms have no way to
+    control PTEs. If this would be possible and not other mitigation would
+    be possible, then the default might be different.
+
+  - The administrators of cloud and hosting setups have to carefully
+    analyze the risk for their scenarios and make the appropriate
+    mitigation choices, which might even vary across their deployed
+    machines and also result in other changes of their overall setup.
+    There is no way for the kernel to provide a sensible default for this
+    kind of scenarios.
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 0a491676685e..42247516962a 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -17,14 +17,12 @@ etc.
    kernel-parameters
    devices
 
-This section describes CPU vulnerabilities and provides an overview of the
-possible mitigations along with guidance for selecting mitigations if they
-are configurable at compile, boot or run time.
+This section describes CPU vulnerabilities and their mitigations.
 
 .. toctree::
    :maxdepth: 1
 
-   l1tf
+   hw-vuln/index
 
 Here is a set of documents aimed at users who are trying to track down
 problems and bugs in particular.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index dddb024eb523..9afcb240a673 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2114,7 +2114,7 @@
 
 			Default is 'flush'.
 
-			For details see: Documentation/admin-guide/l1tf.rst
+			For details see: Documentation/admin-guide/hw-vuln/l1tf.rst
 
 	l2cr=		[PPC]
 
diff --git a/Documentation/admin-guide/l1tf.rst b/Documentation/admin-guide/l1tf.rst
deleted file mode 100644
index 9af977384168..000000000000
--- a/Documentation/admin-guide/l1tf.rst
+++ /dev/null
@@ -1,614 +0,0 @@
-L1TF - L1 Terminal Fault
-========================
-
-L1 Terminal Fault is a hardware vulnerability which allows unprivileged
-speculative access to data which is available in the Level 1 Data Cache
-when the page table entry controlling the virtual address, which is used
-for the access, has the Present bit cleared or other reserved bits set.
-
-Affected processors
--------------------
-
-This vulnerability affects a wide range of Intel processors. The
-vulnerability is not present on:
-
-   - Processors from AMD, Centaur and other non Intel vendors
-
-   - Older processor models, where the CPU family is < 6
-
-   - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
-     Penwell, Pineview, Silvermont, Airmont, Merrifield)
-
-   - The Intel XEON PHI family
-
-   - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
-     IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
-     by the Meltdown vulnerability either. These CPUs should become
-     available by end of 2018.
-
-Whether a processor is affected or not can be read out from the L1TF
-vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
-
-Related CVEs
-------------
-
-The following CVE entries are related to the L1TF vulnerability:
-
-   =============  =================  ==============================
-   CVE-2018-3615  L1 Terminal Fault  SGX related aspects
-   CVE-2018-3620  L1 Terminal Fault  OS, SMM related aspects
-   CVE-2018-3646  L1 Terminal Fault  Virtualization related aspects
-   =============  =================  ==============================
-
-Problem
--------
-
-If an instruction accesses a virtual address for which the relevant page
-table entry (PTE) has the Present bit cleared or other reserved bits set,
-then speculative execution ignores the invalid PTE and loads the referenced
-data if it is present in the Level 1 Data Cache, as if the page referenced
-by the address bits in the PTE was still present and accessible.
-
-While this is a purely speculative mechanism and the instruction will raise
-a page fault when it is retired eventually, the pure act of loading the
-data and making it available to other speculative instructions opens up the
-opportunity for side channel attacks to unprivileged malicious code,
-similar to the Meltdown attack.
-
-While Meltdown breaks the user space to kernel space protection, L1TF
-allows to attack any physical memory address in the system and the attack
-works across all protection domains. It allows an attack of SGX and also
-works from inside virtual machines because the speculation bypasses the
-extended page table (EPT) protection mechanism.
-
-
-Attack scenarios
-----------------
-
-1. Malicious user space
-^^^^^^^^^^^^^^^^^^^^^^^
-
-   Operating Systems store arbitrary information in the address bits of a
-   PTE which is marked non present. This allows a malicious user space
-   application to attack the physical memory to which these PTEs resolve.
-   In some cases user-space can maliciously influence the information
-   encoded in the address bits of the PTE, thus making attacks more
-   deterministic and more practical.
-
-   The Linux kernel contains a mitigation for this attack vector, PTE
-   inversion, which is permanently enabled and has no performance
-   impact. The kernel ensures that the address bits of PTEs, which are not
-   marked present, never point to cacheable physical memory space.
-
-   A system with an up to date kernel is protected against attacks from
-   malicious user space applications.
-
-2. Malicious guest in a virtual machine
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   The fact that L1TF breaks all domain protections allows malicious guest
-   OSes, which can control the PTEs directly, and malicious guest user
-   space applications, which run on an unprotected guest kernel lacking the
-   PTE inversion mitigation for L1TF, to attack physical host memory.
-
-   A special aspect of L1TF in the context of virtualization is symmetric
-   multi threading (SMT). The Intel implementation of SMT is called
-   HyperThreading. The fact that Hyperthreads on the affected processors
-   share the L1 Data Cache (L1D) is important for this. As the flaw allows
-   only to attack data which is present in L1D, a malicious guest running
-   on one Hyperthread can attack the data which is brought into the L1D by
-   the context which runs on the sibling Hyperthread of the same physical
-   core. This context can be host OS, host user space or a different guest.
-
-   If the processor does not support Extended Page Tables, the attack is
-   only possible, when the hypervisor does not sanitize the content of the
-   effective (shadow) page tables.
-
-   While solutions exist to mitigate these attack vectors fully, these
-   mitigations are not enabled by default in the Linux kernel because they
-   can affect performance significantly. The kernel provides several
-   mechanisms which can be utilized to address the problem depending on the
-   deployment scenario. The mitigations, their protection scope and impact
-   are described in the next sections.
-
-   The default mitigations and the rationale for choosing them are explained
-   at the end of this document. See :ref:`default_mitigations`.
-
-.. _l1tf_sys_info:
-
-L1TF system information
------------------------
-
-The Linux kernel provides a sysfs interface to enumerate the current L1TF
-status of the system: whether the system is vulnerable, and which
-mitigations are active. The relevant sysfs file is:
-
-/sys/devices/system/cpu/vulnerabilities/l1tf
-
-The possible values in this file are:
-
-  ===========================   ===============================
-  'Not affected'		The processor is not vulnerable
-  'Mitigation: PTE Inversion'	The host protection is active
-  ===========================   ===============================
-
-If KVM/VMX is enabled and the processor is vulnerable then the following
-information is appended to the 'Mitigation: PTE Inversion' part:
-
-  - SMT status:
-
-    =====================  ================
-    'VMX: SMT vulnerable'  SMT is enabled
-    'VMX: SMT disabled'    SMT is disabled
-    =====================  ================
-
-  - L1D Flush mode:
-
-    ================================  ====================================
-    'L1D vulnerable'		      L1D flushing is disabled
-
-    'L1D conditional cache flushes'   L1D flush is conditionally enabled
-
-    'L1D cache flushes'		      L1D flush is unconditionally enabled
-    ================================  ====================================
-
-The resulting grade of protection is discussed in the following sections.
-
-
-Host mitigation mechanism
--------------------------
-
-The kernel is unconditionally protected against L1TF attacks from malicious
-user space running on the host.
-
-
-Guest mitigation mechanisms
----------------------------
-
-.. _l1d_flush:
-
-1. L1D flush on VMENTER
-^^^^^^^^^^^^^^^^^^^^^^^
-
-   To make sure that a guest cannot attack data which is present in the L1D
-   the hypervisor flushes the L1D before entering the guest.
-
-   Flushing the L1D evicts not only the data which should not be accessed
-   by a potentially malicious guest, it also flushes the guest
-   data. Flushing the L1D has a performance impact as the processor has to
-   bring the flushed guest data back into the L1D. Depending on the
-   frequency of VMEXIT/VMENTER and the type of computations in the guest
-   performance degradation in the range of 1% to 50% has been observed. For
-   scenarios where guest VMEXIT/VMENTER are rare the performance impact is
-   minimal. Virtio and mechanisms like posted interrupts are designed to
-   confine the VMEXITs to a bare minimum, but specific configurations and
-   application scenarios might still suffer from a high VMEXIT rate.
-
-   The kernel provides two L1D flush modes:
-    - conditional ('cond')
-    - unconditional ('always')
-
-   The conditional mode avoids L1D flushing after VMEXITs which execute
-   only audited code paths before the corresponding VMENTER. These code
-   paths have been verified that they cannot expose secrets or other
-   interesting data to an attacker, but they can leak information about the
-   address space layout of the hypervisor.
-
-   Unconditional mode flushes L1D on all VMENTER invocations and provides
-   maximum protection. It has a higher overhead than the conditional
-   mode. The overhead cannot be quantified correctly as it depends on the
-   workload scenario and the resulting number of VMEXITs.
-
-   The general recommendation is to enable L1D flush on VMENTER. The kernel
-   defaults to conditional mode on affected processors.
-
-   **Note**, that L1D flush does not prevent the SMT problem because the
-   sibling thread will also bring back its data into the L1D which makes it
-   attackable again.
-
-   L1D flush can be controlled by the administrator via the kernel command
-   line and sysfs control files. See :ref:`mitigation_control_command_line`
-   and :ref:`mitigation_control_kvm`.
-
-.. _guest_confinement:
-
-2. Guest VCPU confinement to dedicated physical cores
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   To address the SMT problem, it is possible to make a guest or a group of
-   guests affine to one or more physical cores. The proper mechanism for
-   that is to utilize exclusive cpusets to ensure that no other guest or
-   host tasks can run on these cores.
-
-   If only a single guest or related guests run on sibling SMT threads on
-   the same physical core then they can only attack their own memory and
-   restricted parts of the host memory.
-
-   Host memory is attackable, when one of the sibling SMT threads runs in
-   host OS (hypervisor) context and the other in guest context. The amount
-   of valuable information from the host OS context depends on the context
-   which the host OS executes, i.e. interrupts, soft interrupts and kernel
-   threads. The amount of valuable data from these contexts cannot be
-   declared as non-interesting for an attacker without deep inspection of
-   the code.
-
-   **Note**, that assigning guests to a fixed set of physical cores affects
-   the ability of the scheduler to do load balancing and might have
-   negative effects on CPU utilization depending on the hosting
-   scenario. Disabling SMT might be a viable alternative for particular
-   scenarios.
-
-   For further information about confining guests to a single or to a group
-   of cores consult the cpusets documentation:
-
-   https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
-
-.. _interrupt_isolation:
-
-3. Interrupt affinity
-^^^^^^^^^^^^^^^^^^^^^
-
-   Interrupts can be made affine to logical CPUs. This is not universally
-   true because there are types of interrupts which are truly per CPU
-   interrupts, e.g. the local timer interrupt. Aside of that multi queue
-   devices affine their interrupts to single CPUs or groups of CPUs per
-   queue without allowing the administrator to control the affinities.
-
-   Moving the interrupts, which can be affinity controlled, away from CPUs
-   which run untrusted guests, reduces the attack vector space.
-
-   Whether the interrupts with are affine to CPUs, which run untrusted
-   guests, provide interesting data for an attacker depends on the system
-   configuration and the scenarios which run on the system. While for some
-   of the interrupts it can be assumed that they won't expose interesting
-   information beyond exposing hints about the host OS memory layout, there
-   is no way to make general assumptions.
-
-   Interrupt affinity can be controlled by the administrator via the
-   /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
-   available at:
-
-   https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
-
-.. _smt_control:
-
-4. SMT control
-^^^^^^^^^^^^^^
-
-   To prevent the SMT issues of L1TF it might be necessary to disable SMT
-   completely. Disabling SMT can have a significant performance impact, but
-   the impact depends on the hosting scenario and the type of workloads.
-   The impact of disabling SMT needs also to be weighted against the impact
-   of other mitigation solutions like confining guests to dedicated cores.
-
-   The kernel provides a sysfs interface to retrieve the status of SMT and
-   to control it. It also provides a kernel command line interface to
-   control SMT.
-
-   The kernel command line interface consists of the following options:
-
-     =========== ==========================================================
-     nosmt	 Affects the bring up of the secondary CPUs during boot. The
-		 kernel tries to bring all present CPUs online during the
-		 boot process. "nosmt" makes sure that from each physical
-		 core only one - the so called primary (hyper) thread is
-		 activated. Due to a design flaw of Intel processors related
-		 to Machine Check Exceptions the non primary siblings have
-		 to be brought up at least partially and are then shut down
-		 again.  "nosmt" can be undone via the sysfs interface.
-
-     nosmt=force Has the same effect as "nosmt" but it does not allow to
-		 undo the SMT disable via the sysfs interface.
-     =========== ==========================================================
-
-   The sysfs interface provides two files:
-
-   - /sys/devices/system/cpu/smt/control
-   - /sys/devices/system/cpu/smt/active
-
-   /sys/devices/system/cpu/smt/control:
-
-     This file allows to read out the SMT control state and provides the
-     ability to disable or (re)enable SMT. The possible states are:
-
-	==============  ===================================================
-	on		SMT is supported by the CPU and enabled. All
-			logical CPUs can be onlined and offlined without
-			restrictions.
-
-	off		SMT is supported by the CPU and disabled. Only
-			the so called primary SMT threads can be onlined
-			and offlined without restrictions. An attempt to
-			online a non-primary sibling is rejected
-
-	forceoff	Same as 'off' but the state cannot be controlled.
-			Attempts to write to the control file are rejected.
-
-	notsupported	The processor does not support SMT. It's therefore
-			not affected by the SMT implications of L1TF.
-			Attempts to write to the control file are rejected.
-	==============  ===================================================
-
-     The possible states which can be written into this file to control SMT
-     state are:
-
-     - on
-     - off
-     - forceoff
-
-   /sys/devices/system/cpu/smt/active:
-
-     This file reports whether SMT is enabled and active, i.e. if on any
-     physical core two or more sibling threads are online.
-
-   SMT control is also possible at boot time via the l1tf kernel command
-   line parameter in combination with L1D flush control. See
-   :ref:`mitigation_control_command_line`.
-
-5. Disabling EPT
-^^^^^^^^^^^^^^^^
-
-  Disabling EPT for virtual machines provides full mitigation for L1TF even
-  with SMT enabled, because the effective page tables for guests are
-  managed and sanitized by the hypervisor. Though disabling EPT has a
-  significant performance impact especially when the Meltdown mitigation
-  KPTI is enabled.
-
-  EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
-
-There is ongoing research and development for new mitigation mechanisms to
-address the performance impact of disabling SMT or EPT.
-
-.. _mitigation_control_command_line:
-
-Mitigation control on the kernel command line
----------------------------------------------
-
-The kernel command line allows to control the L1TF mitigations at boot
-time with the option "l1tf=". The valid arguments for this option are:
-
-  ============  =============================================================
-  full		Provides all available mitigations for the L1TF
-		vulnerability. Disables SMT and enables all mitigations in
-		the hypervisors, i.e. unconditional L1D flushing
-
-		SMT control and L1D flush control via the sysfs interface
-		is still possible after boot.  Hypervisors will issue a
-		warning when the first VM is started in a potentially
-		insecure configuration, i.e. SMT enabled or L1D flush
-		disabled.
-
-  full,force	Same as 'full', but disables SMT and L1D flush runtime
-		control. Implies the 'nosmt=force' command line option.
-		(i.e. sysfs control of SMT is disabled.)
-
-  flush		Leaves SMT enabled and enables the default hypervisor
-		mitigation, i.e. conditional L1D flushing
-
-		SMT control and L1D flush control via the sysfs interface
-		is still possible after boot.  Hypervisors will issue a
-		warning when the first VM is started in a potentially
-		insecure configuration, i.e. SMT enabled or L1D flush
-		disabled.
-
-  flush,nosmt	Disables SMT and enables the default hypervisor mitigation,
-		i.e. conditional L1D flushing.
-
-		SMT control and L1D flush control via the sysfs interface
-		is still possible after boot.  Hypervisors will issue a
-		warning when the first VM is started in a potentially
-		insecure configuration, i.e. SMT enabled or L1D flush
-		disabled.
-
-  flush,nowarn	Same as 'flush', but hypervisors will not warn when a VM is
-		started in a potentially insecure configuration.
-
-  off		Disables hypervisor mitigations and doesn't emit any
-		warnings.
-		It also drops the swap size and available RAM limit restrictions
-		on both hypervisor and bare metal.
-
-  ============  =============================================================
-
-The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
-
-
-.. _mitigation_control_kvm:
-
-Mitigation control for KVM - module parameter
--------------------------------------------------------------
-
-The KVM hypervisor mitigation mechanism, flushing the L1D cache when
-entering a guest, can be controlled with a module parameter.
-
-The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
-following arguments:
-
-  ============  ==============================================================
-  always	L1D cache flush on every VMENTER.
-
-  cond		Flush L1D on VMENTER only when the code between VMEXIT and
-		VMENTER can leak host memory which is considered
-		interesting for an attacker. This still can leak host memory
-		which allows e.g. to determine the hosts address space layout.
-
-  never		Disables the mitigation
-  ============  ==============================================================
-
-The parameter can be provided on the kernel command line, as a module
-parameter when loading the modules and at runtime modified via the sysfs
-file:
-
-/sys/module/kvm_intel/parameters/vmentry_l1d_flush
-
-The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
-line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
-module parameter is ignored and writes to the sysfs file are rejected.
-
-
-Mitigation selection guide
---------------------------
-
-1. No virtualization in use
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   The system is protected by the kernel unconditionally and no further
-   action is required.
-
-2. Virtualization with trusted guests
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-   If the guest comes from a trusted source and the guest OS kernel is
-   guaranteed to have the L1TF mitigations in place the system is fully
-   protected against L1TF and no further action is required.
-
-   To avoid the overhead of the default L1D flushing on VMENTER the
-   administrator can disable the flushing via the kernel command line and
-   sysfs control files. See :ref:`mitigation_control_command_line` and
-   :ref:`mitigation_control_kvm`.
-
-
-3. Virtualization with untrusted guests
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-3.1. SMT not supported or disabled
-""""""""""""""""""""""""""""""""""
-
-  If SMT is not supported by the processor or disabled in the BIOS or by
-  the kernel, it's only required to enforce L1D flushing on VMENTER.
-
-  Conditional L1D flushing is the default behaviour and can be tuned. See
-  :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
-
-3.2. EPT not supported or disabled
-""""""""""""""""""""""""""""""""""
-
-  If EPT is not supported by the processor or disabled in the hypervisor,
-  the system is fully protected. SMT can stay enabled and L1D flushing on
-  VMENTER is not required.
-
-  EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
-
-3.3. SMT and EPT supported and active
-"""""""""""""""""""""""""""""""""""""
-
-  If SMT and EPT are supported and active then various degrees of
-  mitigations can be employed:
-
-  - L1D flushing on VMENTER:
-
-    L1D flushing on VMENTER is the minimal protection requirement, but it
-    is only potent in combination with other mitigation methods.
-
-    Conditional L1D flushing is the default behaviour and can be tuned. See
-    :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
-
-  - Guest confinement:
-
-    Confinement of guests to a single or a group of physical cores which
-    are not running any other processes, can reduce the attack surface
-    significantly, but interrupts, soft interrupts and kernel threads can
-    still expose valuable data to a potential attacker. See
-    :ref:`guest_confinement`.
-
-  - Interrupt isolation:
-
-    Isolating the guest CPUs from interrupts can reduce the attack surface
-    further, but still allows a malicious guest to explore a limited amount
-    of host physical memory. This can at least be used to gain knowledge
-    about the host address space layout. The interrupts which have a fixed
-    affinity to the CPUs which run the untrusted guests can depending on
-    the scenario still trigger soft interrupts and schedule kernel threads
-    which might expose valuable information. See
-    :ref:`interrupt_isolation`.
-
-The above three mitigation methods combined can provide protection to a
-certain degree, but the risk of the remaining attack surface has to be
-carefully analyzed. For full protection the following methods are
-available:
-
-  - Disabling SMT:
-
-    Disabling SMT and enforcing the L1D flushing provides the maximum
-    amount of protection. This mitigation is not depending on any of the
-    above mitigation methods.
-
-    SMT control and L1D flushing can be tuned by the command line
-    parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
-    time with the matching sysfs control files. See :ref:`smt_control`,
-    :ref:`mitigation_control_command_line` and
-    :ref:`mitigation_control_kvm`.
-
-  - Disabling EPT:
-
-    Disabling EPT provides the maximum amount of protection as well. It is
-    not depending on any of the above mitigation methods. SMT can stay
-    enabled and L1D flushing is not required, but the performance impact is
-    significant.
-
-    EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
-    parameter.
-
-3.4. Nested virtual machines
-""""""""""""""""""""""""""""
-
-When nested virtualization is in use, three operating systems are involved:
-the bare metal hypervisor, the nested hypervisor and the nested virtual
-machine.  VMENTER operations from the nested hypervisor into the nested
-guest will always be processed by the bare metal hypervisor. If KVM is the
-bare metal hypervisor it will:
-
- - Flush the L1D cache on every switch from the nested hypervisor to the
-   nested virtual machine, so that the nested hypervisor's secrets are not
-   exposed to the nested virtual machine;
-
- - Flush the L1D cache on every switch from the nested virtual machine to
-   the nested hypervisor; this is a complex operation, and flushing the L1D
-   cache avoids that the bare metal hypervisor's secrets are exposed to the
-   nested virtual machine;
-
- - Instruct the nested hypervisor to not perform any L1D cache flush. This
-   is an optimization to avoid double L1D flushing.
-
-
-.. _default_mitigations:
-
-Default mitigations
--------------------
-
-  The kernel default mitigations for vulnerable processors are:
-
-  - PTE inversion to protect against malicious user space. This is done
-    unconditionally and cannot be controlled. The swap storage is limited
-    to ~16TB.
-
-  - L1D conditional flushing on VMENTER when EPT is enabled for
-    a guest.
-
-  The kernel does not by default enforce the disabling of SMT, which leaves
-  SMT systems vulnerable when running untrusted guests with EPT enabled.
-
-  The rationale for this choice is:
-
-  - Force disabling SMT can break existing setups, especially with
-    unattended updates.
-
-  - If regular users run untrusted guests on their machine, then L1TF is
-    just an add on to other malware which might be embedded in an untrusted
-    guest, e.g. spam-bots or attacks on the local network.
-
-    There is no technical way to prevent a user from running untrusted code
-    on their machines blindly.
-
-  - It's technically extremely unlikely and from today's knowledge even
-    impossible that L1TF can be exploited via the most popular attack
-    mechanisms like JavaScript because these mechanisms have no way to
-    control PTEs. If this would be possible and not other mitigation would
-    be possible, then the default might be different.
-
-  - The administrators of cloud and hosting setups have to carefully
-    analyze the risk for their scenarios and make the appropriate
-    mitigation choices, which might even vary across their deployed
-    machines and also result in other changes of their overall setup.
-    There is no way for the kernel to provide a sensible default for this
-    kind of scenarios.
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 95cda38c8785..373ae1dcd301 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1107,7 +1107,7 @@ static void __init l1tf_select_mitigation(void)
 		pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n",
 				half_pa);
 		pr_info("However, doing so will make a part of your RAM unusable.\n");
-		pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n");
+		pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n");
 		return;
 	}
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 544bd24a9c1e..b0597507bde7 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6801,8 +6801,8 @@ free_partial_vcpu:
 	return ERR_PTR(err);
 }
 
-#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
-#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
 
 static int vmx_vm_init(struct kvm *kvm)
 {
-- 
cgit v1.2.3


From 8db5da0b8618df79eceea99672e205d4a2a6309e Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.ibm.com>
Date: Sun, 27 Jan 2019 19:03:45 -0500
Subject: x86/ima: require signed kernel modules

Have the IMA architecture specific policy require signed kernel modules
on systems with secure boot mode enabled; and coordinate the different
signature verification methods, so only one signature is required.

Requiring appended kernel module signatures may be configured, enabled
on the boot command line, or with this patch enabled in secure boot
mode.  This patch defines set_module_sig_enforced().

To coordinate between appended kernel module signatures and IMA
signatures, only define an IMA MODULE_CHECK policy rule if
CONFIG_MODULE_SIG is not enabled.  A custom IMA policy may still define
and require an IMA signature.

Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Jessica Yu <jeyu@kernel.org>
---
 arch/x86/kernel/ima_arch.c | 9 ++++++++-
 include/linux/module.h     | 5 +++++
 kernel/module.c            | 5 +++++
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c
index e47cd9390ab4..3fb9847f1cad 100644
--- a/arch/x86/kernel/ima_arch.c
+++ b/arch/x86/kernel/ima_arch.c
@@ -64,12 +64,19 @@ static const char * const sb_arch_rules[] = {
 	"appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig",
 #endif /* CONFIG_KEXEC_VERIFY_SIG */
 	"measure func=KEXEC_KERNEL_CHECK",
+#if !IS_ENABLED(CONFIG_MODULE_SIG)
+	"appraise func=MODULE_CHECK appraise_type=imasig",
+#endif
+	"measure func=MODULE_CHECK",
 	NULL
 };
 
 const char * const *arch_get_ima_policy(void)
 {
-	if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot())
+	if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) {
+		if (IS_ENABLED(CONFIG_MODULE_SIG))
+			set_module_sig_enforced();
 		return sb_arch_rules;
+	}
 	return NULL;
 }
diff --git a/include/linux/module.h b/include/linux/module.h
index 5bf5dcd91009..73ee2b10e816 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -676,6 +676,7 @@ static inline bool is_livepatch_module(struct module *mod)
 #endif /* CONFIG_LIVEPATCH */
 
 bool is_module_sig_enforced(void);
+void set_module_sig_enforced(void);
 
 #else /* !CONFIG_MODULES... */
 
@@ -796,6 +797,10 @@ static inline bool is_module_sig_enforced(void)
 	return false;
 }
 
+static inline void set_module_sig_enforced(void)
+{
+}
+
 /* Dereference module function descriptor */
 static inline
 void *dereference_module_function_descriptor(struct module *mod, void *ptr)
diff --git a/kernel/module.c b/kernel/module.c
index 0b9aa8ab89f0..985caa467aef 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -286,6 +286,11 @@ bool is_module_sig_enforced(void)
 }
 EXPORT_SYMBOL(is_module_sig_enforced);
 
+void set_module_sig_enforced(void)
+{
+	sig_enforce = true;
+}
+
 /* Block module loading/unloading? */
 int modules_disabled = 0;
 core_param(nomodule, modules_disabled, bint, 0);
-- 
cgit v1.2.3


From 86d35d4e7625f7c056d81316da107bd3a7564fb3 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Tue, 26 Mar 2019 07:40:54 +0000
Subject: drm/i915: Split Pineview device info into desktop and mobile
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows the IS_PINEVIEW_<G|M> macros to be removed and avoid
duplication of device ids already defined in i915_pciids.h.

!IS_MOBILE check can be used in place of existing IS_PINEVIEW_G call
sites.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20190326074057.27833-2-tvrtko.ursulin@linux.intel.com
---
 arch/x86/kernel/early-quirks.c  |  3 ++-
 drivers/gpu/drm/i915/i915_drv.h |  2 --
 drivers/gpu/drm/i915/i915_pci.c | 12 ++++++++++--
 drivers/gpu/drm/i915/intel_pm.c |  4 ++--
 include/drm/i915_pciids.h       |  6 ++++--
 5 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 50d5848bf22e..f91d3ed2df62 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -525,7 +525,8 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
 	INTEL_I945G_IDS(&gen3_early_ops),
 	INTEL_I945GM_IDS(&gen3_early_ops),
 	INTEL_VLV_IDS(&gen6_early_ops),
-	INTEL_PINEVIEW_IDS(&gen3_early_ops),
+	INTEL_PINEVIEW_G_IDS(&gen3_early_ops),
+	INTEL_PINEVIEW_M_IDS(&gen3_early_ops),
 	INTEL_I965G_IDS(&gen3_early_ops),
 	INTEL_G33_IDS(&gen3_early_ops),
 	INTEL_I965GM_IDS(&gen3_early_ops),
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f75600fa77c6..d1fd8f7d6f8a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2317,8 +2317,6 @@ static inline unsigned int i915_sg_segment_size(void)
 #define IS_G45(dev_priv)	IS_PLATFORM(dev_priv, INTEL_G45)
 #define IS_GM45(dev_priv)	IS_PLATFORM(dev_priv, INTEL_GM45)
 #define IS_G4X(dev_priv)	(IS_G45(dev_priv) || IS_GM45(dev_priv))
-#define IS_PINEVIEW_G(dev_priv)	(INTEL_DEVID(dev_priv) == 0xa001)
-#define IS_PINEVIEW_M(dev_priv)	(INTEL_DEVID(dev_priv) == 0xa011)
 #define IS_PINEVIEW(dev_priv)	IS_PLATFORM(dev_priv, INTEL_PINEVIEW)
 #define IS_G33(dev_priv)	IS_PLATFORM(dev_priv, INTEL_G33)
 #define IS_IRONLAKE_M(dev_priv)	(INTEL_DEVID(dev_priv) == 0x0046)
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index a7e1611af26d..716f2f95c57d 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -257,7 +257,14 @@ static const struct intel_device_info intel_g33_info = {
 	.display.has_overlay = 1,
 };
 
-static const struct intel_device_info intel_pineview_info = {
+static const struct intel_device_info intel_pineview_g_info = {
+	GEN3_FEATURES,
+	PLATFORM(INTEL_PINEVIEW),
+	.display.has_hotplug = 1,
+	.display.has_overlay = 1,
+};
+
+static const struct intel_device_info intel_pineview_m_info = {
 	GEN3_FEATURES,
 	PLATFORM(INTEL_PINEVIEW),
 	.is_mobile = 1,
@@ -761,7 +768,8 @@ static const struct pci_device_id pciidlist[] = {
 	INTEL_I965GM_IDS(&intel_i965gm_info),
 	INTEL_GM45_IDS(&intel_gm45_info),
 	INTEL_G45_IDS(&intel_g45_info),
-	INTEL_PINEVIEW_IDS(&intel_pineview_info),
+	INTEL_PINEVIEW_G_IDS(&intel_pineview_g_info),
+	INTEL_PINEVIEW_M_IDS(&intel_pineview_m_info),
 	INTEL_IRONLAKE_D_IDS(&intel_ironlake_d_info),
 	INTEL_IRONLAKE_M_IDS(&intel_ironlake_m_info),
 	INTEL_SNB_D_GT1_IDS(&intel_sandybridge_d_gt1_info),
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 9a6eb2ef5f48..0e05ee1f3ea0 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -850,7 +850,7 @@ static void pineview_update_wm(struct intel_crtc *unused_crtc)
 	u32 reg;
 	unsigned int wm;
 
-	latency = intel_get_cxsr_latency(IS_PINEVIEW_G(dev_priv),
+	latency = intel_get_cxsr_latency(!IS_MOBILE(dev_priv),
 					 dev_priv->is_ddr3,
 					 dev_priv->fsb_freq,
 					 dev_priv->mem_freq);
@@ -9589,7 +9589,7 @@ void intel_init_pm(struct drm_i915_private *dev_priv)
 		dev_priv->display.initial_watermarks = g4x_initial_watermarks;
 		dev_priv->display.optimize_watermarks = g4x_optimize_watermarks;
 	} else if (IS_PINEVIEW(dev_priv)) {
-		if (!intel_get_cxsr_latency(IS_PINEVIEW_G(dev_priv),
+		if (!intel_get_cxsr_latency(!IS_MOBILE(dev_priv),
 					    dev_priv->is_ddr3,
 					    dev_priv->fsb_freq,
 					    dev_priv->mem_freq)) {
diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
index c7cdbfc4d033..cb9b5b35aa2c 100644
--- a/include/drm/i915_pciids.h
+++ b/include/drm/i915_pciids.h
@@ -108,8 +108,10 @@
 	INTEL_VGA_DEVICE(0x2e42, info), /* B43_G */ \
 	INTEL_VGA_DEVICE(0x2e92, info)	/* B43_G.1 */
 
-#define INTEL_PINEVIEW_IDS(info)			\
-	INTEL_VGA_DEVICE(0xa001, info),			\
+#define INTEL_PINEVIEW_G_IDS(info) \
+	INTEL_VGA_DEVICE(0xa001, info)
+
+#define INTEL_PINEVIEW_M_IDS(info) \
 	INTEL_VGA_DEVICE(0xa011, info)
 
 #define INTEL_IRONLAKE_D_IDS(info) \
-- 
cgit v1.2.3


From d53fef0be4a5f2f12219c231cdb6f838fbbf680c Mon Sep 17 00:00:00 2001
From: Rodrigo Vivi <rodrigo.vivi@intel.com>
Date: Fri, 15 Mar 2019 12:19:38 -0700
Subject: x86/gpu: add ElkhartLake to gen11 early quirks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's reserve EHL stolen memory for graphics.

ElkhartLake is a gen11 platform which is compatible with
ICL changes.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: José Roberto de Souza <jose.souza@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Link: https://patchwork.freedesktop.org/patch/msgid/20190315191938.22211-2-rodrigo.vivi@intel.com
---
 arch/x86/kernel/early-quirks.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index f91d3ed2df62..6c4f01540833 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -548,6 +548,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
 	INTEL_GLK_IDS(&gen9_early_ops),
 	INTEL_CNL_IDS(&gen9_early_ops),
 	INTEL_ICL_11_IDS(&gen11_early_ops),
+	INTEL_EHL_IDS(&gen11_early_ops),
 };
 
 struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
-- 
cgit v1.2.3


From d71eb0ce109a124b0fa714832823b9452f2762cf Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 2 Apr 2019 09:59:33 -0500
Subject: x86/speculation/mds: Add mds=full,nosmt cmdline option

Add the mds=full,nosmt cmdline option.  This is like mds=full, but with
SMT disabled if the CPU is vulnerable.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/admin-guide/hw-vuln/mds.rst       |  3 +++
 Documentation/admin-guide/kernel-parameters.txt |  6 ++++--
 arch/x86/kernel/cpu/bugs.c                      | 10 ++++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/admin-guide/hw-vuln/mds.rst b/Documentation/admin-guide/hw-vuln/mds.rst
index 1de29d28903d..244ab47d1fb3 100644
--- a/Documentation/admin-guide/hw-vuln/mds.rst
+++ b/Documentation/admin-guide/hw-vuln/mds.rst
@@ -260,6 +260,9 @@ time with the option "mds=". The valid arguments for this option are:
 
 		It does not automatically disable SMT.
 
+  full,nosmt	The same as mds=full, with SMT disabled on vulnerable
+		CPUs.  This is the complete mitigation.
+
   off		Disables MDS mitigations completely.
 
   ============  =============================================================
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 7325319c2c23..8f04985d3122 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2372,8 +2372,10 @@
 			This parameter controls the MDS mitigation. The
 			options are:
 
-			full    - Enable MDS mitigation on vulnerable CPUs
-			off     - Unconditionally disable MDS mitigation
+			full       - Enable MDS mitigation on vulnerable CPUs
+			full,nosmt - Enable MDS mitigation and disable
+				     SMT on vulnerable CPUs
+			off        - Unconditionally disable MDS mitigation
 
 			Not specifying this option is equivalent to
 			mds=full.
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 373ae1dcd301..9f252082a83b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -221,6 +221,7 @@ static void x86_amd_ssb_disable(void)
 
 /* Default mitigation for L1TF-affected CPUs */
 static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
+static bool mds_nosmt __ro_after_init = false;
 
 static const char * const mds_strings[] = {
 	[MDS_MITIGATION_OFF]	= "Vulnerable",
@@ -238,8 +239,13 @@ static void __init mds_select_mitigation(void)
 	if (mds_mitigation == MDS_MITIGATION_FULL) {
 		if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
 			mds_mitigation = MDS_MITIGATION_VMWERV;
+
 		static_branch_enable(&mds_user_clear);
+
+		if (mds_nosmt && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
+			cpu_smt_disable(false);
 	}
+
 	pr_info("%s\n", mds_strings[mds_mitigation]);
 }
 
@@ -255,6 +261,10 @@ static int __init mds_cmdline(char *str)
 		mds_mitigation = MDS_MITIGATION_OFF;
 	else if (!strcmp(str, "full"))
 		mds_mitigation = MDS_MITIGATION_FULL;
+	else if (!strcmp(str, "full,nosmt")) {
+		mds_mitigation = MDS_MITIGATION_FULL;
+		mds_nosmt = true;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 7c3658b20194a5b3209a143f63bc9c643c6a3ae2 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 2 Apr 2019 10:00:14 -0500
Subject: x86/speculation: Move arch_smt_update() call to after mitigation
 decisions

arch_smt_update() now has a dependency on both Spectre v2 and MDS
mitigations.  Move its initial call to after all the mitigation decisions
have been made.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/kernel/cpu/bugs.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 9f252082a83b..3f934ffef8cf 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -111,6 +111,8 @@ void __init check_bugs(void)
 
 	mds_select_mitigation();
 
+	arch_smt_update();
+
 #ifdef CONFIG_X86_32
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
@@ -638,9 +640,6 @@ specv2_set_mode:
 
 	/* Set up IBPB and STIBP depending on the general spectre V2 command */
 	spectre_v2_user_select_mitigation(cmd);
-
-	/* Enable STIBP if appropriate */
-	arch_smt_update();
 }
 
 static void update_stibp_msr(void * __unused)
-- 
cgit v1.2.3


From 39226ef02bfb43248b7db12a4fdccb39d95318e3 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 2 Apr 2019 10:00:51 -0500
Subject: x86/speculation/mds: Add SMT warning message

MDS is vulnerable with SMT.  Make that clear with a one-time printk
whenever SMT first gets enabled.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/kernel/cpu/bugs.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 3f934ffef8cf..22a14d4b68a2 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -673,6 +673,9 @@ static void update_indir_branch_cond(void)
 		static_branch_disable(&switch_to_cond_stibp);
 }
 
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
+
 /* Update the static key controlling the MDS CPU buffer clear in idle */
 static void update_mds_branch_idle(void)
 {
@@ -693,6 +696,8 @@ static void update_mds_branch_idle(void)
 		static_branch_disable(&mds_idle_clear);
 }
 
+#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
+
 void arch_smt_update(void)
 {
 	/* Enhanced IBRS implies STIBP. No update required. */
@@ -717,6 +722,8 @@ void arch_smt_update(void)
 	switch (mds_mitigation) {
 	case MDS_MITIGATION_FULL:
 	case MDS_MITIGATION_VMWERV:
+		if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
+			pr_warn_once(MDS_MSG_SMT);
 		update_mds_branch_idle();
 		break;
 	case MDS_MITIGATION_OFF:
@@ -1149,6 +1156,7 @@ static int __init l1tf_cmdline(char *str)
 early_param("l1tf", l1tf_cmdline);
 
 #undef pr_fmt
+#define pr_fmt(fmt) fmt
 
 #ifdef CONFIG_SYSFS
 
-- 
cgit v1.2.3


From 89833fab15d6017ba006a45b5af68caa067171a7 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Fri, 29 Mar 2019 22:46:51 +0100
Subject: x86/fpu: Fix __user annotations

In save_xstate_epilog(), use __user when type-casting userspace
pointers.

In setup_sigcontext() and x32_setup_rt_frame(), cast the userspace
pointers to 'unsigned long __user *' before writing into them. These
pointers are originally '__u32 __user *' or '__u64 __user *', causing
sparse to complain when a userspace pointer is written into them. The
casts are okay because the pointers always point to pointer-sized
values.

Thanks to Luc Van Oostenryck and Al Viro for explaining this to me.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Mukesh Ojha <mojha@codeaurora.org>
Cc: Qiaowei Ren <qiaowei.ren@intel.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190329214652.258477-3-jannh@google.com
---
 arch/x86/kernel/fpu/signal.c | 6 +++---
 arch/x86/kernel/signal.c     | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index f6a1d299627c..55b80de13ea5 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -92,13 +92,13 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
 		return err;
 
 	err |= __put_user(FP_XSTATE_MAGIC2,
-			  (__u32 *)(buf + fpu_user_xstate_size));
+			  (__u32 __user *)(buf + fpu_user_xstate_size));
 
 	/*
 	 * Read the xfeatures which we copied (directly from the cpu or
 	 * from the state in task struct) to the user buffers.
 	 */
-	err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures);
+	err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
 
 	/*
 	 * For legacy compatible, we always set FP/SSE bits in the bit
@@ -113,7 +113,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
 	 */
 	xfeatures |= XFEATURE_MASK_FPSSE;
 
-	err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures);
+	err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
 
 	return err;
 }
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 08dfd4c1a4f9..b419e1a1a0ce 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -206,7 +206,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		put_user_ex(regs->ss, &sc->ss);
 #endif /* CONFIG_X86_32 */
 
-		put_user_ex(fpstate, &sc->fpstate);
+		put_user_ex(fpstate, (unsigned long __user *)&sc->fpstate);
 
 		/* non-iBCS2 extensions.. */
 		put_user_ex(mask, &sc->oldmask);
@@ -569,7 +569,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 			restorer = NULL;
 			err |= -EFAULT;
 		}
-		put_user_ex(restorer, &frame->pretcode);
+		put_user_ex(restorer, (unsigned long __user *)&frame->pretcode);
 	} put_user_catch(err);
 
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
-- 
cgit v1.2.3


From 60574d1e05b094d222162260dd9cac49f4d0996a Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 11 Mar 2019 14:55:57 -0600
Subject: acpi: Create subtable parsing infrastructure

Parsing entries in an ACPI table had assumed a generic header
structure. There is no standard ACPI header, though, so less common
layouts with different field sizes required custom parsers to go through
their subtable entry list.

Create the infrastructure for adding different table types so parsing
the entries array may be more reused for all ACPI system tables and
the common code doesn't need to be duplicated.

Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Tested-by: Brice Goglin <Brice.Goglin@inria.fr>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/kernel/acpi_numa.c                 |  2 +-
 arch/arm64/kernel/smp.c                       |  4 +-
 arch/ia64/kernel/acpi.c                       | 14 +++---
 arch/x86/kernel/acpi/boot.c                   | 36 +++++++-------
 drivers/acpi/numa.c                           | 16 +++----
 drivers/acpi/scan.c                           |  4 +-
 drivers/acpi/tables.c                         | 67 +++++++++++++++++++++++----
 drivers/irqchip/irq-gic-v2m.c                 |  2 +-
 drivers/irqchip/irq-gic-v3-its-pci-msi.c      |  2 +-
 drivers/irqchip/irq-gic-v3-its-platform-msi.c |  2 +-
 drivers/irqchip/irq-gic-v3-its.c              |  6 +--
 drivers/irqchip/irq-gic-v3.c                  | 10 ++--
 drivers/irqchip/irq-gic.c                     |  4 +-
 drivers/mailbox/pcc.c                         |  2 +-
 include/linux/acpi.h                          |  5 +-
 15 files changed, 113 insertions(+), 63 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c
index eac1d0cc595c..7ff800045434 100644
--- a/arch/arm64/kernel/acpi_numa.c
+++ b/arch/arm64/kernel/acpi_numa.c
@@ -45,7 +45,7 @@ static inline int get_cpu_for_acpi_id(u32 uid)
 	return -EINVAL;
 }
 
-static int __init acpi_parse_gicc_pxm(struct acpi_subtable_header *header,
+static int __init acpi_parse_gicc_pxm(union acpi_subtable_headers *header,
 				      const unsigned long end)
 {
 	struct acpi_srat_gicc_affinity *pa;
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 824de7038967..bb4b3f07761a 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -586,7 +586,7 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor)
 }
 
 static int __init
-acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header,
+acpi_parse_gic_cpu_interface(union acpi_subtable_headers *header,
 			     const unsigned long end)
 {
 	struct acpi_madt_generic_interrupt *processor;
@@ -595,7 +595,7 @@ acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header,
 	if (BAD_MADT_GICC_ENTRY(processor, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	acpi_map_gic_cpu_interface(processor);
 
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 41eb281709da..1435e7a1a8cd 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -177,7 +177,7 @@ struct acpi_table_madt *acpi_madt __initdata;
 static u8 has_8259;
 
 static int __init
-acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
+acpi_parse_lapic_addr_ovr(union acpi_subtable_headers * header,
 			  const unsigned long end)
 {
 	struct acpi_madt_local_apic_override *lapic;
@@ -195,7 +195,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 }
 
 static int __init
-acpi_parse_lsapic(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_lsapic(union acpi_subtable_headers *header, const unsigned long end)
 {
 	struct acpi_madt_local_sapic *lsapic;
 
@@ -216,7 +216,7 @@ acpi_parse_lsapic(struct acpi_subtable_header * header, const unsigned long end)
 }
 
 static int __init
-acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_local_apic_nmi *lacpi_nmi;
 
@@ -230,7 +230,7 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
 }
 
 static int __init
-acpi_parse_iosapic(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_iosapic(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_io_sapic *iosapic;
 
@@ -245,7 +245,7 @@ acpi_parse_iosapic(struct acpi_subtable_header * header, const unsigned long end
 static unsigned int __initdata acpi_madt_rev;
 
 static int __init
-acpi_parse_plat_int_src(struct acpi_subtable_header * header,
+acpi_parse_plat_int_src(union acpi_subtable_headers * header,
 			const unsigned long end)
 {
 	struct acpi_madt_interrupt_source *plintsrc;
@@ -329,7 +329,7 @@ unsigned int get_cpei_target_cpu(void)
 }
 
 static int __init
-acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
+acpi_parse_int_src_ovr(union acpi_subtable_headers * header,
 		       const unsigned long end)
 {
 	struct acpi_madt_interrupt_override *p;
@@ -350,7 +350,7 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
 }
 
 static int __init
-acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_nmi_source *nmi_src;
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8dcbf6890714..9fc92e4539d8 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -197,7 +197,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
 }
 
 static int __init
-acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
+acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
 {
 	struct acpi_madt_local_x2apic *processor = NULL;
 #ifdef CONFIG_X86_X2APIC
@@ -210,7 +210,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
 	if (BAD_MADT_ENTRY(processor, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 #ifdef CONFIG_X86_X2APIC
 	apic_id = processor->local_apic_id;
@@ -242,7 +242,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
 }
 
 static int __init
-acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_local_apic *processor = NULL;
 
@@ -251,7 +251,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
 	if (BAD_MADT_ENTRY(processor, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	/* Ignore invalid ID */
 	if (processor->id == 0xff)
@@ -272,7 +272,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
 }
 
 static int __init
-acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
+acpi_parse_sapic(union acpi_subtable_headers *header, const unsigned long end)
 {
 	struct acpi_madt_local_sapic *processor = NULL;
 
@@ -281,7 +281,7 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
 	if (BAD_MADT_ENTRY(processor, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */
 			    processor->processor_id, /* ACPI ID */
@@ -291,7 +291,7 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
 }
 
 static int __init
-acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
+acpi_parse_lapic_addr_ovr(union acpi_subtable_headers * header,
 			  const unsigned long end)
 {
 	struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
@@ -301,7 +301,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 	if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	acpi_lapic_addr = lapic_addr_ovr->address;
 
@@ -309,7 +309,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 }
 
 static int __init
-acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
+acpi_parse_x2apic_nmi(union acpi_subtable_headers *header,
 		      const unsigned long end)
 {
 	struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL;
@@ -319,7 +319,7 @@ acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
 	if (BAD_MADT_ENTRY(x2apic_nmi, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	if (x2apic_nmi->lint != 1)
 		printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
@@ -328,7 +328,7 @@ acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
 }
 
 static int __init
-acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_local_apic_nmi *lapic_nmi = NULL;
 
@@ -337,7 +337,7 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
 	if (BAD_MADT_ENTRY(lapic_nmi, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	if (lapic_nmi->lint != 1)
 		printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
@@ -449,7 +449,7 @@ static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
 }
 
 static int __init
-acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_ioapic(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_io_apic *ioapic = NULL;
 	struct ioapic_domain_cfg cfg = {
@@ -462,7 +462,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 	if (BAD_MADT_ENTRY(ioapic, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	/* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */
 	if (ioapic->global_irq_base < nr_legacy_irqs())
@@ -508,7 +508,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
 }
 
 static int __init
-acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
+acpi_parse_int_src_ovr(union acpi_subtable_headers * header,
 		       const unsigned long end)
 {
 	struct acpi_madt_interrupt_override *intsrc = NULL;
@@ -518,7 +518,7 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
 	if (BAD_MADT_ENTRY(intsrc, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
 		acpi_sci_ioapic_setup(intsrc->source_irq,
@@ -550,7 +550,7 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
 }
 
 static int __init
-acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end)
 {
 	struct acpi_madt_nmi_source *nmi_src = NULL;
 
@@ -559,7 +559,7 @@ acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end
 	if (BAD_MADT_ENTRY(nmi_src, end))
 		return -EINVAL;
 
-	acpi_table_print_madt_entry(header);
+	acpi_table_print_madt_entry(&header->common);
 
 	/* TBD: Support nimsrc entries? */
 
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 867f6e3f2b4f..30995834ad70 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -339,7 +339,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 }
 
 static int __init
-acpi_parse_x2apic_affinity(struct acpi_subtable_header *header,
+acpi_parse_x2apic_affinity(union acpi_subtable_headers *header,
 			   const unsigned long end)
 {
 	struct acpi_srat_x2apic_cpu_affinity *processor_affinity;
@@ -348,7 +348,7 @@ acpi_parse_x2apic_affinity(struct acpi_subtable_header *header,
 	if (!processor_affinity)
 		return -EINVAL;
 
-	acpi_table_print_srat_entry(header);
+	acpi_table_print_srat_entry(&header->common);
 
 	/* let architecture-dependent part to do it */
 	acpi_numa_x2apic_affinity_init(processor_affinity);
@@ -357,7 +357,7 @@ acpi_parse_x2apic_affinity(struct acpi_subtable_header *header,
 }
 
 static int __init
-acpi_parse_processor_affinity(struct acpi_subtable_header *header,
+acpi_parse_processor_affinity(union acpi_subtable_headers *header,
 			      const unsigned long end)
 {
 	struct acpi_srat_cpu_affinity *processor_affinity;
@@ -366,7 +366,7 @@ acpi_parse_processor_affinity(struct acpi_subtable_header *header,
 	if (!processor_affinity)
 		return -EINVAL;
 
-	acpi_table_print_srat_entry(header);
+	acpi_table_print_srat_entry(&header->common);
 
 	/* let architecture-dependent part to do it */
 	acpi_numa_processor_affinity_init(processor_affinity);
@@ -375,7 +375,7 @@ acpi_parse_processor_affinity(struct acpi_subtable_header *header,
 }
 
 static int __init
-acpi_parse_gicc_affinity(struct acpi_subtable_header *header,
+acpi_parse_gicc_affinity(union acpi_subtable_headers *header,
 			 const unsigned long end)
 {
 	struct acpi_srat_gicc_affinity *processor_affinity;
@@ -384,7 +384,7 @@ acpi_parse_gicc_affinity(struct acpi_subtable_header *header,
 	if (!processor_affinity)
 		return -EINVAL;
 
-	acpi_table_print_srat_entry(header);
+	acpi_table_print_srat_entry(&header->common);
 
 	/* let architecture-dependent part to do it */
 	acpi_numa_gicc_affinity_init(processor_affinity);
@@ -395,7 +395,7 @@ acpi_parse_gicc_affinity(struct acpi_subtable_header *header,
 static int __initdata parsed_numa_memblks;
 
 static int __init
-acpi_parse_memory_affinity(struct acpi_subtable_header * header,
+acpi_parse_memory_affinity(union acpi_subtable_headers * header,
 			   const unsigned long end)
 {
 	struct acpi_srat_mem_affinity *memory_affinity;
@@ -404,7 +404,7 @@ acpi_parse_memory_affinity(struct acpi_subtable_header * header,
 	if (!memory_affinity)
 		return -EINVAL;
 
-	acpi_table_print_srat_entry(header);
+	acpi_table_print_srat_entry(&header->common);
 
 	/* let architecture-dependent part to do it */
 	if (!acpi_numa_memory_affinity_init(memory_affinity))
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 446c959a8f08..f7771a3b4a3e 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -2241,10 +2241,10 @@ static struct acpi_probe_entry *ape;
 static int acpi_probe_count;
 static DEFINE_MUTEX(acpi_probe_mutex);
 
-static int __init acpi_match_madt(struct acpi_subtable_header *header,
+static int __init acpi_match_madt(union acpi_subtable_headers *header,
 				  const unsigned long end)
 {
-	if (!ape->subtable_valid || ape->subtable_valid(header, ape))
+	if (!ape->subtable_valid || ape->subtable_valid(&header->common, ape))
 		if (!ape->probe_subtbl(header, end))
 			acpi_probe_count++;
 
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 8fccbe49612a..7553774a22b7 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -49,6 +49,15 @@ static struct acpi_table_desc initial_tables[ACPI_MAX_TABLES] __initdata;
 
 static int acpi_apic_instance __initdata;
 
+enum acpi_subtable_type {
+	ACPI_SUBTABLE_COMMON,
+};
+
+struct acpi_subtable_entry {
+	union acpi_subtable_headers *hdr;
+	enum acpi_subtable_type type;
+};
+
 /*
  * Disable table checksum verification for the early stage due to the size
  * limitation of the current x86 early mapping implementation.
@@ -217,6 +226,42 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header)
 	}
 }
 
+static unsigned long __init
+acpi_get_entry_type(struct acpi_subtable_entry *entry)
+{
+	switch (entry->type) {
+	case ACPI_SUBTABLE_COMMON:
+		return entry->hdr->common.type;
+	}
+	return 0;
+}
+
+static unsigned long __init
+acpi_get_entry_length(struct acpi_subtable_entry *entry)
+{
+	switch (entry->type) {
+	case ACPI_SUBTABLE_COMMON:
+		return entry->hdr->common.length;
+	}
+	return 0;
+}
+
+static unsigned long __init
+acpi_get_subtable_header_length(struct acpi_subtable_entry *entry)
+{
+	switch (entry->type) {
+	case ACPI_SUBTABLE_COMMON:
+		return sizeof(entry->hdr->common);
+	}
+	return 0;
+}
+
+static enum acpi_subtable_type __init
+acpi_get_subtable_type(char *id)
+{
+	return ACPI_SUBTABLE_COMMON;
+}
+
 /**
  * acpi_parse_entries_array - for each proc_num find a suitable subtable
  *
@@ -246,8 +291,8 @@ acpi_parse_entries_array(char *id, unsigned long table_size,
 		struct acpi_subtable_proc *proc, int proc_num,
 		unsigned int max_entries)
 {
-	struct acpi_subtable_header *entry;
-	unsigned long table_end;
+	struct acpi_subtable_entry entry;
+	unsigned long table_end, subtable_len, entry_len;
 	int count = 0;
 	int errs = 0;
 	int i;
@@ -270,19 +315,20 @@ acpi_parse_entries_array(char *id, unsigned long table_size,
 
 	/* Parse all entries looking for a match. */
 
-	entry = (struct acpi_subtable_header *)
+	entry.type = acpi_get_subtable_type(id);
+	entry.hdr = (union acpi_subtable_headers *)
 	    ((unsigned long)table_header + table_size);
+	subtable_len = acpi_get_subtable_header_length(&entry);
 
-	while (((unsigned long)entry) + sizeof(struct acpi_subtable_header) <
-	       table_end) {
+	while (((unsigned long)entry.hdr) + subtable_len  < table_end) {
 		if (max_entries && count >= max_entries)
 			break;
 
 		for (i = 0; i < proc_num; i++) {
-			if (entry->type != proc[i].id)
+			if (acpi_get_entry_type(&entry) != proc[i].id)
 				continue;
 			if (!proc[i].handler ||
-			     (!errs && proc[i].handler(entry, table_end))) {
+			     (!errs && proc[i].handler(entry.hdr, table_end))) {
 				errs++;
 				continue;
 			}
@@ -297,13 +343,14 @@ acpi_parse_entries_array(char *id, unsigned long table_size,
 		 * If entry->length is 0, break from this loop to avoid
 		 * infinite loop.
 		 */
-		if (entry->length == 0) {
+		entry_len = acpi_get_entry_length(&entry);
+		if (entry_len == 0) {
 			pr_err("[%4.4s:0x%02x] Invalid zero length\n", id, proc->id);
 			return -EINVAL;
 		}
 
-		entry = (struct acpi_subtable_header *)
-		    ((unsigned long)entry + entry->length);
+		entry.hdr = (union acpi_subtable_headers *)
+		    ((unsigned long)entry.hdr + entry_len);
 	}
 
 	if (max_entries && count > max_entries) {
diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c
index f5fe0100f9ff..de14e06fd9ec 100644
--- a/drivers/irqchip/irq-gic-v2m.c
+++ b/drivers/irqchip/irq-gic-v2m.c
@@ -446,7 +446,7 @@ static struct fwnode_handle *gicv2m_get_fwnode(struct device *dev)
 }
 
 static int __init
-acpi_parse_madt_msi(struct acpi_subtable_header *header,
+acpi_parse_madt_msi(union acpi_subtable_headers *header,
 		    const unsigned long end)
 {
 	int ret;
diff --git a/drivers/irqchip/irq-gic-v3-its-pci-msi.c b/drivers/irqchip/irq-gic-v3-its-pci-msi.c
index 8d6d009d1d58..c81d5b81da56 100644
--- a/drivers/irqchip/irq-gic-v3-its-pci-msi.c
+++ b/drivers/irqchip/irq-gic-v3-its-pci-msi.c
@@ -159,7 +159,7 @@ static int __init its_pci_of_msi_init(void)
 #ifdef CONFIG_ACPI
 
 static int __init
-its_pci_msi_parse_madt(struct acpi_subtable_header *header,
+its_pci_msi_parse_madt(union acpi_subtable_headers *header,
 		       const unsigned long end)
 {
 	struct acpi_madt_generic_translator *its_entry;
diff --git a/drivers/irqchip/irq-gic-v3-its-platform-msi.c b/drivers/irqchip/irq-gic-v3-its-platform-msi.c
index 7b8e87b493fe..9cdcda5bb3bd 100644
--- a/drivers/irqchip/irq-gic-v3-its-platform-msi.c
+++ b/drivers/irqchip/irq-gic-v3-its-platform-msi.c
@@ -117,7 +117,7 @@ static int __init its_pmsi_init_one(struct fwnode_handle *fwnode,
 
 #ifdef CONFIG_ACPI
 static int __init
-its_pmsi_parse_madt(struct acpi_subtable_header *header,
+its_pmsi_parse_madt(union acpi_subtable_headers *header,
 			const unsigned long end)
 {
 	struct acpi_madt_generic_translator *its_entry;
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 7577755bdcf4..128ac893d7e4 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3830,13 +3830,13 @@ static int __init acpi_get_its_numa_node(u32 its_id)
 	return NUMA_NO_NODE;
 }
 
-static int __init gic_acpi_match_srat_its(struct acpi_subtable_header *header,
+static int __init gic_acpi_match_srat_its(union acpi_subtable_headers *header,
 					  const unsigned long end)
 {
 	return 0;
 }
 
-static int __init gic_acpi_parse_srat_its(struct acpi_subtable_header *header,
+static int __init gic_acpi_parse_srat_its(union acpi_subtable_headers *header,
 			 const unsigned long end)
 {
 	int node;
@@ -3903,7 +3903,7 @@ static int __init acpi_get_its_numa_node(u32 its_id) { return NUMA_NO_NODE; }
 static void __init acpi_its_srat_maps_free(void) { }
 #endif
 
-static int __init gic_acpi_parse_madt_its(struct acpi_subtable_header *header,
+static int __init gic_acpi_parse_madt_its(union acpi_subtable_headers *header,
 					  const unsigned long end)
 {
 	struct acpi_madt_generic_translator *its_entry;
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 15e55d327505..f44cd89cfc40 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1593,7 +1593,7 @@ gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base)
 }
 
 static int __init
-gic_acpi_parse_madt_redist(struct acpi_subtable_header *header,
+gic_acpi_parse_madt_redist(union acpi_subtable_headers *header,
 			   const unsigned long end)
 {
 	struct acpi_madt_generic_redistributor *redist =
@@ -1611,7 +1611,7 @@ gic_acpi_parse_madt_redist(struct acpi_subtable_header *header,
 }
 
 static int __init
-gic_acpi_parse_madt_gicc(struct acpi_subtable_header *header,
+gic_acpi_parse_madt_gicc(union acpi_subtable_headers *header,
 			 const unsigned long end)
 {
 	struct acpi_madt_generic_interrupt *gicc =
@@ -1653,14 +1653,14 @@ static int __init gic_acpi_collect_gicr_base(void)
 	return -ENODEV;
 }
 
-static int __init gic_acpi_match_gicr(struct acpi_subtable_header *header,
+static int __init gic_acpi_match_gicr(union acpi_subtable_headers *header,
 				  const unsigned long end)
 {
 	/* Subtable presence means that redist exists, that's it */
 	return 0;
 }
 
-static int __init gic_acpi_match_gicc(struct acpi_subtable_header *header,
+static int __init gic_acpi_match_gicc(union acpi_subtable_headers *header,
 				      const unsigned long end)
 {
 	struct acpi_madt_generic_interrupt *gicc =
@@ -1726,7 +1726,7 @@ static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header,
 	return true;
 }
 
-static int __init gic_acpi_parse_virt_madt_gicc(struct acpi_subtable_header *header,
+static int __init gic_acpi_parse_virt_madt_gicc(union acpi_subtable_headers *header,
 						const unsigned long end)
 {
 	struct acpi_madt_generic_interrupt *gicc =
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index fd3110c171ba..c6dbe5018972 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -1495,7 +1495,7 @@ static struct
 } acpi_data __initdata;
 
 static int __init
-gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header,
+gic_acpi_parse_madt_cpu(union acpi_subtable_headers *header,
 			const unsigned long end)
 {
 	struct acpi_madt_generic_interrupt *processor;
@@ -1527,7 +1527,7 @@ gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header,
 }
 
 /* The things you have to do to just *count* something... */
-static int __init acpi_dummy_func(struct acpi_subtable_header *header,
+static int __init acpi_dummy_func(union acpi_subtable_headers *header,
 				  const unsigned long end)
 {
 	return 0;
diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c
index 256f18b67e8a..08a0a3517138 100644
--- a/drivers/mailbox/pcc.c
+++ b/drivers/mailbox/pcc.c
@@ -382,7 +382,7 @@ static const struct mbox_chan_ops pcc_chan_ops = {
  *
  * This gets called for each entry in the PCC table.
  */
-static int parse_pcc_subspace(struct acpi_subtable_header *header,
+static int parse_pcc_subspace(union acpi_subtable_headers *header,
 		const unsigned long end)
 {
 	struct acpi_pcct_subspace *ss = (struct acpi_pcct_subspace *) header;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index d5dcebd7aad3..9494d42bf507 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -141,10 +141,13 @@ enum acpi_address_range_id {
 
 
 /* Table Handlers */
+union acpi_subtable_headers {
+	struct acpi_subtable_header common;
+};
 
 typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table);
 
-typedef int (*acpi_tbl_entry_handler)(struct acpi_subtable_header *header,
+typedef int (*acpi_tbl_entry_handler)(union acpi_subtable_headers *header,
 				      const unsigned long end);
 
 /* Debugger support */
-- 
cgit v1.2.3


From e43e2657fe77a37b13643e2469670ecdb0ba5e10 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Dec 2018 14:32:02 +0100
Subject: x86/dma: Remove the x86_dma_fallback_dev hack

Now that we removed support for the NULL device argument in the DMA API,
there is no need to cater for that in the x86 code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/x86/include/asm/dma-mapping.h | 10 ----------
 arch/x86/kernel/amd_gart_64.c      |  6 ------
 arch/x86/kernel/pci-dma.c          | 20 --------------------
 kernel/dma/mapping.c               |  7 -------
 4 files changed, 43 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index ce4d176b3d13..6b15a24930e0 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -13,14 +13,7 @@
 #include <asm/swiotlb.h>
 #include <linux/dma-contiguous.h>
 
-#ifdef CONFIG_ISA
-# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
-#else
-# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
-#endif
-
 extern int iommu_merge;
-extern struct device x86_dma_fallback_dev;
 extern int panic_on_overflow;
 
 extern const struct dma_map_ops *dma_ops;
@@ -30,7 +23,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 	return dma_ops;
 }
 
-bool arch_dma_alloc_attrs(struct device **dev);
-#define arch_dma_alloc_attrs arch_dma_alloc_attrs
-
 #endif
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 2c0aa34af69c..bf7f13ea3c64 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -233,9 +233,6 @@ static dma_addr_t gart_map_page(struct device *dev, struct page *page,
 	unsigned long bus;
 	phys_addr_t paddr = page_to_phys(page) + offset;
 
-	if (!dev)
-		dev = &x86_dma_fallback_dev;
-
 	if (!need_iommu(dev, paddr, size))
 		return paddr;
 
@@ -392,9 +389,6 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 	if (nents == 0)
 		return 0;
 
-	if (!dev)
-		dev = &x86_dma_fallback_dev;
-
 	out		= 0;
 	start		= 0;
 	start_sg	= sg;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d460998ae828..dcd272dbd0a9 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -51,14 +51,6 @@ int iommu_pass_through __read_mostly;
 
 extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
 
-/* Dummy device used for NULL arguments (normally ISA). */
-struct device x86_dma_fallback_dev = {
-	.init_name = "fallback device",
-	.coherent_dma_mask = ISA_DMA_BIT_MASK,
-	.dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
-};
-EXPORT_SYMBOL(x86_dma_fallback_dev);
-
 void __init pci_iommu_alloc(void)
 {
 	struct iommu_table_entry *p;
@@ -77,18 +69,6 @@ void __init pci_iommu_alloc(void)
 	}
 }
 
-bool arch_dma_alloc_attrs(struct device **dev)
-{
-	if (!*dev)
-		*dev = &x86_dma_fallback_dev;
-
-	if (!is_device_dma_capable(*dev))
-		return false;
-	return true;
-
-}
-EXPORT_SYMBOL(arch_dma_alloc_attrs);
-
 /*
  * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
  * parameter documentation.
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index c000906348c9..685a53f2a793 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -238,10 +238,6 @@ u64 dma_get_required_mask(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dma_get_required_mask);
 
-#ifndef arch_dma_alloc_attrs
-#define arch_dma_alloc_attrs(dev)	(true)
-#endif
-
 void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t flag, unsigned long attrs)
 {
@@ -256,9 +252,6 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	/* let the implementation decide on the zone to allocate from: */
 	flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
-	if (!arch_dma_alloc_attrs(&dev))
-		return NULL;
-
 	if (dma_is_direct(ops))
 		cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
 	else if (ops->alloc)
-- 
cgit v1.2.3


From 39ea9baffda91df8bfee9b45610242a3191ea1ec Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:30 +0200
Subject: x86/fpu: Remove fpu->initialized usage in __fpu__restore_sig()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a preparation for the removal of the ->initialized member in the
fpu struct.

__fpu__restore_sig() is deactivating the FPU via fpu__drop() and then
setting manually ->initialized followed by fpu__restore(). The result is
that it is possible to manipulate fpu->state and the state of registers
won't be saved/restored on a context switch which would overwrite
fpu->state:

fpu__drop(fpu):
  ...
  fpu->initialized = 0;
  preempt_enable();

  <--- context switch

Don't access the fpu->state while the content is read from user space
and examined/sanitized. Use a temporary kmalloc() buffer for the
preparation of the FPU registers and once the state is considered okay,
load it. Should something go wrong, return with an error and without
altering the original FPU registers.

The removal of fpu__initialize() is a nop because fpu->initialized is
already set for the user task.

 [ bp: Massage a bit. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-2-bigeasy@linutronix.de
---
 arch/x86/include/asm/fpu/signal.h |  2 +-
 arch/x86/kernel/fpu/regset.c      |  5 ++---
 arch/x86/kernel/fpu/signal.c      | 40 +++++++++++++++------------------------
 3 files changed, 18 insertions(+), 29 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 44bbc39a57b3..7fb516b6893a 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -22,7 +22,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 
 extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
 			      struct task_struct *tsk);
-extern void convert_to_fxsr(struct task_struct *tsk,
+extern void convert_to_fxsr(struct fxregs_state *fxsave,
 			    const struct user_i387_ia32_struct *env);
 
 unsigned long
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index bc02f5144b95..5dbc099178a8 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -269,11 +269,10 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 		memcpy(&to[i], &from[i], sizeof(to[0]));
 }
 
-void convert_to_fxsr(struct task_struct *tsk,
+void convert_to_fxsr(struct fxregs_state *fxsave,
 		     const struct user_i387_ia32_struct *env)
 
 {
-	struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
 	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -350,7 +349,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
 	if (!ret)
-		convert_to_fxsr(target, &env);
+		convert_to_fxsr(&target->thread.fpu.state.fxsave, &env);
 
 	/*
 	 * update the header bit in the xsave header, indicating the
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 55b80de13ea5..7296a9bb78e7 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -207,11 +207,11 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 }
 
 static inline void
-sanitize_restored_xstate(struct task_struct *tsk,
+sanitize_restored_xstate(union fpregs_state *state,
 			 struct user_i387_ia32_struct *ia32_env,
 			 u64 xfeatures, int fx_only)
 {
-	struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+	struct xregs_state *xsave = &state->xsave;
 	struct xstate_header *header = &xsave->header;
 
 	if (use_xsave()) {
@@ -238,7 +238,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
 		 */
 		xsave->i387.mxcsr &= mxcsr_feature_mask;
 
-		convert_to_fxsr(tsk, ia32_env);
+		convert_to_fxsr(&state->fxsave, ia32_env);
 	}
 }
 
@@ -284,8 +284,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	if (!access_ok(buf, size))
 		return -EACCES;
 
-	fpu__initialize(fpu);
-
 	if (!static_cpu_has(X86_FEATURE_FPU))
 		return fpregs_soft_set(current, NULL,
 				       0, sizeof(struct user_i387_ia32_struct),
@@ -315,40 +313,32 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		 * header. Validate and sanitize the copied state.
 		 */
 		struct user_i387_ia32_struct env;
+		union fpregs_state *state;
 		int err = 0;
+		void *tmp;
 
-		/*
-		 * Drop the current fpu which clears fpu->initialized. This ensures
-		 * that any context-switch during the copy of the new state,
-		 * avoids the intermediate state from getting restored/saved.
-		 * Thus avoiding the new restored state from getting corrupted.
-		 * We will be ready to restore/save the state only after
-		 * fpu->initialized is again set.
-		 */
-		fpu__drop(fpu);
+		tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+		state = PTR_ALIGN(tmp, 64);
 
 		if (using_compacted_format()) {
-			err = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
+			err = copy_user_to_xstate(&state->xsave, buf_fx);
 		} else {
-			err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
+			err = __copy_from_user(&state->xsave, buf_fx, state_size);
 
 			if (!err && state_size > offsetof(struct xregs_state, header))
-				err = validate_xstate_header(&fpu->state.xsave.header);
+				err = validate_xstate_header(&state->xsave.header);
 		}
 
 		if (err || __copy_from_user(&env, buf, sizeof(env))) {
-			fpstate_init(&fpu->state);
-			trace_x86_fpu_init_state(fpu);
 			err = -1;
 		} else {
-			sanitize_restored_xstate(tsk, &env, xfeatures, fx_only);
+			sanitize_restored_xstate(state, &env, xfeatures, fx_only);
+			copy_kernel_to_fpregs(state);
 		}
 
-		local_bh_disable();
-		fpu->initialized = 1;
-		fpu__restore(fpu);
-		local_bh_enable();
-
+		kfree(tmp);
 		return err;
 	} else {
 		/*
-- 
cgit v1.2.3


From 6dd677a044e606fd343e31c2108b13d74aec1ca5 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:31 +0200
Subject: x86/fpu: Remove fpu__restore()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no users of fpu__restore() so it is time to remove it. The
comment regarding fpu__restore() and TS bit is stale since commit

  b3b0870ef3ffe ("i387: do not preload FPU state at task switch time")

and has no meaning since.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: Babu Moger <Babu.Moger@amd.com>
Cc: "Chang S. Bae" <chang.seok.bae@intel.com>
Cc: Dmitry Safonov <dima@arista.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: linux-doc@vger.kernel.org
Cc: Nicolai Stange <nstange@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-3-bigeasy@linutronix.de
---
 Documentation/preempt-locking.txt   |  1 -
 arch/x86/include/asm/fpu/internal.h |  1 -
 arch/x86/kernel/fpu/core.c          | 24 ------------------------
 arch/x86/kernel/process_32.c        |  4 +---
 arch/x86/kernel/process_64.c        |  4 +---
 5 files changed, 2 insertions(+), 32 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/preempt-locking.txt b/Documentation/preempt-locking.txt
index 509f5a422d57..dce336134e54 100644
--- a/Documentation/preempt-locking.txt
+++ b/Documentation/preempt-locking.txt
@@ -52,7 +52,6 @@ preemption must be disabled around such regions.
 
 Note, some FPU functions are already explicitly preempt safe.  For example,
 kernel_fpu_begin and kernel_fpu_end will disable and enable preemption.
-However, fpu__restore() must be called with preemption disabled.
 
 
 RULE #3: Lock acquire and release must be performed by same task
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index fb04a3ded7dd..75a1d5f712ee 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -28,7 +28,6 @@ extern void fpu__initialize(struct fpu *fpu);
 extern void fpu__prepare_read(struct fpu *fpu);
 extern void fpu__prepare_write(struct fpu *fpu);
 extern void fpu__save(struct fpu *fpu);
-extern void fpu__restore(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
 extern int  fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 2e5003fef51a..1d3ae7988f7f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -303,30 +303,6 @@ void fpu__prepare_write(struct fpu *fpu)
 	}
 }
 
-/*
- * 'fpu__restore()' is called to copy FPU registers from
- * the FPU fpstate to the live hw registers and to activate
- * access to the hardware registers, so that FPU instructions
- * can be used afterwards.
- *
- * Must be called with kernel preemption disabled (for example
- * with local interrupts disabled, as it is in the case of
- * do_device_not_available()).
- */
-void fpu__restore(struct fpu *fpu)
-{
-	fpu__initialize(fpu);
-
-	/* Avoid __kernel_fpu_begin() right after fpregs_activate() */
-	kernel_fpu_disable();
-	trace_x86_fpu_before_restore(fpu);
-	fpregs_activate(fpu);
-	copy_kernel_to_fpregs(&fpu->state);
-	trace_x86_fpu_after_restore(fpu);
-	kernel_fpu_enable();
-}
-EXPORT_SYMBOL_GPL(fpu__restore);
-
 /*
  * Drops current FPU state: deactivates the fpregs and
  * the fpstate. NOTE: it still leaves previous contents
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index e471d8e6f0b2..7888a41a03cd 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -267,9 +267,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
-	 * the GDT and LDT are properly updated, and must be
-	 * done before fpu__restore(), so the TS bit is up
-	 * to date.
+	 * the GDT and LDT are properly updated.
 	 */
 	arch_end_context_switch(next_p);
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6a62f4af9fcf..e1983b3a16c4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -538,9 +538,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.  This
 	 * must be done after loading TLS entries in the GDT but before
-	 * loading segments that might reference them, and and it must
-	 * be done before fpu__restore(), so the TS bit is up to
-	 * date.
+	 * loading segments that might reference them.
 	 */
 	arch_end_context_switch(next_p);
 
-- 
cgit v1.2.3


From 60e528d6ce3f60a058bbb64f8acb2a07f84b172a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:32 +0200
Subject: x86/fpu: Remove preempt_disable() in fpu__clear()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The preempt_disable() section was introduced in commit

  a10b6a16cdad8 ("x86/fpu: Make the fpu state change in fpu__clear() scheduler-atomic")

and it was said to be temporary.

fpu__initialize() initializes the FPU struct to its initial value and
then sets ->initialized to 1. The last part is the important one.
The content of the state does not matter because it gets set via
copy_init_fpstate_to_fpregs().

A preemption here has little meaning because the registers will always be
set to the same content after copy_init_fpstate_to_fpregs(). A softirq
with a kernel_fpu_begin() could also force to save FPU's registers after
fpu__initialize() without changing the outcome here.

Remove the preempt_disable() section in fpu__clear(), preemption here
does not hurt.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-4-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1d3ae7988f7f..1940319268ae 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -366,11 +366,9 @@ void fpu__clear(struct fpu *fpu)
 	 * Make sure fpstate is cleared and initialized.
 	 */
 	if (static_cpu_has(X86_FEATURE_FPU)) {
-		preempt_disable();
 		fpu__initialize(fpu);
 		user_fpu_begin();
 		copy_init_fpstate_to_fpregs();
-		preempt_enable();
 	}
 }
 
-- 
cgit v1.2.3


From 88f5260a3bf9bfb276b5b4aac2e81587e425a1d7 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:33 +0200
Subject: x86/fpu: Always init the state in fpu__clear()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fpu__clear() only initializes the state if the CPU has FPU support.
This initialisation is also required for FPU-less systems and takes
place in math_emulate(). Since fpu__initialize() only performs the
initialization if ->initialized is zero it does not matter that it
is invoked each time an opcode is emulated. It makes the removal of
->initialized easier if the struct is also initialized in the FPU-less
case at the same time.

Move fpu__initialize() before the FPU feature check so it is also
performed in the FPU-less case too.

 [ bp: Massage a bit. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: Bill Metzenthen <billm@melbpc.org.au>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-5-bigeasy@linutronix.de
---
 arch/x86/include/asm/fpu/internal.h | 1 -
 arch/x86/kernel/fpu/core.c          | 5 ++---
 arch/x86/math-emu/fpu_entry.c       | 3 ---
 3 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 75a1d5f712ee..70ecb7c032cb 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -24,7 +24,6 @@
 /*
  * High level FPU state handling functions:
  */
-extern void fpu__initialize(struct fpu *fpu);
 extern void fpu__prepare_read(struct fpu *fpu);
 extern void fpu__prepare_write(struct fpu *fpu);
 extern void fpu__save(struct fpu *fpu);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1940319268ae..e43296854e37 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -223,7 +223,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
  * Activate the current task's in-memory FPU context,
  * if it has not been used before:
  */
-void fpu__initialize(struct fpu *fpu)
+static void fpu__initialize(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
@@ -236,7 +236,6 @@ void fpu__initialize(struct fpu *fpu)
 		fpu->initialized = 1;
 	}
 }
-EXPORT_SYMBOL_GPL(fpu__initialize);
 
 /*
  * This function must be called before we read a task's fpstate.
@@ -365,8 +364,8 @@ void fpu__clear(struct fpu *fpu)
 	/*
 	 * Make sure fpstate is cleared and initialized.
 	 */
+	fpu__initialize(fpu);
 	if (static_cpu_has(X86_FEATURE_FPU)) {
-		fpu__initialize(fpu);
 		user_fpu_begin();
 		copy_init_fpstate_to_fpregs();
 	}
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 9e2ba7e667f6..a873da6b46d6 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -113,9 +113,6 @@ void math_emulate(struct math_emu_info *info)
 	unsigned long code_base = 0;
 	unsigned long code_limit = 0;	/* Initialized to stop compiler warnings */
 	struct desc_struct code_descriptor;
-	struct fpu *fpu = &current->thread.fpu;
-
-	fpu__initialize(fpu);
 
 #ifdef RE_ENTRANT_CHECKING
 	if (emulating) {
-- 
cgit v1.2.3


From fbcc9e0c37ba3186c41b5eb1aee2f7f3b711bc1e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:34 +0200
Subject: x86/fpu: Remove fpu->initialized usage in copy_fpstate_to_sigframe()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With lazy-FPU support the (now named variable) ->initialized was set to
true if the CPU's FPU registers were holding a valid state of the
FPU registers for the active process. If it was set to false then the
FPU state was saved in fpu->state and the FPU was deactivated.

With lazy-FPU gone, ->initialized is always true for user threads and
kernel threads never call this function so ->initialized is always true
in copy_fpstate_to_sigframe().

The using_compacted_format() check is also a leftover from the lazy-FPU
time. In the

  ->initialized == false

case copy_to_user() would copy the compacted buffer while userland would
expect the non-compacted format instead. So in order to save the FPU
state in the non-compacted form it issues XSAVE to save the *current*
FPU state.

If the FPU is not enabled, the attempt raises the FPU trap, the trap
restores the FPU contents and re-enables the FPU and XSAVE is invoked
again and succeeds.

*This* does not longer work since commit

  bef8b6da9522 ("x86/fpu: Handle #NM without FPU emulation as an error")

Remove the check for ->initialized because it is always true and remove
the false condition. Update the comment to reflect that the state is
always live.

 [ bp: Massage. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-6-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 35 ++++++++---------------------------
 1 file changed, 8 insertions(+), 27 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 7296a9bb78e7..c1a5999affa0 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -144,9 +144,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
  *	buf != buf_fx for 32-bit frames with fxstate.
  *
- * If the fpu, extended register state is live, save the state directly
- * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
- * copy the thread's fpu state to the user frame starting at 'buf_fx'.
+ * Save the state directly to the user frame pointed by the aligned pointer
+ * 'buf_fx'.
  *
  * If this is a 32-bit frame with fxstate, put a fsave header before
  * the aligned state at 'buf_fx'.
@@ -157,7 +156,6 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
 int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
 	struct fpu *fpu = &current->thread.fpu;
-	struct xregs_state *xsave = &fpu->state.xsave;
 	struct task_struct *tsk = current;
 	int ia32_fxstate = (buf != buf_fx);
 
@@ -172,29 +170,12 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			sizeof(struct user_i387_ia32_struct), NULL,
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
-	if (fpu->initialized || using_compacted_format()) {
-		/* Save the live register state to the user directly. */
-		if (copy_fpregs_to_sigframe(buf_fx))
-			return -1;
-		/* Update the thread's fxstate to save the fsave header. */
-		if (ia32_fxstate)
-			copy_fxregs_to_kernel(fpu);
-	} else {
-		/*
-		 * It is a *bug* if kernel uses compacted-format for xsave
-		 * area and we copy it out directly to a signal frame. It
-		 * should have been handled above by saving the registers
-		 * directly.
-		 */
-		if (boot_cpu_has(X86_FEATURE_XSAVES)) {
-			WARN_ONCE(1, "x86/fpu: saving compacted-format xsave area to a signal frame!\n");
-			return -1;
-		}
-
-		fpstate_sanitize_xstate(fpu);
-		if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
-			return -1;
-	}
+	/* Save the live registers state to the user frame directly. */
+	if (copy_fpregs_to_sigframe(buf_fx))
+		return -1;
+	/* Update the thread's fxstate to save the fsave header. */
+	if (ia32_fxstate)
+		copy_fxregs_to_kernel(fpu);
 
 	/* Save the fsave header for the 32-bit frames. */
 	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
-- 
cgit v1.2.3


From 39388e80f9b0c3788bfb6efe3054bdce0c3ead45 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:35 +0200
Subject: x86/fpu: Don't save fxregs for ia32 frames in
 copy_fpstate_to_sigframe()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In commit

  72a671ced66db ("x86, fpu: Unify signal handling code paths for x86 and x86_64 kernels")

the 32bit and 64bit path of the signal delivery code were merged.

The 32bit version:

  int save_i387_xstate_ia32(void __user *buf)
  …
         if (cpu_has_xsave)
                 return save_i387_xsave(fp);
         if (cpu_has_fxsr)
                 return save_i387_fxsave(fp);

The 64bit version:

  int save_i387_xstate(void __user *buf)
  …
         if (user_has_fpu()) {
                 if (use_xsave())
                         err = xsave_user(buf);
                 else
                         err = fxsave_user(buf);

                 if (unlikely(err)) {
                         __clear_user(buf, xstate_size);
                         return err;

The merge:

  int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)
  …
         if (user_has_fpu()) {
                 /* Save the live register state to the user directly. */
                 if (save_user_xstate(buf_fx))
                         return -1;
                 /* Update the thread's fxstate to save the fsave header. */
                 if (ia32_fxstate)
                         fpu_fxsave(&tsk->thread.fpu);

I don't think that we needed to save the FPU registers to ->thread.fpu
because the registers were stored in buf_fx. Today the state will be
restored from buf_fx after the signal was handled (I assume that this
was also the case with lazy-FPU).

Since commit

  66463db4fc560 ("x86, fpu: shift drop_init_fpu() from save_xstate_sig() to handle_signal()")

it is ensured that the signal handler starts with clear/fresh set of FPU
registers which means that the previous store is futile.

Remove the copy_fxregs_to_kernel() call because task's FPU state is
cleared later in handle_signal() via fpu__clear().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-7-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index c1a5999affa0..34989d2a8893 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -155,7 +155,6 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  */
 int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
-	struct fpu *fpu = &current->thread.fpu;
 	struct task_struct *tsk = current;
 	int ia32_fxstate = (buf != buf_fx);
 
@@ -173,9 +172,6 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	/* Save the live registers state to the user frame directly. */
 	if (copy_fpregs_to_sigframe(buf_fx))
 		return -1;
-	/* Update the thread's fxstate to save the fsave header. */
-	if (ia32_fxstate)
-		copy_fxregs_to_kernel(fpu);
 
 	/* Save the fsave header for the 32-bit frames. */
 	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
-- 
cgit v1.2.3


From 2722146eb78451b30e4717a267a3a2b44e4ad317 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:36 +0200
Subject: x86/fpu: Remove fpu->initialized
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The struct fpu.initialized member is always set to one for user tasks
and zero for kernel tasks. This avoids saving/restoring the FPU
registers for kernel threads.

The ->initialized = 0 case for user tasks has been removed in previous
changes, for instance, by doing an explicit unconditional init at fork()
time for FPU-less systems which was otherwise delayed until the emulated
opcode.

The context switch code (switch_fpu_prepare() + switch_fpu_finish())
can't unconditionally save/restore registers for kernel threads. Not
only would it slow down the switch but also load a zeroed xcomp_bv for
XSAVES.

For kernel_fpu_begin() (+end) the situation is similar: EFI with runtime
services uses this before alternatives_patched is true. Which means that
this function is used too early and it wasn't the case before.

For those two cases, use current->mm to distinguish between user and
kernel thread. For kernel_fpu_begin() skip save/restore of the FPU
registers.

During the context switch into a kernel thread don't do anything. There
is no reason to save the FPU state of a kernel thread.

The reordering in __switch_to() is important because the current()
pointer needs to be valid before switch_fpu_finish() is invoked so ->mm
is seen of the new task instead the old one.

N.B.: fpu__save() doesn't need to check ->mm because it is called by
user tasks only.

 [ bp: Massage. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: Babu Moger <Babu.Moger@amd.com>
Cc: "Chang S. Bae" <chang.seok.bae@intel.com>
Cc: Dmitry Safonov <dima@arista.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-8-bigeasy@linutronix.de
---
 arch/x86/ia32/ia32_signal.c         | 17 ++++-----
 arch/x86/include/asm/fpu/internal.h | 18 +++++-----
 arch/x86/include/asm/fpu/types.h    |  9 -----
 arch/x86/include/asm/trace/fpu.h    |  5 +--
 arch/x86/kernel/fpu/core.c          | 70 +++++++++++--------------------------
 arch/x86/kernel/fpu/init.c          |  2 --
 arch/x86/kernel/fpu/regset.c        | 19 +++-------
 arch/x86/kernel/fpu/xstate.c        |  2 --
 arch/x86/kernel/process_32.c        |  4 +--
 arch/x86/kernel/process_64.c        |  4 +--
 arch/x86/kernel/signal.c            | 17 ++++-----
 arch/x86/mm/pkeys.c                 |  7 +---
 12 files changed, 53 insertions(+), 121 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 321fe5f5d0e9..6eeb3249f22f 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -216,8 +216,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 				 size_t frame_size,
 				 void __user **fpstate)
 {
-	struct fpu *fpu = &current->thread.fpu;
-	unsigned long sp;
+	unsigned long sp, fx_aligned, math_size;
 
 	/* Default to using normal stack */
 	sp = regs->sp;
@@ -231,15 +230,11 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 		 ksig->ka.sa.sa_restorer)
 		sp = (unsigned long) ksig->ka.sa.sa_restorer;
 
-	if (fpu->initialized) {
-		unsigned long fx_aligned, math_size;
-
-		sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
-		*fpstate = (struct _fpstate_32 __user *) sp;
-		if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
-				    math_size) < 0)
-			return (void __user *) -1L;
-	}
+	sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
+	*fpstate = (struct _fpstate_32 __user *) sp;
+	if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
+				     math_size) < 0)
+		return (void __user *) -1L;
 
 	sp -= frame_size;
 	/* Align the stack pointer according to the i386 ABI,
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 70ecb7c032cb..04042eacc852 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -494,11 +494,14 @@ static inline void fpregs_activate(struct fpu *fpu)
  *
  *  - switch_fpu_finish() restores the new state as
  *    necessary.
+ *
+ * The FPU context is only stored/restored for a user task and
+ * ->mm is used to distinguish between kernel and user threads.
  */
 static inline void
 switch_fpu_prepare(struct fpu *old_fpu, int cpu)
 {
-	if (static_cpu_has(X86_FEATURE_FPU) && old_fpu->initialized) {
+	if (static_cpu_has(X86_FEATURE_FPU) && current->mm) {
 		if (!copy_fpregs_to_fpstate(old_fpu))
 			old_fpu->last_cpu = -1;
 		else
@@ -506,8 +509,7 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
 
 		/* But leave fpu_fpregs_owner_ctx! */
 		trace_x86_fpu_regs_deactivated(old_fpu);
-	} else
-		old_fpu->last_cpu = -1;
+	}
 }
 
 /*
@@ -520,12 +522,12 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
  */
 static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
 {
-	bool preload = static_cpu_has(X86_FEATURE_FPU) &&
-		       new_fpu->initialized;
+	if (static_cpu_has(X86_FEATURE_FPU)) {
+		if (!fpregs_state_valid(new_fpu, cpu)) {
+			if (current->mm)
+				copy_kernel_to_fpregs(&new_fpu->state);
+		}
 
-	if (preload) {
-		if (!fpregs_state_valid(new_fpu, cpu))
-			copy_kernel_to_fpregs(&new_fpu->state);
 		fpregs_activate(new_fpu);
 	}
 }
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 2e32e178e064..f098f6cab94b 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -293,15 +293,6 @@ struct fpu {
 	 */
 	unsigned int			last_cpu;
 
-	/*
-	 * @initialized:
-	 *
-	 * This flag indicates whether this context is initialized: if the task
-	 * is not running then we can restore from this context, if the task
-	 * is running then we should save into this context.
-	 */
-	unsigned char			initialized;
-
 	/*
 	 * @avx512_timestamp:
 	 *
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 069c04be1507..bd65f6ba950f 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -13,22 +13,19 @@ DECLARE_EVENT_CLASS(x86_fpu,
 
 	TP_STRUCT__entry(
 		__field(struct fpu *, fpu)
-		__field(bool, initialized)
 		__field(u64, xfeatures)
 		__field(u64, xcomp_bv)
 		),
 
 	TP_fast_assign(
 		__entry->fpu		= fpu;
-		__entry->initialized	= fpu->initialized;
 		if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
 			__entry->xfeatures = fpu->state.xsave.header.xfeatures;
 			__entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
 		}
 	),
-	TP_printk("x86/fpu: %p initialized: %d xfeatures: %llx xcomp_bv: %llx",
+	TP_printk("x86/fpu: %p xfeatures: %llx xcomp_bv: %llx",
 			__entry->fpu,
-			__entry->initialized,
 			__entry->xfeatures,
 			__entry->xcomp_bv
 	)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index e43296854e37..97e27de2b7c0 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -101,7 +101,7 @@ static void __kernel_fpu_begin(void)
 
 	kernel_fpu_disable();
 
-	if (fpu->initialized) {
+	if (current->mm) {
 		/*
 		 * Ignore return value -- we don't care if reg state
 		 * is clobbered.
@@ -116,7 +116,7 @@ static void __kernel_fpu_end(void)
 {
 	struct fpu *fpu = &current->thread.fpu;
 
-	if (fpu->initialized)
+	if (current->mm)
 		copy_kernel_to_fpregs(&fpu->state);
 
 	kernel_fpu_enable();
@@ -147,11 +147,10 @@ void fpu__save(struct fpu *fpu)
 
 	preempt_disable();
 	trace_x86_fpu_before_save(fpu);
-	if (fpu->initialized) {
-		if (!copy_fpregs_to_fpstate(fpu)) {
-			copy_kernel_to_fpregs(&fpu->state);
-		}
-	}
+
+	if (!copy_fpregs_to_fpstate(fpu))
+		copy_kernel_to_fpregs(&fpu->state);
+
 	trace_x86_fpu_after_save(fpu);
 	preempt_enable();
 }
@@ -190,7 +189,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
 	dst_fpu->last_cpu = -1;
 
-	if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU))
+	if (!static_cpu_has(X86_FEATURE_FPU))
 		return 0;
 
 	WARN_ON_FPU(src_fpu != &current->thread.fpu);
@@ -227,14 +226,10 @@ static void fpu__initialize(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
-	if (!fpu->initialized) {
-		fpstate_init(&fpu->state);
-		trace_x86_fpu_init_state(fpu);
+	fpstate_init(&fpu->state);
+	trace_x86_fpu_init_state(fpu);
 
-		trace_x86_fpu_activate_state(fpu);
-		/* Safe to do for the current task: */
-		fpu->initialized = 1;
-	}
+	trace_x86_fpu_activate_state(fpu);
 }
 
 /*
@@ -247,32 +242,20 @@ static void fpu__initialize(struct fpu *fpu)
  *
  * - or it's called for stopped tasks (ptrace), in which case the
  *   registers were already saved by the context-switch code when
- *   the task scheduled out - we only have to initialize the registers
- *   if they've never been initialized.
+ *   the task scheduled out.
  *
  * If the task has used the FPU before then save it.
  */
 void fpu__prepare_read(struct fpu *fpu)
 {
-	if (fpu == &current->thread.fpu) {
+	if (fpu == &current->thread.fpu)
 		fpu__save(fpu);
-	} else {
-		if (!fpu->initialized) {
-			fpstate_init(&fpu->state);
-			trace_x86_fpu_init_state(fpu);
-
-			trace_x86_fpu_activate_state(fpu);
-			/* Safe to do for current and for stopped child tasks: */
-			fpu->initialized = 1;
-		}
-	}
 }
 
 /*
  * This function must be called before we write a task's fpstate.
  *
- * If the task has used the FPU before then invalidate any cached FPU registers.
- * If the task has not used the FPU before then initialize its fpstate.
+ * Invalidate any cached FPU registers.
  *
  * After this function call, after registers in the fpstate are
  * modified and the child task has woken up, the child task will
@@ -289,17 +272,8 @@ void fpu__prepare_write(struct fpu *fpu)
 	 */
 	WARN_ON_FPU(fpu == &current->thread.fpu);
 
-	if (fpu->initialized) {
-		/* Invalidate any cached state: */
-		__fpu_invalidate_fpregs_state(fpu);
-	} else {
-		fpstate_init(&fpu->state);
-		trace_x86_fpu_init_state(fpu);
-
-		trace_x86_fpu_activate_state(fpu);
-		/* Safe to do for stopped child tasks: */
-		fpu->initialized = 1;
-	}
+	/* Invalidate any cached state: */
+	__fpu_invalidate_fpregs_state(fpu);
 }
 
 /*
@@ -316,17 +290,13 @@ void fpu__drop(struct fpu *fpu)
 	preempt_disable();
 
 	if (fpu == &current->thread.fpu) {
-		if (fpu->initialized) {
-			/* Ignore delayed exceptions from user space */
-			asm volatile("1: fwait\n"
-				     "2:\n"
-				     _ASM_EXTABLE(1b, 2b));
-			fpregs_deactivate(fpu);
-		}
+		/* Ignore delayed exceptions from user space */
+		asm volatile("1: fwait\n"
+			     "2:\n"
+			     _ASM_EXTABLE(1b, 2b));
+		fpregs_deactivate(fpu);
 	}
 
-	fpu->initialized = 0;
-
 	trace_x86_fpu_dropped(fpu);
 
 	preempt_enable();
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 6abd83572b01..20d8fa7124c7 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -239,8 +239,6 @@ static void __init fpu__init_system_ctx_switch(void)
 
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
-
-	WARN_ON_FPU(current->thread.fpu.initialized);
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 5dbc099178a8..d652b939ccfb 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -15,16 +15,12 @@
  */
 int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
 {
-	struct fpu *target_fpu = &target->thread.fpu;
-
-	return target_fpu->initialized ? regset->n : 0;
+	return regset->n;
 }
 
 int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
 {
-	struct fpu *target_fpu = &target->thread.fpu;
-
-	if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized)
+	if (boot_cpu_has(X86_FEATURE_FXSR))
 		return regset->n;
 	else
 		return 0;
@@ -370,16 +366,9 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu)
 {
 	struct task_struct *tsk = current;
-	struct fpu *fpu = &tsk->thread.fpu;
-	int fpvalid;
-
-	fpvalid = fpu->initialized;
-	if (fpvalid)
-		fpvalid = !fpregs_get(tsk, NULL,
-				      0, sizeof(struct user_i387_ia32_struct),
-				      ufpu, NULL);
 
-	return fpvalid;
+	return !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct),
+			   ufpu, NULL);
 }
 EXPORT_SYMBOL(dump_fpu);
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d7432c2b1051..8bfcc5b9e04b 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -892,8 +892,6 @@ const void *get_xsave_field_ptr(int xsave_state)
 {
 	struct fpu *fpu = &current->thread.fpu;
 
-	if (!fpu->initialized)
-		return NULL;
 	/*
 	 * fpu__save() takes the CPU's xstate registers
 	 * and saves them off to the 'fpu memory buffer.
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 7888a41a03cd..77d9eb43ccac 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -288,10 +288,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (prev->gs | next->gs)
 		lazy_load_gs(next->gs);
 
-	switch_fpu_finish(next_fpu, cpu);
-
 	this_cpu_write(current_task, next_p);
 
+	switch_fpu_finish(next_fpu, cpu);
+
 	/* Load the Intel cache allocation PQR MSR. */
 	resctrl_sched_in();
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e1983b3a16c4..ffea7c557963 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -566,14 +566,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	x86_fsgsbase_load(prev, next);
 
-	switch_fpu_finish(next_fpu, cpu);
-
 	/*
 	 * Switch the PDA and FPU contexts.
 	 */
 	this_cpu_write(current_task, next_p);
 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 
+	switch_fpu_finish(next_fpu, cpu);
+
 	/* Reload sp0. */
 	update_task_stack(next_p);
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b419e1a1a0ce..87b327c6cb10 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -246,7 +246,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	unsigned long sp = regs->sp;
 	unsigned long buf_fx = 0;
 	int onsigstack = on_sig_stack(sp);
-	struct fpu *fpu = &current->thread.fpu;
+	int ret;
 
 	/* redzone */
 	if (IS_ENABLED(CONFIG_X86_64))
@@ -265,11 +265,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 		sp = (unsigned long) ka->sa.sa_restorer;
 	}
 
-	if (fpu->initialized) {
-		sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
-					  &buf_fx, &math_size);
-		*fpstate = (void __user *)sp;
-	}
+	sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
+				  &buf_fx, &math_size);
+	*fpstate = (void __user *)sp;
 
 	sp = align_sigframe(sp - frame_size);
 
@@ -281,8 +279,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 		return (void __user *)-1L;
 
 	/* save i387 and extended state */
-	if (fpu->initialized &&
-	    copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0)
+	ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size);
+	if (ret < 0)
 		return (void __user *)-1L;
 
 	return (void __user *)sp;
@@ -763,8 +761,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		/*
 		 * Ensure the signal handler starts with the new fpu state.
 		 */
-		if (fpu->initialized)
-			fpu__clear(fpu);
+		fpu__clear(fpu);
 	}
 	signal_setup_done(failed, ksig, stepping);
 }
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 047a77f6a10c..05bb9a44eb1c 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -39,17 +39,12 @@ int __execute_only_pkey(struct mm_struct *mm)
 	 * dance to set PKRU if we do not need to.  Check it
 	 * first and assume that if the execute-only pkey is
 	 * write-disabled that we do not have to set it
-	 * ourselves.  We need preempt off so that nobody
-	 * can make fpregs inactive.
+	 * ourselves.
 	 */
-	preempt_disable();
 	if (!need_to_set_mm_pkey &&
-	    current->thread.fpu.initialized &&
 	    !__pkru_allows_read(read_pkru(), execute_only_pkey)) {
-		preempt_enable();
 		return execute_only_pkey;
 	}
-	preempt_enable();
 
 	/*
 	 * Set up PKRU so that it denies access for everything
-- 
cgit v1.2.3


From 0169f53e0d97bb675075506810494bd86b8c934e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:37 +0200
Subject: x86/fpu: Remove user_fpu_begin()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

user_fpu_begin() sets fpu_fpregs_owner_ctx to task's fpu struct. This is
always the case since there is no lazy FPU anymore.

fpu_fpregs_owner_ctx is used during context switch to decide if it needs
to load the saved registers or if the currently loaded registers are
valid. It could be skipped during a

  taskA -> kernel thread -> taskA

switch because the switch to the kernel thread would not alter the CPU's
sFPU tate.

Since this field is always updated during context switch and
never invalidated, setting it manually (in user context) makes no
difference. A kernel thread with kernel_fpu_begin() block could
set fpu_fpregs_owner_ctx to NULL but a kernel thread does not use
user_fpu_begin().

This is a leftover from the lazy-FPU time.

Remove user_fpu_begin(), it does not change fpu_fpregs_owner_ctx's
content.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-9-bigeasy@linutronix.de
---
 arch/x86/include/asm/fpu/internal.h | 17 -----------------
 arch/x86/kernel/fpu/core.c          |  4 +---
 arch/x86/kernel/fpu/signal.c        |  1 -
 3 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 04042eacc852..54f70cae2f15 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -532,23 +532,6 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
 	}
 }
 
-/*
- * Needs to be preemption-safe.
- *
- * NOTE! user_fpu_begin() must be used only immediately before restoring
- * the save state. It does not do any saving/restoring on its own. In
- * lazy FPU mode, it is just an optimization to avoid a #NM exception,
- * the task can lose the FPU right after preempt_enable().
- */
-static inline void user_fpu_begin(void)
-{
-	struct fpu *fpu = &current->thread.fpu;
-
-	preempt_disable();
-	fpregs_activate(fpu);
-	preempt_enable();
-}
-
 /*
  * MXCSR and XCR definitions:
  */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 97e27de2b7c0..739ca3ae2bdc 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -335,10 +335,8 @@ void fpu__clear(struct fpu *fpu)
 	 * Make sure fpstate is cleared and initialized.
 	 */
 	fpu__initialize(fpu);
-	if (static_cpu_has(X86_FEATURE_FPU)) {
-		user_fpu_begin();
+	if (static_cpu_has(X86_FEATURE_FPU))
 		copy_init_fpstate_to_fpregs();
-	}
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 34989d2a8893..155f4552413e 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -322,7 +322,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		 * For 64-bit frames and 32-bit fsave frames, restore the user
 		 * state to the registers directly (with exceptions handled).
 		 */
-		user_fpu_begin();
 		if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) {
 			fpu__clear(fpu);
 			return -1;
-- 
cgit v1.2.3


From 07baeb04f37c952981d63359ff840118ce8f5434 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:39 +0200
Subject: x86/fpu: Make __raw_xsave_addr() use a feature number instead of mask
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most users of __raw_xsave_addr() use a feature number, shift it to a
mask and then __raw_xsave_addr() shifts it back to the feature number.

Make __raw_xsave_addr() use the feature number as an argument.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-11-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/xstate.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 8bfcc5b9e04b..4f7f3c5d0d0c 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -805,20 +805,18 @@ void fpu__resume_cpu(void)
 }
 
 /*
- * Given an xstate feature mask, calculate where in the xsave
+ * Given an xstate feature nr, calculate where in the xsave
  * buffer the state is.  Callers should ensure that the buffer
  * is valid.
  */
-static void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
+static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 {
-	int feature_nr = fls64(xstate_feature_mask) - 1;
-
-	if (!xfeature_enabled(feature_nr)) {
+	if (!xfeature_enabled(xfeature_nr)) {
 		WARN_ON_FPU(1);
 		return NULL;
 	}
 
-	return (void *)xsave + xstate_comp_offsets[feature_nr];
+	return (void *)xsave + xstate_comp_offsets[xfeature_nr];
 }
 /*
  * Given the xsave area and a state inside, this function returns the
@@ -840,6 +838,7 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask
  */
 void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
 {
+	int xfeature_nr;
 	/*
 	 * Do we even *have* xsave state?
 	 */
@@ -867,7 +866,8 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
 	if (!(xsave->header.xfeatures & xstate_feature))
 		return NULL;
 
-	return __raw_xsave_addr(xsave, xstate_feature);
+	xfeature_nr = fls64(xstate_feature) - 1;
+	return __raw_xsave_addr(xsave, xfeature_nr);
 }
 EXPORT_SYMBOL_GPL(get_xsave_addr);
 
@@ -1014,7 +1014,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
 		 * Copy only in-use xstates:
 		 */
 		if ((header.xfeatures >> i) & 1) {
-			void *src = __raw_xsave_addr(xsave, 1 << i);
+			void *src = __raw_xsave_addr(xsave, i);
 
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
@@ -1100,7 +1100,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 		 * Copy only in-use xstates:
 		 */
 		if ((header.xfeatures >> i) & 1) {
-			void *src = __raw_xsave_addr(xsave, 1 << i);
+			void *src = __raw_xsave_addr(xsave, i);
 
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
@@ -1157,7 +1157,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
 		u64 mask = ((u64)1 << i);
 
 		if (hdr.xfeatures & mask) {
-			void *dst = __raw_xsave_addr(xsave, 1 << i);
+			void *dst = __raw_xsave_addr(xsave, i);
 
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
@@ -1211,7 +1211,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 		u64 mask = ((u64)1 << i);
 
 		if (hdr.xfeatures & mask) {
-			void *dst = __raw_xsave_addr(xsave, 1 << i);
+			void *dst = __raw_xsave_addr(xsave, i);
 
 			offset = xstate_offsets[i];
 			size = xstate_sizes[i];
-- 
cgit v1.2.3


From abd16d68d65229e5acafdadc32704239131bf2ea Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:40 +0200
Subject: x86/fpu: Use a feature number instead of mask in two more helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After changing the argument of __raw_xsave_addr() from a mask to
number Dave suggested to check if it makes sense to do the same for
get_xsave_addr(). As it turns out it does.

Only get_xsave_addr() needs the mask to check if the requested feature
is part of what is supported/saved and then uses the number again. The
shift operation is cheaper compared to fls64() (find last bit set).
Also, the feature number uses less opcode space compared to the mask. :)

Make the get_xsave_addr() argument a xfeature number instead of a mask
and fix up its callers.

Furthermore, use xfeature_nr and xfeature_mask consistently.

This results in the following changes to the kvm code:

  feature -> xfeature_mask
  index -> xfeature_nr

Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Siarhei Liakh <Siarhei.Liakh@concurrent-rt.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-12-bigeasy@linutronix.de
---
 arch/x86/include/asm/fpu/xstate.h |  4 ++--
 arch/x86/kernel/fpu/xstate.c      | 22 ++++++++++------------
 arch/x86/kernel/traps.c           |  2 +-
 arch/x86/kvm/x86.c                | 28 ++++++++++++++--------------
 arch/x86/mm/mpx.c                 |  6 +++---
 5 files changed, 30 insertions(+), 32 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 48581988d78c..fbe41f808e5d 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -46,8 +46,8 @@ extern void __init update_regset_xstate_info(unsigned int size,
 					     u64 xstate_mask);
 
 void fpu__xstate_clear_all_cpu_caps(void);
-void *get_xsave_addr(struct xregs_state *xsave, int xstate);
-const void *get_xsave_field_ptr(int xstate_field);
+void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
 int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 4f7f3c5d0d0c..9c459fd1d38e 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -830,15 +830,14 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
  *
  * Inputs:
  *	xstate: the thread's storage area for all FPU data
- *	xstate_feature: state which is defined in xsave.h (e.g.
- *	XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...)
+ *	xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
+ *	XFEATURE_SSE, etc...)
  * Output:
  *	address of the state in the xsave area, or NULL if the
  *	field is not present in the xsave buffer.
  */
-void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
+void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 {
-	int xfeature_nr;
 	/*
 	 * Do we even *have* xsave state?
 	 */
@@ -850,11 +849,11 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
 	 * have not enabled.  Remember that pcntxt_mask is
 	 * what we write to the XCR0 register.
 	 */
-	WARN_ONCE(!(xfeatures_mask & xstate_feature),
+	WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)),
 		  "get of unsupported state");
 	/*
 	 * This assumes the last 'xsave*' instruction to
-	 * have requested that 'xstate_feature' be saved.
+	 * have requested that 'xfeature_nr' be saved.
 	 * If it did not, we might be seeing and old value
 	 * of the field in the buffer.
 	 *
@@ -863,10 +862,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
 	 * or because the "init optimization" caused it
 	 * to not be saved.
 	 */
-	if (!(xsave->header.xfeatures & xstate_feature))
+	if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
 		return NULL;
 
-	xfeature_nr = fls64(xstate_feature) - 1;
 	return __raw_xsave_addr(xsave, xfeature_nr);
 }
 EXPORT_SYMBOL_GPL(get_xsave_addr);
@@ -882,13 +880,13 @@ EXPORT_SYMBOL_GPL(get_xsave_addr);
  * Note that this only works on the current task.
  *
  * Inputs:
- *	@xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
- *	XFEATURE_MASK_SSE, etc...)
+ *	@xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
+ *	XFEATURE_SSE, etc...)
  * Output:
  *	address of the state in the xsave area or NULL if the state
  *	is not present or is in its 'init state'.
  */
-const void *get_xsave_field_ptr(int xsave_state)
+const void *get_xsave_field_ptr(int xfeature_nr)
 {
 	struct fpu *fpu = &current->thread.fpu;
 
@@ -898,7 +896,7 @@ const void *get_xsave_field_ptr(int xsave_state)
 	 */
 	fpu__save(fpu);
 
-	return get_xsave_addr(&fpu->state.xsave, xsave_state);
+	return get_xsave_addr(&fpu->state.xsave, xfeature_nr);
 }
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d26f9e9c3d83..8b6d03e55d2f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -456,7 +456,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	 * which is all zeros which indicates MPX was not
 	 * responsible for the exception.
 	 */
-	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR);
 	if (!bndcsr)
 		goto exit_trap;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 099b851dabaf..8022e7769b3a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3674,15 +3674,15 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 	 */
 	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 	while (valid) {
-		u64 feature = valid & -valid;
-		int index = fls64(feature) - 1;
-		void *src = get_xsave_addr(xsave, feature);
+		u64 xfeature_mask = valid & -valid;
+		int xfeature_nr = fls64(xfeature_mask) - 1;
+		void *src = get_xsave_addr(xsave, xfeature_nr);
 
 		if (src) {
 			u32 size, offset, ecx, edx;
-			cpuid_count(XSTATE_CPUID, index,
+			cpuid_count(XSTATE_CPUID, xfeature_nr,
 				    &size, &offset, &ecx, &edx);
-			if (feature == XFEATURE_MASK_PKRU)
+			if (xfeature_nr == XFEATURE_PKRU)
 				memcpy(dest + offset, &vcpu->arch.pkru,
 				       sizeof(vcpu->arch.pkru));
 			else
@@ -3690,7 +3690,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 
 		}
 
-		valid -= feature;
+		valid -= xfeature_mask;
 	}
 }
 
@@ -3717,22 +3717,22 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 	 */
 	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 	while (valid) {
-		u64 feature = valid & -valid;
-		int index = fls64(feature) - 1;
-		void *dest = get_xsave_addr(xsave, feature);
+		u64 xfeature_mask = valid & -valid;
+		int xfeature_nr = fls64(xfeature_mask) - 1;
+		void *dest = get_xsave_addr(xsave, xfeature_nr);
 
 		if (dest) {
 			u32 size, offset, ecx, edx;
-			cpuid_count(XSTATE_CPUID, index,
+			cpuid_count(XSTATE_CPUID, xfeature_nr,
 				    &size, &offset, &ecx, &edx);
-			if (feature == XFEATURE_MASK_PKRU)
+			if (xfeature_nr == XFEATURE_PKRU)
 				memcpy(&vcpu->arch.pkru, src + offset,
 				       sizeof(vcpu->arch.pkru));
 			else
 				memcpy(dest, src + offset, size);
 		}
 
-		valid -= feature;
+		valid -= xfeature_mask;
 	}
 }
 
@@ -8850,11 +8850,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		if (init_event)
 			kvm_put_guest_fpu(vcpu);
 		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
-					XFEATURE_MASK_BNDREGS);
+					XFEATURE_BNDREGS);
 		if (mpx_state_buffer)
 			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
 		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
-					XFEATURE_MASK_BNDCSR);
+					XFEATURE_BNDCSR);
 		if (mpx_state_buffer)
 			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
 		if (init_event)
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index c805db6236b4..59726aaf4671 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -142,7 +142,7 @@ int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs)
 		goto err_out;
 	}
 	/* get bndregs field from current task's xsave area */
-	bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS);
+	bndregs = get_xsave_field_ptr(XFEATURE_BNDREGS);
 	if (!bndregs) {
 		err = -EINVAL;
 		goto err_out;
@@ -190,7 +190,7 @@ static __user void *mpx_get_bounds_dir(void)
 	 * The bounds directory pointer is stored in a register
 	 * only accessible if we first do an xsave.
 	 */
-	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR);
 	if (!bndcsr)
 		return MPX_INVALID_BOUNDS_DIR;
 
@@ -376,7 +376,7 @@ static int do_mpx_bt_fault(void)
 	const struct mpx_bndcsr *bndcsr;
 	struct mm_struct *mm = current->mm;
 
-	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR);
 	if (!bndcsr)
 		return -EINVAL;
 	/*
-- 
cgit v1.2.3


From 3d45ad9260c35c597706847e196aae8d966a574f Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.ibm.com>
Date: Wed, 3 Apr 2019 22:12:17 -0400
Subject: x86/ima: add missing include

As reported by 0-DAY kernel test infrastructure:
   arch/x86//kernel/ima_arch.c: In function 'arch_get_ima_policy':
>> arch/x86//kernel/ima_arch.c:78:4: error: implicit declaration of
function 'set_module_sig_enforced' [-Werror=implicit-function-declaration]

Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 arch/x86/kernel/ima_arch.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c
index 3fb9847f1cad..85de790583f9 100644
--- a/arch/x86/kernel/ima_arch.c
+++ b/arch/x86/kernel/ima_arch.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2018 IBM Corporation
  */
 #include <linux/efi.h>
+#include <linux/module.h>
 #include <linux/ima.h>
 
 extern struct boot_params boot_params;
-- 
cgit v1.2.3


From 69277c98f5eef0d9839699b7825923c3985f665f Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 3 Apr 2019 18:41:46 +0200
Subject: x86/fpu: Always store the registers in copy_fpstate_to_sigframe()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

copy_fpstate_to_sigframe() stores the registers directly to user space.
This is okay because the FPU registers are valid and saving them
directly avoids saving them into kernel memory and making a copy.

However, this cannot be done anymore if the FPU registers are going
to be restored on the return to userland. It is possible that the FPU
registers will be invalidated in the middle of the save operation and
this should be done with disabled preemption / BH.

Save the FPU registers to the task's FPU struct and copy them to the
user memory later on.

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-18-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 155f4552413e..8f23f5237218 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -144,8 +144,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
  *	buf != buf_fx for 32-bit frames with fxstate.
  *
- * Save the state directly to the user frame pointed by the aligned pointer
- * 'buf_fx'.
+ * Save the state to task's fpu->state and then copy it to the user frame
+ * pointed to by the aligned pointer 'buf_fx'.
  *
  * If this is a 32-bit frame with fxstate, put a fsave header before
  * the aligned state at 'buf_fx'.
@@ -155,6 +155,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  */
 int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
+	struct fpu *fpu = &current->thread.fpu;
+	struct xregs_state *xsave = &fpu->state.xsave;
 	struct task_struct *tsk = current;
 	int ia32_fxstate = (buf != buf_fx);
 
@@ -169,9 +171,16 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			sizeof(struct user_i387_ia32_struct), NULL,
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
-	/* Save the live registers state to the user frame directly. */
-	if (copy_fpregs_to_sigframe(buf_fx))
-		return -1;
+	copy_fpregs_to_fpstate(fpu);
+
+	if (using_compacted_format()) {
+		if (copy_xstate_to_user(buf_fx, xsave, 0, size))
+			return -1;
+	} else {
+		fpstate_sanitize_xstate(fpu);
+		if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
+			return -1;
+	}
 
 	/* Save the fsave header for the 32-bit frames. */
 	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
-- 
cgit v1.2.3


From a352a3b7b7920212ee4c45a41500c66826318e92 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 3 Apr 2019 18:41:47 +0200
Subject: x86/fpu: Prepare copy_fpstate_to_sigframe() for TIF_NEED_FPU_LOAD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The FPU registers need only to be saved if TIF_NEED_FPU_LOAD is not set.
Otherwise this has been already done and can be skipped.

 [ bp: Massage a bit. ]

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-19-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 8f23f5237218..9b9dfdc96285 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -171,7 +171,17 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			sizeof(struct user_i387_ia32_struct), NULL,
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
-	copy_fpregs_to_fpstate(fpu);
+	/*
+	 * If we do not need to load the FPU registers at return to userspace
+	 * then the CPU has the current state and we need to save it. Otherwise,
+	 * it has already been done and we can skip it.
+	 */
+	fpregs_lock();
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+		copy_fpregs_to_fpstate(fpu);
+		set_thread_flag(TIF_NEED_FPU_LOAD);
+	}
+	fpregs_unlock();
 
 	if (using_compacted_format()) {
 		if (copy_xstate_to_user(buf_fx, xsave, 0, size))
-- 
cgit v1.2.3


From e0d3602f933367881bddfff310a744e6e61c284c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:49 +0200
Subject: x86/fpu: Inline copy_user_to_fpregs_zeroing()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Start refactoring __fpu__restore_sig() by inlining
copy_user_to_fpregs_zeroing(). The original function remains and will be
used to restore from userland memory if possible.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-21-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 9b9dfdc96285..c2ff43fbbd07 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -337,11 +337,29 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		kfree(tmp);
 		return err;
 	} else {
+		int ret;
+
 		/*
 		 * For 64-bit frames and 32-bit fsave frames, restore the user
 		 * state to the registers directly (with exceptions handled).
 		 */
-		if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) {
+		if (use_xsave()) {
+			if ((unsigned long)buf_fx % 64 || fx_only) {
+				u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+				ret = copy_user_to_fxregs(buf_fx);
+			} else {
+				u64 init_bv = xfeatures_mask & ~xfeatures;
+				if (unlikely(init_bv))
+					copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+				ret = copy_user_to_xregs(buf_fx, xfeatures);
+			}
+		} else if (use_fxsr()) {
+			ret = copy_user_to_fxregs(buf_fx);
+		} else
+			ret = copy_user_to_fregs(buf_fx);
+
+		if (ret) {
 			fpu__clear(fpu);
 			return -1;
 		}
-- 
cgit v1.2.3


From 926b21f37b072ae4c117052de45a975c6d468fec Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:50 +0200
Subject: x86/fpu: Restore from kernel memory on the 64-bit path too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 64-bit case (both 64-bit and 32-bit frames) loads the new state from
user memory.

However, doing this is not desired if the FPU state is going to be
restored on return to userland: it would be required to disable
preemption in order to avoid a context switch which would set
TIF_NEED_FPU_LOAD. If this happens before the restore operation then the
loaded registers would become volatile.

Furthermore, disabling preemption while accessing user memory requires
to disable the pagefault handler. An error during FXRSTOR would then
mean that either a page fault occurred (and it would have to be retried
with enabled page fault handler) or a #GP occurred because the xstate is
bogus (after all, the signal handler can modify it).

In order to avoid that mess, copy the FPU state from userland, validate
it and then load it. The copy_kernel_…() helpers are basically just
like the old helpers except that they operate on kernel memory and the
fault handler just sets the error value and the caller handles it.

copy_user_to_fpregs_zeroing() and its helpers remain and will be used
later for a fastpath optimisation.

 [ bp: Clarify commit message. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-22-bigeasy@linutronix.de
---
 arch/x86/include/asm/fpu/internal.h | 43 +++++++++++++++++++++++++
 arch/x86/kernel/fpu/signal.c        | 62 +++++++++++++++++++++++++++++--------
 2 files changed, 92 insertions(+), 13 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index da75d7b3e37d..2cf04fbcba5d 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -121,6 +121,21 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu);
 	err;								\
 })
 
+#define kernel_insn_err(insn, output, input...)				\
+({									\
+	int err;							\
+	asm volatile("1:" #insn "\n\t"					\
+		     "2:\n"						\
+		     ".section .fixup,\"ax\"\n"				\
+		     "3:  movl $-1,%[err]\n"				\
+		     "    jmp  2b\n"					\
+		     ".previous\n"					\
+		     _ASM_EXTABLE(1b, 3b)				\
+		     : [err] "=r" (err), output				\
+		     : "0"(0), input);					\
+	err;								\
+})
+
 #define kernel_insn(insn, output, input...)				\
 	asm volatile("1:" #insn "\n\t"					\
 		     "2:\n"						\
@@ -149,6 +164,14 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
 		kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
 }
 
+static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx)
+{
+	if (IS_ENABLED(CONFIG_X86_32))
+		return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	else
+		return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
 static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
 {
 	if (IS_ENABLED(CONFIG_X86_32))
@@ -162,6 +185,11 @@ static inline void copy_kernel_to_fregs(struct fregs_state *fx)
 	kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
 }
 
+static inline int copy_kernel_to_fregs_err(struct fregs_state *fx)
+{
+	return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
 static inline int copy_user_to_fregs(struct fregs_state __user *fx)
 {
 	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
@@ -361,6 +389,21 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
 	return err;
 }
 
+/*
+ * Restore xstate from kernel space xsave area, return an error code instead of
+ * an exception.
+ */
+static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err;
+
+	XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+	return err;
+}
+
 /*
  * These must be called with preempt disabled. Returns
  * 'true' if the FPU state is still intact and we can
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index c2ff43fbbd07..9ea1eaa4c9b1 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -234,7 +234,8 @@ sanitize_restored_xstate(union fpregs_state *state,
 		 */
 		xsave->i387.mxcsr &= mxcsr_feature_mask;
 
-		convert_to_fxsr(&state->fxsave, ia32_env);
+		if (ia32_env)
+			convert_to_fxsr(&state->fxsave, ia32_env);
 	}
 }
 
@@ -337,28 +338,63 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		kfree(tmp);
 		return err;
 	} else {
+		union fpregs_state *state;
+		void *tmp;
 		int ret;
 
+		tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+		state = PTR_ALIGN(tmp, 64);
+
 		/*
 		 * For 64-bit frames and 32-bit fsave frames, restore the user
 		 * state to the registers directly (with exceptions handled).
 		 */
-		if (use_xsave()) {
-			if ((unsigned long)buf_fx % 64 || fx_only) {
-				u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
-				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-				ret = copy_user_to_fxregs(buf_fx);
+		if ((unsigned long)buf_fx % 64)
+			fx_only = 1;
+
+		if (use_xsave() && !fx_only) {
+			u64 init_bv = xfeatures_mask & ~xfeatures;
+
+			if (using_compacted_format()) {
+				ret = copy_user_to_xstate(&state->xsave, buf_fx);
 			} else {
-				u64 init_bv = xfeatures_mask & ~xfeatures;
-				if (unlikely(init_bv))
-					copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-				ret = copy_user_to_xregs(buf_fx, xfeatures);
+				ret = __copy_from_user(&state->xsave, buf_fx, state_size);
+
+				if (!ret && state_size > offsetof(struct xregs_state, header))
+					ret = validate_xstate_header(&state->xsave.header);
 			}
+			if (ret)
+				goto err_out;
+
+			sanitize_restored_xstate(state, NULL, xfeatures, fx_only);
+
+			if (unlikely(init_bv))
+				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+			ret = copy_kernel_to_xregs_err(&state->xsave, xfeatures);
+
 		} else if (use_fxsr()) {
-			ret = copy_user_to_fxregs(buf_fx);
-		} else
-			ret = copy_user_to_fregs(buf_fx);
+			ret = __copy_from_user(&state->fxsave, buf_fx, state_size);
+			if (ret)
+				goto err_out;
 
+			if (use_xsave()) {
+				u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+			}
+			state->fxsave.mxcsr &= mxcsr_feature_mask;
+
+			ret = copy_kernel_to_fxregs_err(&state->fxsave);
+		} else {
+			ret = __copy_from_user(&state->fsave, buf_fx, state_size);
+			if (ret)
+				goto err_out;
+			ret = copy_kernel_to_fregs_err(&state->fsave);
+		}
+
+err_out:
+		kfree(tmp);
 		if (ret) {
 			fpu__clear(fpu);
 			return -1;
-- 
cgit v1.2.3


From c2ff9e9a3d9d6c019394a22989a228d02970a8b1 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:51 +0200
Subject: x86/fpu: Merge the two code paths in __fpu__restore_sig()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ia32_fxstate case (32bit with fxsr) and the other (64bit frames or
32bit frames without fxsr) restore both from kernel memory and sanitize
the content.

The !ia32_fxstate version restores missing xstates from "init state"
while the ia32_fxstate doesn't and skips it.

Merge the two code paths and keep the !ia32_fxstate one. Copy only the
user_i387_ia32_struct data structure in the ia32_fxstate.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-23-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 139 +++++++++++++++++--------------------------
 1 file changed, 54 insertions(+), 85 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 9ea1eaa4c9b1..b13e86b29426 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -263,12 +263,17 @@ static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_
 
 static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 {
+	struct user_i387_ia32_struct *envp = NULL;
+	int state_size = fpu_kernel_xstate_size;
 	int ia32_fxstate = (buf != buf_fx);
 	struct task_struct *tsk = current;
 	struct fpu *fpu = &tsk->thread.fpu;
-	int state_size = fpu_kernel_xstate_size;
+	struct user_i387_ia32_struct env;
+	union fpregs_state *state;
 	u64 xfeatures = 0;
 	int fx_only = 0;
+	int ret = 0;
+	void *tmp;
 
 	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
@@ -303,105 +308,69 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		}
 	}
 
+	tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+	state = PTR_ALIGN(tmp, 64);
+
+	if ((unsigned long)buf_fx % 64)
+		fx_only = 1;
+
+	/*
+	 * For 32-bit frames with fxstate, copy the fxstate so it can be
+	 * reconstructed later.
+	 */
 	if (ia32_fxstate) {
-		/*
-		 * For 32-bit frames with fxstate, copy the user state to the
-		 * thread's fpu state, reconstruct fxstate from the fsave
-		 * header. Validate and sanitize the copied state.
-		 */
-		struct user_i387_ia32_struct env;
-		union fpregs_state *state;
-		int err = 0;
-		void *tmp;
+		ret = __copy_from_user(&env, buf, sizeof(env));
+		if (ret)
+			goto err_out;
+		envp = &env;
+	}
 
-		tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL);
-		if (!tmp)
-			return -ENOMEM;
-		state = PTR_ALIGN(tmp, 64);
+	if (use_xsave() && !fx_only) {
+		u64 init_bv = xfeatures_mask & ~xfeatures;
 
 		if (using_compacted_format()) {
-			err = copy_user_to_xstate(&state->xsave, buf_fx);
+			ret = copy_user_to_xstate(&state->xsave, buf_fx);
 		} else {
-			err = __copy_from_user(&state->xsave, buf_fx, state_size);
+			ret = __copy_from_user(&state->xsave, buf_fx, state_size);
 
-			if (!err && state_size > offsetof(struct xregs_state, header))
-				err = validate_xstate_header(&state->xsave.header);
+			if (!ret && state_size > offsetof(struct xregs_state, header))
+				ret = validate_xstate_header(&state->xsave.header);
 		}
+		if (ret)
+			goto err_out;
 
-		if (err || __copy_from_user(&env, buf, sizeof(env))) {
-			err = -1;
-		} else {
-			sanitize_restored_xstate(state, &env, xfeatures, fx_only);
-			copy_kernel_to_fpregs(state);
-		}
-
-		kfree(tmp);
-		return err;
-	} else {
-		union fpregs_state *state;
-		void *tmp;
-		int ret;
-
-		tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL);
-		if (!tmp)
-			return -ENOMEM;
-		state = PTR_ALIGN(tmp, 64);
-
-		/*
-		 * For 64-bit frames and 32-bit fsave frames, restore the user
-		 * state to the registers directly (with exceptions handled).
-		 */
-		if ((unsigned long)buf_fx % 64)
-			fx_only = 1;
-
-		if (use_xsave() && !fx_only) {
-			u64 init_bv = xfeatures_mask & ~xfeatures;
-
-			if (using_compacted_format()) {
-				ret = copy_user_to_xstate(&state->xsave, buf_fx);
-			} else {
-				ret = __copy_from_user(&state->xsave, buf_fx, state_size);
-
-				if (!ret && state_size > offsetof(struct xregs_state, header))
-					ret = validate_xstate_header(&state->xsave.header);
-			}
-			if (ret)
-				goto err_out;
-
-			sanitize_restored_xstate(state, NULL, xfeatures, fx_only);
-
-			if (unlikely(init_bv))
-				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-			ret = copy_kernel_to_xregs_err(&state->xsave, xfeatures);
+		sanitize_restored_xstate(state, envp, xfeatures, fx_only);
 
-		} else if (use_fxsr()) {
-			ret = __copy_from_user(&state->fxsave, buf_fx, state_size);
-			if (ret)
-				goto err_out;
+		if (unlikely(init_bv))
+			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+		ret = copy_kernel_to_xregs_err(&state->xsave, xfeatures);
 
-			if (use_xsave()) {
-				u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
-				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-			}
-			state->fxsave.mxcsr &= mxcsr_feature_mask;
+	} else if (use_fxsr()) {
+		ret = __copy_from_user(&state->fxsave, buf_fx, state_size);
+		if (ret)
+			goto err_out;
 
-			ret = copy_kernel_to_fxregs_err(&state->fxsave);
-		} else {
-			ret = __copy_from_user(&state->fsave, buf_fx, state_size);
-			if (ret)
-				goto err_out;
-			ret = copy_kernel_to_fregs_err(&state->fsave);
+		sanitize_restored_xstate(state, envp, xfeatures, fx_only);
+		if (use_xsave()) {
+			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 		}
 
-err_out:
-		kfree(tmp);
-		if (ret) {
-			fpu__clear(fpu);
-			return -1;
-		}
+		ret = copy_kernel_to_fxregs_err(&state->fxsave);
+	} else {
+		ret = __copy_from_user(&state->fsave, buf_fx, state_size);
+		if (ret)
+			goto err_out;
+		ret = copy_kernel_to_fregs_err(&state->fsave);
 	}
 
-	return 0;
+err_out:
+	kfree(tmp);
+	if (ret)
+		fpu__clear(fpu);
+	return ret;
 }
 
 static inline int xstate_sigframe_size(void)
-- 
cgit v1.2.3


From 5f409e20b794565e2d60ad333e79334630a6c798 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 3 Apr 2019 18:41:52 +0200
Subject: x86/fpu: Defer FPU state load until return to userspace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Defer loading of FPU state until return to userspace. This gives
the kernel the potential to skip loading FPU state for tasks that
stay in kernel mode, or for tasks that end up with repeated
invocations of kernel_fpu_begin() & kernel_fpu_end().

The fpregs_lock/unlock() section ensures that the registers remain
unchanged. Otherwise a context switch or a bottom half could save the
registers to its FPU context and the processor's FPU registers would
became random if modified at the same time.

KVM swaps the host/guest registers on entry/exit path. This flow has
been kept as is. First it ensures that the registers are loaded and then
saves the current (host) state before it loads the guest's registers. The
swap is done at the very end with disabled interrupts so it should not
change anymore before theg guest is entered. The read/save version seems
to be cheaper compared to memcpy() in a micro benchmark.

Each thread gets TIF_NEED_FPU_LOAD set as part of fork() / fpu__copy().
For kernel threads, this flag gets never cleared which avoids saving /
restoring the FPU state for kernel threads and during in-kernel usage of
the FPU registers.

 [
   bp: Correct and update commit message and fix checkpatch warnings.
   s/register/registers/ where it is used in plural.
   minor comment corrections.
   remove unused trace_x86_fpu_activate_state() TP.
 ]

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: Babu Moger <Babu.Moger@amd.com>
Cc: "Chang S. Bae" <chang.seok.bae@intel.com>
Cc: Dmitry Safonov <dima@arista.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Nicolai Stange <nstange@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Waiman Long <longman@redhat.com>
Cc: x86-ml <x86@kernel.org>
Cc: Yi Wang <wang.yi59@zte.com.cn>
Link: https://lkml.kernel.org/r/20190403164156.19645-24-bigeasy@linutronix.de
---
 arch/x86/entry/common.c             |  10 +++-
 arch/x86/include/asm/fpu/api.h      |  22 +++++++-
 arch/x86/include/asm/fpu/internal.h |  27 +++++----
 arch/x86/include/asm/trace/fpu.h    |  10 ++--
 arch/x86/kernel/fpu/core.c          | 106 +++++++++++++++++++++++++++---------
 arch/x86/kernel/fpu/signal.c        |  49 ++++++++++-------
 arch/x86/kernel/process.c           |   2 +-
 arch/x86/kernel/process_32.c        |   5 +-
 arch/x86/kernel/process_64.c        |   5 +-
 arch/x86/kvm/x86.c                  |  20 +++++--
 10 files changed, 184 insertions(+), 72 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 7bc105f47d21..51beb8d29123 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -25,12 +25,13 @@
 #include <linux/uprobes.h>
 #include <linux/livepatch.h>
 #include <linux/syscalls.h>
+#include <linux/uaccess.h>
 
 #include <asm/desc.h>
 #include <asm/traps.h>
 #include <asm/vdso.h>
-#include <linux/uaccess.h>
 #include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
@@ -196,6 +197,13 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 	if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
 		exit_to_usermode_loop(regs, cached_flags);
 
+	/* Reload ti->flags; we may have rescheduled above. */
+	cached_flags = READ_ONCE(ti->flags);
+
+	fpregs_assert_state_consistent();
+	if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD))
+		switch_fpu_return();
+
 #ifdef CONFIG_COMPAT
 	/*
 	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 73e684160f35..b774c52e5411 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -10,7 +10,7 @@
 
 #ifndef _ASM_X86_FPU_API_H
 #define _ASM_X86_FPU_API_H
-#include <linux/preempt.h>
+#include <linux/bottom_half.h>
 
 /*
  * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
@@ -22,17 +22,37 @@
 extern void kernel_fpu_begin(void);
 extern void kernel_fpu_end(void);
 extern bool irq_fpu_usable(void);
+extern void fpregs_mark_activate(void);
 
+/*
+ * Use fpregs_lock() while editing CPU's FPU registers or fpu->state.
+ * A context switch will (and softirq might) save CPU's FPU registers to
+ * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
+ * a random state.
+ */
 static inline void fpregs_lock(void)
 {
 	preempt_disable();
+	local_bh_disable();
 }
 
 static inline void fpregs_unlock(void)
 {
+	local_bh_enable();
 	preempt_enable();
 }
 
+#ifdef CONFIG_X86_DEBUG_FPU
+extern void fpregs_assert_state_consistent(void);
+#else
+static inline void fpregs_assert_state_consistent(void) { }
+#endif
+
+/*
+ * Load the task FPU state before returning to userspace.
+ */
+extern void switch_fpu_return(void);
+
 /*
  * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
  *
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 2cf04fbcba5d..0c8a5093647a 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -30,7 +30,7 @@ extern void fpu__prepare_write(struct fpu *fpu);
 extern void fpu__save(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
-extern int  fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
+extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
 extern void fpu__clear(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
@@ -531,13 +531,20 @@ static inline void fpregs_activate(struct fpu *fpu)
 /*
  * Internal helper, do not use directly. Use switch_fpu_return() instead.
  */
-static inline void __fpregs_load_activate(struct fpu *fpu, int cpu)
+static inline void __fpregs_load_activate(void)
 {
+	struct fpu *fpu = &current->thread.fpu;
+	int cpu = smp_processor_id();
+
+	if (WARN_ON_ONCE(current->mm == NULL))
+		return;
+
 	if (!fpregs_state_valid(fpu, cpu)) {
-		if (current->mm)
-			copy_kernel_to_fpregs(&fpu->state);
+		copy_kernel_to_fpregs(&fpu->state);
 		fpregs_activate(fpu);
+		fpu->last_cpu = cpu;
 	}
+	clear_thread_flag(TIF_NEED_FPU_LOAD);
 }
 
 /*
@@ -548,8 +555,8 @@ static inline void __fpregs_load_activate(struct fpu *fpu, int cpu)
  *  - switch_fpu_prepare() saves the old state.
  *    This is done within the context of the old process.
  *
- *  - switch_fpu_finish() restores the new state as
- *    necessary.
+ *  - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
+ *    will get loaded on return to userspace, or when the kernel needs it.
  *
  * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
  * are saved in the current thread's FPU register state.
@@ -581,10 +588,10 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
  */
 
 /*
- * Set up the userspace FPU context for the new task, if the task
- * has used the FPU.
+ * Load PKRU from the FPU context if available. Delay loading of the
+ * complete FPU state until the return to userland.
  */
-static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
+static inline void switch_fpu_finish(struct fpu *new_fpu)
 {
 	u32 pkru_val = init_pkru_value;
 	struct pkru_state *pk;
@@ -592,7 +599,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
 	if (!static_cpu_has(X86_FEATURE_FPU))
 		return;
 
-	__fpregs_load_activate(new_fpu, cpu);
+	set_thread_flag(TIF_NEED_FPU_LOAD);
 
 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
 		return;
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index bd65f6ba950f..879b77792f94 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -13,19 +13,22 @@ DECLARE_EVENT_CLASS(x86_fpu,
 
 	TP_STRUCT__entry(
 		__field(struct fpu *, fpu)
+		__field(bool, load_fpu)
 		__field(u64, xfeatures)
 		__field(u64, xcomp_bv)
 		),
 
 	TP_fast_assign(
 		__entry->fpu		= fpu;
+		__entry->load_fpu	= test_thread_flag(TIF_NEED_FPU_LOAD);
 		if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
 			__entry->xfeatures = fpu->state.xsave.header.xfeatures;
 			__entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
 		}
 	),
-	TP_printk("x86/fpu: %p xfeatures: %llx xcomp_bv: %llx",
+	TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
 			__entry->fpu,
+			__entry->load_fpu,
 			__entry->xfeatures,
 			__entry->xcomp_bv
 	)
@@ -61,11 +64,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated,
 	TP_ARGS(fpu)
 );
 
-DEFINE_EVENT(x86_fpu, x86_fpu_activate_state,
-	TP_PROTO(struct fpu *fpu),
-	TP_ARGS(fpu)
-);
-
 DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
 	TP_PROTO(struct fpu *fpu),
 	TP_ARGS(fpu)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 739ca3ae2bdc..ce243f76bdb7 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -102,23 +102,20 @@ static void __kernel_fpu_begin(void)
 	kernel_fpu_disable();
 
 	if (current->mm) {
-		/*
-		 * Ignore return value -- we don't care if reg state
-		 * is clobbered.
-		 */
-		copy_fpregs_to_fpstate(fpu);
-	} else {
-		__cpu_invalidate_fpregs_state();
+		if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+			set_thread_flag(TIF_NEED_FPU_LOAD);
+			/*
+			 * Ignore return value -- we don't care if reg state
+			 * is clobbered.
+			 */
+			copy_fpregs_to_fpstate(fpu);
+		}
 	}
+	__cpu_invalidate_fpregs_state();
 }
 
 static void __kernel_fpu_end(void)
 {
-	struct fpu *fpu = &current->thread.fpu;
-
-	if (current->mm)
-		copy_kernel_to_fpregs(&fpu->state);
-
 	kernel_fpu_enable();
 }
 
@@ -145,14 +142,17 @@ void fpu__save(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
-	preempt_disable();
+	fpregs_lock();
 	trace_x86_fpu_before_save(fpu);
 
-	if (!copy_fpregs_to_fpstate(fpu))
-		copy_kernel_to_fpregs(&fpu->state);
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+		if (!copy_fpregs_to_fpstate(fpu)) {
+			copy_kernel_to_fpregs(&fpu->state);
+		}
+	}
 
 	trace_x86_fpu_after_save(fpu);
-	preempt_enable();
+	fpregs_unlock();
 }
 EXPORT_SYMBOL_GPL(fpu__save);
 
@@ -185,8 +185,11 @@ void fpstate_init(union fpregs_state *state)
 }
 EXPORT_SYMBOL_GPL(fpstate_init);
 
-int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+int fpu__copy(struct task_struct *dst, struct task_struct *src)
 {
+	struct fpu *dst_fpu = &dst->thread.fpu;
+	struct fpu *src_fpu = &src->thread.fpu;
+
 	dst_fpu->last_cpu = -1;
 
 	if (!static_cpu_has(X86_FEATURE_FPU))
@@ -201,16 +204,23 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 	memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
 
 	/*
-	 * Save current FPU registers directly into the child
-	 * FPU context, without any memory-to-memory copying.
+	 * If the FPU registers are not current just memcpy() the state.
+	 * Otherwise save current FPU registers directly into the child's FPU
+	 * context, without any memory-to-memory copying.
 	 *
 	 * ( The function 'fails' in the FNSAVE case, which destroys
-	 *   register contents so we have to copy them back. )
+	 *   register contents so we have to load them back. )
 	 */
-	if (!copy_fpregs_to_fpstate(dst_fpu)) {
-		memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size);
-		copy_kernel_to_fpregs(&src_fpu->state);
-	}
+	fpregs_lock();
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);
+
+	else if (!copy_fpregs_to_fpstate(dst_fpu))
+		copy_kernel_to_fpregs(&dst_fpu->state);
+
+	fpregs_unlock();
+
+	set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
 
 	trace_x86_fpu_copy_src(src_fpu);
 	trace_x86_fpu_copy_dst(dst_fpu);
@@ -226,10 +236,9 @@ static void fpu__initialize(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu);
 
+	set_thread_flag(TIF_NEED_FPU_LOAD);
 	fpstate_init(&fpu->state);
 	trace_x86_fpu_init_state(fpu);
-
-	trace_x86_fpu_activate_state(fpu);
 }
 
 /*
@@ -308,6 +317,8 @@ void fpu__drop(struct fpu *fpu)
  */
 static inline void copy_init_fpstate_to_fpregs(void)
 {
+	fpregs_lock();
+
 	if (use_xsave())
 		copy_kernel_to_xregs(&init_fpstate.xsave, -1);
 	else if (static_cpu_has(X86_FEATURE_FXSR))
@@ -317,6 +328,9 @@ static inline void copy_init_fpstate_to_fpregs(void)
 
 	if (boot_cpu_has(X86_FEATURE_OSPKE))
 		copy_init_pkru_to_fpregs();
+
+	fpregs_mark_activate();
+	fpregs_unlock();
 }
 
 /*
@@ -339,6 +353,46 @@ void fpu__clear(struct fpu *fpu)
 		copy_init_fpstate_to_fpregs();
 }
 
+/*
+ * Load FPU context before returning to userspace.
+ */
+void switch_fpu_return(void)
+{
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return;
+
+	__fpregs_load_activate();
+}
+EXPORT_SYMBOL_GPL(switch_fpu_return);
+
+#ifdef CONFIG_X86_DEBUG_FPU
+/*
+ * If current FPU state according to its tracking (loaded FPU context on this
+ * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is
+ * loaded on return to userland.
+ */
+void fpregs_assert_state_consistent(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		return;
+
+	WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
+}
+EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
+#endif
+
+void fpregs_mark_activate(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	fpregs_activate(fpu);
+	fpu->last_cpu = smp_processor_id();
+	clear_thread_flag(TIF_NEED_FPU_LOAD);
+}
+EXPORT_SYMBOL_GPL(fpregs_mark_activate);
+
 /*
  * x87 math exception handling:
  */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index b13e86b29426..6df1f15e0cd5 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -269,11 +269,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 	struct task_struct *tsk = current;
 	struct fpu *fpu = &tsk->thread.fpu;
 	struct user_i387_ia32_struct env;
-	union fpregs_state *state;
 	u64 xfeatures = 0;
 	int fx_only = 0;
 	int ret = 0;
-	void *tmp;
 
 	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
@@ -308,14 +306,18 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		}
 	}
 
-	tmp = kzalloc(sizeof(*state) + fpu_kernel_xstate_size + 64, GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
-	state = PTR_ALIGN(tmp, 64);
+	/*
+	 * The current state of the FPU registers does not matter. By setting
+	 * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate
+	 * is not modified on context switch and that the xstate is considered
+	 * to be loaded again on return to userland (overriding last_cpu avoids
+	 * the optimisation).
+	 */
+	set_thread_flag(TIF_NEED_FPU_LOAD);
+	__fpu_invalidate_fpregs_state(fpu);
 
 	if ((unsigned long)buf_fx % 64)
 		fx_only = 1;
-
 	/*
 	 * For 32-bit frames with fxstate, copy the fxstate so it can be
 	 * reconstructed later.
@@ -331,43 +333,52 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		u64 init_bv = xfeatures_mask & ~xfeatures;
 
 		if (using_compacted_format()) {
-			ret = copy_user_to_xstate(&state->xsave, buf_fx);
+			ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
 		} else {
-			ret = __copy_from_user(&state->xsave, buf_fx, state_size);
+			ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
 
 			if (!ret && state_size > offsetof(struct xregs_state, header))
-				ret = validate_xstate_header(&state->xsave.header);
+				ret = validate_xstate_header(&fpu->state.xsave.header);
 		}
 		if (ret)
 			goto err_out;
 
-		sanitize_restored_xstate(state, envp, xfeatures, fx_only);
+		sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only);
 
+		fpregs_lock();
 		if (unlikely(init_bv))
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
-		ret = copy_kernel_to_xregs_err(&state->xsave, xfeatures);
+		ret = copy_kernel_to_xregs_err(&fpu->state.xsave, xfeatures);
 
 	} else if (use_fxsr()) {
-		ret = __copy_from_user(&state->fxsave, buf_fx, state_size);
-		if (ret)
+		ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
+		if (ret) {
+			ret = -EFAULT;
 			goto err_out;
+		}
+
+		sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only);
 
-		sanitize_restored_xstate(state, envp, xfeatures, fx_only);
+		fpregs_lock();
 		if (use_xsave()) {
 			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 		}
 
-		ret = copy_kernel_to_fxregs_err(&state->fxsave);
+		ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave);
 	} else {
-		ret = __copy_from_user(&state->fsave, buf_fx, state_size);
+		ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size);
 		if (ret)
 			goto err_out;
-		ret = copy_kernel_to_fregs_err(&state->fsave);
+
+		fpregs_lock();
+		ret = copy_kernel_to_fregs_err(&fpu->state.fsave);
 	}
+	if (!ret)
+		fpregs_mark_activate();
+	fpregs_unlock();
 
 err_out:
-	kfree(tmp);
 	if (ret)
 		fpu__clear(fpu);
 	return ret;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 58ac7be52c7a..b9b8e6eb74f2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -101,7 +101,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 	dst->thread.vm86 = NULL;
 #endif
 
-	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
+	return fpu__copy(dst, src);
 }
 
 /*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 77d9eb43ccac..1bc47f3a4885 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -234,7 +234,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
-	switch_fpu_prepare(prev_fpu, cpu);
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+		switch_fpu_prepare(prev_fpu, cpu);
 
 	/*
 	 * Save away %gs. No need to save %fs, as it was saved on the
@@ -290,7 +291,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	this_cpu_write(current_task, next_p);
 
-	switch_fpu_finish(next_fpu, cpu);
+	switch_fpu_finish(next_fpu);
 
 	/* Load the Intel cache allocation PQR MSR. */
 	resctrl_sched_in();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ffea7c557963..37b2ecef041e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -520,7 +520,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
 		     this_cpu_read(irq_count) != -1);
 
-	switch_fpu_prepare(prev_fpu, cpu);
+	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+		switch_fpu_prepare(prev_fpu, cpu);
 
 	/* We must save %fs and %gs before load_TLS() because
 	 * %fs and %gs may be cleared by load_TLS().
@@ -572,7 +573,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	this_cpu_write(current_task, next_p);
 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 
-	switch_fpu_finish(next_fpu, cpu);
+	switch_fpu_finish(next_fpu);
 
 	/* Reload sp0. */
 	update_task_stack(next_p);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8022e7769b3a..e340c3c0cba3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7877,6 +7877,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		wait_lapic_expire(vcpu);
 	guest_enter_irqoff();
 
+	fpregs_assert_state_consistent();
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		switch_fpu_return();
+
 	if (unlikely(vcpu->arch.switch_db_regs)) {
 		set_debugreg(0, 7);
 		set_debugreg(vcpu->arch.eff_db[0], 0);
@@ -8137,22 +8141,30 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 /* Swap (qemu) user FPU context for the guest FPU context. */
 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	preempt_disable();
+	fpregs_lock();
+
 	copy_fpregs_to_fpstate(&current->thread.fpu);
 	/* PKRU is separately restored in kvm_x86_ops->run.  */
 	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
 				~XFEATURE_MASK_PKRU);
-	preempt_enable();
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+
 	trace_kvm_fpu(1);
 }
 
 /* When vcpu_run ends, restore user space FPU context. */
 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
-	preempt_disable();
+	fpregs_lock();
+
 	copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
 	copy_kernel_to_fpregs(&current->thread.fpu.state);
-	preempt_enable();
+
+	fpregs_mark_activate();
+	fpregs_unlock();
+
 	++vcpu->stat.fpu_reload;
 	trace_kvm_fpu(0);
 }
-- 
cgit v1.2.3


From 1d731e731c4cd7cbd3b1aa295f0932e7610da82f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:53 +0200
Subject: x86/fpu: Add a fastpath to __fpu__restore_sig()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous commits refactor the restoration of the FPU registers so
that they can be loaded from in-kernel memory. This overhead can be
avoided if the load can be performed without a pagefault.

Attempt to restore FPU registers by invoking
copy_user_to_fpregs_zeroing(). If it fails try the slowpath which can
handle pagefaults.

 [ bp: Add a comment over the fastpath to be able to find one's way
   around the function. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-25-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 6df1f15e0cd5..a1bd7be70206 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -242,10 +242,10 @@ sanitize_restored_xstate(union fpregs_state *state,
 /*
  * Restore the extended state if present. Otherwise, restore the FP/SSE state.
  */
-static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
+static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
 {
 	if (use_xsave()) {
-		if ((unsigned long)buf % 64 || fx_only) {
+		if (fx_only) {
 			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
 			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
 			return copy_user_to_fxregs(buf);
@@ -327,8 +327,27 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		if (ret)
 			goto err_out;
 		envp = &env;
+	} else {
+		/*
+		 * Attempt to restore the FPU registers directly from user
+		 * memory. For that to succeed, the user access cannot cause
+		 * page faults. If it does, fall back to the slow path below,
+		 * going through the kernel buffer with the enabled pagefault
+		 * handler.
+		 */
+		fpregs_lock();
+		pagefault_disable();
+		ret = copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only);
+		pagefault_enable();
+		if (!ret) {
+			fpregs_mark_activate();
+			fpregs_unlock();
+			return 0;
+		}
+		fpregs_unlock();
 	}
 
+
 	if (use_xsave() && !fx_only) {
 		u64 init_bv = xfeatures_mask & ~xfeatures;
 
-- 
cgit v1.2.3


From da2f32fb8dc7cbd9433cb2e990693734b30a2465 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:54 +0200
Subject: x86/fpu: Add a fastpath to copy_fpstate_to_sigframe()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Try to save the FPU registers directly to the userland stack frame if
the CPU holds the FPU registers for the current task. This has to be
done with the pagefault disabled because we can't fault (while the FPU
registers are locked) and therefore the operation might fail. If it
fails try the slowpath which can handle faults.

 [ bp: Massage a bit. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-26-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a1bd7be70206..3c3167576216 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -144,8 +144,10 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
  *	buf != buf_fx for 32-bit frames with fxstate.
  *
- * Save the state to task's fpu->state and then copy it to the user frame
- * pointed to by the aligned pointer 'buf_fx'.
+ * Try to save it directly to the user frame with disabled page fault handler.
+ * If this fails then do the slow path where the FPU state is first saved to
+ * task's fpu->state and then copy it to the user frame pointed to by the
+ * aligned pointer 'buf_fx'.
  *
  * If this is a 32-bit frame with fxstate, put a fsave header before
  * the aligned state at 'buf_fx'.
@@ -159,6 +161,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	struct xregs_state *xsave = &fpu->state.xsave;
 	struct task_struct *tsk = current;
 	int ia32_fxstate = (buf != buf_fx);
+	int ret = -EFAULT;
 
 	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
@@ -173,23 +176,30 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 
 	/*
 	 * If we do not need to load the FPU registers at return to userspace
-	 * then the CPU has the current state and we need to save it. Otherwise,
-	 * it has already been done and we can skip it.
+	 * then the CPU has the current state. Try to save it directly to
+	 * userland's stack frame if it does not cause a pagefault. If it does,
+	 * try the slowpath.
 	 */
 	fpregs_lock();
 	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
-		copy_fpregs_to_fpstate(fpu);
+		pagefault_disable();
+		ret = copy_fpregs_to_sigframe(buf_fx);
+		pagefault_enable();
+		if (ret)
+			copy_fpregs_to_fpstate(fpu);
 		set_thread_flag(TIF_NEED_FPU_LOAD);
 	}
 	fpregs_unlock();
 
-	if (using_compacted_format()) {
-		if (copy_xstate_to_user(buf_fx, xsave, 0, size))
-			return -1;
-	} else {
-		fpstate_sanitize_xstate(fpu);
-		if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
-			return -1;
+	if (ret) {
+		if (using_compacted_format()) {
+			if (copy_xstate_to_user(buf_fx, xsave, 0, size))
+				return -1;
+		} else {
+			fpstate_sanitize_xstate(fpu);
+			if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
+				return -1;
+		}
 	}
 
 	/* Save the fsave header for the 32-bit frames. */
-- 
cgit v1.2.3


From 06b251dff78704c7d122bd109384d970a7dbe94d Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 12 Apr 2019 20:16:15 +0200
Subject: x86/fpu: Restore regs in copy_fpstate_to_sigframe() in order to use
 the fastpath
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a task is scheduled out and receives a signal then it won't be
able to take the fastpath because the registers aren't available. The
slowpath is more expensive compared to XRSTOR + XSAVE which usually
succeeds.

Here are some clock_gettime() numbers from a bigger box with AVX512
during bootup:

- __fpregs_load_activate() takes 140ns - 350ns. If it was the most recent
  FPU context on the CPU then the optimisation in __fpregs_load_activate()
  will skip the load (which was disabled during the test).

- copy_fpregs_to_sigframe() takes 200ns - 450ns if it succeeds. On a
  pagefault it is 1.8us - 3us usually in the 2.6us area.

- The slowpath takes 1.5us - 6us. Usually in the 2.6us area.

My testcases (including lat_sig) take the fastpath without
__fpregs_load_activate(). I expect this to be the majority.

Since the slowpath is in the >1us area it makes sense to load the
registers and attempt to save them directly. The direct save may fail
but should only happen on the first invocation or after fork() while the
page is read-only.

 [ bp: Massage a bit. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-27-bigeasy@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 3c3167576216..7026f1c4e5e3 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -175,20 +175,21 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
 	/*
-	 * If we do not need to load the FPU registers at return to userspace
-	 * then the CPU has the current state. Try to save it directly to
-	 * userland's stack frame if it does not cause a pagefault. If it does,
-	 * try the slowpath.
+	 * Load the FPU registers if they are not valid for the current task.
+	 * With a valid FPU state we can attempt to save the state directly to
+	 * userland's stack frame which will likely succeed. If it does not, do
+	 * the slowpath.
 	 */
 	fpregs_lock();
-	if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
-		pagefault_disable();
-		ret = copy_fpregs_to_sigframe(buf_fx);
-		pagefault_enable();
-		if (ret)
-			copy_fpregs_to_fpstate(fpu);
-		set_thread_flag(TIF_NEED_FPU_LOAD);
-	}
+	if (test_thread_flag(TIF_NEED_FPU_LOAD))
+		__fpregs_load_activate();
+
+	pagefault_disable();
+	ret = copy_fpregs_to_sigframe(buf_fx);
+	pagefault_enable();
+	if (ret && !test_thread_flag(TIF_NEED_FPU_LOAD))
+		copy_fpregs_to_fpstate(fpu);
+	set_thread_flag(TIF_NEED_FPU_LOAD);
 	fpregs_unlock();
 
 	if (ret) {
-- 
cgit v1.2.3


From a5eff7259790d5314eff10563d6e59d358cce482 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 3 Apr 2019 18:41:56 +0200
Subject: x86/pkeys: Add PKRU value to init_fpstate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The task's initial PKRU value is set partly for fpu__clear()/
copy_init_pkru_to_fpregs(). It is not part of init_fpstate.xsave and
instead it is set explicitly.

If the user removes the PKRU state from XSAVE in the signal handler then
__fpu__restore_sig() will restore the missing bits from `init_fpstate'
and initialize the PKRU value to 0.

Add the `init_pkru_value' to `init_fpstate' so it is set to the init
value in such a case.

In theory copy_init_pkru_to_fpregs() could be removed because restoring
the PKRU at return-to-userland should be enough.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Chang S. Bae" <chang.seok.bae@intel.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190403164156.19645-28-bigeasy@linutronix.de
---
 arch/x86/kernel/cpu/common.c | 5 +++++
 arch/x86/mm/pkeys.c          | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cb28e98a0659..352fa19e6311 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -372,6 +372,8 @@ static bool pku_disabled;
 
 static __always_inline void setup_pku(struct cpuinfo_x86 *c)
 {
+	struct pkru_state *pk;
+
 	/* check the boot processor, plus compile options for PKU: */
 	if (!cpu_feature_enabled(X86_FEATURE_PKU))
 		return;
@@ -382,6 +384,9 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c)
 		return;
 
 	cr4_set_bits(X86_CR4_PKE);
+	pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
+	if (pk)
+		pk->pkru = init_pkru_value;
 	/*
 	 * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
 	 * cpuid bit to be set.  We need to ensure that we
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 2ecbf4155f98..1dcfc91c8f0c 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -18,6 +18,7 @@
 
 #include <asm/cpufeature.h>             /* boot_cpu_has, ...            */
 #include <asm/mmu_context.h>            /* vma_pkey()                   */
+#include <asm/fpu/internal.h>		/* init_fpstate			*/
 
 int __execute_only_pkey(struct mm_struct *mm)
 {
@@ -161,6 +162,7 @@ static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
 static ssize_t init_pkru_write_file(struct file *file,
 		 const char __user *user_buf, size_t count, loff_t *ppos)
 {
+	struct pkru_state *pk;
 	char buf[32];
 	ssize_t len;
 	u32 new_init_pkru;
@@ -183,6 +185,10 @@ static ssize_t init_pkru_write_file(struct file *file,
 		return -EINVAL;
 
 	WRITE_ONCE(init_pkru_value, new_init_pkru);
+	pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
+	if (!pk)
+		return -EINVAL;
+	pk->pkru = new_init_pkru;
 	return count;
 }
 
-- 
cgit v1.2.3


From cae5ec342645746d617dd420d206e1588d47768a Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Fri, 12 Apr 2019 17:50:57 -0400
Subject: x86/speculation/mds: Fix comment

s/L1TF/MDS/

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/kernel/cpu/bugs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 22a14d4b68a2..0642505dda69 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -221,7 +221,7 @@ static void x86_amd_ssb_disable(void)
 #undef pr_fmt
 #define pr_fmt(fmt)	"MDS: " fmt
 
-/* Default mitigation for L1TF-affected CPUs */
+/* Default mitigation for MDS-affected CPUs */
 static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
 static bool mds_nosmt __ro_after_init = false;
 
-- 
cgit v1.2.3


From e2c3c94788b08891dcf3dbe608f9880523ecd71b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 12 Apr 2019 17:50:58 -0400
Subject: x86/speculation/mds: Print SMT vulnerable on MSBDS with mitigations
 off

This code is only for CPUs which are affected by MSBDS, but are *not*
affected by the other two MDS issues.

For such CPUs, enabling the mds_idle_clear mitigation is enough to
mitigate SMT.

However if user boots with 'mds=off' and still has SMT enabled, we should
not report that SMT is mitigated:

$cat /sys//devices/system/cpu/vulnerabilities/mds
Vulnerable; SMT mitigated

But rather:
Vulnerable; SMT vulnerable

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lkml.kernel.org/r/20190412215118.294906495@localhost.localdomain
---
 arch/x86/kernel/cpu/bugs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 0642505dda69..6b8a55c7cebc 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1204,7 +1204,8 @@ static ssize_t mds_show_state(char *buf)
 
 	if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
 		return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
-			       sched_smt_active() ? "mitigated" : "disabled");
+			       (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
+			        sched_smt_active() ? "mitigated" : "disabled"));
 	}
 
 	return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
-- 
cgit v1.2.3


From 5c14068f87d04adc73ba3f41c2a303d3c3d1fa12 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 17 Apr 2019 16:39:02 -0500
Subject: x86/speculation/mds: Add 'mitigations=' support for MDS

Add MDS to the new 'mitigations=' cmdline option.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/admin-guide/kernel-parameters.txt | 2 ++
 arch/x86/kernel/cpu/bugs.c                      | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9aa3543a8723..18cad2b0392a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2556,6 +2556,7 @@
 					       spectre_v2_user=off [X86]
 					       spec_store_bypass_disable=off [X86,PPC]
 					       l1tf=off [X86]
+					       mds=off [X86]
 
 			auto (default)
 				Mitigate all CPU vulnerabilities, but leave SMT
@@ -2570,6 +2571,7 @@
 				if needed.  This is for users who always want to
 				be fully mitigated, even if it means losing SMT.
 				Equivalent to: l1tf=flush,nosmt [X86]
+					       mds=full,nosmt [X86]
 
 	mminit_loglevel=
 			[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 3c5c3c3ba734..667c273a66d7 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -233,7 +233,7 @@ static const char * const mds_strings[] = {
 
 static void __init mds_select_mitigation(void)
 {
-	if (!boot_cpu_has_bug(X86_BUG_MDS)) {
+	if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
 		mds_mitigation = MDS_MITIGATION_OFF;
 		return;
 	}
@@ -244,7 +244,8 @@ static void __init mds_select_mitigation(void)
 
 		static_branch_enable(&mds_user_clear);
 
-		if (mds_nosmt && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
+		if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
+		    (mds_nosmt || cpu_mitigations_auto_nosmt()))
 			cpu_smt_disable(false);
 	}
 
-- 
cgit v1.2.3


From d9c9ce34ed5c892323cbf5b4f9a4c498e036316a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 2 May 2019 19:11:39 +0200
Subject: x86/fpu: Fault-in user stack if copy_fpstate_to_sigframe() fails

In the compacted form, XSAVES may save only the XMM+SSE state but skip
FP (x87 state).

This is denoted by header->xfeatures = 6. The fastpath
(copy_fpregs_to_sigframe()) does that but _also_ initialises the FP
state (cwd to 0x37f, mxcsr as we do, remaining fields to 0).

The slowpath (copy_xstate_to_user()) leaves most of the FP
state untouched. Only mxcsr and mxcsr_flags are set due to
xfeatures_mxcsr_quirk(). Now that XFEATURE_MASK_FP is set
unconditionally, see

  04944b793e18 ("x86: xsave: set FP, SSE bits in the xsave header in the user sigcontext"),

on return from the signal, random garbage is loaded as the FP state.

Instead of utilizing copy_xstate_to_user(), fault-in the user memory
and retry the fast path. Ideally, the fast path succeeds on the second
attempt but may be retried again if the memory is swapped out due
to memory pressure. If the user memory can not be faulted-in then
get_user_pages() returns an error so we don't loop forever.

Fault in memory via get_user_pages_unlocked() so
copy_fpregs_to_sigframe() succeeds without a fault.

Fixes: 69277c98f5eef ("x86/fpu: Always store the registers in copy_fpstate_to_sigframe()")
Reported-by: Kurt Kanzenbach <kurt.kanzenbach@linutronix.de>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190502171139.mqtegctsg35cir2e@linutronix.de
---
 arch/x86/kernel/fpu/signal.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 7026f1c4e5e3..5a8d118bc423 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -157,11 +157,9 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
  */
 int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 {
-	struct fpu *fpu = &current->thread.fpu;
-	struct xregs_state *xsave = &fpu->state.xsave;
 	struct task_struct *tsk = current;
 	int ia32_fxstate = (buf != buf_fx);
-	int ret = -EFAULT;
+	int ret;
 
 	ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
 			 IS_ENABLED(CONFIG_IA32_EMULATION));
@@ -174,11 +172,12 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 			sizeof(struct user_i387_ia32_struct), NULL,
 			(struct _fpstate_32 __user *) buf) ? -1 : 1;
 
+retry:
 	/*
 	 * Load the FPU registers if they are not valid for the current task.
 	 * With a valid FPU state we can attempt to save the state directly to
-	 * userland's stack frame which will likely succeed. If it does not, do
-	 * the slowpath.
+	 * userland's stack frame which will likely succeed. If it does not,
+	 * resolve the fault in the user memory and try again.
 	 */
 	fpregs_lock();
 	if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -187,20 +186,20 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	pagefault_disable();
 	ret = copy_fpregs_to_sigframe(buf_fx);
 	pagefault_enable();
-	if (ret && !test_thread_flag(TIF_NEED_FPU_LOAD))
-		copy_fpregs_to_fpstate(fpu);
-	set_thread_flag(TIF_NEED_FPU_LOAD);
 	fpregs_unlock();
 
 	if (ret) {
-		if (using_compacted_format()) {
-			if (copy_xstate_to_user(buf_fx, xsave, 0, size))
-				return -1;
-		} else {
-			fpstate_sanitize_xstate(fpu);
-			if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
-				return -1;
-		}
+		int aligned_size;
+		int nr_pages;
+
+		aligned_size = offset_in_page(buf_fx) + fpu_user_xstate_size;
+		nr_pages = DIV_ROUND_UP(aligned_size, PAGE_SIZE);
+
+		ret = get_user_pages_unlocked((unsigned long)buf_fx, nr_pages,
+					      NULL, FOLL_WRITE);
+		if (ret == nr_pages)
+			goto retry;
+		return -EFAULT;
 	}
 
 	/* Save the fsave header for the 32-bit frames. */
-- 
cgit v1.2.3


From c5bf68fe0c86a5835bd2e6aead1c49976360753f Mon Sep 17 00:00:00 2001
From: Kirill Smelkov <kirr@nexedi.com>
Date: Tue, 26 Mar 2019 23:51:19 +0300
Subject: *: convert stream-like files from nonseekable_open -> stream_open

Using scripts/coccinelle/api/stream_open.cocci added in 10dce8af3422
("fs: stream_open - opener for stream-like files so that read and write
can run simultaneously without deadlock"), search and convert to
stream_open all in-kernel nonseekable_open users for which read and
write actually do not depend on ppos and where there is no other methods
in file_operations which assume @offset access.

I've verified each generated change manually - that it is correct to convert -
and each other nonseekable_open instance left - that it is either not correct
to convert there, or that it is not converted due to current stream_open.cocci
limitations. The script also does not convert files that should be valid to
convert, but that currently have .llseek = noop_llseek or generic_file_llseek
for unknown reason despite file being opened with nonseekable_open (e.g.
drivers/input/mousedev.c)

Among cases converted 14 were potentially vulnerable to read vs write deadlock
(see details in 10dce8af3422):

	drivers/char/pcmcia/cm4000_cs.c:1685:7-23: ERROR: cm4000_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/gnss/core.c:45:1-17: ERROR: gnss_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/hid/uhid.c:635:1-17: ERROR: uhid_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/infiniband/core/user_mad.c:988:1-17: ERROR: umad_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/input/evdev.c:527:1-17: ERROR: evdev_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/input/misc/uinput.c:401:1-17: ERROR: uinput_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/isdn/capi/capi.c:963:8-24: ERROR: capi_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/leds/uleds.c:77:1-17: ERROR: uleds_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/media/rc/lirc_dev.c:198:1-17: ERROR: lirc_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/s390/char/fs3270.c:488:1-17: ERROR: fs3270_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/usb/misc/ldusb.c:310:1-17: ERROR: ld_usb_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	drivers/xen/evtchn.c:667:8-24: ERROR: evtchn_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	net/batman-adv/icmp_socket.c:80:1-17: ERROR: batadv_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.
	net/rfkill/core.c:1146:8-24: ERROR: rfkill_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix.

and the rest were just safe to convert to stream_open because their read and
write do not use ppos at all and corresponding file_operations do not
have methods that assume @offset file access(*):

	arch/powerpc/platforms/52xx/mpc52xx_gpt.c:631:8-24: WARNING: mpc52xx_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/powerpc/platforms/cell/spufs/file.c:591:8-24: WARNING: spufs_ibox_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/powerpc/platforms/cell/spufs/file.c:591:8-24: WARNING: spufs_ibox_stat_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/powerpc/platforms/cell/spufs/file.c:591:8-24: WARNING: spufs_mbox_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/powerpc/platforms/cell/spufs/file.c:591:8-24: WARNING: spufs_mbox_stat_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/powerpc/platforms/cell/spufs/file.c:591:8-24: WARNING: spufs_wbox_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/powerpc/platforms/cell/spufs/file.c:591:8-24: WARNING: spufs_wbox_stat_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/um/drivers/harddog_kern.c:88:8-24: WARNING: harddog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	arch/x86/kernel/cpu/microcode/core.c:430:33-49: WARNING: microcode_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/char/ds1620.c:215:8-24: WARNING: ds1620_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/char/dtlk.c:301:1-17: WARNING: dtlk_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/char/ipmi/ipmi_watchdog.c:840:9-25: WARNING: ipmi_wdog_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/char/pcmcia/scr24x_cs.c:95:8-24: WARNING: scr24x_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/char/tb0219.c:246:9-25: WARNING: tb0219_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/firewire/nosy.c:306:8-24: WARNING: nosy_ops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/hwmon/fschmd.c:840:8-24: WARNING: watchdog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/hwmon/w83793.c:1344:8-24: WARNING: watchdog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/infiniband/core/ucma.c:1747:8-24: WARNING: ucma_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/infiniband/core/ucm.c:1178:8-24: WARNING: ucm_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/infiniband/core/uverbs_main.c:1086:8-24: WARNING: uverbs_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/input/joydev.c:282:1-17: WARNING: joydev_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/pci/switch/switchtec.c:393:1-17: WARNING: switchtec_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/platform/chrome/cros_ec_debugfs.c:135:8-24: WARNING: cros_ec_console_log_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/rtc/rtc-ds1374.c:470:9-25: WARNING: ds1374_wdt_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/rtc/rtc-m41t80.c:805:9-25: WARNING: wdt_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/s390/char/tape_char.c:293:2-18: WARNING: tape_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/s390/char/zcore.c:194:8-24: WARNING: zcore_reipl_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/s390/crypto/zcrypt_api.c:528:8-24: WARNING: zcrypt_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/spi/spidev.c:594:1-17: WARNING: spidev_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/staging/pi433/pi433_if.c:974:1-17: WARNING: pi433_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/acquirewdt.c:203:8-24: WARNING: acq_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/advantechwdt.c:202:8-24: WARNING: advwdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/alim1535_wdt.c:252:8-24: WARNING: ali_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/alim7101_wdt.c:217:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/ar7_wdt.c:166:8-24: WARNING: ar7_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/at91rm9200_wdt.c:113:8-24: WARNING: at91wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/ath79_wdt.c:135:8-24: WARNING: ath79_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/bcm63xx_wdt.c:119:8-24: WARNING: bcm63xx_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/cpu5wdt.c:143:8-24: WARNING: cpu5wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/cpwd.c:397:8-24: WARNING: cpwd_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/eurotechwdt.c:319:8-24: WARNING: eurwdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/f71808e_wdt.c:528:8-24: WARNING: watchdog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/gef_wdt.c:232:8-24: WARNING: gef_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/geodewdt.c:95:8-24: WARNING: geodewdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/ib700wdt.c:241:8-24: WARNING: ibwdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/ibmasr.c:326:8-24: WARNING: asr_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/indydog.c:80:8-24: WARNING: indydog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/intel_scu_watchdog.c:307:8-24: WARNING: intel_scu_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/iop_wdt.c:104:8-24: WARNING: iop_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/it8712f_wdt.c:330:8-24: WARNING: it8712f_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/ixp4xx_wdt.c:68:8-24: WARNING: ixp4xx_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/ks8695_wdt.c:145:8-24: WARNING: ks8695wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/m54xx_wdt.c:88:8-24: WARNING: m54xx_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/machzwd.c:336:8-24: WARNING: zf_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/mixcomwd.c:153:8-24: WARNING: mixcomwd_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/mtx-1_wdt.c:121:8-24: WARNING: mtx1_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/mv64x60_wdt.c:136:8-24: WARNING: mv64x60_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/nuc900_wdt.c:134:8-24: WARNING: nuc900wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/nv_tco.c:164:8-24: WARNING: nv_tco_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pc87413_wdt.c:289:8-24: WARNING: pc87413_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pcwd.c:698:8-24: WARNING: pcwd_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pcwd.c:737:8-24: WARNING: pcwd_temp_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pcwd_pci.c:581:8-24: WARNING: pcipcwd_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pcwd_pci.c:623:8-24: WARNING: pcipcwd_temp_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pcwd_usb.c:488:8-24: WARNING: usb_pcwd_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pcwd_usb.c:527:8-24: WARNING: usb_pcwd_temperature_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pika_wdt.c:121:8-24: WARNING: pikawdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/pnx833x_wdt.c:119:8-24: WARNING: pnx833x_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/rc32434_wdt.c:153:8-24: WARNING: rc32434_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/rdc321x_wdt.c:145:8-24: WARNING: rdc321x_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/riowd.c:79:1-17: WARNING: riowd_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sa1100_wdt.c:62:8-24: WARNING: sa1100dog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sbc60xxwdt.c:211:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sbc7240_wdt.c:139:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sbc8360.c:274:8-24: WARNING: sbc8360_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sbc_epx_c3.c:81:8-24: WARNING: epx_c3_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sbc_fitpc2_wdt.c:78:8-24: WARNING: fitpc2_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sb_wdog.c:108:1-17: WARNING: sbwdog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sc1200wdt.c:181:8-24: WARNING: sc1200wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sc520_wdt.c:261:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/sch311x_wdt.c:319:8-24: WARNING: sch311x_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/scx200_wdt.c:105:8-24: WARNING: scx200_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/smsc37b787_wdt.c:369:8-24: WARNING: wb_smsc_wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/w83877f_wdt.c:227:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/w83977f_wdt.c:301:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wafer5823wdt.c:200:8-24: WARNING: wafwdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/watchdog_dev.c:828:8-24: WARNING: watchdog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdrtas.c:379:8-24: WARNING: wdrtas_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdrtas.c:445:8-24: WARNING: wdrtas_temp_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdt285.c:104:1-17: WARNING: watchdog_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdt977.c:276:8-24: WARNING: wdt977_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdt.c:424:8-24: WARNING: wdt_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdt.c:484:8-24: WARNING: wdt_temp_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdt_pci.c:464:8-24: WARNING: wdtpci_fops: .write() has stream semantic; safe to change nonseekable_open -> stream_open.
	drivers/watchdog/wdt_pci.c:527:8-24: WARNING: wdtpci_temp_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	net/batman-adv/log.c:105:1-17: WARNING: batadv_log_fops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	sound/core/control.c:57:7-23: WARNING: snd_ctl_f_ops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.
	sound/core/rawmidi.c:385:7-23: WARNING: snd_rawmidi_f_ops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	sound/core/seq/seq_clientmgr.c:310:7-23: WARNING: snd_seq_f_ops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open.
	sound/core/timer.c:1428:7-23: WARNING: snd_timer_f_ops: .read() has stream semantic; safe to change nonseekable_open -> stream_open.

One can also recheck/review the patch via generating it with explanation comments included via

	$ make coccicheck MODE=patch COCCI=scripts/coccinelle/api/stream_open.cocci SPFLAGS="-D explain"

(*) This second group also contains cases with read/write deadlocks that
stream_open.cocci don't yet detect, but which are still valid to convert to
stream_open since ppos is not used. For example drivers/pci/switch/switchtec.c
calls wait_for_completion_interruptible() in its .read, but stream_open.cocci
currently detects only "wait_event*" as blocking.

Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Yongzhi Pan <panyongzhi@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Tejun Heo <tj@kernel.org>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Julia Lawall <Julia.Lawall@lip6.fr>
Cc: Nikolaus Rath <Nikolaus@rath.org>
Cc: Han-Wen Nienhuys <hanwen@google.com>
Cc: Anatolij Gustschin <agust@denx.de>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James R. Van Zandt" <jrv@vanzandt.mv.com>
Cc: Corey Minyard <minyard@acm.org>
Cc: Harald Welte <laforge@gnumonks.org>
Acked-by: Lubomir Rintel <lkundrak@v3.sk> [scr24x_cs]
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Johan Hovold <johan@kernel.org>
Cc: David Herrmann <dh.herrmann@googlemail.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Cc: Jean Delvare <jdelvare@suse.com>
Acked-by: Guenter Roeck <linux@roeck-us.net>	[watchdog/* hwmon/*]
Cc: Rudolf Marek <r.marek@assembler.cz>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Karsten Keil <isdn@linux-pingi.de>
Cc: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Kurt Schwemmer <kurt.schwemmer@microsemi.com>
Acked-by: Logan Gunthorpe <logang@deltatee.com> [drivers/pci/switch/switchtec]
Acked-by: Bjorn Helgaas <bhelgaas@google.com> [drivers/pci/switch/switchtec]
Cc: Benson Leung <bleung@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com> [platform/chrome]
Cc: Alessandro Zummo <a.zummo@towertech.it>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com> [rtc/*]
Cc: Mark Brown <broonie@kernel.org>
Cc: Wim Van Sebroeck <wim@linux-watchdog.org>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: bcm-kernel-feedback-list@broadcom.com
Cc: Wan ZongShun <mcuos.com@gmail.com>
Cc: Zwane Mwaikambo <zwanem@gmail.com>
Cc: Marek Lindner <mareklindner@neomailbox.ch>
Cc: Simon Wunderlich <sw@simonwunderlich.de>
Cc: Antonio Quartulli <a@unstable.cc>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.com>
Signed-off-by: Kirill Smelkov <kirr@nexedi.com>
---
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c | 2 +-
 arch/powerpc/platforms/cell/spufs/file.c  | 2 +-
 arch/um/drivers/harddog_kern.c            | 2 +-
 arch/x86/kernel/cpu/microcode/core.c      | 2 +-
 drivers/char/ds1620.c                     | 2 +-
 drivers/char/dtlk.c                       | 2 +-
 drivers/char/ipmi/ipmi_watchdog.c         | 2 +-
 drivers/char/pcmcia/cm4000_cs.c           | 2 +-
 drivers/char/pcmcia/scr24x_cs.c           | 2 +-
 drivers/char/tb0219.c                     | 2 +-
 drivers/firewire/nosy.c                   | 2 +-
 drivers/gnss/core.c                       | 2 +-
 drivers/hid/uhid.c                        | 2 +-
 drivers/hwmon/fschmd.c                    | 2 +-
 drivers/hwmon/w83793.c                    | 2 +-
 drivers/infiniband/core/ucm.c             | 2 +-
 drivers/infiniband/core/ucma.c            | 2 +-
 drivers/infiniband/core/user_mad.c        | 2 +-
 drivers/infiniband/core/uverbs_main.c     | 2 +-
 drivers/input/evdev.c                     | 2 +-
 drivers/input/joydev.c                    | 2 +-
 drivers/input/misc/uinput.c               | 2 +-
 drivers/isdn/capi/capi.c                  | 2 +-
 drivers/leds/uleds.c                      | 2 +-
 drivers/media/rc/lirc_dev.c               | 2 +-
 drivers/pci/switch/switchtec.c            | 2 +-
 drivers/platform/chrome/cros_ec_debugfs.c | 2 +-
 drivers/rtc/rtc-ds1374.c                  | 2 +-
 drivers/rtc/rtc-m41t80.c                  | 2 +-
 drivers/s390/char/fs3270.c                | 2 +-
 drivers/s390/char/tape_char.c             | 2 +-
 drivers/s390/char/zcore.c                 | 2 +-
 drivers/s390/crypto/zcrypt_api.c          | 2 +-
 drivers/spi/spidev.c                      | 2 +-
 drivers/staging/pi433/pi433_if.c          | 2 +-
 drivers/usb/misc/ldusb.c                  | 2 +-
 drivers/watchdog/acquirewdt.c             | 2 +-
 drivers/watchdog/advantechwdt.c           | 2 +-
 drivers/watchdog/alim1535_wdt.c           | 2 +-
 drivers/watchdog/alim7101_wdt.c           | 2 +-
 drivers/watchdog/ar7_wdt.c                | 2 +-
 drivers/watchdog/at91rm9200_wdt.c         | 2 +-
 drivers/watchdog/ath79_wdt.c              | 2 +-
 drivers/watchdog/bcm63xx_wdt.c            | 2 +-
 drivers/watchdog/cpu5wdt.c                | 2 +-
 drivers/watchdog/cpwd.c                   | 2 +-
 drivers/watchdog/eurotechwdt.c            | 2 +-
 drivers/watchdog/f71808e_wdt.c            | 2 +-
 drivers/watchdog/gef_wdt.c                | 2 +-
 drivers/watchdog/geodewdt.c               | 2 +-
 drivers/watchdog/ib700wdt.c               | 2 +-
 drivers/watchdog/ibmasr.c                 | 2 +-
 drivers/watchdog/indydog.c                | 2 +-
 drivers/watchdog/intel_scu_watchdog.c     | 2 +-
 drivers/watchdog/iop_wdt.c                | 2 +-
 drivers/watchdog/it8712f_wdt.c            | 2 +-
 drivers/watchdog/ixp4xx_wdt.c             | 2 +-
 drivers/watchdog/ks8695_wdt.c             | 2 +-
 drivers/watchdog/m54xx_wdt.c              | 2 +-
 drivers/watchdog/machzwd.c                | 2 +-
 drivers/watchdog/mixcomwd.c               | 2 +-
 drivers/watchdog/mtx-1_wdt.c              | 2 +-
 drivers/watchdog/mv64x60_wdt.c            | 2 +-
 drivers/watchdog/nuc900_wdt.c             | 2 +-
 drivers/watchdog/nv_tco.c                 | 2 +-
 drivers/watchdog/pc87413_wdt.c            | 2 +-
 drivers/watchdog/pcwd.c                   | 4 ++--
 drivers/watchdog/pcwd_pci.c               | 4 ++--
 drivers/watchdog/pcwd_usb.c               | 4 ++--
 drivers/watchdog/pika_wdt.c               | 2 +-
 drivers/watchdog/pnx833x_wdt.c            | 2 +-
 drivers/watchdog/rc32434_wdt.c            | 2 +-
 drivers/watchdog/rdc321x_wdt.c            | 2 +-
 drivers/watchdog/riowd.c                  | 2 +-
 drivers/watchdog/sa1100_wdt.c             | 2 +-
 drivers/watchdog/sb_wdog.c                | 2 +-
 drivers/watchdog/sbc60xxwdt.c             | 2 +-
 drivers/watchdog/sbc7240_wdt.c            | 2 +-
 drivers/watchdog/sbc8360.c                | 2 +-
 drivers/watchdog/sbc_epx_c3.c             | 2 +-
 drivers/watchdog/sbc_fitpc2_wdt.c         | 2 +-
 drivers/watchdog/sc1200wdt.c              | 2 +-
 drivers/watchdog/sc520_wdt.c              | 2 +-
 drivers/watchdog/sch311x_wdt.c            | 2 +-
 drivers/watchdog/scx200_wdt.c             | 2 +-
 drivers/watchdog/smsc37b787_wdt.c         | 2 +-
 drivers/watchdog/w83877f_wdt.c            | 2 +-
 drivers/watchdog/w83977f_wdt.c            | 2 +-
 drivers/watchdog/wafer5823wdt.c           | 2 +-
 drivers/watchdog/watchdog_dev.c           | 2 +-
 drivers/watchdog/wdrtas.c                 | 4 ++--
 drivers/watchdog/wdt.c                    | 4 ++--
 drivers/watchdog/wdt285.c                 | 2 +-
 drivers/watchdog/wdt977.c                 | 2 +-
 drivers/watchdog/wdt_pci.c                | 4 ++--
 drivers/xen/evtchn.c                      | 2 +-
 net/batman-adv/icmp_socket.c              | 2 +-
 net/batman-adv/log.c                      | 2 +-
 net/rfkill/core.c                         | 2 +-
 sound/core/control.c                      | 2 +-
 sound/core/rawmidi.c                      | 2 +-
 sound/core/seq/seq_clientmgr.c            | 2 +-
 sound/core/timer.c                        | 2 +-
 103 files changed, 109 insertions(+), 109 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index 17cf249b18ee..3cb2f07ce8eb 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -628,7 +628,7 @@ static int mpc52xx_wdt_open(struct inode *inode, struct file *file)
 	}
 
 	file->private_data = mpc52xx_gpt_wdt;
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int mpc52xx_wdt_release(struct inode *inode, struct file *file)
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 48c2477e7e2a..bfb9ca99ac05 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -588,7 +588,7 @@ static int spufs_pipe_open(struct inode *inode, struct file *file)
 	struct spufs_inode_info *i = SPUFS_I(inode);
 	file->private_data = i->i_ctx;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /*
diff --git a/arch/um/drivers/harddog_kern.c b/arch/um/drivers/harddog_kern.c
index 6d381279b362..000cb69ba0bc 100644
--- a/arch/um/drivers/harddog_kern.c
+++ b/arch/um/drivers/harddog_kern.c
@@ -85,7 +85,7 @@ static int harddog_open(struct inode *inode, struct file *file)
 	timer_alive = 1;
 	spin_unlock(&lock);
 	mutex_unlock(&harddog_mutex);
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 err:
 	spin_unlock(&lock);
 	mutex_unlock(&harddog_mutex);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 5260185cbf7b..639817729ed4 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -427,7 +427,7 @@ static int do_microcode_update(const void __user *buf, size_t size)
 
 static int microcode_open(struct inode *inode, struct file *file)
 {
-	return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
+	return capable(CAP_SYS_RAWIO) ? stream_open(inode, file) : -EPERM;
 }
 
 static ssize_t microcode_write(struct file *file, const char __user *buf,
diff --git a/drivers/char/ds1620.c b/drivers/char/ds1620.c
index a5ecf6dae02e..373f549525fe 100644
--- a/drivers/char/ds1620.c
+++ b/drivers/char/ds1620.c
@@ -212,7 +212,7 @@ static void ds1620_read_state(struct therm *therm)
 
 static int ds1620_open(struct inode *inode, struct file *file)
 {
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static ssize_t
diff --git a/drivers/char/dtlk.c b/drivers/char/dtlk.c
index 669c3311adc4..4fed8fafa0f0 100644
--- a/drivers/char/dtlk.c
+++ b/drivers/char/dtlk.c
@@ -302,7 +302,7 @@ static int dtlk_open(struct inode *inode, struct file *file)
 	case DTLK_MINOR:
 		if (dtlk_busy)
 			return -EBUSY;
-		return nonseekable_open(inode, file);
+		return stream_open(inode, file);
 
 	default:
 		return -ENXIO;
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 2924a4bc4a32..74c6d1f34132 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -837,7 +837,7 @@ static int ipmi_open(struct inode *ino, struct file *filep)
 		 * first heartbeat.
 		 */
 		ipmi_start_timer_on_heartbeat = 1;
-		return nonseekable_open(ino, filep);
+		return stream_open(ino, filep);
 
 	default:
 		return (-ENODEV);
diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index 7a4eb86aedac..15bf585af5d3 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -1682,7 +1682,7 @@ static int cmm_open(struct inode *inode, struct file *filp)
 	link->open = 1;		/* only one open per device */
 
 	DEBUGP(2, dev, "<- cmm_open\n");
-	ret = nonseekable_open(inode, filp);
+	ret = stream_open(inode, filp);
 out:
 	mutex_unlock(&cmm_mutex);
 	return ret;
diff --git a/drivers/char/pcmcia/scr24x_cs.c b/drivers/char/pcmcia/scr24x_cs.c
index f6b43d9350f0..04b39c3596cc 100644
--- a/drivers/char/pcmcia/scr24x_cs.c
+++ b/drivers/char/pcmcia/scr24x_cs.c
@@ -92,7 +92,7 @@ static int scr24x_open(struct inode *inode, struct file *filp)
 	kref_get(&dev->refcnt);
 	filp->private_data = dev;
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int scr24x_release(struct inode *inode, struct file *filp)
diff --git a/drivers/char/tb0219.c b/drivers/char/tb0219.c
index 7c19d9b22785..e8614ea843e2 100644
--- a/drivers/char/tb0219.c
+++ b/drivers/char/tb0219.c
@@ -243,7 +243,7 @@ static int tanbac_tb0219_open(struct inode *inode, struct file *file)
 	case 16 ... 23:
 	case 32 ... 39:
 	case 48 ... 55:
-		return nonseekable_open(inode, file);
+		return stream_open(inode, file);
 	default:
 		break;
 	}
diff --git a/drivers/firewire/nosy.c b/drivers/firewire/nosy.c
index a128dd1126ae..515e96db4391 100644
--- a/drivers/firewire/nosy.c
+++ b/drivers/firewire/nosy.c
@@ -303,7 +303,7 @@ nosy_open(struct inode *inode, struct file *file)
 
 	file->private_data = client;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 fail:
 	kfree(client);
 	lynx_put(lynx);
diff --git a/drivers/gnss/core.c b/drivers/gnss/core.c
index 320cfca80d5f..e6f94501cb28 100644
--- a/drivers/gnss/core.c
+++ b/drivers/gnss/core.c
@@ -42,7 +42,7 @@ static int gnss_open(struct inode *inode, struct file *file)
 
 	get_device(&gdev->dev);
 
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 	file->private_data = gdev;
 
 	down_write(&gdev->rwsem);
diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c
index 840634e0f1e3..dbaead0a5371 100644
--- a/drivers/hid/uhid.c
+++ b/drivers/hid/uhid.c
@@ -632,7 +632,7 @@ static int uhid_char_open(struct inode *inode, struct file *file)
 	INIT_WORK(&uhid->worker, uhid_device_add_worker);
 
 	file->private_data = uhid;
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	return 0;
 }
diff --git a/drivers/hwmon/fschmd.c b/drivers/hwmon/fschmd.c
index 042a166e1858..8fb54079fac8 100644
--- a/drivers/hwmon/fschmd.c
+++ b/drivers/hwmon/fschmd.c
@@ -837,7 +837,7 @@ static int watchdog_open(struct inode *inode, struct file *filp)
 	watchdog_trigger(data);
 	filp->private_data = data;
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int watchdog_release(struct inode *inode, struct file *filp)
diff --git a/drivers/hwmon/w83793.c b/drivers/hwmon/w83793.c
index 0af0f6283b35..e94ae1bb3cf0 100644
--- a/drivers/hwmon/w83793.c
+++ b/drivers/hwmon/w83793.c
@@ -1341,7 +1341,7 @@ static int watchdog_open(struct inode *inode, struct file *filp)
 	/* Store pointer to data into filp's private data */
 	filp->private_data = data;
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int watchdog_close(struct inode *inode, struct file *filp)
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 7541fbaf58a3..65c3230f5663 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -1175,7 +1175,7 @@ static int ib_ucm_open(struct inode *inode, struct file *filp)
 	file->filp = filp;
 	file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev);
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int ib_ucm_close(struct inode *inode, struct file *filp)
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 7468b26b8a01..140a338a135f 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1744,7 +1744,7 @@ static int ucma_open(struct inode *inode, struct file *filp)
 	filp->private_data = file;
 	file->filp = filp;
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int ucma_close(struct inode *inode, struct file *filp)
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 02b7947ab215..b58b07c03cfb 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -985,7 +985,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
 
 	list_add_tail(&file->port_list, &port->file_list);
 
-	nonseekable_open(inode, filp);
+	stream_open(inode, filp);
 out:
 	mutex_unlock(&port->file_mutex);
 	return ret;
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index c489f545baae..8b43dd96d3b2 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -1132,7 +1132,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 
 	setup_ufile_idr_uobject(file);
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 
 err_module:
 	module_put(ib_dev->owner);
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index f48369d6f3a0..f040d8881ff2 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -524,7 +524,7 @@ static int evdev_open(struct inode *inode, struct file *file)
 		goto err_free_client;
 
 	file->private_data = client;
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	return 0;
 
diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c
index 4c1e427dfabb..d806f6be4788 100644
--- a/drivers/input/joydev.c
+++ b/drivers/input/joydev.c
@@ -279,7 +279,7 @@ static int joydev_open(struct inode *inode, struct file *file)
 		goto err_free_client;
 
 	file->private_data = client;
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	return 0;
 
diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c
index 26ec603fe220..1a6762fc38f9 100644
--- a/drivers/input/misc/uinput.c
+++ b/drivers/input/misc/uinput.c
@@ -398,7 +398,7 @@ static int uinput_open(struct inode *inode, struct file *file)
 	newdev->state = UIST_NEW_DEVICE;
 
 	file->private_data = newdev;
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	return 0;
 }
diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index e1da70a9530c..3c3ad42f22bf 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -960,7 +960,7 @@ static int capi_open(struct inode *inode, struct file *file)
 	list_add_tail(&cdev->list, &capidev_list);
 	mutex_unlock(&capidev_list_lock);
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int capi_release(struct inode *inode, struct file *file)
diff --git a/drivers/leds/uleds.c b/drivers/leds/uleds.c
index 0c43bfac9598..08b6a769ff8f 100644
--- a/drivers/leds/uleds.c
+++ b/drivers/leds/uleds.c
@@ -74,7 +74,7 @@ static int uleds_open(struct inode *inode, struct file *file)
 	udev->state = ULEDS_STATE_UNKNOWN;
 
 	file->private_data = udev;
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	return 0;
 }
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index f862f1b7f996..92db1e83c192 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -195,7 +195,7 @@ static int ir_lirc_open(struct inode *inode, struct file *file)
 	list_add(&fh->list, &dev->lirc_fh);
 	spin_unlock_irqrestore(&dev->lirc_fh_lock, flags);
 
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	return 0;
 out_kfifo:
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index e22766c79fe9..0f7b80144863 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -390,7 +390,7 @@ static int switchtec_dev_open(struct inode *inode, struct file *filp)
 		return PTR_ERR(stuser);
 
 	filp->private_data = stuser;
-	nonseekable_open(inode, filp);
+	stream_open(inode, filp);
 
 	dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser);
 
diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c
index 71308766e891..2b8e8a01a739 100644
--- a/drivers/platform/chrome/cros_ec_debugfs.c
+++ b/drivers/platform/chrome/cros_ec_debugfs.c
@@ -132,7 +132,7 @@ static int cros_ec_console_log_open(struct inode *inode, struct file *file)
 {
 	file->private_data = inode->i_private;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static ssize_t cros_ec_console_log_read(struct file *file, char __user *buf,
diff --git a/drivers/rtc/rtc-ds1374.c b/drivers/rtc/rtc-ds1374.c
index 38a2e9e684df..225a8df1d4e9 100644
--- a/drivers/rtc/rtc-ds1374.c
+++ b/drivers/rtc/rtc-ds1374.c
@@ -467,7 +467,7 @@ static int ds1374_wdt_open(struct inode *inode, struct file *file)
 		 */
 		wdt_is_open = 1;
 		mutex_unlock(&ds1374->mutex);
-		return nonseekable_open(inode, file);
+		return stream_open(inode, file);
 	}
 	return -ENODEV;
 }
diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c
index ebf50b1540f2..dd5a8991f75b 100644
--- a/drivers/rtc/rtc-m41t80.c
+++ b/drivers/rtc/rtc-m41t80.c
@@ -802,7 +802,7 @@ static int wdt_open(struct inode *inode, struct file *file)
 		 */
 		wdt_is_open = 1;
 		mutex_unlock(&m41t80_rtc_mutex);
-		return nonseekable_open(inode, file);
+		return stream_open(inode, file);
 	}
 	return -ENODEV;
 }
diff --git a/drivers/s390/char/fs3270.c b/drivers/s390/char/fs3270.c
index 8b48ba9c598e..4c4683d8784a 100644
--- a/drivers/s390/char/fs3270.c
+++ b/drivers/s390/char/fs3270.c
@@ -486,7 +486,7 @@ fs3270_open(struct inode *inode, struct file *filp)
 		raw3270_del_view(&fp->view);
 		goto out;
 	}
-	nonseekable_open(inode, filp);
+	stream_open(inode, filp);
 	filp->private_data = fp;
 out:
 	mutex_unlock(&fs3270_mutex);
diff --git a/drivers/s390/char/tape_char.c b/drivers/s390/char/tape_char.c
index fc206c9d1c56..ea4253939555 100644
--- a/drivers/s390/char/tape_char.c
+++ b/drivers/s390/char/tape_char.c
@@ -290,7 +290,7 @@ tapechar_open (struct inode *inode, struct file *filp)
 	rc = tape_open(device);
 	if (rc == 0) {
 		filp->private_data = device;
-		nonseekable_open(inode, filp);
+		stream_open(inode, filp);
 	} else
 		tape_put_device(device);
 
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index 76d3c50bf078..a57e1c55094f 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -191,7 +191,7 @@ static ssize_t zcore_reipl_write(struct file *filp, const char __user *buf,
 
 static int zcore_reipl_open(struct inode *inode, struct file *filp)
 {
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int zcore_reipl_release(struct inode *inode, struct file *filp)
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 689c2af7026a..6bfdc69a13e7 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -525,7 +525,7 @@ static int zcrypt_open(struct inode *inode, struct file *filp)
 	filp->private_data = (void *) perms;
 
 	atomic_inc(&zcrypt_open_count);
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 /**
diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c
index b0c76e2626ce..7fd0d9943160 100644
--- a/drivers/spi/spidev.c
+++ b/drivers/spi/spidev.c
@@ -591,7 +591,7 @@ static int spidev_open(struct inode *inode, struct file *filp)
 
 	spidev->users++;
 	filp->private_data = spidev;
-	nonseekable_open(inode, filp);
+	stream_open(inode, filp);
 
 	mutex_unlock(&device_list_lock);
 	return 0;
diff --git a/drivers/staging/pi433/pi433_if.c b/drivers/staging/pi433/pi433_if.c
index b2314636dc89..2299a11b1878 100644
--- a/drivers/staging/pi433/pi433_if.c
+++ b/drivers/staging/pi433/pi433_if.c
@@ -971,7 +971,7 @@ static int pi433_open(struct inode *inode, struct file *filp)
 
 	/* instance data as context */
 	filp->private_data = instance;
-	nonseekable_open(inode, filp);
+	stream_open(inode, filp);
 
 	return 0;
 }
diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c
index 006762b72ff5..6581774bdfa4 100644
--- a/drivers/usb/misc/ldusb.c
+++ b/drivers/usb/misc/ldusb.c
@@ -307,7 +307,7 @@ static int ld_usb_open(struct inode *inode, struct file *file)
 	int retval;
 	struct usb_interface *interface;
 
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 	subminor = iminor(inode);
 
 	interface = usb_find_interface(&ld_usb_driver, subminor);
diff --git a/drivers/watchdog/acquirewdt.c b/drivers/watchdog/acquirewdt.c
index d6210d946082..957d1255d4ca 100644
--- a/drivers/watchdog/acquirewdt.c
+++ b/drivers/watchdog/acquirewdt.c
@@ -200,7 +200,7 @@ static int acq_open(struct inode *inode, struct file *file)
 
 	/* Activate */
 	acq_keepalive();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int acq_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/advantechwdt.c b/drivers/watchdog/advantechwdt.c
index f61944369c1a..2766af292a71 100644
--- a/drivers/watchdog/advantechwdt.c
+++ b/drivers/watchdog/advantechwdt.c
@@ -199,7 +199,7 @@ static int advwdt_open(struct inode *inode, struct file *file)
 	 */
 
 	advwdt_ping();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int advwdt_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/alim1535_wdt.c b/drivers/watchdog/alim1535_wdt.c
index 60f0c2eb8531..39a07bb5f6d5 100644
--- a/drivers/watchdog/alim1535_wdt.c
+++ b/drivers/watchdog/alim1535_wdt.c
@@ -249,7 +249,7 @@ static int ali_open(struct inode *inode, struct file *file)
 
 	/* Activate */
 	ali_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /*
diff --git a/drivers/watchdog/alim7101_wdt.c b/drivers/watchdog/alim7101_wdt.c
index 12f7ea62dddd..7e9884960eb9 100644
--- a/drivers/watchdog/alim7101_wdt.c
+++ b/drivers/watchdog/alim7101_wdt.c
@@ -214,7 +214,7 @@ static int fop_open(struct inode *inode, struct file *file)
 		return -EBUSY;
 	/* Good, fire up the show */
 	wdt_startup();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int fop_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/ar7_wdt.c b/drivers/watchdog/ar7_wdt.c
index ee1ab12ab04f..b9b2d06b3879 100644
--- a/drivers/watchdog/ar7_wdt.c
+++ b/drivers/watchdog/ar7_wdt.c
@@ -163,7 +163,7 @@ static int ar7_wdt_open(struct inode *inode, struct file *file)
 	ar7_wdt_enable_wdt();
 	expect_close = 0;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int ar7_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/at91rm9200_wdt.c b/drivers/watchdog/at91rm9200_wdt.c
index b45fc0aee667..907a4545dee6 100644
--- a/drivers/watchdog/at91rm9200_wdt.c
+++ b/drivers/watchdog/at91rm9200_wdt.c
@@ -110,7 +110,7 @@ static int at91_wdt_open(struct inode *inode, struct file *file)
 		return -EBUSY;
 
 	at91_wdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /*
diff --git a/drivers/watchdog/ath79_wdt.c b/drivers/watchdog/ath79_wdt.c
index e2209bf5fa8a..4f56b63f9691 100644
--- a/drivers/watchdog/ath79_wdt.c
+++ b/drivers/watchdog/ath79_wdt.c
@@ -132,7 +132,7 @@ static int ath79_wdt_open(struct inode *inode, struct file *file)
 	clear_bit(WDT_FLAGS_EXPECT_CLOSE, &wdt_flags);
 	ath79_wdt_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int ath79_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/bcm63xx_wdt.c b/drivers/watchdog/bcm63xx_wdt.c
index d3c1113e774c..e2af37c9a266 100644
--- a/drivers/watchdog/bcm63xx_wdt.c
+++ b/drivers/watchdog/bcm63xx_wdt.c
@@ -116,7 +116,7 @@ static int bcm63xx_wdt_open(struct inode *inode, struct file *file)
 		return -EBUSY;
 
 	bcm63xx_wdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int bcm63xx_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/cpu5wdt.c b/drivers/watchdog/cpu5wdt.c
index 6cfb102c397c..475360de6e9e 100644
--- a/drivers/watchdog/cpu5wdt.c
+++ b/drivers/watchdog/cpu5wdt.c
@@ -140,7 +140,7 @@ static int cpu5wdt_open(struct inode *inode, struct file *file)
 {
 	if (test_and_set_bit(0, &cpu5wdt_device.inuse))
 		return -EBUSY;
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int cpu5wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/cpwd.c b/drivers/watchdog/cpwd.c
index 32156e199c51..b5b078bdebe6 100644
--- a/drivers/watchdog/cpwd.c
+++ b/drivers/watchdog/cpwd.c
@@ -394,7 +394,7 @@ static int cpwd_open(struct inode *inode, struct file *f)
 
 	mutex_unlock(&cpwd_mutex);
 
-	return nonseekable_open(inode, f);
+	return stream_open(inode, f);
 }
 
 static int cpwd_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/eurotechwdt.c b/drivers/watchdog/eurotechwdt.c
index 47f77a6fdfd6..89129e6fa9b6 100644
--- a/drivers/watchdog/eurotechwdt.c
+++ b/drivers/watchdog/eurotechwdt.c
@@ -316,7 +316,7 @@ static int eurwdt_open(struct inode *inode, struct file *file)
 	eurwdt_timeout = WDT_TIMEOUT;	/* initial timeout */
 	/* Activate the WDT */
 	eurwdt_activate_timer();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
diff --git a/drivers/watchdog/f71808e_wdt.c b/drivers/watchdog/f71808e_wdt.c
index 9a1c761258ce..021c6ace9462 100644
--- a/drivers/watchdog/f71808e_wdt.c
+++ b/drivers/watchdog/f71808e_wdt.c
@@ -525,7 +525,7 @@ static int watchdog_open(struct inode *inode, struct file *file)
 		__module_get(THIS_MODULE);
 
 	watchdog.expect_close = 0;
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int watchdog_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/gef_wdt.c b/drivers/watchdog/gef_wdt.c
index 006e2348022c..26350b319505 100644
--- a/drivers/watchdog/gef_wdt.c
+++ b/drivers/watchdog/gef_wdt.c
@@ -229,7 +229,7 @@ static int gef_wdt_open(struct inode *inode, struct file *file)
 
 	gef_wdt_handler_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int gef_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/geodewdt.c b/drivers/watchdog/geodewdt.c
index 88e01238f01b..c5a727da6657 100644
--- a/drivers/watchdog/geodewdt.c
+++ b/drivers/watchdog/geodewdt.c
@@ -92,7 +92,7 @@ static int geodewdt_open(struct inode *inode, struct file *file)
 		__module_get(THIS_MODULE);
 
 	geodewdt_ping();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int geodewdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/ib700wdt.c b/drivers/watchdog/ib700wdt.c
index cc262284a6aa..30d6cec582af 100644
--- a/drivers/watchdog/ib700wdt.c
+++ b/drivers/watchdog/ib700wdt.c
@@ -238,7 +238,7 @@ static int ibwdt_open(struct inode *inode, struct file *file)
 
 	/* Activate */
 	ibwdt_ping();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int ibwdt_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/ibmasr.c b/drivers/watchdog/ibmasr.c
index 366b0474f278..897f7eda9e6a 100644
--- a/drivers/watchdog/ibmasr.c
+++ b/drivers/watchdog/ibmasr.c
@@ -323,7 +323,7 @@ static int asr_open(struct inode *inode, struct file *file)
 	asr_toggle();
 	asr_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int asr_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/indydog.c b/drivers/watchdog/indydog.c
index 5d20cdd30efe..5592b975fe3a 100644
--- a/drivers/watchdog/indydog.c
+++ b/drivers/watchdog/indydog.c
@@ -77,7 +77,7 @@ static int indydog_open(struct inode *inode, struct file *file)
 
 	pr_info("Started watchdog timer\n");
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int indydog_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/intel_scu_watchdog.c b/drivers/watchdog/intel_scu_watchdog.c
index 0caab6241eb7..3181a72c7ddf 100644
--- a/drivers/watchdog/intel_scu_watchdog.c
+++ b/drivers/watchdog/intel_scu_watchdog.c
@@ -304,7 +304,7 @@ static int intel_scu_open(struct inode *inode, struct file *file)
 	if (watchdog_device.driver_closed)
 		return -EPERM;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int intel_scu_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/iop_wdt.c b/drivers/watchdog/iop_wdt.c
index b16013ffacc2..d910a7dec21b 100644
--- a/drivers/watchdog/iop_wdt.c
+++ b/drivers/watchdog/iop_wdt.c
@@ -101,7 +101,7 @@ static int iop_wdt_open(struct inode *inode, struct file *file)
 	clear_bit(WDT_OK_TO_CLOSE, &wdt_status);
 	wdt_enable();
 	set_bit(WDT_ENABLED, &wdt_status);
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static ssize_t iop_wdt_write(struct file *file, const char *data, size_t len,
diff --git a/drivers/watchdog/it8712f_wdt.c b/drivers/watchdog/it8712f_wdt.c
index 41b3979a9d87..b1567240a0e6 100644
--- a/drivers/watchdog/it8712f_wdt.c
+++ b/drivers/watchdog/it8712f_wdt.c
@@ -327,7 +327,7 @@ static int it8712f_wdt_open(struct inode *inode, struct file *file)
 	ret = it8712f_wdt_enable();
 	if (ret)
 		return ret;
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int it8712f_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/ixp4xx_wdt.c b/drivers/watchdog/ixp4xx_wdt.c
index f20cc53ff719..dd139cda936c 100644
--- a/drivers/watchdog/ixp4xx_wdt.c
+++ b/drivers/watchdog/ixp4xx_wdt.c
@@ -65,7 +65,7 @@ static int ixp4xx_wdt_open(struct inode *inode, struct file *file)
 
 	clear_bit(WDT_OK_TO_CLOSE, &wdt_status);
 	wdt_enable();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static ssize_t
diff --git a/drivers/watchdog/ks8695_wdt.c b/drivers/watchdog/ks8695_wdt.c
index 1e41818a44bc..0565cf30017b 100644
--- a/drivers/watchdog/ks8695_wdt.c
+++ b/drivers/watchdog/ks8695_wdt.c
@@ -142,7 +142,7 @@ static int ks8695_wdt_open(struct inode *inode, struct file *file)
 		return -EBUSY;
 
 	ks8695_wdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /*
diff --git a/drivers/watchdog/m54xx_wdt.c b/drivers/watchdog/m54xx_wdt.c
index da6fa2b68074..752d03620f0a 100644
--- a/drivers/watchdog/m54xx_wdt.c
+++ b/drivers/watchdog/m54xx_wdt.c
@@ -85,7 +85,7 @@ static int m54xx_wdt_open(struct inode *inode, struct file *file)
 
 	clear_bit(WDT_OK_TO_CLOSE, &wdt_status);
 	wdt_enable();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static ssize_t m54xx_wdt_write(struct file *file, const char *data,
diff --git a/drivers/watchdog/machzwd.c b/drivers/watchdog/machzwd.c
index 88d823d87a4b..53759415cf06 100644
--- a/drivers/watchdog/machzwd.c
+++ b/drivers/watchdog/machzwd.c
@@ -333,7 +333,7 @@ static int zf_open(struct inode *inode, struct file *file)
 	if (nowayout)
 		__module_get(THIS_MODULE);
 	zf_timer_on();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int zf_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/mixcomwd.c b/drivers/watchdog/mixcomwd.c
index 3cc07447c655..ece56db0a379 100644
--- a/drivers/watchdog/mixcomwd.c
+++ b/drivers/watchdog/mixcomwd.c
@@ -150,7 +150,7 @@ static int mixcomwd_open(struct inode *inode, struct file *file)
 			mixcomwd_timer_alive = 0;
 		}
 	}
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int mixcomwd_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/mtx-1_wdt.c b/drivers/watchdog/mtx-1_wdt.c
index e028e0a2eca0..25a92857b217 100644
--- a/drivers/watchdog/mtx-1_wdt.c
+++ b/drivers/watchdog/mtx-1_wdt.c
@@ -118,7 +118,7 @@ static int mtx1_wdt_open(struct inode *inode, struct file *file)
 {
 	if (test_and_set_bit(0, &mtx1_wdt_device.inuse))
 		return -EBUSY;
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 
diff --git a/drivers/watchdog/mv64x60_wdt.c b/drivers/watchdog/mv64x60_wdt.c
index 315275d7bab6..c785f4f0a196 100644
--- a/drivers/watchdog/mv64x60_wdt.c
+++ b/drivers/watchdog/mv64x60_wdt.c
@@ -133,7 +133,7 @@ static int mv64x60_wdt_open(struct inode *inode, struct file *file)
 
 	mv64x60_wdt_handler_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int mv64x60_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/nuc900_wdt.c b/drivers/watchdog/nuc900_wdt.c
index 830bd04ff911..8a36350bab7b 100644
--- a/drivers/watchdog/nuc900_wdt.c
+++ b/drivers/watchdog/nuc900_wdt.c
@@ -131,7 +131,7 @@ static int nuc900_wdt_open(struct inode *inode, struct file *file)
 
 	nuc900_wdt_start();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int nuc900_wdt_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/nv_tco.c b/drivers/watchdog/nv_tco.c
index a0fabf6f92b0..98d4f5371cf4 100644
--- a/drivers/watchdog/nv_tco.c
+++ b/drivers/watchdog/nv_tco.c
@@ -161,7 +161,7 @@ static int nv_tco_open(struct inode *inode, struct file *file)
 	/* Reload and activate timer */
 	tco_timer_keepalive();
 	tco_timer_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int nv_tco_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/pc87413_wdt.c b/drivers/watchdog/pc87413_wdt.c
index 2ffa39b46970..ca21d6c240a3 100644
--- a/drivers/watchdog/pc87413_wdt.c
+++ b/drivers/watchdog/pc87413_wdt.c
@@ -286,7 +286,7 @@ static int pc87413_open(struct inode *inode, struct file *file)
 
 	pr_info("Watchdog enabled. Timeout set to %d minute(s).\n", timeout);
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
diff --git a/drivers/watchdog/pcwd.c b/drivers/watchdog/pcwd.c
index b72ce68eacd3..a3415cf07c98 100644
--- a/drivers/watchdog/pcwd.c
+++ b/drivers/watchdog/pcwd.c
@@ -695,7 +695,7 @@ static int pcwd_open(struct inode *inode, struct file *file)
 	/* Activate */
 	pcwd_start();
 	pcwd_keepalive();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int pcwd_close(struct inode *inode, struct file *file)
@@ -734,7 +734,7 @@ static int pcwd_temp_open(struct inode *inode, struct file *file)
 	if (!pcwd_private.supports_temp)
 		return -ENODEV;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int pcwd_temp_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/pcwd_pci.c b/drivers/watchdog/pcwd_pci.c
index 1f78f0908621..5773d2591d3f 100644
--- a/drivers/watchdog/pcwd_pci.c
+++ b/drivers/watchdog/pcwd_pci.c
@@ -578,7 +578,7 @@ static int pcipcwd_open(struct inode *inode, struct file *file)
 	/* Activate */
 	pcipcwd_start();
 	pcipcwd_keepalive();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int pcipcwd_release(struct inode *inode, struct file *file)
@@ -620,7 +620,7 @@ static int pcipcwd_temp_open(struct inode *inode, struct file *file)
 	if (!pcipcwd_private.supports_temp)
 		return -ENODEV;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int pcipcwd_temp_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/pcwd_usb.c b/drivers/watchdog/pcwd_usb.c
index 4d02f26156f9..5de6182dae33 100644
--- a/drivers/watchdog/pcwd_usb.c
+++ b/drivers/watchdog/pcwd_usb.c
@@ -485,7 +485,7 @@ static int usb_pcwd_open(struct inode *inode, struct file *file)
 	/* Activate */
 	usb_pcwd_start(usb_pcwd_device);
 	usb_pcwd_keepalive(usb_pcwd_device);
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int usb_pcwd_release(struct inode *inode, struct file *file)
@@ -524,7 +524,7 @@ static ssize_t usb_pcwd_temperature_read(struct file *file, char __user *data,
 
 static int usb_pcwd_temperature_open(struct inode *inode, struct file *file)
 {
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int usb_pcwd_temperature_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/pika_wdt.c b/drivers/watchdog/pika_wdt.c
index bb97f5b2f7eb..8938b3fb2b2d 100644
--- a/drivers/watchdog/pika_wdt.c
+++ b/drivers/watchdog/pika_wdt.c
@@ -118,7 +118,7 @@ static int pikawdt_open(struct inode *inode, struct file *file)
 
 	pikawdt_start();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /*
diff --git a/drivers/watchdog/pnx833x_wdt.c b/drivers/watchdog/pnx833x_wdt.c
index 882fdcb46ad1..312899f39fd2 100644
--- a/drivers/watchdog/pnx833x_wdt.c
+++ b/drivers/watchdog/pnx833x_wdt.c
@@ -116,7 +116,7 @@ static int pnx833x_wdt_open(struct inode *inode, struct file *file)
 
 	pr_info("Started watchdog timer\n");
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int pnx833x_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/rc32434_wdt.c b/drivers/watchdog/rc32434_wdt.c
index 3a75f3b53452..e74d5cf272ab 100644
--- a/drivers/watchdog/rc32434_wdt.c
+++ b/drivers/watchdog/rc32434_wdt.c
@@ -150,7 +150,7 @@ static int rc32434_wdt_open(struct inode *inode, struct file *file)
 	rc32434_wdt_start();
 	rc32434_wdt_ping();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int rc32434_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/rdc321x_wdt.c b/drivers/watchdog/rdc321x_wdt.c
index a281aa84bfb1..4382e9556860 100644
--- a/drivers/watchdog/rdc321x_wdt.c
+++ b/drivers/watchdog/rdc321x_wdt.c
@@ -142,7 +142,7 @@ static int rdc321x_wdt_open(struct inode *inode, struct file *file)
 	if (test_and_set_bit(0, &rdc321x_wdt_device.inuse))
 		return -EBUSY;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int rdc321x_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/riowd.c b/drivers/watchdog/riowd.c
index aba53424605e..f7f7a7a62022 100644
--- a/drivers/watchdog/riowd.c
+++ b/drivers/watchdog/riowd.c
@@ -76,7 +76,7 @@ static void riowd_writereg(struct riowd *p, u8 val, int index)
 
 static int riowd_open(struct inode *inode, struct file *filp)
 {
-	nonseekable_open(inode, filp);
+	stream_open(inode, filp);
 	return 0;
 }
 
diff --git a/drivers/watchdog/sa1100_wdt.c b/drivers/watchdog/sa1100_wdt.c
index d3be4f831db5..bfa035e1a75e 100644
--- a/drivers/watchdog/sa1100_wdt.c
+++ b/drivers/watchdog/sa1100_wdt.c
@@ -59,7 +59,7 @@ static int sa1100dog_open(struct inode *inode, struct file *file)
 	writel_relaxed(OSSR_M3, OSSR);
 	writel_relaxed(OWER_WME, OWER);
 	writel_relaxed(readl_relaxed(OIER) | OIER_E3, OIER);
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /*
diff --git a/drivers/watchdog/sb_wdog.c b/drivers/watchdog/sb_wdog.c
index 3abae50773b8..0692d42e5c67 100644
--- a/drivers/watchdog/sb_wdog.c
+++ b/drivers/watchdog/sb_wdog.c
@@ -105,7 +105,7 @@ static const struct watchdog_info ident = {
  */
 static int sbwdog_open(struct inode *inode, struct file *file)
 {
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 	if (test_and_set_bit(0, &sbwdog_gate))
 		return -EBUSY;
 	__module_get(THIS_MODULE);
diff --git a/drivers/watchdog/sbc60xxwdt.c b/drivers/watchdog/sbc60xxwdt.c
index 72d15fd1f183..4d127a91cbdc 100644
--- a/drivers/watchdog/sbc60xxwdt.c
+++ b/drivers/watchdog/sbc60xxwdt.c
@@ -208,7 +208,7 @@ static int fop_open(struct inode *inode, struct file *file)
 
 	/* Good, fire up the show */
 	wdt_startup();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int fop_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/sbc7240_wdt.c b/drivers/watchdog/sbc7240_wdt.c
index 5f268add17ce..efc81b318939 100644
--- a/drivers/watchdog/sbc7240_wdt.c
+++ b/drivers/watchdog/sbc7240_wdt.c
@@ -136,7 +136,7 @@ static int fop_open(struct inode *inode, struct file *file)
 
 	wdt_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int fop_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/sbc8360.c b/drivers/watchdog/sbc8360.c
index da60560ca446..3396024e7b76 100644
--- a/drivers/watchdog/sbc8360.c
+++ b/drivers/watchdog/sbc8360.c
@@ -271,7 +271,7 @@ static int sbc8360_open(struct inode *inode, struct file *file)
 	/* Activate and ping once to start the countdown */
 	sbc8360_activate();
 	sbc8360_ping();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int sbc8360_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/sbc_epx_c3.c b/drivers/watchdog/sbc_epx_c3.c
index a1c502e0d8ec..783037ffd7d8 100644
--- a/drivers/watchdog/sbc_epx_c3.c
+++ b/drivers/watchdog/sbc_epx_c3.c
@@ -78,7 +78,7 @@ static int epx_c3_open(struct inode *inode, struct file *file)
 	epx_c3_alive = 1;
 	pr_info("Started watchdog timer\n");
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int epx_c3_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/sbc_fitpc2_wdt.c b/drivers/watchdog/sbc_fitpc2_wdt.c
index a517d8bae757..3822a60a8d2b 100644
--- a/drivers/watchdog/sbc_fitpc2_wdt.c
+++ b/drivers/watchdog/sbc_fitpc2_wdt.c
@@ -75,7 +75,7 @@ static int fitpc2_wdt_open(struct inode *inode, struct file *file)
 
 	wdt_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static ssize_t fitpc2_wdt_write(struct file *file, const char *data,
diff --git a/drivers/watchdog/sc1200wdt.c b/drivers/watchdog/sc1200wdt.c
index e035a4d4b299..3c2e9355410a 100644
--- a/drivers/watchdog/sc1200wdt.c
+++ b/drivers/watchdog/sc1200wdt.c
@@ -178,7 +178,7 @@ static int sc1200wdt_open(struct inode *inode, struct file *file)
 	sc1200wdt_start();
 	pr_info("Watchdog enabled, timeout = %d min(s)", timeout);
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 
diff --git a/drivers/watchdog/sc520_wdt.c b/drivers/watchdog/sc520_wdt.c
index 403542f9ed8d..44797414c886 100644
--- a/drivers/watchdog/sc520_wdt.c
+++ b/drivers/watchdog/sc520_wdt.c
@@ -258,7 +258,7 @@ static int fop_open(struct inode *inode, struct file *file)
 
 	/* Good, fire up the show */
 	wdt_startup();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int fop_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/sch311x_wdt.c b/drivers/watchdog/sch311x_wdt.c
index 814cdf539b0f..ed6e9fac5d74 100644
--- a/drivers/watchdog/sch311x_wdt.c
+++ b/drivers/watchdog/sch311x_wdt.c
@@ -316,7 +316,7 @@ static int sch311x_wdt_open(struct inode *inode, struct file *file)
 	 *	Activate
 	 */
 	sch311x_wdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int sch311x_wdt_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/scx200_wdt.c b/drivers/watchdog/scx200_wdt.c
index ec4063ebb41a..85f2d8e06cd0 100644
--- a/drivers/watchdog/scx200_wdt.c
+++ b/drivers/watchdog/scx200_wdt.c
@@ -102,7 +102,7 @@ static int scx200_wdt_open(struct inode *inode, struct file *file)
 		return -EBUSY;
 	scx200_wdt_enable();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int scx200_wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/smsc37b787_wdt.c b/drivers/watchdog/smsc37b787_wdt.c
index c768dcd53034..a22170775273 100644
--- a/drivers/watchdog/smsc37b787_wdt.c
+++ b/drivers/watchdog/smsc37b787_wdt.c
@@ -366,7 +366,7 @@ static int wb_smsc_wdt_open(struct inode *inode, struct file *file)
 	pr_info("Watchdog enabled. Timeout set to %d %s\n",
 		timeout, (unit == UNIT_SECOND) ? "second(s)" : "minute(s)");
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /* close => shut off the timer */
diff --git a/drivers/watchdog/w83877f_wdt.c b/drivers/watchdog/w83877f_wdt.c
index db9b6488e388..8dd953f90680 100644
--- a/drivers/watchdog/w83877f_wdt.c
+++ b/drivers/watchdog/w83877f_wdt.c
@@ -224,7 +224,7 @@ static int fop_open(struct inode *inode, struct file *file)
 
 	/* Good, fire up the show */
 	wdt_startup();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int fop_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/w83977f_wdt.c b/drivers/watchdog/w83977f_wdt.c
index 672b61a7f9a3..184324c1edae 100644
--- a/drivers/watchdog/w83977f_wdt.c
+++ b/drivers/watchdog/w83977f_wdt.c
@@ -298,7 +298,7 @@ static int wdt_open(struct inode *inode, struct file *file)
 		__module_get(THIS_MODULE);
 
 	wdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int wdt_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/wafer5823wdt.c b/drivers/watchdog/wafer5823wdt.c
index 93c5b610e264..0a8073b419f8 100644
--- a/drivers/watchdog/wafer5823wdt.c
+++ b/drivers/watchdog/wafer5823wdt.c
@@ -197,7 +197,7 @@ static int wafwdt_open(struct inode *inode, struct file *file)
 	 *      Activate
 	 */
 	wafwdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int wafwdt_close(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index f6c24b22b37c..252a7c7b6592 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -825,7 +825,7 @@ static int watchdog_open(struct inode *inode, struct file *file)
 		kref_get(&wd_data->kref);
 
 	/* dev/watchdog is a virtual (and thus non-seekable) filesystem */
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 
 out_mod:
 	module_put(wd_data->wdd->ops->owner);
diff --git a/drivers/watchdog/wdrtas.c b/drivers/watchdog/wdrtas.c
index 0240c60d14e3..3c3ed512ce1e 100644
--- a/drivers/watchdog/wdrtas.c
+++ b/drivers/watchdog/wdrtas.c
@@ -376,7 +376,7 @@ static int wdrtas_open(struct inode *inode, struct file *file)
 	wdrtas_timer_start();
 	wdrtas_timer_keepalive();
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
@@ -442,7 +442,7 @@ static ssize_t wdrtas_temp_read(struct file *file, char __user *buf,
  */
 static int wdrtas_temp_open(struct inode *inode, struct file *file)
 {
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
diff --git a/drivers/watchdog/wdt.c b/drivers/watchdog/wdt.c
index e481fbbc4ae7..3d2f5ed60e88 100644
--- a/drivers/watchdog/wdt.c
+++ b/drivers/watchdog/wdt.c
@@ -421,7 +421,7 @@ static int wdt_open(struct inode *inode, struct file *file)
 	 *	Activate
 	 */
 	wdt_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
@@ -481,7 +481,7 @@ static ssize_t wdt_temp_read(struct file *file, char __user *buf,
 
 static int wdt_temp_open(struct inode *inode, struct file *file)
 {
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
diff --git a/drivers/watchdog/wdt285.c b/drivers/watchdog/wdt285.c
index ebbb183be618..68843e7f224d 100644
--- a/drivers/watchdog/wdt285.c
+++ b/drivers/watchdog/wdt285.c
@@ -101,7 +101,7 @@ static int watchdog_open(struct inode *inode, struct file *file)
 
 	ret = 0;
 #endif
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 	return ret;
 }
 
diff --git a/drivers/watchdog/wdt977.c b/drivers/watchdog/wdt977.c
index a8e6f87f60c9..59ed644dd4a9 100644
--- a/drivers/watchdog/wdt977.c
+++ b/drivers/watchdog/wdt977.c
@@ -273,7 +273,7 @@ static int wdt977_open(struct inode *inode, struct file *file)
 		__module_get(THIS_MODULE);
 
 	wdt977_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 static int wdt977_release(struct inode *inode, struct file *file)
diff --git a/drivers/watchdog/wdt_pci.c b/drivers/watchdog/wdt_pci.c
index 10e2cda0ee5a..ff3a41f47127 100644
--- a/drivers/watchdog/wdt_pci.c
+++ b/drivers/watchdog/wdt_pci.c
@@ -461,7 +461,7 @@ static int wdtpci_open(struct inode *inode, struct file *file)
 	 *	Activate
 	 */
 	wdtpci_start();
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
@@ -524,7 +524,7 @@ static ssize_t wdtpci_temp_read(struct file *file, char __user *buf,
 
 static int wdtpci_temp_open(struct inode *inode, struct file *file)
 {
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 }
 
 /**
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 6d1a5e58968f..f341b016672f 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -664,7 +664,7 @@ static int evtchn_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = u;
 
-	return nonseekable_open(inode, filp);
+	return stream_open(inode, filp);
 }
 
 static int evtchn_release(struct inode *inode, struct file *filp)
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 9859ababb82e..3ff32125f4b5 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -77,7 +77,7 @@ static int batadv_socket_open(struct inode *inode, struct file *file)
 
 	batadv_debugfs_deprecated(file, "");
 
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 
 	socket_client = kmalloc(sizeof(*socket_client), GFP_KERNEL);
 	if (!socket_client) {
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 3e610df8debf..e8ff13598c08 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -102,7 +102,7 @@ static int batadv_log_open(struct inode *inode, struct file *file)
 	batadv_debugfs_deprecated(file,
 				  "Use tracepoint batadv:batadv_dbg instead\n");
 
-	nonseekable_open(inode, file);
+	stream_open(inode, file);
 	file->private_data = inode->i_private;
 	return 0;
 }
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index abca57040f37..742e186bfadb 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1143,7 +1143,7 @@ static int rfkill_fop_open(struct inode *inode, struct file *file)
 
 	file->private_data = data;
 
-	return nonseekable_open(inode, file);
+	return stream_open(inode, file);
 
  free:
 	mutex_unlock(&data->mtx);
diff --git a/sound/core/control.c b/sound/core/control.c
index fad7db402443..a5cc9a874062 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -54,7 +54,7 @@ static int snd_ctl_open(struct inode *inode, struct file *file)
 	struct snd_ctl_file *ctl;
 	int i, err;
 
-	err = nonseekable_open(inode, file);
+	err = stream_open(inode, file);
 	if (err < 0)
 		return err;
 
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index c0690d1ecd55..4666bb366c0c 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -382,7 +382,7 @@ static int snd_rawmidi_open(struct inode *inode, struct file *file)
 	if ((file->f_flags & O_APPEND) && !(file->f_flags & O_NONBLOCK))
 		return -EINVAL;		/* invalid combination */
 
-	err = nonseekable_open(inode, file);
+	err = stream_open(inode, file);
 	if (err < 0)
 		return err;
 
diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c
index 38e7deab6384..a11bdc0350fc 100644
--- a/sound/core/seq/seq_clientmgr.c
+++ b/sound/core/seq/seq_clientmgr.c
@@ -307,7 +307,7 @@ static int snd_seq_open(struct inode *inode, struct file *file)
 	struct snd_seq_user_client *user;
 	int err;
 
-	err = nonseekable_open(inode, file);
+	err = stream_open(inode, file);
 	if (err < 0)
 		return err;
 
diff --git a/sound/core/timer.c b/sound/core/timer.c
index 61a0cec6e1f6..b842b61f66c2 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -1425,7 +1425,7 @@ static int snd_timer_user_open(struct inode *inode, struct file *file)
 	struct snd_timer_user *tu;
 	int err;
 
-	err = nonseekable_open(inode, file);
+	err = stream_open(inode, file);
 	if (err < 0)
 		return err;
 
-- 
cgit v1.2.3