From 08b79f62a1a3f6774cf27db4d12f3f938486c4f9 Mon Sep 17 00:00:00 2001
From: Manasi Navare <manasi.d.navare@intel.com>
Date: Fri, 20 Jan 2017 19:09:29 -0800
Subject: drm: Add definitions for DP compliance Video pattern tests

v4:
* Remove redundant single bit defs (Jani Nikula)
v3:
* Fix the conventions in bit definitions (Jani Nikula)
v2:
* Add all the other DP Complianec TEST register defs (Jani Nikula)
Cc: dri-devel@lists.freedesktop.org
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Ville Syrjala <ville.syrjala@linux.intel.com>
Signed-off-by: Manasi Navare <manasi.d.navare@intel.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1484968170-12467-4-git-send-email-manasi.d.navare@intel.com
---
 include/drm/drm_dp_helper.h | 57 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index 04681359a6f5..ba89295c8651 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -417,6 +417,63 @@
 #define DP_TEST_LANE_COUNT		    0x220
 
 #define DP_TEST_PATTERN			    0x221
+# define DP_NO_TEST_PATTERN                 0x0
+# define DP_COLOR_RAMP                      0x1
+# define DP_BLACK_AND_WHITE_VERTICAL_LINES  0x2
+# define DP_COLOR_SQUARE                    0x3
+
+#define DP_TEST_H_TOTAL_HI                  0x222
+#define DP_TEST_H_TOTAL_LO                  0x223
+
+#define DP_TEST_V_TOTAL_HI                  0x224
+#define DP_TEST_V_TOTAL_LO                  0x225
+
+#define DP_TEST_H_START_HI                  0x226
+#define DP_TEST_H_START_LO                  0x227
+
+#define DP_TEST_V_START_HI                  0x228
+#define DP_TEST_V_START_LO                  0x229
+
+#define DP_TEST_HSYNC_HI                    0x22A
+# define DP_TEST_HSYNC_POLARITY             (1 << 7)
+# define DP_TEST_HSYNC_WIDTH_HI_MASK        (127 << 0)
+#define DP_TEST_HSYNC_WIDTH_LO              0x22B
+
+#define DP_TEST_VSYNC_HI                    0x22C
+# define DP_TEST_VSYNC_POLARITY             (1 << 7)
+# define DP_TEST_VSYNC_WIDTH_HI_MASK        (127 << 0)
+#define DP_TEST_VSYNC_WIDTH_LO              0x22D
+
+#define DP_TEST_H_WIDTH_HI                  0x22E
+#define DP_TEST_H_WIDTH_LO                  0x22F
+
+#define DP_TEST_V_HEIGHT_HI                 0x230
+#define DP_TEST_V_HEIGHT_LO                 0x231
+
+#define DP_TEST_MISC0                       0x232
+# define DP_TEST_SYNC_CLOCK                 (1 << 0)
+# define DP_TEST_COLOR_FORMAT_MASK          (3 << 1)
+# define DP_TEST_COLOR_FORMAT_SHIFT         1
+# define DP_COLOR_FORMAT_RGB                (0 << 1)
+# define DP_COLOR_FORMAT_YCbCr422           (1 << 1)
+# define DP_COLOR_FORMAT_YCbCr444           (2 << 1)
+# define DP_TEST_DYNAMIC_RANGE_CEA          (1 << 3)
+# define DP_TEST_YCBCR_COEFFICIENTS         (1 << 4)
+# define DP_YCBCR_COEFFICIENTS_ITU601       (0 << 4)
+# define DP_YCBCR_COEFFICIENTS_ITU709       (1 << 4)
+# define DP_TEST_BIT_DEPTH_MASK             (7 << 5)
+# define DP_TEST_BIT_DEPTH_SHIFT            5
+# define DP_TEST_BIT_DEPTH_6                (0 << 5)
+# define DP_TEST_BIT_DEPTH_8                (1 << 5)
+# define DP_TEST_BIT_DEPTH_10               (2 << 5)
+# define DP_TEST_BIT_DEPTH_12               (3 << 5)
+# define DP_TEST_BIT_DEPTH_16               (4 << 5)
+
+#define DP_TEST_MISC1                       0x233
+# define DP_TEST_REFRESH_DENOMINATOR        (1 << 0)
+# define DP_TEST_INTERLACED                 (1 << 1)
+
+#define DP_TEST_REFRESH_RATE_NUMERATOR      0x234
 
 #define DP_TEST_CRC_R_CR		    0x240
 #define DP_TEST_CRC_G_Y			    0x242
-- 
cgit v1.2.3


From 2eebe4f2d5f4c91edc37801d828ba29edfbc7722 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Mon, 23 Jan 2017 11:42:59 +0200
Subject: drm/color: un-inline drm_color_lut_extract()

The function is not that big, but it's also not used for anything
performance critical. Make it a normal function.

As a side effect, this apparently makes sparse smarter about what it's
doing, and gets rid of the warning:

./include/drm/drm_color_mgmt.h:53:28: warning: shift too big (4294967295) for type unsigned long
./include/drm/drm_color_mgmt.h:53:28: warning: cast truncates bits from constant value (8000000000000000 becomes 0)

Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1485164579-16250-1-git-send-email-jani.nikula@intel.com
---
 drivers/gpu/drm/drm_color_mgmt.c | 24 ++++++++++++++++++++++++
 include/drm/drm_color_mgmt.h     | 27 ++-------------------------
 2 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_color_mgmt.c b/drivers/gpu/drm/drm_color_mgmt.c
index 789b4c65cd69..5618f60c7690 100644
--- a/drivers/gpu/drm/drm_color_mgmt.c
+++ b/drivers/gpu/drm/drm_color_mgmt.c
@@ -87,6 +87,30 @@
  * "GAMMA_LUT" property above.
  */
 
+/**
+ * drm_color_lut_extract - clamp&round LUT entries
+ * @user_input: input value
+ * @bit_precision: number of bits the hw LUT supports
+ *
+ * Extract a degamma/gamma LUT value provided by user (in the form of
+ * &drm_color_lut entries) and round it to the precision supported by the
+ * hardware.
+ */
+uint32_t drm_color_lut_extract(uint32_t user_input, uint32_t bit_precision)
+{
+	uint32_t val = user_input;
+	uint32_t max = 0xffff >> (16 - bit_precision);
+
+	/* Round only if we're not using full precision. */
+	if (bit_precision < 16) {
+		val += 1UL << (16 - bit_precision - 1);
+		val >>= 16 - bit_precision;
+	}
+
+	return clamp_val(val, 0, max);
+}
+EXPORT_SYMBOL(drm_color_lut_extract);
+
 /**
  * drm_crtc_enable_color_mgmt - enable color management properties
  * @crtc: DRM CRTC
diff --git a/include/drm/drm_color_mgmt.h b/include/drm/drm_color_mgmt.h
index c767238ac9d5..bce4a532836d 100644
--- a/include/drm/drm_color_mgmt.h
+++ b/include/drm/drm_color_mgmt.h
@@ -25,6 +25,8 @@
 
 #include <linux/ctype.h>
 
+uint32_t drm_color_lut_extract(uint32_t user_input, uint32_t bit_precision);
+
 void drm_crtc_enable_color_mgmt(struct drm_crtc *crtc,
 				uint degamma_lut_size,
 				bool has_ctm,
@@ -33,29 +35,4 @@ void drm_crtc_enable_color_mgmt(struct drm_crtc *crtc,
 int drm_mode_crtc_set_gamma_size(struct drm_crtc *crtc,
 				 int gamma_size);
 
-/**
- * drm_color_lut_extract - clamp&round LUT entries
- * @user_input: input value
- * @bit_precision: number of bits the hw LUT supports
- *
- * Extract a degamma/gamma LUT value provided by user (in the form of
- * &drm_color_lut entries) and round it to the precision supported by the
- * hardware.
- */
-static inline uint32_t drm_color_lut_extract(uint32_t user_input,
-					     uint32_t bit_precision)
-{
-	uint32_t val = user_input;
-	uint32_t max = 0xffff >> (16 - bit_precision);
-
-	/* Round only if we're not using full precision. */
-	if (bit_precision < 16) {
-		val += 1UL << (16 - bit_precision - 1);
-		val >>= 16 - bit_precision;
-	}
-
-	return clamp_val(val, 0, max);
-}
-
-
 #endif
-- 
cgit v1.2.3


From 40f62bb75e354dddfd8d7b6d7b96c4ab93cb00f1 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Fri, 27 Jan 2017 16:11:29 +0200
Subject: Revert "drm/color: un-inline drm_color_lut_extract()"

This reverts commit 2eebe4f2d5f4c91edc37801d828ba29edfbc7722, because I
screwed up and applied it to the wrong branch.

Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/drm_color_mgmt.c | 24 ------------------------
 include/drm/drm_color_mgmt.h     | 27 +++++++++++++++++++++++++--
 2 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_color_mgmt.c b/drivers/gpu/drm/drm_color_mgmt.c
index 5618f60c7690..789b4c65cd69 100644
--- a/drivers/gpu/drm/drm_color_mgmt.c
+++ b/drivers/gpu/drm/drm_color_mgmt.c
@@ -87,30 +87,6 @@
  * "GAMMA_LUT" property above.
  */
 
-/**
- * drm_color_lut_extract - clamp&round LUT entries
- * @user_input: input value
- * @bit_precision: number of bits the hw LUT supports
- *
- * Extract a degamma/gamma LUT value provided by user (in the form of
- * &drm_color_lut entries) and round it to the precision supported by the
- * hardware.
- */
-uint32_t drm_color_lut_extract(uint32_t user_input, uint32_t bit_precision)
-{
-	uint32_t val = user_input;
-	uint32_t max = 0xffff >> (16 - bit_precision);
-
-	/* Round only if we're not using full precision. */
-	if (bit_precision < 16) {
-		val += 1UL << (16 - bit_precision - 1);
-		val >>= 16 - bit_precision;
-	}
-
-	return clamp_val(val, 0, max);
-}
-EXPORT_SYMBOL(drm_color_lut_extract);
-
 /**
  * drm_crtc_enable_color_mgmt - enable color management properties
  * @crtc: DRM CRTC
diff --git a/include/drm/drm_color_mgmt.h b/include/drm/drm_color_mgmt.h
index bce4a532836d..c767238ac9d5 100644
--- a/include/drm/drm_color_mgmt.h
+++ b/include/drm/drm_color_mgmt.h
@@ -25,8 +25,6 @@
 
 #include <linux/ctype.h>
 
-uint32_t drm_color_lut_extract(uint32_t user_input, uint32_t bit_precision);
-
 void drm_crtc_enable_color_mgmt(struct drm_crtc *crtc,
 				uint degamma_lut_size,
 				bool has_ctm,
@@ -35,4 +33,29 @@ void drm_crtc_enable_color_mgmt(struct drm_crtc *crtc,
 int drm_mode_crtc_set_gamma_size(struct drm_crtc *crtc,
 				 int gamma_size);
 
+/**
+ * drm_color_lut_extract - clamp&round LUT entries
+ * @user_input: input value
+ * @bit_precision: number of bits the hw LUT supports
+ *
+ * Extract a degamma/gamma LUT value provided by user (in the form of
+ * &drm_color_lut entries) and round it to the precision supported by the
+ * hardware.
+ */
+static inline uint32_t drm_color_lut_extract(uint32_t user_input,
+					     uint32_t bit_precision)
+{
+	uint32_t val = user_input;
+	uint32_t max = 0xffff >> (16 - bit_precision);
+
+	/* Round only if we're not using full precision. */
+	if (bit_precision < 16) {
+		val += 1UL << (16 - bit_precision - 1);
+		val >>= 16 - bit_precision;
+	}
+
+	return clamp_val(val, 0, max);
+}
+
+
 #endif
-- 
cgit v1.2.3


From f5b98461cb8167ba362ad9f74c41d126b7becea7 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Fri, 6 Jan 2017 19:32:01 +0100
Subject: random: use chacha20 for get_random_int/long

Now that our crng uses chacha20, we can rely on its speedy
characteristics for replacing MD5, while simultaneously achieving a
higher security guarantee. Before the idea was to use these functions if
you wanted random integers that aren't stupidly insecure but aren't
necessarily secure either, a vague gray zone, that hopefully was "good
enough" for its users. With chacha20, we can strengthen this claim,
since either we're using an rdrand-like instruction, or we're using the
same crng as /dev/urandom. And it's faster than what was before.

We could have chosen to replace this with a SipHash-derived function,
which might be slightly faster, but at the cost of having yet another
RNG construction in the kernel. By moving to chacha20, we have a single
RNG to analyze and verify, and we also already get good performance
improvements on all platforms.

Implementation-wise, rather than use a generic buffer for both
get_random_int/long and memcpy based on the size needs, we use a
specific buffer for 32-bit reads and for 64-bit reads. This way, we're
guaranteed to always have aligned accesses on all platforms. While
slightly more verbose in C, the assembly this generates is a lot
simpler than otherwise.

Finally, on 32-bit platforms where longs and ints are the same size,
we simply alias get_random_int to get_random_long.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Suggested-by: Theodore Ts'o <tytso@mit.edu>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 drivers/char/random.c  | 84 ++++++++++++++++++++++++++------------------------
 include/linux/random.h |  1 -
 init/main.c            |  1 -
 3 files changed, 43 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 9d147d456598..b800e5479b7d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -2016,63 +2016,65 @@ struct ctl_table random_table[] = {
 };
 #endif 	/* CONFIG_SYSCTL */
 
-static u32 random_int_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned;
-
-int random_int_secret_init(void)
-{
-	get_random_bytes(random_int_secret, sizeof(random_int_secret));
-	return 0;
-}
-
-static DEFINE_PER_CPU(__u32 [MD5_DIGEST_WORDS], get_random_int_hash)
-		__aligned(sizeof(unsigned long));
+struct batched_entropy {
+	union {
+		unsigned long entropy_long[CHACHA20_BLOCK_SIZE / sizeof(unsigned long)];
+		unsigned int entropy_int[CHACHA20_BLOCK_SIZE / sizeof(unsigned int)];
+	};
+	unsigned int position;
+};
 
 /*
- * Get a random word for internal kernel use only. Similar to urandom but
- * with the goal of minimal entropy pool depletion. As a result, the random
- * value is not cryptographically secure but for several uses the cost of
- * depleting entropy is too high
+ * Get a random word for internal kernel use only. The quality of the random
+ * number is either as good as RDRAND or as good as /dev/urandom, with the
+ * goal of being quite fast and not depleting entropy.
  */
-unsigned int get_random_int(void)
+static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_long);
+unsigned long get_random_long(void)
 {
-	__u32 *hash;
-	unsigned int ret;
+	unsigned long ret;
+	struct batched_entropy *batch;
 
-	if (arch_get_random_int(&ret))
+	if (arch_get_random_long(&ret))
 		return ret;
 
-	hash = get_cpu_var(get_random_int_hash);
-
-	hash[0] += current->pid + jiffies + random_get_entropy();
-	md5_transform(hash, random_int_secret);
-	ret = hash[0];
-	put_cpu_var(get_random_int_hash);
-
+	batch = &get_cpu_var(batched_entropy_long);
+	if (batch->position % ARRAY_SIZE(batch->entropy_long) == 0) {
+		extract_crng((u8 *)batch->entropy_long);
+		batch->position = 0;
+	}
+	ret = batch->entropy_long[batch->position++];
+	put_cpu_var(batched_entropy_long);
 	return ret;
 }
-EXPORT_SYMBOL(get_random_int);
+EXPORT_SYMBOL(get_random_long);
 
-/*
- * Same as get_random_int(), but returns unsigned long.
- */
-unsigned long get_random_long(void)
+#if BITS_PER_LONG == 32
+unsigned int get_random_int(void)
 {
-	__u32 *hash;
-	unsigned long ret;
+	return get_random_long();
+}
+#else
+static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_int);
+unsigned int get_random_int(void)
+{
+	unsigned int ret;
+	struct batched_entropy *batch;
 
-	if (arch_get_random_long(&ret))
+	if (arch_get_random_int(&ret))
 		return ret;
 
-	hash = get_cpu_var(get_random_int_hash);
-
-	hash[0] += current->pid + jiffies + random_get_entropy();
-	md5_transform(hash, random_int_secret);
-	ret = *(unsigned long *)hash;
-	put_cpu_var(get_random_int_hash);
-
+	batch = &get_cpu_var(batched_entropy_int);
+	if (batch->position % ARRAY_SIZE(batch->entropy_int) == 0) {
+		extract_crng((u8 *)batch->entropy_int);
+		batch->position = 0;
+	}
+	ret = batch->entropy_int[batch->position++];
+	put_cpu_var(batched_entropy_int);
 	return ret;
 }
-EXPORT_SYMBOL(get_random_long);
+#endif
+EXPORT_SYMBOL(get_random_int);
 
 /**
  * randomize_page - Generate a random, page aligned address
diff --git a/include/linux/random.h b/include/linux/random.h
index 7bd2403e4fef..16ab429735a7 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -37,7 +37,6 @@ extern void get_random_bytes(void *buf, int nbytes);
 extern int add_random_ready_callback(struct random_ready_callback *rdy);
 extern void del_random_ready_callback(struct random_ready_callback *rdy);
 extern void get_random_bytes_arch(void *buf, int nbytes);
-extern int random_int_secret_init(void);
 
 #ifndef MODULE
 extern const struct file_operations random_fops, urandom_fops;
diff --git a/init/main.c b/init/main.c
index b0c9d6facef9..09beb7fc6e8c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -879,7 +879,6 @@ static void __init do_basic_setup(void)
 	do_ctors();
 	usermodehelper_enable();
 	do_initcalls();
-	random_int_secret_init();
 }
 
 static void __init do_pre_smp_initcalls(void)
-- 
cgit v1.2.3


From c440408cf6901eeb2c09563397e24a9097907078 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 22 Jan 2017 16:34:08 +0100
Subject: random: convert get_random_int/long into get_random_u32/u64

Many times, when a user wants a random number, he wants a random number
of a guaranteed size. So, thinking of get_random_int and get_random_long
in terms of get_random_u32 and get_random_u64 makes it much easier to
achieve this. It also makes the code simpler.

On 32-bit platforms, get_random_int and get_random_long are both aliased
to get_random_u32. On 64-bit platforms, int->u32 and long->u64.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 drivers/char/random.c  | 55 +++++++++++++++++++++++++-------------------------
 include/linux/random.h | 17 ++++++++++++++--
 2 files changed, 42 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index b800e5479b7d..066ae125f2c8 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -2018,8 +2018,8 @@ struct ctl_table random_table[] = {
 
 struct batched_entropy {
 	union {
-		unsigned long entropy_long[CHACHA20_BLOCK_SIZE / sizeof(unsigned long)];
-		unsigned int entropy_int[CHACHA20_BLOCK_SIZE / sizeof(unsigned int)];
+		u64 entropy_u64[CHACHA20_BLOCK_SIZE / sizeof(u64)];
+		u32 entropy_u32[CHACHA20_BLOCK_SIZE / sizeof(u32)];
 	};
 	unsigned int position;
 };
@@ -2029,52 +2029,51 @@ struct batched_entropy {
  * number is either as good as RDRAND or as good as /dev/urandom, with the
  * goal of being quite fast and not depleting entropy.
  */
-static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_long);
-unsigned long get_random_long(void)
+static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64);
+u64 get_random_u64(void)
 {
-	unsigned long ret;
+	u64 ret;
 	struct batched_entropy *batch;
 
-	if (arch_get_random_long(&ret))
+#if BITS_PER_LONG == 64
+	if (arch_get_random_long((unsigned long *)&ret))
 		return ret;
+#else
+	if (arch_get_random_long((unsigned long *)&ret) &&
+	    arch_get_random_long((unsigned long *)&ret + 1))
+	    return ret;
+#endif
 
-	batch = &get_cpu_var(batched_entropy_long);
-	if (batch->position % ARRAY_SIZE(batch->entropy_long) == 0) {
-		extract_crng((u8 *)batch->entropy_long);
+	batch = &get_cpu_var(batched_entropy_u64);
+	if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) {
+		extract_crng((u8 *)batch->entropy_u64);
 		batch->position = 0;
 	}
-	ret = batch->entropy_long[batch->position++];
-	put_cpu_var(batched_entropy_long);
+	ret = batch->entropy_u64[batch->position++];
+	put_cpu_var(batched_entropy_u64);
 	return ret;
 }
-EXPORT_SYMBOL(get_random_long);
+EXPORT_SYMBOL(get_random_u64);
 
-#if BITS_PER_LONG == 32
-unsigned int get_random_int(void)
-{
-	return get_random_long();
-}
-#else
-static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_int);
-unsigned int get_random_int(void)
+static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32);
+u32 get_random_u32(void)
 {
-	unsigned int ret;
+	u32 ret;
 	struct batched_entropy *batch;
 
 	if (arch_get_random_int(&ret))
 		return ret;
 
-	batch = &get_cpu_var(batched_entropy_int);
-	if (batch->position % ARRAY_SIZE(batch->entropy_int) == 0) {
-		extract_crng((u8 *)batch->entropy_int);
+	batch = &get_cpu_var(batched_entropy_u32);
+	if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) {
+		extract_crng((u8 *)batch->entropy_u32);
 		batch->position = 0;
 	}
-	ret = batch->entropy_int[batch->position++];
-	put_cpu_var(batched_entropy_int);
+	ret = batch->entropy_u32[batch->position++];
+	put_cpu_var(batched_entropy_u32);
 	return ret;
 }
-#endif
-EXPORT_SYMBOL(get_random_int);
+EXPORT_SYMBOL(get_random_u32);
 
 /**
  * randomize_page - Generate a random, page aligned address
diff --git a/include/linux/random.h b/include/linux/random.h
index 16ab429735a7..ed5c3838780d 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -42,8 +42,21 @@ extern void get_random_bytes_arch(void *buf, int nbytes);
 extern const struct file_operations random_fops, urandom_fops;
 #endif
 
-unsigned int get_random_int(void);
-unsigned long get_random_long(void);
+u32 get_random_u32(void);
+u64 get_random_u64(void);
+static inline unsigned int get_random_int(void)
+{
+	return get_random_u32();
+}
+static inline unsigned long get_random_long(void)
+{
+#if BITS_PER_LONG == 64
+	return get_random_u64();
+#else
+	return get_random_u32();
+#endif
+}
+
 unsigned long randomize_page(unsigned long start, unsigned long range);
 
 u32 prandom_u32(void);
-- 
cgit v1.2.3


From 77ae9957897df86e627089688265e0db029dd0df Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 27 Jan 2017 09:40:07 +0000
Subject: drm/i915: Enable userspace to opt-out of implicit fencing

Userspace is faced with a dilemma. The kernel requires implicit fencing
to manage resource usage (we always must wait for the GPU to finish
before releasing its PTE) and for third parties. However, userspace may
wish to avoid this serialisation if it is either using explicit fencing
between parties and wants more fine-grained access to buffers (e.g. it
may partition the buffer between uses and track fences on ranges rather
than the implicit fences tracking the whole object). It follows that
userspace needs a mechanism to avoid the kernel's serialisation on its
implicit fences before execbuf execution.

The next question is whether this is an object, execbuf or context flag.
Hybrid users (such as using explicit EGL_ANDROID_native_sync fencing on
shared winsys buffers, but implicit fencing on internal surfaces)
require a per-object level flag. Given that this flag need to be only
set once for the lifetime of the object, this reduces the convenience of
having an execbuf or context level flag (and avoids having multiple
pieces of uABI controlling the same feature).

Incorrect use of this flag will result in rendering corruption and GPU
hangs - but will not result in use-after-free or similar resource
tracking issues.

Serious caveat: write ordering is not strictly correct after setting
this flag on a render target on multiple engines. This affects all
subsequent GEM operations (execbuf, set-domain, pread) and shared
dma-buf operations. A fix is possible - but costly (both in terms of
further ABI changes and runtime overhead).

Testcase: igt/gem_exec_async
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Chad Versace <chadversary@chromium.org>
Link: http://patchwork.freedesktop.org/patch/msgid/20170127094008.27489-1-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.c            |  1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  3 +++
 include/uapi/drm/i915_drm.h                | 29 ++++++++++++++++++++++++++++-
 3 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 507a6f69426f..6c9c2037fadf 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -349,6 +349,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_HANDLE_LUT:
 	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
 	case I915_PARAM_HAS_EXEC_SOFTPIN:
+	case I915_PARAM_HAS_EXEC_ASYNC:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index c66e90571031..6fd60682bf93 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1111,6 +1111,9 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 	list_for_each_entry(vma, vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
+		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
+			continue;
+
 		ret = i915_gem_request_await_object
 			(req, obj, obj->base.pending_write_domain);
 		if (ret)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 57093b455db6..6620b6ad76ed 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -397,6 +397,12 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_SCHEDULER	 41
 #define I915_PARAM_HUC_STATUS		 42
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to opt-out of
+ * synchronisation with implicit fencing on individual objects.
+ * See EXEC_OBJECT_ASYNC.
+ */
+#define I915_PARAM_HAS_EXEC_ASYNC	 43
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
@@ -737,8 +743,29 @@ struct drm_i915_gem_exec_object2 {
 #define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
 #define EXEC_OBJECT_PINNED		 (1<<4)
 #define EXEC_OBJECT_PAD_TO_SIZE		 (1<<5)
+/* The kernel implicitly tracks GPU activity on all GEM objects, and
+ * synchronises operations with outstanding rendering. This includes
+ * rendering on other devices if exported via dma-buf. However, sometimes
+ * this tracking is too coarse and the user knows better. For example,
+ * if the object is split into non-overlapping ranges shared between different
+ * clients or engines (i.e. suballocating objects), the implicit tracking
+ * by kernel assumes that each operation affects the whole object rather
+ * than an individual range, causing needless synchronisation between clients.
+ * The kernel will also forgo any CPU cache flushes prior to rendering from
+ * the object as the client is expected to be also handling such domain
+ * tracking.
+ *
+ * The kernel maintains the implicit tracking in order to manage resources
+ * used by the GPU - this flag only disables the synchronisation prior to
+ * rendering with this object in this execbuf.
+ *
+ * Opting out of implicit synhronisation requires the user to do its own
+ * explicit tracking to avoid rendering corruption. See, for example,
+ * I915_PARAM_HAS_EXEC_FENCE to order execbufs and execute them asynchronously.
+ */
+#define EXEC_OBJECT_ASYNC		(1<<6)
 /* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_PAD_TO_SIZE<<1)
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_ASYNC<<1)
 	__u64 flags;
 
 	union {
-- 
cgit v1.2.3


From fec0445caa273209d2809760ac7c63e743d6f512 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 27 Jan 2017 09:40:08 +0000
Subject: drm/i915: Support explicit fencing for execbuf

Now that the user can opt-out of implicit fencing, we need to give them
back control over the fencing. We employ sync_file to wrap our
drm_i915_gem_request and provide an fd that userspace can merge with
other sync_file fds and pass back to the kernel to wait upon before
future execution.

Testcase: igt/gem_exec_fence
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Chad Versace <chadversary@chromium.org>
Link: http://patchwork.freedesktop.org/patch/msgid/20170127094008.27489-2-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/Kconfig               |  1 +
 drivers/gpu/drm/i915/i915_drv.c            |  3 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 54 +++++++++++++++++++++++++++---
 include/uapi/drm/i915_drm.h                | 36 +++++++++++++++++++-
 4 files changed, 87 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 183f5dc1c3f2..1ae0bb91ee60 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -19,6 +19,7 @@ config DRM_I915
 	select INPUT if ACPI
 	select ACPI_VIDEO if ACPI
 	select ACPI_BUTTON if ACPI
+	select SYNC_FILE
 	help
 	  Choose this option if you have a system that has "Intel Graphics
 	  Media Accelerator" or "HD Graphics" integrated graphics,
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 6c9c2037fadf..3aa5bf58cf32 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -350,6 +350,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
 	case I915_PARAM_HAS_EXEC_SOFTPIN:
 	case I915_PARAM_HAS_EXEC_ASYNC:
+	case I915_PARAM_HAS_EXEC_FENCE:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
@@ -2550,7 +2551,7 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_HWS_ADDR, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_INIT, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER, i915_gem_execbuffer, DRM_AUTH),
-	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2_WR, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_GEM_PIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_UNPIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_BUSY, i915_gem_busy_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 6fd60682bf93..91c2393199a3 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -28,6 +28,7 @@
 
 #include <linux/dma_remapping.h>
 #include <linux/reservation.h>
+#include <linux/sync_file.h>
 #include <linux/uaccess.h>
 
 #include <drm/drmP.h>
@@ -1595,6 +1596,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct i915_execbuffer_params *params = &params_master;
 	const u32 ctx_id = i915_execbuffer2_get_context_id(*args);
 	u32 dispatch_flags;
+	struct dma_fence *in_fence = NULL;
+	struct sync_file *out_fence = NULL;
+	int out_fence_fd = -1;
 	int ret;
 	bool need_relocs;
 
@@ -1638,6 +1642,23 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		dispatch_flags |= I915_DISPATCH_RS;
 	}
 
+	if (args->flags & I915_EXEC_FENCE_IN) {
+		in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2));
+		if (!in_fence) {
+			ret = -EINVAL;
+			goto pre_mutex_err;
+		}
+	}
+
+	if (args->flags & I915_EXEC_FENCE_OUT) {
+		out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
+		if (out_fence_fd < 0) {
+			ret = out_fence_fd;
+			out_fence_fd = -1;
+			goto pre_mutex_err;
+		}
+	}
+
 	/* Take a local wakeref for preparing to dispatch the execbuf as
 	 * we expect to access the hardware fairly frequently in the
 	 * process. Upon first dispatch, we acquire another prolonged
@@ -1782,6 +1803,21 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		goto err_batch_unpin;
 	}
 
+	if (in_fence) {
+		ret = i915_gem_request_await_dma_fence(params->request,
+						       in_fence);
+		if (ret < 0)
+			goto err_request;
+	}
+
+	if (out_fence_fd != -1) {
+		out_fence = sync_file_create(&params->request->fence);
+		if (!out_fence) {
+			ret = -ENOMEM;
+			goto err_request;
+		}
+	}
+
 	/* Whilst this request exists, batch_obj will be on the
 	 * active_list, and so will hold the active reference. Only when this
 	 * request is retired will the the batch_obj be moved onto the
@@ -1809,6 +1845,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	ret = execbuf_submit(params, args, &eb->vmas);
 err_request:
 	__i915_add_request(params->request, ret == 0);
+	if (out_fence) {
+		if (ret == 0) {
+			fd_install(out_fence_fd, out_fence->file);
+			args->rsvd2 &= GENMASK_ULL(0, 31); /* keep in-fence */
+			args->rsvd2 |= (u64)out_fence_fd << 32;
+			out_fence_fd = -1;
+		} else {
+			fput(out_fence->file);
+		}
+	}
 
 err_batch_unpin:
 	/*
@@ -1830,6 +1876,9 @@ pre_mutex_err:
 	/* intel_gpu_busy should also get a ref, so it will free when the device
 	 * is really idle. */
 	intel_runtime_pm_put(dev_priv);
+	if (out_fence_fd != -1)
+		put_unused_fd(out_fence_fd);
+	dma_fence_put(in_fence);
 	return ret;
 }
 
@@ -1937,11 +1986,6 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	if (args->rsvd2 != 0) {
-		DRM_DEBUG("dirty rvsd2 field\n");
-		return -EINVAL;
-	}
-
 	exec2_list = drm_malloc_gfp(args->buffer_count,
 				    sizeof(*exec2_list),
 				    GFP_TEMPORARY);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 6620b6ad76ed..3554495bef13 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -246,6 +246,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
 #define DRM_I915_OVERLAY_ATTRS	0x28
 #define DRM_I915_GEM_EXECBUFFER2	0x29
+#define DRM_I915_GEM_EXECBUFFER2_WR	DRM_I915_GEM_EXECBUFFER2
 #define DRM_I915_GET_SPRITE_COLORKEY	0x2a
 #define DRM_I915_SET_SPRITE_COLORKEY	0x2b
 #define DRM_I915_GEM_WAIT	0x2c
@@ -280,6 +281,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2_WR	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2_WR, struct drm_i915_gem_execbuffer2)
 #define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
 #define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
 #define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
@@ -403,6 +405,13 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_EXEC_ASYNC	 43
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports explicit fence support -
+ * both being able to pass in a sync_file fd to wait upon before executing,
+ * and being able to return a new sync_file fd that is signaled when the
+ * current request is complete. See I915_EXEC_FENCE_IN and I915_EXEC_FENCE_OUT.
+ */
+#define I915_PARAM_HAS_EXEC_FENCE	 44
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
@@ -855,7 +864,32 @@ struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_RESOURCE_STREAMER     (1<<15)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_RESOURCE_STREAMER<<1)
+/* Setting I915_EXEC_FENCE_IN implies that lower_32_bits(rsvd2) represent
+ * a sync_file fd to wait upon (in a nonblocking manner) prior to executing
+ * the batch.
+ *
+ * Returns -EINVAL if the sync_file fd cannot be found.
+ */
+#define I915_EXEC_FENCE_IN		(1<<16)
+
+/* Setting I915_EXEC_FENCE_OUT causes the ioctl to return a sync_file fd
+ * in the upper_32_bits(rsvd2) upon success. Ownership of the fd is given
+ * to the caller, and it should be close() after use. (The fd is a regular
+ * file descriptor and will be cleaned up on process termination. It holds
+ * a reference to the request, but nothing else.)
+ *
+ * The sync_file fd can be combined with other sync_file and passed either
+ * to execbuf using I915_EXEC_FENCE_IN, to atomic KMS ioctls (so that a flip
+ * will only occur after this request completes), or to other devices.
+ *
+ * Using I915_EXEC_FENCE_OUT requires use of
+ * DRM_IOCTL_I915_GEM_EXECBUFFER2_WR ioctl so that the result is written
+ * back to userspace. Failure to do so will cause the out-fence to always
+ * be reported as zero, and the real fence fd to be leaked.
+ */
+#define I915_EXEC_FENCE_OUT		(1<<17)
+
+#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_OUT<<1))
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
cgit v1.2.3


From 7e91c7df29b5e196de3dc6f086c8937973bd0b88 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Tue, 7 Feb 2017 19:58:02 +0200
Subject: swiotlb-xen: implement xen_swiotlb_dma_mmap callback

This function creates userspace mapping for the DMA-coherent memory.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Oleksandr Dmytryshyn <oleksandr.dmytryshyn@globallogic.com>
Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad@kernel.org>
---
 arch/arm/xen/mm.c         |  1 +
 drivers/xen/swiotlb-xen.c | 19 +++++++++++++++++++
 include/xen/swiotlb-xen.h |  5 +++++
 3 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
index bd62d94f8ac5..cd1684e4e864 100644
--- a/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@ -198,6 +198,7 @@ static struct dma_map_ops xen_swiotlb_dma_ops = {
 	.unmap_page = xen_swiotlb_unmap_page,
 	.dma_supported = xen_swiotlb_dma_supported,
 	.set_dma_mask = xen_swiotlb_set_dma_mask,
+	.mmap = xen_swiotlb_dma_mmap,
 };
 
 int __init xen_mm_init(void)
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index f905d6eeb048..c7f61fc0572a 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -680,3 +680,22 @@ xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask);
+
+/*
+ * Create userspace mapping for the DMA-coherent memory.
+ * This function should be called with the pages from the current domain only,
+ * passing pages mapped from other domains would lead to memory corruption.
+ */
+int
+xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		     void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		     unsigned long attrs)
+{
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+	if (__generic_dma_ops(dev)->mmap)
+		return __generic_dma_ops(dev)->mmap(dev, vma, cpu_addr,
+						    dma_addr, size, attrs);
+#endif
+	return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mmap);
diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h
index a0083be5d529..a315c876aaa9 100644
--- a/include/xen/swiotlb-xen.h
+++ b/include/xen/swiotlb-xen.h
@@ -55,4 +55,9 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask);
 
 extern int
 xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask);
+
+extern int
+xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		     void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		     unsigned long attrs);
 #endif /* __LINUX_SWIOTLB_XEN_H */
-- 
cgit v1.2.3


From 69369f52d28a34c84acb6f2a8a585e743441566a Mon Sep 17 00:00:00 2001
From: Andrii Anisov <andrii_anisov@epam.com>
Date: Tue, 7 Feb 2017 19:58:03 +0200
Subject: swiotlb-xen: implement xen_swiotlb_get_sgtable callback

Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
Signed-off-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad@kernel.org>
---
 arch/arm/xen/mm.c         |  1 +
 drivers/xen/swiotlb-xen.c | 28 ++++++++++++++++++++++++++++
 include/xen/swiotlb-xen.h |  6 ++++++
 3 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
index cd1684e4e864..76ea48a614e1 100644
--- a/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@ -199,6 +199,7 @@ static struct dma_map_ops xen_swiotlb_dma_ops = {
 	.dma_supported = xen_swiotlb_dma_supported,
 	.set_dma_mask = xen_swiotlb_set_dma_mask,
 	.mmap = xen_swiotlb_dma_mmap,
+	.get_sgtable = xen_swiotlb_get_sgtable,
 };
 
 int __init xen_mm_init(void)
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index c7f61fc0572a..23e30b4e1fb6 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -699,3 +699,31 @@ xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 	return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size);
 }
 EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mmap);
+
+/*
+ * This function should be called with the pages from the current domain only,
+ * passing pages mapped from other domains would lead to memory corruption.
+ */
+int
+xen_swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt,
+			void *cpu_addr, dma_addr_t handle, size_t size,
+			unsigned long attrs)
+{
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+	if (__generic_dma_ops(dev)->get_sgtable) {
+#if 0
+	/*
+	 * This check verifies that the page belongs to the current domain and
+	 * is not one mapped from another domain.
+	 * This check is for debug only, and should not go to production build
+	 */
+		unsigned long bfn = PHYS_PFN(dma_to_phys(dev, handle));
+		BUG_ON (!page_is_ram(bfn));
+#endif
+		return __generic_dma_ops(dev)->get_sgtable(dev, sgt, cpu_addr,
+							   handle, size, attrs);
+	}
+#endif
+	return dma_common_get_sgtable(dev, sgt, cpu_addr, handle, size);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_get_sgtable);
diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h
index a315c876aaa9..1f6d78f044b6 100644
--- a/include/xen/swiotlb-xen.h
+++ b/include/xen/swiotlb-xen.h
@@ -2,6 +2,7 @@
 #define __LINUX_SWIOTLB_XEN_H
 
 #include <linux/dma-direction.h>
+#include <linux/scatterlist.h>
 #include <linux/swiotlb.h>
 
 extern int xen_swiotlb_init(int verbose, bool early);
@@ -60,4 +61,9 @@ extern int
 xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 		     void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		     unsigned long attrs);
+
+extern int
+xen_swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt,
+			void *cpu_addr, dma_addr_t handle, size_t size,
+			unsigned long attrs);
 #endif /* __LINUX_SWIOTLB_XEN_H */
-- 
cgit v1.2.3


From ca7a45ba6fb9e7ceca56d10b91db29c2f3451a2e Mon Sep 17 00:00:00 2001
From: Michał Winiarski <michal.winiarski@intel.com>
Date: Mon, 27 Feb 2017 12:22:56 +0100
Subject: drm/i915/skl: Add missing SKL ID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Used by production device:
    Intel(R) Iris(TM) Graphics P555

Cc: <stable@vger.kernel.org>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170227112256.20060-1-michal.winiarski@intel.com
---
 include/drm/i915_pciids.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
index a1dd21d6b723..466c71592a6f 100644
--- a/include/drm/i915_pciids.h
+++ b/include/drm/i915_pciids.h
@@ -265,7 +265,8 @@
 	INTEL_VGA_DEVICE(0x1923, info), /* ULT GT3 */ \
 	INTEL_VGA_DEVICE(0x1926, info), /* ULT GT3 */ \
 	INTEL_VGA_DEVICE(0x1927, info), /* ULT GT3 */ \
-	INTEL_VGA_DEVICE(0x192B, info)  /* Halo GT3 */ \
+	INTEL_VGA_DEVICE(0x192B, info), /* Halo GT3 */ \
+	INTEL_VGA_DEVICE(0x192D, info)  /* SRV GT3 */
 
 #define INTEL_SKL_GT4_IDS(info) \
 	INTEL_VGA_DEVICE(0x1932, info), /* DT GT4 */ \
-- 
cgit v1.2.3


From 6f8830f5bbab16e54f261de187f3df4644a5b977 Mon Sep 17 00:00:00 2001
From: Chris Leech <cleech@redhat.com>
Date: Mon, 27 Feb 2017 16:58:36 -0800
Subject: scsi: libiscsi: add lock around task lists to fix list corruption
 regression

There's a rather long standing regression from the commit "libiscsi:
Reduce locking contention in fast path"

Depending on iSCSI target behavior, it's possible to hit the case in
iscsi_complete_task where the task is still on a pending list
(!list_empty(&task->running)).  When that happens the task is removed
from the list while holding the session back_lock, but other task list
modification occur under the frwd_lock.  That leads to linked list
corruption and eventually a panicked system.

Rather than back out the session lock split entirely, in order to try
and keep some of the performance gains this patch adds another lock to
maintain the task lists integrity.

Major enterprise supported kernels have been backing out the lock split
for while now, thanks to the efforts at IBM where a lab setup has the
most reliable reproducer I've seen on this issue.  This patch has been
tested there successfully.

Signed-off-by: Chris Leech <cleech@redhat.com>
Fixes: 659743b02c41 ("[SCSI] libiscsi: Reduce locking contention in fast path")
Reported-by: Prashantha Subbarao <psubbara@us.ibm.com>
Reviewed-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org> # v3.15+
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/libiscsi.c | 26 +++++++++++++++++++++++++-
 include/scsi/libiscsi.h |  1 +
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 834d1212b6d5..3fca34a675af 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -560,8 +560,12 @@ static void iscsi_complete_task(struct iscsi_task *task, int state)
 	WARN_ON_ONCE(task->state == ISCSI_TASK_FREE);
 	task->state = state;
 
-	if (!list_empty(&task->running))
+	spin_lock_bh(&conn->taskqueuelock);
+	if (!list_empty(&task->running)) {
+		pr_debug_once("%s while task on list", __func__);
 		list_del_init(&task->running);
+	}
+	spin_unlock_bh(&conn->taskqueuelock);
 
 	if (conn->task == task)
 		conn->task = NULL;
@@ -783,7 +787,9 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
 		if (session->tt->xmit_task(task))
 			goto free_task;
 	} else {
+		spin_lock_bh(&conn->taskqueuelock);
 		list_add_tail(&task->running, &conn->mgmtqueue);
+		spin_unlock_bh(&conn->taskqueuelock);
 		iscsi_conn_queue_work(conn);
 	}
 
@@ -1474,8 +1480,10 @@ void iscsi_requeue_task(struct iscsi_task *task)
 	 * this may be on the requeue list already if the xmit_task callout
 	 * is handling the r2ts while we are adding new ones
 	 */
+	spin_lock_bh(&conn->taskqueuelock);
 	if (list_empty(&task->running))
 		list_add_tail(&task->running, &conn->requeue);
+	spin_unlock_bh(&conn->taskqueuelock);
 	iscsi_conn_queue_work(conn);
 }
 EXPORT_SYMBOL_GPL(iscsi_requeue_task);
@@ -1512,22 +1520,26 @@ static int iscsi_data_xmit(struct iscsi_conn *conn)
 	 * only have one nop-out as a ping from us and targets should not
 	 * overflow us with nop-ins
 	 */
+	spin_lock_bh(&conn->taskqueuelock);
 check_mgmt:
 	while (!list_empty(&conn->mgmtqueue)) {
 		conn->task = list_entry(conn->mgmtqueue.next,
 					 struct iscsi_task, running);
 		list_del_init(&conn->task->running);
+		spin_unlock_bh(&conn->taskqueuelock);
 		if (iscsi_prep_mgmt_task(conn, conn->task)) {
 			/* regular RX path uses back_lock */
 			spin_lock_bh(&conn->session->back_lock);
 			__iscsi_put_task(conn->task);
 			spin_unlock_bh(&conn->session->back_lock);
 			conn->task = NULL;
+			spin_lock_bh(&conn->taskqueuelock);
 			continue;
 		}
 		rc = iscsi_xmit_task(conn);
 		if (rc)
 			goto done;
+		spin_lock_bh(&conn->taskqueuelock);
 	}
 
 	/* process pending command queue */
@@ -1535,19 +1547,24 @@ check_mgmt:
 		conn->task = list_entry(conn->cmdqueue.next, struct iscsi_task,
 					running);
 		list_del_init(&conn->task->running);
+		spin_unlock_bh(&conn->taskqueuelock);
 		if (conn->session->state == ISCSI_STATE_LOGGING_OUT) {
 			fail_scsi_task(conn->task, DID_IMM_RETRY);
+			spin_lock_bh(&conn->taskqueuelock);
 			continue;
 		}
 		rc = iscsi_prep_scsi_cmd_pdu(conn->task);
 		if (rc) {
 			if (rc == -ENOMEM || rc == -EACCES) {
+				spin_lock_bh(&conn->taskqueuelock);
 				list_add_tail(&conn->task->running,
 					      &conn->cmdqueue);
 				conn->task = NULL;
+				spin_unlock_bh(&conn->taskqueuelock);
 				goto done;
 			} else
 				fail_scsi_task(conn->task, DID_ABORT);
+			spin_lock_bh(&conn->taskqueuelock);
 			continue;
 		}
 		rc = iscsi_xmit_task(conn);
@@ -1558,6 +1575,7 @@ check_mgmt:
 		 * we need to check the mgmt queue for nops that need to
 		 * be sent to aviod starvation
 		 */
+		spin_lock_bh(&conn->taskqueuelock);
 		if (!list_empty(&conn->mgmtqueue))
 			goto check_mgmt;
 	}
@@ -1577,12 +1595,15 @@ check_mgmt:
 		conn->task = task;
 		list_del_init(&conn->task->running);
 		conn->task->state = ISCSI_TASK_RUNNING;
+		spin_unlock_bh(&conn->taskqueuelock);
 		rc = iscsi_xmit_task(conn);
 		if (rc)
 			goto done;
+		spin_lock_bh(&conn->taskqueuelock);
 		if (!list_empty(&conn->mgmtqueue))
 			goto check_mgmt;
 	}
+	spin_unlock_bh(&conn->taskqueuelock);
 	spin_unlock_bh(&conn->session->frwd_lock);
 	return -ENODATA;
 
@@ -1738,7 +1759,9 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc)
 			goto prepd_reject;
 		}
 	} else {
+		spin_lock_bh(&conn->taskqueuelock);
 		list_add_tail(&task->running, &conn->cmdqueue);
+		spin_unlock_bh(&conn->taskqueuelock);
 		iscsi_conn_queue_work(conn);
 	}
 
@@ -2896,6 +2919,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size,
 	INIT_LIST_HEAD(&conn->mgmtqueue);
 	INIT_LIST_HEAD(&conn->cmdqueue);
 	INIT_LIST_HEAD(&conn->requeue);
+	spin_lock_init(&conn->taskqueuelock);
 	INIT_WORK(&conn->xmitwork, iscsi_xmitworker);
 
 	/* allocate login_task used for the login/text sequences */
diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
index b0e275de6dec..583875ea136a 100644
--- a/include/scsi/libiscsi.h
+++ b/include/scsi/libiscsi.h
@@ -196,6 +196,7 @@ struct iscsi_conn {
 	struct iscsi_task	*task;		/* xmit task in progress */
 
 	/* xmit */
+	spinlock_t		taskqueuelock;  /* protects the next three lists */
 	struct list_head	mgmtqueue;	/* mgmt (control) xmit queue */
 	struct list_head	cmdqueue;	/* data-path cmd queue */
 	struct list_head	requeue;	/* tasks needing another run */
-- 
cgit v1.2.3


From 8893cf6cb1cf56334c05120e23092dbfc9423ebb Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 1 Mar 2017 09:00:36 -0800
Subject: scsi: mpt3sas: Avoid sleeping in interrupt context

Commit 669f044170d8 ("scsi: srp_transport: Move queuecommand() wait code
to SCSI core") can make scsi_internal_device_block() sleep.  However,
the mpt3sas driver can call this function from an interrupt
handler. Hence add a second argument to scsi_internal_device_block()
that restores the old behavior of this function for the mpt3sas handler.

The call chain that triggered an "IRQ handler enabled interrupts"
complaint is as follows:

_base_interrupt()
-> _base_async_event()
   -> mpt3sas_scsih_event_callback()
      -> _scsih_check_topo_delete_events()
         -> _scsih_block_io_to_children_attached_directly()
            -> _scsih_block_io_device()
               -> _scsih_internal_device_block()
                  -> scsi_internal_device_block()

Reported-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Omar Sandoval <osandov@osandov.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sathya Prakash <sathya.prakash@broadcom.com>
Cc: Chaitra P B <chaitra.basappa@broadcom.com>
Cc: Suganath Prabu Subramani <suganath-prabu.subramani@broadcom.com>
Cc: Sreekanth Reddy <Sreekanth.Reddy@broadcom.com>
Cc: <stable@vger.kernel.org> # v4.10+
Tested-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/mpt3sas/mpt3sas_base.h  |  3 ---
 drivers/scsi/mpt3sas/mpt3sas_scsih.c |  4 ++--
 drivers/scsi/scsi_lib.c              | 14 ++++++++++----
 drivers/scsi/scsi_priv.h             |  3 ---
 include/scsi/scsi_device.h           |  4 ++++
 5 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h
index 7fe7e6ed595b..8981806fb13f 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.h
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.h
@@ -1442,9 +1442,6 @@ void mpt3sas_transport_update_links(struct MPT3SAS_ADAPTER *ioc,
 	u64 sas_address, u16 handle, u8 phy_number, u8 link_rate);
 extern struct sas_function_template mpt3sas_transport_functions;
 extern struct scsi_transport_template *mpt3sas_transport_template;
-extern int scsi_internal_device_block(struct scsi_device *sdev);
-extern int scsi_internal_device_unblock(struct scsi_device *sdev,
-				enum scsi_device_state new_state);
 /* trigger data externs */
 void mpt3sas_send_trigger_data_event(struct MPT3SAS_ADAPTER *ioc,
 	struct SL_WH_TRIGGERS_EVENT_DATA_T *event_data);
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index 69c29c560575..919ba2bb15f1 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -2859,7 +2859,7 @@ _scsih_internal_device_block(struct scsi_device *sdev,
 	    sas_device_priv_data->sas_target->handle);
 	sas_device_priv_data->block = 1;
 
-	r = scsi_internal_device_block(sdev);
+	r = scsi_internal_device_block(sdev, false);
 	if (r == -EINVAL)
 		sdev_printk(KERN_WARNING, sdev,
 		    "device_block failed with return(%d) for handle(0x%04x)\n",
@@ -2895,7 +2895,7 @@ _scsih_internal_device_unblock(struct scsi_device *sdev,
 		    "performing a block followed by an unblock\n",
 		    r, sas_device_priv_data->sas_target->handle);
 		sas_device_priv_data->block = 1;
-		r = scsi_internal_device_block(sdev);
+		r = scsi_internal_device_block(sdev, false);
 		if (r)
 			sdev_printk(KERN_WARNING, sdev, "retried device_block "
 			    "failed with return(%d) for handle(0x%04x)\n",
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index f5e45a252485..f41e6b84a1bd 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2932,6 +2932,8 @@ EXPORT_SYMBOL(scsi_target_resume);
 /**
  * scsi_internal_device_block - internal function to put a device temporarily into the SDEV_BLOCK state
  * @sdev:	device to block
+ * @wait:	Whether or not to wait until ongoing .queuecommand() /
+ *		.queue_rq() calls have finished.
  *
  * Block request made by scsi lld's to temporarily stop all
  * scsi commands on the specified device. May sleep.
@@ -2949,7 +2951,7 @@ EXPORT_SYMBOL(scsi_target_resume);
  * remove the rport mutex lock and unlock calls from srp_queuecommand().
  */
 int
-scsi_internal_device_block(struct scsi_device *sdev)
+scsi_internal_device_block(struct scsi_device *sdev, bool wait)
 {
 	struct request_queue *q = sdev->request_queue;
 	unsigned long flags;
@@ -2969,12 +2971,16 @@ scsi_internal_device_block(struct scsi_device *sdev)
 	 * request queue. 
 	 */
 	if (q->mq_ops) {
-		blk_mq_quiesce_queue(q);
+		if (wait)
+			blk_mq_quiesce_queue(q);
+		else
+			blk_mq_stop_hw_queues(q);
 	} else {
 		spin_lock_irqsave(q->queue_lock, flags);
 		blk_stop_queue(q);
 		spin_unlock_irqrestore(q->queue_lock, flags);
-		scsi_wait_for_queuecommand(sdev);
+		if (wait)
+			scsi_wait_for_queuecommand(sdev);
 	}
 
 	return 0;
@@ -3036,7 +3042,7 @@ EXPORT_SYMBOL_GPL(scsi_internal_device_unblock);
 static void
 device_block(struct scsi_device *sdev, void *data)
 {
-	scsi_internal_device_block(sdev);
+	scsi_internal_device_block(sdev, true);
 }
 
 static int
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 99bfc985e190..f11bd102d6d5 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -188,8 +188,5 @@ static inline void scsi_dh_remove_device(struct scsi_device *sdev) { }
  */
 
 #define SCSI_DEVICE_BLOCK_MAX_TIMEOUT	600	/* units in seconds */
-extern int scsi_internal_device_block(struct scsi_device *sdev);
-extern int scsi_internal_device_unblock(struct scsi_device *sdev,
-					enum scsi_device_state new_state);
 
 #endif /* _SCSI_PRIV_H */
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 6f22b39f1b0c..080c7ce9bae8 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -472,6 +472,10 @@ static inline int scsi_device_created(struct scsi_device *sdev)
 		sdev->sdev_state == SDEV_CREATED_BLOCK;
 }
 
+int scsi_internal_device_block(struct scsi_device *sdev, bool wait);
+int scsi_internal_device_unblock(struct scsi_device *sdev,
+				 enum scsi_device_state new_state);
+
 /* accessor functions for the SCSI parameters */
 static inline int scsi_device_sync(struct scsi_device *sdev)
 {
-- 
cgit v1.2.3


From cd8d860dcce906cd477be7d0648ba6f56a52eaa6 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Tue, 28 Feb 2017 11:32:22 -0500
Subject: jump_label: Fix anonymous union initialization

Pre-4.6 gcc do not allow direct static initialization of members of
anonymous structs/unions. After commit 3821fd35b58d ("jump_label:
Reduce the size of struct static_key") STATIC_KEY_INIT_{TRUE|FALSE}
definitions cannot be compiled with those older compilers.

Placing initializers inside curved brackets works around this problem.

Link: http://lkml.kernel.org/r/1488299542-30765-1-git-send-email-boris.ostrovsky@oracle.com

Fixes: 3821fd35b58d ("jump_label: Reduce the size of struct static_key")
Reviewed-by: Jason Baron <jbaron@akamai.com>
Compiled-by: Chris Mason <clm@fb.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/jump_label.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 680c98b2f41c..a7f90117cf7d 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -166,10 +166,10 @@ extern void static_key_disable(struct static_key *key);
  */
 #define STATIC_KEY_INIT_TRUE					\
 	{ .enabled = { 1 },					\
-	  .entries = (void *)JUMP_TYPE_TRUE }
+	  { .entries = (void *)JUMP_TYPE_TRUE } }
 #define STATIC_KEY_INIT_FALSE					\
 	{ .enabled = { 0 },					\
-	  .entries = (void *)JUMP_TYPE_FALSE }
+	  { .entries = (void *)JUMP_TYPE_FALSE } }
 
 #else  /* !HAVE_JUMP_LABEL */
 
-- 
cgit v1.2.3


From b17ef2ed624aa7c1f68ed11acd16ecbf80fe01d7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 2 Mar 2017 17:28:45 -0500
Subject: jump_label: Add comment about initialization order for anonymous
 unions

Commit 3821fd35b58d ("jump_label: Reduce the size of struct static_key")
broke old compilers that could not handle static initialization of anonymous
unions. Boris fixed it with a patch that added brackets around the static
initializer. But this creates a dependency between those initializers and
the structure's order of its fields. Document this dependency in case new
fields are added to struct static_key in the future.

Noted-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Suggested-by: Chris Mason <clm@fb.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/jump_label.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index a7f90117cf7d..28e04a33535a 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -90,6 +90,13 @@ extern bool static_key_initialized;
 struct static_key {
 	atomic_t enabled;
 /*
+ * Note:
+ *   To make anonymous unions work with old compilers, the static
+ *   initialization of them requires brackets. This creates a dependency
+ *   on the order of the struct with the initializers. If any fields
+ *   are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need
+ *   to be modified.
+ *
  * bit 0 => 1 if key is initially true
  *	    0 if initially false
  * bit 1 => 1 if points to struct static_key_mod
-- 
cgit v1.2.3


From bf7165cfa23695c51998231c4efa080fe1d3548d Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 28 Sep 2016 22:55:54 -0400
Subject: tracing: Add #undef to fix compile error

There are several trace include files that define TRACE_INCLUDE_FILE.

Include several of them in the same .c file (as I currently have in
some code I am working on), and the compile will blow up with a
"warning: "TRACE_INCLUDE_FILE" redefined #define TRACE_INCLUDE_FILE syscalls"

Every other include file in include/trace/events/ avoids that issue
by having a #undef TRACE_INCLUDE_FILE before the #define; syscalls.h
should have one, too.

Link: http://lkml.kernel.org/r/20160928225554.13bd7ac6@annuminas.surriel.com

Cc: stable@vger.kernel.org
Fixes: b8007ef74222 ("tracing: Separate raw syscall from syscall tracer")
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/trace/events/syscalls.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h
index 14e49c798135..b35533b94277 100644
--- a/include/trace/events/syscalls.h
+++ b/include/trace/events/syscalls.h
@@ -1,5 +1,6 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM raw_syscalls
+#undef TRACE_INCLUDE_FILE
 #define TRACE_INCLUDE_FILE syscalls
 
 #if !defined(_TRACE_EVENTS_SYSCALLS_H) || defined(TRACE_HEADER_MULTI_READ)
-- 
cgit v1.2.3


From 0878fff1f42c18e448ab5b8b4f6a3eb32365b5b6 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Sun, 5 Mar 2017 12:34:49 -0800
Subject: net: phy: Do not perform software reset for Generic PHY

The Generic PHY driver is a catch-all PHY driver and it should preserve
whatever prior initialization has been done by boot loader or firmware
agents. For specific PHY device configuration it is expected that a
specialized PHY driver would take over that role.

Resetting the generic PHY was a bad idea that has lead to several
complaints and downstream workarounds e.g: in OpenWrt/LEDE so restore
the behavior prior to 87aa9f9c61ad ("net: phy: consolidate PHY
reset in phy_init_hw()").

Reported-by: Felix Fietkau <nbd@nbd.name>
Fixes: 87aa9f9c61ad ("net: phy: consolidate PHY reset in phy_init_hw()")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 2 +-
 include/linux/phy.h          | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index daec6555f3b1..5198ccfa347f 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1864,7 +1864,7 @@ static struct phy_driver genphy_driver[] = {
 	.phy_id		= 0xffffffff,
 	.phy_id_mask	= 0xffffffff,
 	.name		= "Generic PHY",
-	.soft_reset	= genphy_soft_reset,
+	.soft_reset	= genphy_no_soft_reset,
 	.config_init	= genphy_config_init,
 	.features	= PHY_GBIT_FEATURES | SUPPORTED_MII |
 			  SUPPORTED_AUI | SUPPORTED_FIBRE |
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 772476028a65..43a774873aa9 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -837,6 +837,10 @@ int genphy_read_status(struct phy_device *phydev);
 int genphy_suspend(struct phy_device *phydev);
 int genphy_resume(struct phy_device *phydev);
 int genphy_soft_reset(struct phy_device *phydev);
+static inline int genphy_no_soft_reset(struct phy_device *phydev)
+{
+	return 0;
+}
 void phy_driver_unregister(struct phy_driver *drv);
 void phy_drivers_unregister(struct phy_driver *drv, int n);
 int phy_driver_register(struct phy_driver *new_driver, struct module *owner);
-- 
cgit v1.2.3


From 4dfc050571523ac2bc02cbf948dd47621f7dd83f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 21 Feb 2017 11:32:47 +0000
Subject: KVM: arm/arm64: vgic-v3: Don't pretend to support IRQ/FIQ bypass

Our GICv3 emulation always presents ICC_SRE_EL1 with DIB/DFB set to
zero, which implies that there is a way to bypass the GIC and
inject raw IRQ/FIQ by driving the CPU pins.

Of course, we don't allow that when the GIC is configured, but
we fail to indicate that to the guest. The obvious fix is to
set these bits (and never let them being changed again).

Reported-by: Peter Maydell <peter.maydell@linaro.org>
Acked-by: Christoffer Dall <cdall@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h | 2 ++
 virt/kvm/arm/vgic/vgic-v3.c        | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 672cfef72fc8..97cbca19430d 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -373,6 +373,8 @@
 #define ICC_IGRPEN0_EL1_MASK		(1 << ICC_IGRPEN0_EL1_SHIFT)
 #define ICC_IGRPEN1_EL1_SHIFT		0
 #define ICC_IGRPEN1_EL1_MASK		(1 << ICC_IGRPEN1_EL1_SHIFT)
+#define ICC_SRE_EL1_DIB			(1U << 2)
+#define ICC_SRE_EL1_DFB			(1U << 1)
 #define ICC_SRE_EL1_SRE			(1U << 0)
 
 /*
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index edc6ee2dc852..be0f4c3e0142 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -229,10 +229,13 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
 	/*
 	 * If we are emulating a GICv3, we do it in an non-GICv2-compatible
 	 * way, so we force SRE to 1 to demonstrate this to the guest.
+	 * Also, we don't support any form of IRQ/FIQ bypass.
 	 * This goes with the spec allowing the value to be RAO/WI.
 	 */
 	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-		vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
+		vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB |
+				     ICC_SRE_EL1_DFB |
+				     ICC_SRE_EL1_SRE);
 		vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE;
 	} else {
 		vgic_v3->vgic_sre = 0;
-- 
cgit v1.2.3


From b3e228473e6cec7cf83b4025b4570c8066ab2dd8 Mon Sep 17 00:00:00 2001
From: Mian Yousaf Kaukab <yousaf.kaukab@suse.com>
Date: Thu, 2 Mar 2017 16:11:47 +0100
Subject: irqdomain: Add empty irq_domain_check_msi_remap

Fix following build error for s390:
drivers/vfio/vfio_iommu_type1.c: In function 'vfio_iommu_type1_attach_group':
drivers/vfio/vfio_iommu_type1.c:1290:25: error: implicit declaration of function 'irq_domain_check_msi_remap'

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Mian Yousaf Kaukab <yousaf.kaukab@suse.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqdomain.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 188eced6813e..9f3616085423 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -524,6 +524,10 @@ static inline struct irq_domain *irq_find_matching_fwnode(
 {
 	return NULL;
 }
+static inline bool irq_domain_check_msi_remap(void)
+{
+	return false;
+}
 #endif /* !CONFIG_IRQ_DOMAIN */
 
 #endif /* _LINUX_IRQDOMAIN_H */
-- 
cgit v1.2.3


From 040757f738e13caaa9c5078bca79aa97e11dde88 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 5 Mar 2017 15:03:22 -0600
Subject: ucount: Remove the atomicity from ucount->count

Always increment/decrement ucount->count under the ucounts_lock.  The
increments are there already and moving the decrements there means the
locking logic of the code is simpler.  This simplification in the
locking logic fixes a race between put_ucounts and get_ucounts that
could result in a use-after-free because the count could go zero then
be found by get_ucounts and then be freed by put_ucounts.

A bug presumably this one was found by a combination of syzkaller and
KASAN.  JongWhan Kim reported the syzkaller failure and Dmitry Vyukov
spotted the race in the code.

Cc: stable@vger.kernel.org
Fixes: f6b2db1a3e8d ("userns: Make the count of user namespaces per user")
Reported-by: JongHwan Kim <zzoru007@gmail.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Andrei Vagin <avagin@gmail.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/user_namespace.h |  2 +-
 kernel/ucount.c                | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index be765234c0a2..32354b4b4b2b 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -72,7 +72,7 @@ struct ucounts {
 	struct hlist_node node;
 	struct user_namespace *ns;
 	kuid_t uid;
-	atomic_t count;
+	int count;
 	atomic_t ucount[UCOUNT_COUNTS];
 };
 
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 62630a40ab3a..b4eeee03934f 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -144,7 +144,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
 
 		new->ns = ns;
 		new->uid = uid;
-		atomic_set(&new->count, 0);
+		new->count = 0;
 
 		spin_lock_irq(&ucounts_lock);
 		ucounts = find_ucounts(ns, uid, hashent);
@@ -155,8 +155,10 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
 			ucounts = new;
 		}
 	}
-	if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
+	if (ucounts->count == INT_MAX)
 		ucounts = NULL;
+	else
+		ucounts->count += 1;
 	spin_unlock_irq(&ucounts_lock);
 	return ucounts;
 }
@@ -165,13 +167,15 @@ static void put_ucounts(struct ucounts *ucounts)
 {
 	unsigned long flags;
 
-	if (atomic_dec_and_test(&ucounts->count)) {
-		spin_lock_irqsave(&ucounts_lock, flags);
+	spin_lock_irqsave(&ucounts_lock, flags);
+	ucounts->count -= 1;
+	if (!ucounts->count)
 		hlist_del_init(&ucounts->node);
-		spin_unlock_irqrestore(&ucounts_lock, flags);
+	else
+		ucounts = NULL;
+	spin_unlock_irqrestore(&ucounts_lock, flags);
 
-		kfree(ucounts);
-	}
+	kfree(ucounts);
 }
 
 static inline bool atomic_inc_below(atomic_t *v, int u)
-- 
cgit v1.2.3


From 7cc5e38f2f0b0b58a22a4c18a56348dd99a71270 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Sun, 12 Feb 2017 17:11:07 +0100
Subject: libceph: osd_request_timeout option

osd_request_timeout specifies how many seconds to wait for a response
from OSDs before returning -ETIMEDOUT from an OSD request.  0 (default)
means no limit.

osd_request_timeout is osdkeepalive-precise -- in-flight requests are
swept through every osdkeepalive seconds.  With ack vs commit behaviour
gone, abort_request() is really simple.

This is based on a patch from Artur Molchanov <artur.molchanov@synesis.ru>.

Tested-by: Artur Molchanov <artur.molchanov@synesis.ru>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Sage Weil <sage@redhat.com>
---
 include/linux/ceph/libceph.h    |  2 ++
 include/linux/ceph/osd_client.h |  1 +
 net/ceph/ceph_common.c          | 15 +++++++++++++++
 net/ceph/osd_client.c           | 36 +++++++++++++++++++++++++++++++++++-
 4 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 1816c5e26581..88cd5dc8e238 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -48,6 +48,7 @@ struct ceph_options {
 	unsigned long mount_timeout;		/* jiffies */
 	unsigned long osd_idle_ttl;		/* jiffies */
 	unsigned long osd_keepalive_timeout;	/* jiffies */
+	unsigned long osd_request_timeout;	/* jiffies */
 
 	/*
 	 * any type that can't be simply compared or doesn't need need
@@ -68,6 +69,7 @@ struct ceph_options {
 #define CEPH_MOUNT_TIMEOUT_DEFAULT	msecs_to_jiffies(60 * 1000)
 #define CEPH_OSD_KEEPALIVE_DEFAULT	msecs_to_jiffies(5 * 1000)
 #define CEPH_OSD_IDLE_TTL_DEFAULT	msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0  /* no timeout */
 
 #define CEPH_MONC_HUNT_INTERVAL		msecs_to_jiffies(3 * 1000)
 #define CEPH_MONC_PING_INTERVAL		msecs_to_jiffies(10 * 1000)
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 2ea0c282f3dc..c125b5d9e13c 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -189,6 +189,7 @@ struct ceph_osd_request {
 
 	/* internal */
 	unsigned long r_stamp;                /* jiffies, send or check time */
+	unsigned long r_start_stamp;          /* jiffies */
 	int r_attempts;
 	struct ceph_eversion r_replay_version; /* aka reassert_version */
 	u32 r_last_force_resend;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 464e88599b9d..108533859a53 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -230,6 +230,7 @@ enum {
 	Opt_osdkeepalivetimeout,
 	Opt_mount_timeout,
 	Opt_osd_idle_ttl,
+	Opt_osd_request_timeout,
 	Opt_last_int,
 	/* int args above */
 	Opt_fsid,
@@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
 	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
 	{Opt_mount_timeout, "mount_timeout=%d"},
 	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+	{Opt_osd_request_timeout, "osd_request_timeout=%d"},
 	/* int args above */
 	{Opt_fsid, "fsid=%s"},
 	{Opt_name, "name=%s"},
@@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
 	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
 	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+	opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
 
 	/* get mon ip(s) */
 	/* ip1[:port1][,ip2[:port2]...] */
@@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
 			}
 			opt->mount_timeout = msecs_to_jiffies(intval * 1000);
 			break;
+		case Opt_osd_request_timeout:
+			/* 0 is "wait forever" (i.e. infinite timeout) */
+			if (intval < 0 || intval > INT_MAX / 1000) {
+				pr_err("osd_request_timeout out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
+			break;
 
 		case Opt_share:
 			opt->flags &= ~CEPH_OPT_NOSHARE;
@@ -557,6 +569,9 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
 	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
 		seq_printf(m, "osdkeepalivetimeout=%d,",
 		    jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
+	if (opt->osd_request_timeout != CEPH_OSD_REQUEST_TIMEOUT_DEFAULT)
+		seq_printf(m, "osd_request_timeout=%d,",
+			   jiffies_to_msecs(opt->osd_request_timeout) / 1000);
 
 	/* drop redundant comma */
 	if (m->count != pos)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b65bbf9f45eb..e15ea9e4c495 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1709,6 +1709,8 @@ static void account_request(struct ceph_osd_request *req)
 
 	req->r_flags |= CEPH_OSD_FLAG_ONDISK;
 	atomic_inc(&req->r_osdc->num_requests);
+
+	req->r_start_stamp = jiffies;
 }
 
 static void submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -1789,6 +1791,14 @@ static void cancel_request(struct ceph_osd_request *req)
 	ceph_osdc_put_request(req);
 }
 
+static void abort_request(struct ceph_osd_request *req, int err)
+{
+	dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+
+	cancel_map_check(req);
+	complete_request(req, err);
+}
+
 static void check_pool_dne(struct ceph_osd_request *req)
 {
 	struct ceph_osd_client *osdc = req->r_osdc;
@@ -2487,6 +2497,7 @@ static void handle_timeout(struct work_struct *work)
 		container_of(work, struct ceph_osd_client, timeout_work.work);
 	struct ceph_options *opts = osdc->client->options;
 	unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+	unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout;
 	LIST_HEAD(slow_osds);
 	struct rb_node *n, *p;
 
@@ -2502,15 +2513,23 @@ static void handle_timeout(struct work_struct *work)
 		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
 		bool found = false;
 
-		for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+		for (p = rb_first(&osd->o_requests); p; ) {
 			struct ceph_osd_request *req =
 			    rb_entry(p, struct ceph_osd_request, r_node);
 
+			p = rb_next(p); /* abort_request() */
+
 			if (time_before(req->r_stamp, cutoff)) {
 				dout(" req %p tid %llu on osd%d is laggy\n",
 				     req, req->r_tid, osd->o_osd);
 				found = true;
 			}
+			if (opts->osd_request_timeout &&
+			    time_before(req->r_start_stamp, expiry_cutoff)) {
+				pr_err_ratelimited("tid %llu on osd%d timeout\n",
+				       req->r_tid, osd->o_osd);
+				abort_request(req, -ETIMEDOUT);
+			}
 		}
 		for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
 			struct ceph_osd_linger_request *lreq =
@@ -2530,6 +2549,21 @@ static void handle_timeout(struct work_struct *work)
 			list_move_tail(&osd->o_keepalive_item, &slow_osds);
 	}
 
+	if (opts->osd_request_timeout) {
+		for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
+			struct ceph_osd_request *req =
+			    rb_entry(p, struct ceph_osd_request, r_node);
+
+			p = rb_next(p); /* abort_request() */
+
+			if (time_before(req->r_start_stamp, expiry_cutoff)) {
+				pr_err_ratelimited("tid %llu on osd%d timeout\n",
+				       req->r_tid, osdc->homeless_osd.o_osd);
+				abort_request(req, -ETIMEDOUT);
+			}
+		}
+	}
+
 	if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
 		maybe_request_map(osdc);
 
-- 
cgit v1.2.3


From 8c71fff434e5ecf5ff27bd61db1bc9ac4c2b2a1b Mon Sep 17 00:00:00 2001
From: Kieran Bingham <kieran.bingham+renesas@ideasonboard.com>
Date: Fri, 3 Mar 2017 06:31:48 -0300
Subject: [media] v4l: vsp1: Adapt vsp1_du_setup_lif() interface to use a
 structure

The interface to configure the LIF in the VSP1 requires adapting the
function prototype for any changes. This makes extending the interface
difficult.

Change the function prototype to pass a structure which can be easily
extended.

This changes the means of disabling the pipeline, by now passing a NULL
configuration rather than passing either a 0 width or height.

[Fixed kerneldoc, made vsp1_du_setup_lif() cfg argument const]

Signed-off-by: Kieran Bingham <kieran.bingham+renesas@ideasonboard.com>
Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Acked-by: Dave Airlie <airlied@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/gpu/drm/rcar-du/rcar_du_vsp.c  |  8 ++++++--
 drivers/media/platform/vsp1/vsp1_drm.c | 33 ++++++++++++++++-----------------
 include/media/vsp1.h                   | 13 +++++++++++--
 3 files changed, 33 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/rcar-du/rcar_du_vsp.c b/drivers/gpu/drm/rcar-du/rcar_du_vsp.c
index 83ebd162f3ef..22da2d671297 100644
--- a/drivers/gpu/drm/rcar-du/rcar_du_vsp.c
+++ b/drivers/gpu/drm/rcar-du/rcar_du_vsp.c
@@ -32,6 +32,10 @@ void rcar_du_vsp_enable(struct rcar_du_crtc *crtc)
 {
 	const struct drm_display_mode *mode = &crtc->crtc.state->adjusted_mode;
 	struct rcar_du_device *rcdu = crtc->group->dev;
+	struct vsp1_du_lif_config cfg = {
+		.width = mode->hdisplay,
+		.height = mode->vdisplay,
+	};
 	struct rcar_du_plane_state state = {
 		.state = {
 			.crtc = &crtc->crtc,
@@ -66,12 +70,12 @@ void rcar_du_vsp_enable(struct rcar_du_crtc *crtc)
 	 */
 	crtc->group->need_restart = true;
 
-	vsp1_du_setup_lif(crtc->vsp->vsp, mode->hdisplay, mode->vdisplay);
+	vsp1_du_setup_lif(crtc->vsp->vsp, &cfg);
 }
 
 void rcar_du_vsp_disable(struct rcar_du_crtc *crtc)
 {
-	vsp1_du_setup_lif(crtc->vsp->vsp, 0, 0);
+	vsp1_du_setup_lif(crtc->vsp->vsp, NULL);
 }
 
 void rcar_du_vsp_atomic_begin(struct rcar_du_crtc *crtc)
diff --git a/drivers/media/platform/vsp1/vsp1_drm.c b/drivers/media/platform/vsp1/vsp1_drm.c
index b4b583f7137a..b4c0f10fc3b0 100644
--- a/drivers/media/platform/vsp1/vsp1_drm.c
+++ b/drivers/media/platform/vsp1/vsp1_drm.c
@@ -54,12 +54,11 @@ EXPORT_SYMBOL_GPL(vsp1_du_init);
 /**
  * vsp1_du_setup_lif - Setup the output part of the VSP pipeline
  * @dev: the VSP device
- * @width: output frame width in pixels
- * @height: output frame height in pixels
+ * @cfg: the LIF configuration
  *
- * Configure the output part of VSP DRM pipeline for the given frame @width and
- * @height. This sets up formats on the BRU source pad, the WPF0 sink and source
- * pads, and the LIF sink pad.
+ * Configure the output part of VSP DRM pipeline for the given frame @cfg.width
+ * and @cfg.height. This sets up formats on the BRU source pad, the WPF0 sink
+ * and source pads, and the LIF sink pad.
  *
  * As the media bus code on the BRU source pad is conditioned by the
  * configuration of the BRU sink 0 pad, we also set up the formats on all BRU
@@ -69,8 +68,7 @@ EXPORT_SYMBOL_GPL(vsp1_du_init);
  *
  * Return 0 on success or a negative error code on failure.
  */
-int vsp1_du_setup_lif(struct device *dev, unsigned int width,
-		      unsigned int height)
+int vsp1_du_setup_lif(struct device *dev, const struct vsp1_du_lif_config *cfg)
 {
 	struct vsp1_device *vsp1 = dev_get_drvdata(dev);
 	struct vsp1_pipeline *pipe = &vsp1->drm->pipe;
@@ -79,11 +77,8 @@ int vsp1_du_setup_lif(struct device *dev, unsigned int width,
 	unsigned int i;
 	int ret;
 
-	dev_dbg(vsp1->dev, "%s: configuring LIF with format %ux%u\n",
-		__func__, width, height);
-
-	if (width == 0 || height == 0) {
-		/* Zero width or height means the CRTC is being disabled, stop
+	if (!cfg) {
+		/* NULL configuration means the CRTC is being disabled, stop
 		 * the pipeline and turn the light off.
 		 */
 		ret = vsp1_pipeline_stop(pipe);
@@ -108,6 +103,9 @@ int vsp1_du_setup_lif(struct device *dev, unsigned int width,
 		return 0;
 	}
 
+	dev_dbg(vsp1->dev, "%s: configuring LIF with format %ux%u\n",
+		__func__, cfg->width, cfg->height);
+
 	/* Configure the format at the BRU sinks and propagate it through the
 	 * pipeline.
 	 */
@@ -117,8 +115,8 @@ int vsp1_du_setup_lif(struct device *dev, unsigned int width,
 	for (i = 0; i < bru->entity.source_pad; ++i) {
 		format.pad = i;
 
-		format.format.width = width;
-		format.format.height = height;
+		format.format.width = cfg->width;
+		format.format.height = cfg->height;
 		format.format.code = MEDIA_BUS_FMT_ARGB8888_1X32;
 		format.format.field = V4L2_FIELD_NONE;
 
@@ -133,8 +131,8 @@ int vsp1_du_setup_lif(struct device *dev, unsigned int width,
 	}
 
 	format.pad = bru->entity.source_pad;
-	format.format.width = width;
-	format.format.height = height;
+	format.format.width = cfg->width;
+	format.format.height = cfg->height;
 	format.format.code = MEDIA_BUS_FMT_ARGB8888_1X32;
 	format.format.field = V4L2_FIELD_NONE;
 
@@ -180,7 +178,8 @@ int vsp1_du_setup_lif(struct device *dev, unsigned int width,
 	/* Verify that the format at the output of the pipeline matches the
 	 * requested frame size and media bus code.
 	 */
-	if (format.format.width != width || format.format.height != height ||
+	if (format.format.width != cfg->width ||
+	    format.format.height != cfg->height ||
 	    format.format.code != MEDIA_BUS_FMT_ARGB8888_1X32) {
 		dev_dbg(vsp1->dev, "%s: format mismatch\n", __func__);
 		return -EPIPE;
diff --git a/include/media/vsp1.h b/include/media/vsp1.h
index 458b400373d4..38aac554dbba 100644
--- a/include/media/vsp1.h
+++ b/include/media/vsp1.h
@@ -20,8 +20,17 @@ struct device;
 
 int vsp1_du_init(struct device *dev);
 
-int vsp1_du_setup_lif(struct device *dev, unsigned int width,
-		      unsigned int height);
+/**
+ * struct vsp1_du_lif_config - VSP LIF configuration
+ * @width: output frame width
+ * @height: output frame height
+ */
+struct vsp1_du_lif_config {
+	unsigned int width;
+	unsigned int height;
+};
+
+int vsp1_du_setup_lif(struct device *dev, const struct vsp1_du_lif_config *cfg);
 
 struct vsp1_du_atomic_config {
 	u32 pixelformat;
-- 
cgit v1.2.3


From 62f8f4d9066c1c6f2474845d1ca7e2891f2ae3fd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 5 Mar 2017 10:52:16 -0800
Subject: dccp: fix use-after-free in dccp_feat_activate_values

Dmitry reported crashes in DCCP stack [1]

Problem here is that when I got rid of listener spinlock, I missed the
fact that DCCP stores a complex state in struct dccp_request_sock,
while TCP does not.

Since multiple cpus could access it at the same time, we need to add
protection.

[1]
BUG: KASAN: use-after-free in dccp_feat_activate_values+0x967/0xab0
net/dccp/feat.c:1541 at addr ffff88003713be68
Read of size 8 by task syz-executor2/8457
CPU: 2 PID: 8457 Comm: syz-executor2 Not tainted 4.10.0-rc7+ #127
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:15 [inline]
 dump_stack+0x292/0x398 lib/dump_stack.c:51
 kasan_object_err+0x1c/0x70 mm/kasan/report.c:162
 print_address_description mm/kasan/report.c:200 [inline]
 kasan_report_error mm/kasan/report.c:289 [inline]
 kasan_report.part.1+0x20e/0x4e0 mm/kasan/report.c:311
 kasan_report mm/kasan/report.c:332 [inline]
 __asan_report_load8_noabort+0x29/0x30 mm/kasan/report.c:332
 dccp_feat_activate_values+0x967/0xab0 net/dccp/feat.c:1541
 dccp_create_openreq_child+0x464/0x610 net/dccp/minisocks.c:121
 dccp_v6_request_recv_sock+0x1f6/0x1960 net/dccp/ipv6.c:457
 dccp_check_req+0x335/0x5a0 net/dccp/minisocks.c:186
 dccp_v6_rcv+0x69e/0x1d00 net/dccp/ipv6.c:711
 ip6_input_finish+0x46d/0x17a0 net/ipv6/ip6_input.c:279
 NF_HOOK include/linux/netfilter.h:257 [inline]
 ip6_input+0xdb/0x590 net/ipv6/ip6_input.c:322
 dst_input include/net/dst.h:507 [inline]
 ip6_rcv_finish+0x289/0x890 net/ipv6/ip6_input.c:69
 NF_HOOK include/linux/netfilter.h:257 [inline]
 ipv6_rcv+0x12ec/0x23d0 net/ipv6/ip6_input.c:203
 __netif_receive_skb_core+0x1ae5/0x3400 net/core/dev.c:4190
 __netif_receive_skb+0x2a/0x170 net/core/dev.c:4228
 process_backlog+0xe5/0x6c0 net/core/dev.c:4839
 napi_poll net/core/dev.c:5202 [inline]
 net_rx_action+0xe70/0x1900 net/core/dev.c:5267
 __do_softirq+0x2fb/0xb7d kernel/softirq.c:284
 do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:902
 </IRQ>
 do_softirq.part.17+0x1e8/0x230 kernel/softirq.c:328
 do_softirq kernel/softirq.c:176 [inline]
 __local_bh_enable_ip+0x1f2/0x200 kernel/softirq.c:181
 local_bh_enable include/linux/bottom_half.h:31 [inline]
 rcu_read_unlock_bh include/linux/rcupdate.h:971 [inline]
 ip6_finish_output2+0xbb0/0x23d0 net/ipv6/ip6_output.c:123
 ip6_finish_output+0x302/0x960 net/ipv6/ip6_output.c:148
 NF_HOOK_COND include/linux/netfilter.h:246 [inline]
 ip6_output+0x1cb/0x8d0 net/ipv6/ip6_output.c:162
 ip6_xmit+0xcdf/0x20d0 include/net/dst.h:501
 inet6_csk_xmit+0x320/0x5f0 net/ipv6/inet6_connection_sock.c:179
 dccp_transmit_skb+0xb09/0x1120 net/dccp/output.c:141
 dccp_xmit_packet+0x215/0x760 net/dccp/output.c:280
 dccp_write_xmit+0x168/0x1d0 net/dccp/output.c:362
 dccp_sendmsg+0x79c/0xb10 net/dccp/proto.c:796
 inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:744
 sock_sendmsg_nosec net/socket.c:635 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:645
 SYSC_sendto+0x660/0x810 net/socket.c:1687
 SyS_sendto+0x40/0x50 net/socket.c:1655
 entry_SYSCALL_64_fastpath+0x1f/0xc2
RIP: 0033:0x4458b9
RSP: 002b:00007f8ceb77bb58 EFLAGS: 00000282 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 0000000000000017 RCX: 00000000004458b9
RDX: 0000000000000023 RSI: 0000000020e60000 RDI: 0000000000000017
RBP: 00000000006e1b90 R08: 00000000200f9fe1 R09: 0000000000000020
R10: 0000000000008010 R11: 0000000000000282 R12: 00000000007080a8
R13: 0000000000000000 R14: 00007f8ceb77c9c0 R15: 00007f8ceb77c700
Object at ffff88003713be50, in cache kmalloc-64 size: 64
Allocated:
PID = 8446
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
 save_stack+0x43/0xd0 mm/kasan/kasan.c:502
 set_track mm/kasan/kasan.c:514 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:605
 kmem_cache_alloc_trace+0x82/0x270 mm/slub.c:2738
 kmalloc include/linux/slab.h:490 [inline]
 dccp_feat_entry_new+0x214/0x410 net/dccp/feat.c:467
 dccp_feat_push_change+0x38/0x220 net/dccp/feat.c:487
 __feat_register_sp+0x223/0x2f0 net/dccp/feat.c:741
 dccp_feat_propagate_ccid+0x22b/0x2b0 net/dccp/feat.c:949
 dccp_feat_server_ccid_dependencies+0x1b3/0x250 net/dccp/feat.c:1012
 dccp_make_response+0x1f1/0xc90 net/dccp/output.c:423
 dccp_v6_send_response+0x4ec/0xc20 net/dccp/ipv6.c:217
 dccp_v6_conn_request+0xaba/0x11b0 net/dccp/ipv6.c:377
 dccp_rcv_state_process+0x51e/0x1650 net/dccp/input.c:606
 dccp_v6_do_rcv+0x213/0x350 net/dccp/ipv6.c:632
 sk_backlog_rcv include/net/sock.h:893 [inline]
 __sk_receive_skb+0x36f/0xcc0 net/core/sock.c:479
 dccp_v6_rcv+0xba5/0x1d00 net/dccp/ipv6.c:742
 ip6_input_finish+0x46d/0x17a0 net/ipv6/ip6_input.c:279
 NF_HOOK include/linux/netfilter.h:257 [inline]
 ip6_input+0xdb/0x590 net/ipv6/ip6_input.c:322
 dst_input include/net/dst.h:507 [inline]
 ip6_rcv_finish+0x289/0x890 net/ipv6/ip6_input.c:69
 NF_HOOK include/linux/netfilter.h:257 [inline]
 ipv6_rcv+0x12ec/0x23d0 net/ipv6/ip6_input.c:203
 __netif_receive_skb_core+0x1ae5/0x3400 net/core/dev.c:4190
 __netif_receive_skb+0x2a/0x170 net/core/dev.c:4228
 process_backlog+0xe5/0x6c0 net/core/dev.c:4839
 napi_poll net/core/dev.c:5202 [inline]
 net_rx_action+0xe70/0x1900 net/core/dev.c:5267
 __do_softirq+0x2fb/0xb7d kernel/softirq.c:284
Freed:
PID = 15
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
 save_stack+0x43/0xd0 mm/kasan/kasan.c:502
 set_track mm/kasan/kasan.c:514 [inline]
 kasan_slab_free+0x73/0xc0 mm/kasan/kasan.c:578
 slab_free_hook mm/slub.c:1355 [inline]
 slab_free_freelist_hook mm/slub.c:1377 [inline]
 slab_free mm/slub.c:2954 [inline]
 kfree+0xe8/0x2b0 mm/slub.c:3874
 dccp_feat_entry_destructor.part.4+0x48/0x60 net/dccp/feat.c:418
 dccp_feat_entry_destructor net/dccp/feat.c:416 [inline]
 dccp_feat_list_pop net/dccp/feat.c:541 [inline]
 dccp_feat_activate_values+0x57f/0xab0 net/dccp/feat.c:1543
 dccp_create_openreq_child+0x464/0x610 net/dccp/minisocks.c:121
 dccp_v6_request_recv_sock+0x1f6/0x1960 net/dccp/ipv6.c:457
 dccp_check_req+0x335/0x5a0 net/dccp/minisocks.c:186
 dccp_v6_rcv+0x69e/0x1d00 net/dccp/ipv6.c:711
 ip6_input_finish+0x46d/0x17a0 net/ipv6/ip6_input.c:279
 NF_HOOK include/linux/netfilter.h:257 [inline]
 ip6_input+0xdb/0x590 net/ipv6/ip6_input.c:322
 dst_input include/net/dst.h:507 [inline]
 ip6_rcv_finish+0x289/0x890 net/ipv6/ip6_input.c:69
 NF_HOOK include/linux/netfilter.h:257 [inline]
 ipv6_rcv+0x12ec/0x23d0 net/ipv6/ip6_input.c:203
 __netif_receive_skb_core+0x1ae5/0x3400 net/core/dev.c:4190
 __netif_receive_skb+0x2a/0x170 net/core/dev.c:4228
 process_backlog+0xe5/0x6c0 net/core/dev.c:4839
 napi_poll net/core/dev.c:5202 [inline]
 net_rx_action+0xe70/0x1900 net/core/dev.c:5267
 __do_softirq+0x2fb/0xb7d kernel/softirq.c:284
Memory state around the buggy address:
 ffff88003713bd00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff88003713bd80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff88003713be00: fc fc fc fc fc fc fc fc fc fc fb fb fb fb fb fb
                                                          ^

Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h |  1 +
 net/dccp/minisocks.c | 24 ++++++++++++++++--------
 2 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 61d042bbbf60..68449293c4b6 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -163,6 +163,7 @@ struct dccp_request_sock {
 	__u64			 dreq_isr;
 	__u64			 dreq_gsr;
 	__be32			 dreq_service;
+	spinlock_t		 dreq_lock;
 	struct list_head	 dreq_featneg;
 	__u32			 dreq_timestamp_echo;
 	__u32			 dreq_timestamp_time;
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index e267e6f4c9a5..abd07a443219 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -142,6 +142,13 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
 	struct dccp_request_sock *dreq = dccp_rsk(req);
 	bool own_req;
 
+	/* TCP/DCCP listeners became lockless.
+	 * DCCP stores complex state in its request_sock, so we need
+	 * a protection for them, now this code runs without being protected
+	 * by the parent (listener) lock.
+	 */
+	spin_lock_bh(&dreq->dreq_lock);
+
 	/* Check for retransmitted REQUEST */
 	if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
 
@@ -156,7 +163,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
 			inet_rtx_syn_ack(sk, req);
 		}
 		/* Network Duplicate, discard packet */
-		return NULL;
+		goto out;
 	}
 
 	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
@@ -182,20 +189,20 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
 
 	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
 							 req, &own_req);
-	if (!child)
-		goto listen_overflow;
-
-	return inet_csk_complete_hashdance(sk, child, req, own_req);
+	if (child) {
+		child = inet_csk_complete_hashdance(sk, child, req, own_req);
+		goto out;
+	}
 
-listen_overflow:
-	dccp_pr_debug("listen_overflow!\n");
 	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
 drop:
 	if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
 		req->rsk_ops->send_reset(sk, skb);
 
 	inet_csk_reqsk_queue_drop(sk, req);
-	return NULL;
+out:
+	spin_unlock_bh(&dreq->dreq_lock);
+	return child;
 }
 
 EXPORT_SYMBOL_GPL(dccp_check_req);
@@ -246,6 +253,7 @@ int dccp_reqsk_init(struct request_sock *req,
 {
 	struct dccp_request_sock *dreq = dccp_rsk(req);
 
+	spin_lock_init(&dreq->dreq_lock);
 	inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
 	inet_rsk(req)->ir_num	   = ntohs(dccp_hdr(skb)->dccph_dport);
 	inet_rsk(req)->acked	   = 0;
-- 
cgit v1.2.3


From c01228db4ba965986511a5b28c478bddd7e2726e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Mar 2017 17:48:34 +0100
Subject: Revert "scsi, block: fix duplicate bdi name registration crashes"

This reverts commit 0dba1314d4f81115dce711292ec7981d17231064. It causes
leaking of device numbers for SCSI when SCSI registers multiple gendisks
for one request_queue in succession. It can be easily reproduced using
Omar's script [1] on kernel with CONFIG_DEBUG_TEST_DRIVER_REMOVE.
Furthermore the protection provided by this commit is not needed anymore
as the problem it was fixing got also fixed by commit 165a5e22fafb
"block: Move bdi_unregister() to del_gendisk()".

[1]: http://marc.info/?l=linux-block&m=148554717109098&w=2

Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  2 --
 block/genhd.c          | 21 ---------------------
 drivers/scsi/sd.c      | 41 ++++++++---------------------------------
 include/linux/blkdev.h |  1 -
 include/linux/genhd.h  |  8 --------
 5 files changed, 8 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 1086dac8724c..a76895c9776d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -578,8 +578,6 @@ void blk_cleanup_queue(struct request_queue *q)
 		q->queue_lock = &q->__queue_lock;
 	spin_unlock_irq(lock);
 
-	put_disk_devt(q->disk_devt);
-
 	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
 }
diff --git a/block/genhd.c b/block/genhd.c
index 94f323842b52..a9c516a8b37d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -572,20 +572,6 @@ exit:
 	disk_part_iter_exit(&piter);
 }
 
-void put_disk_devt(struct disk_devt *disk_devt)
-{
-	if (disk_devt && atomic_dec_and_test(&disk_devt->count))
-		disk_devt->release(disk_devt);
-}
-EXPORT_SYMBOL(put_disk_devt);
-
-void get_disk_devt(struct disk_devt *disk_devt)
-{
-	if (disk_devt)
-		atomic_inc(&disk_devt->count);
-}
-EXPORT_SYMBOL(get_disk_devt);
-
 /**
  * device_add_disk - add partitioning information to kernel list
  * @parent: parent device for the disk
@@ -626,13 +612,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 
 	disk_alloc_events(disk);
 
-	/*
-	 * Take a reference on the devt and assign it to queue since it
-	 * must not be reallocated while the bdi is registered
-	 */
-	disk->queue->disk_devt = disk->disk_devt;
-	get_disk_devt(disk->disk_devt);
-
 	/* Register BDI before referencing it from bdev */
 	bdi = disk->queue->backing_dev_info;
 	bdi_register_owner(bdi, disk_to_dev(disk));
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index c7839f6c35cc..d277e8620e3e 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3075,23 +3075,6 @@ static void sd_probe_async(void *data, async_cookie_t cookie)
 	put_device(&sdkp->dev);
 }
 
-struct sd_devt {
-	int idx;
-	struct disk_devt disk_devt;
-};
-
-static void sd_devt_release(struct disk_devt *disk_devt)
-{
-	struct sd_devt *sd_devt = container_of(disk_devt, struct sd_devt,
-			disk_devt);
-
-	spin_lock(&sd_index_lock);
-	ida_remove(&sd_index_ida, sd_devt->idx);
-	spin_unlock(&sd_index_lock);
-
-	kfree(sd_devt);
-}
-
 /**
  *	sd_probe - called during driver initialization and whenever a
  *	new scsi device is attached to the system. It is called once
@@ -3113,7 +3096,6 @@ static void sd_devt_release(struct disk_devt *disk_devt)
 static int sd_probe(struct device *dev)
 {
 	struct scsi_device *sdp = to_scsi_device(dev);
-	struct sd_devt *sd_devt;
 	struct scsi_disk *sdkp;
 	struct gendisk *gd;
 	int index;
@@ -3139,13 +3121,9 @@ static int sd_probe(struct device *dev)
 	if (!sdkp)
 		goto out;
 
-	sd_devt = kzalloc(sizeof(*sd_devt), GFP_KERNEL);
-	if (!sd_devt)
-		goto out_free;
-
 	gd = alloc_disk(SD_MINORS);
 	if (!gd)
-		goto out_free_devt;
+		goto out_free;
 
 	do {
 		if (!ida_pre_get(&sd_index_ida, GFP_KERNEL))
@@ -3161,11 +3139,6 @@ static int sd_probe(struct device *dev)
 		goto out_put;
 	}
 
-	atomic_set(&sd_devt->disk_devt.count, 1);
-	sd_devt->disk_devt.release = sd_devt_release;
-	sd_devt->idx = index;
-	gd->disk_devt = &sd_devt->disk_devt;
-
 	error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN);
 	if (error) {
 		sdev_printk(KERN_WARNING, sdp, "SCSI disk (sd) name length exceeded.\n");
@@ -3205,12 +3178,11 @@ static int sd_probe(struct device *dev)
 	return 0;
 
  out_free_index:
-	put_disk_devt(&sd_devt->disk_devt);
-	sd_devt = NULL;
+	spin_lock(&sd_index_lock);
+	ida_remove(&sd_index_ida, index);
+	spin_unlock(&sd_index_lock);
  out_put:
 	put_disk(gd);
- out_free_devt:
-	kfree(sd_devt);
  out_free:
 	kfree(sdkp);
  out:
@@ -3271,7 +3243,10 @@ static void scsi_disk_release(struct device *dev)
 	struct scsi_disk *sdkp = to_scsi_disk(dev);
 	struct gendisk *disk = sdkp->disk;
 	
-	put_disk_devt(disk->disk_devt);
+	spin_lock(&sd_index_lock);
+	ida_remove(&sd_index_ida, sdkp->index);
+	spin_unlock(&sd_index_lock);
+
 	disk->private_data = NULL;
 	put_disk(disk);
 	put_device(&sdkp->device->sdev_gendev);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 796016e63c1d..5a7da607ca04 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -435,7 +435,6 @@ struct request_queue {
 	struct delayed_work	delay_work;
 
 	struct backing_dev_info	*backing_dev_info;
-	struct disk_devt	*disk_devt;
 
 	/*
 	 * The queue owner gets to use this for whatever they like.
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index a999d281a2f1..76f39754e7b0 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -167,13 +167,6 @@ struct blk_integrity {
 };
 
 #endif	/* CONFIG_BLK_DEV_INTEGRITY */
-struct disk_devt {
-	atomic_t count;
-	void (*release)(struct disk_devt *disk_devt);
-};
-
-void put_disk_devt(struct disk_devt *disk_devt);
-void get_disk_devt(struct disk_devt *disk_devt);
 
 struct gendisk {
 	/* major, first_minor and minors are input parameters only,
@@ -183,7 +176,6 @@ struct gendisk {
 	int first_minor;
 	int minors;                     /* maximum number of minors, =1 for
                                          * disks that can't be partitioned. */
-	struct disk_devt *disk_devt;
 
 	char disk_name[DISK_NAME_LEN];	/* name of major driver */
 	char *(*devnode)(struct gendisk *gd, umode_t *mode);
-- 
cgit v1.2.3


From bd0f9b356d00aa241ced36fb075a07041c28d3b8 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 7 Mar 2017 15:33:14 -0800
Subject: sched/headers: fix up header file dependency on
 <linux/sched/signal.h>

The scheduler header file split and cleanups ended up exposing a few
nasty header file dependencies, and in particular it showed how we in
<linux/wait.h> ended up depending on "signal_pending()", which now comes
from <linux/sched/signal.h>.

That's a very subtle and annoying dependency, which already caused a
semantic merge conflict (see commit e58bc927835a "Pull overlayfs updates
from Miklos Szeredi", which added that fixup in the merge commit).

It turns out that we can avoid this dependency _and_ improve code
generation by moving the guts of the fairly nasty helper #define
__wait_event_interruptible_locked() to out-of-line code.  The code that
includes the signal_pending() check is all in the slow-path where we
actually go to sleep waiting for the event anyway, so using a helper
function is the right thing to do.

Using a helper function is also what we already did for the non-locked
versions, see the "__wait_event*()" macros and the "prepare_to_wait*()"
set of helper functions.

We might want to try to unify all these macro games, we have a _lot_ of
subtly different wait-event loops.  But this is the minimal patch to fix
the annoying header dependency.

Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h | 31 ++++++++++---------------------
 kernel/sched/wait.c  | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index aacb1282d19a..db076ca7f11d 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -620,30 +620,19 @@ do {									\
 	__ret;								\
 })
 
+extern int do_wait_intr(wait_queue_head_t *, wait_queue_t *);
+extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *);
 
-#define __wait_event_interruptible_locked(wq, condition, exclusive, irq) \
+#define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \
 ({									\
-	int __ret = 0;							\
+	int __ret;							\
 	DEFINE_WAIT(__wait);						\
 	if (exclusive)							\
 		__wait.flags |= WQ_FLAG_EXCLUSIVE;			\
 	do {								\
-		if (likely(list_empty(&__wait.task_list)))		\
-			__add_wait_queue_tail(&(wq), &__wait);		\
-		set_current_state(TASK_INTERRUPTIBLE);			\
-		if (signal_pending(current)) {				\
-			__ret = -ERESTARTSYS;				\
+		__ret = fn(&(wq), &__wait);				\
+		if (__ret)						\
 			break;						\
-		}							\
-		if (irq)						\
-			spin_unlock_irq(&(wq).lock);			\
-		else							\
-			spin_unlock(&(wq).lock);			\
-		schedule();						\
-		if (irq)						\
-			spin_lock_irq(&(wq).lock);			\
-		else							\
-			spin_lock(&(wq).lock);				\
 	} while (!(condition));						\
 	__remove_wait_queue(&(wq), &__wait);				\
 	__set_current_state(TASK_RUNNING);				\
@@ -676,7 +665,7 @@ do {									\
  */
 #define wait_event_interruptible_locked(wq, condition)			\
 	((condition)							\
-	 ? 0 : __wait_event_interruptible_locked(wq, condition, 0, 0))
+	 ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr))
 
 /**
  * wait_event_interruptible_locked_irq - sleep until a condition gets true
@@ -703,7 +692,7 @@ do {									\
  */
 #define wait_event_interruptible_locked_irq(wq, condition)		\
 	((condition)							\
-	 ? 0 : __wait_event_interruptible_locked(wq, condition, 0, 1))
+	 ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq))
 
 /**
  * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true
@@ -734,7 +723,7 @@ do {									\
  */
 #define wait_event_interruptible_exclusive_locked(wq, condition)	\
 	((condition)							\
-	 ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 0))
+	 ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr))
 
 /**
  * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true
@@ -765,7 +754,7 @@ do {									\
  */
 #define wait_event_interruptible_exclusive_locked_irq(wq, condition)	\
 	((condition)							\
-	 ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1))
+	 ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq))
 
 
 #define __wait_event_killable(wq, condition)				\
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 4d2ea6f25568..b8c84c6dee64 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -242,6 +242,45 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait_event);
 
+/*
+ * Note! These two wait functions are entered with the
+ * wait-queue lock held (and interrupts off in the _irq
+ * case), so there is no race with testing the wakeup
+ * condition in the caller before they add the wait
+ * entry to the wake queue.
+ */
+int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait)
+{
+	if (likely(list_empty(&wait->task_list)))
+		__add_wait_queue_tail(wq, wait);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+
+	spin_unlock(&wq->lock);
+	schedule();
+	spin_lock(&wq->lock);
+	return 0;
+}
+EXPORT_SYMBOL(do_wait_intr);
+
+int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait)
+{
+	if (likely(list_empty(&wait->task_list)))
+		__add_wait_queue_tail(wq, wait);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+
+	spin_unlock_irq(&wq->lock);
+	schedule();
+	spin_lock_irq(&wq->lock);
+	return 0;
+}
+EXPORT_SYMBOL(do_wait_intr_irq);
+
 /**
  * finish_wait - clean up after waiting in a queue
  * @q: waitqueue waited on
-- 
cgit v1.2.3


From 505a60e225606fbd3d2eadc31ff793d939ba66f1 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 9 Mar 2017 17:24:03 +0300
Subject: asm-generic: introduce 5level-fixup.h

We are going to switch core MM to 5-level paging abstraction.

This is preparation step which adds <asm-generic/5level-fixup.h>
As with 4level-fixup.h, the new header allows quickly make all
architectures compatible with 5-level paging in core MM.

In long run we would like to switch architectures to properly folded p4d
level by using <asm-generic/pgtable-nop4d.h>, but it requires more
changes to arch-specific code.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/4level-fixup.h |  3 ++-
 include/asm-generic/5level-fixup.h | 41 ++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h                 |  3 +++
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 include/asm-generic/5level-fixup.h

(limited to 'include')

diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
index 5bdab6bffd23..928fd66b1271 100644
--- a/include/asm-generic/4level-fixup.h
+++ b/include/asm-generic/4level-fixup.h
@@ -15,7 +15,6 @@
 	((unlikely(pgd_none(*(pud))) && __pmd_alloc(mm, pud, address))? \
  		NULL: pmd_offset(pud, address))
 
-#define pud_alloc(mm, pgd, address)	(pgd)
 #define pud_offset(pgd, start)		(pgd)
 #define pud_none(pud)			0
 #define pud_bad(pud)			0
@@ -35,4 +34,6 @@
 #undef  pud_addr_end
 #define pud_addr_end(addr, end)		(end)
 
+#include <asm-generic/5level-fixup.h>
+
 #endif
diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
new file mode 100644
index 000000000000..b5ca82dc4175
--- /dev/null
+++ b/include/asm-generic/5level-fixup.h
@@ -0,0 +1,41 @@
+#ifndef _5LEVEL_FIXUP_H
+#define _5LEVEL_FIXUP_H
+
+#define __ARCH_HAS_5LEVEL_HACK
+#define __PAGETABLE_P4D_FOLDED
+
+#define P4D_SHIFT			PGDIR_SHIFT
+#define P4D_SIZE			PGDIR_SIZE
+#define P4D_MASK			PGDIR_MASK
+#define PTRS_PER_P4D			1
+
+#define p4d_t				pgd_t
+
+#define pud_alloc(mm, p4d, address) \
+	((unlikely(pgd_none(*(p4d))) && __pud_alloc(mm, p4d, address)) ? \
+		NULL : pud_offset(p4d, address))
+
+#define p4d_alloc(mm, pgd, address)	(pgd)
+#define p4d_offset(pgd, start)		(pgd)
+#define p4d_none(p4d)			0
+#define p4d_bad(p4d)			0
+#define p4d_present(p4d)		1
+#define p4d_ERROR(p4d)			do { } while (0)
+#define p4d_clear(p4d)			pgd_clear(p4d)
+#define p4d_val(p4d)			pgd_val(p4d)
+#define p4d_populate(mm, p4d, pud)	pgd_populate(mm, p4d, pud)
+#define p4d_page(p4d)			pgd_page(p4d)
+#define p4d_page_vaddr(p4d)		pgd_page_vaddr(p4d)
+
+#define __p4d(x)			__pgd(x)
+#define set_p4d(p4dp, p4d)		set_pgd(p4dp, p4d)
+
+#undef p4d_free_tlb
+#define p4d_free_tlb(tlb, x, addr)	do { } while (0)
+#define p4d_free(mm, x)			do { } while (0)
+#define __p4d_free_tlb(tlb, x, addr)	do { } while (0)
+
+#undef  p4d_addr_end
+#define p4d_addr_end(addr, end)		(end)
+
+#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0d65dd72c0f4..be1fe264eb37 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1619,11 +1619,14 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
  * Remove it when 4level-fixup.h has been removed.
  */
 #if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
+
+#ifndef __ARCH_HAS_5LEVEL_HACK
 static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
 	return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
 		NULL: pud_offset(pgd, address);
 }
+#endif /* !__ARCH_HAS_5LEVEL_HACK */
 
 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
-- 
cgit v1.2.3


From 30ec842660bd0d056d4a7028ac5bd4a82b113d4f Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 9 Mar 2017 17:24:04 +0300
Subject: asm-generic: introduce __ARCH_USE_5LEVEL_HACK

We are going to introduce <asm-generic/pgtable-nop4d.h> to provide
abstraction for properly (in opposite to 5level-fixup.h hack) folded
p4d level. The new header will be included from pgtable-nopud.h.

If an architecture uses <asm-generic/nop*d.h>, we cannot use
5level-fixup.h directly to quickly convert the architecture to 5-level
paging as it would conflict with pgtable-nop4d.h.

With this patch an architecture can define __ARCH_USE_5LEVEL_HACK before
inclusion <asm-genenric/nop*d.h> to use 5level-fixup.h.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/pgtable-nop4d-hack.h | 62 ++++++++++++++++++++++++++++++++
 include/asm-generic/pgtable-nopud.h      |  5 +++
 2 files changed, 67 insertions(+)
 create mode 100644 include/asm-generic/pgtable-nop4d-hack.h

(limited to 'include')

diff --git a/include/asm-generic/pgtable-nop4d-hack.h b/include/asm-generic/pgtable-nop4d-hack.h
new file mode 100644
index 000000000000..752fb7511750
--- /dev/null
+++ b/include/asm-generic/pgtable-nop4d-hack.h
@@ -0,0 +1,62 @@
+#ifndef _PGTABLE_NOP4D_HACK_H
+#define _PGTABLE_NOP4D_HACK_H
+
+#ifndef __ASSEMBLY__
+#include <asm-generic/5level-fixup.h>
+
+#define __PAGETABLE_PUD_FOLDED
+
+/*
+ * Having the pud type consist of a pgd gets the size right, and allows
+ * us to conceptually access the pgd entry that this pud is folded into
+ * without casting.
+ */
+typedef struct { pgd_t pgd; } pud_t;
+
+#define PUD_SHIFT	PGDIR_SHIFT
+#define PTRS_PER_PUD	1
+#define PUD_SIZE	(1UL << PUD_SHIFT)
+#define PUD_MASK	(~(PUD_SIZE-1))
+
+/*
+ * The "pgd_xxx()" functions here are trivial for a folded two-level
+ * setup: the pud is never bad, and a pud always exists (as it's folded
+ * into the pgd entry)
+ */
+static inline int pgd_none(pgd_t pgd)		{ return 0; }
+static inline int pgd_bad(pgd_t pgd)		{ return 0; }
+static inline int pgd_present(pgd_t pgd)	{ return 1; }
+static inline void pgd_clear(pgd_t *pgd)	{ }
+#define pud_ERROR(pud)				(pgd_ERROR((pud).pgd))
+
+#define pgd_populate(mm, pgd, pud)		do { } while (0)
+/*
+ * (puds are folded into pgds so this doesn't get actually called,
+ * but the define is needed for a generic inline function.)
+ */
+#define set_pgd(pgdptr, pgdval)	set_pud((pud_t *)(pgdptr), (pud_t) { pgdval })
+
+static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
+{
+	return (pud_t *)pgd;
+}
+
+#define pud_val(x)				(pgd_val((x).pgd))
+#define __pud(x)				((pud_t) { __pgd(x) })
+
+#define pgd_page(pgd)				(pud_page((pud_t){ pgd }))
+#define pgd_page_vaddr(pgd)			(pud_page_vaddr((pud_t){ pgd }))
+
+/*
+ * allocating and freeing a pud is trivial: the 1-entry pud is
+ * inside the pgd, so has no extra memory associated with it.
+ */
+#define pud_alloc_one(mm, address)		NULL
+#define pud_free(mm, x)				do { } while (0)
+#define __pud_free_tlb(tlb, x, a)		do { } while (0)
+
+#undef  pud_addr_end
+#define pud_addr_end(addr, end)			(end)
+
+#endif /* __ASSEMBLY__ */
+#endif /* _PGTABLE_NOP4D_HACK_H */
diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h
index 810431d8351b..5e49430a30a4 100644
--- a/include/asm-generic/pgtable-nopud.h
+++ b/include/asm-generic/pgtable-nopud.h
@@ -3,6 +3,10 @@
 
 #ifndef __ASSEMBLY__
 
+#ifdef __ARCH_USE_5LEVEL_HACK
+#include <asm-generic/pgtable-nop4d-hack.h>
+#else
+
 #define __PAGETABLE_PUD_FOLDED
 
 /*
@@ -58,4 +62,5 @@ static inline pud_t * pud_offset(pgd_t * pgd, unsigned long address)
 #define pud_addr_end(addr, end)			(end)
 
 #endif /* __ASSEMBLY__ */
+#endif /* !__ARCH_USE_5LEVEL_HACK */
 #endif /* _PGTABLE_NOPUD_H */
-- 
cgit v1.2.3


From 048456dcf2c56ad6f6248e2899dda92fb6a613f6 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 9 Mar 2017 17:24:06 +0300
Subject: asm-generic: introduce <asm-generic/pgtable-nop4d.h>

Like with pgtable-nopud.h for 4-level paging, this new header is base
for converting an architectures to properly folded p4d_t level.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/pgtable-nop4d.h | 56 +++++++++++++++++++++++++++++++++++++
 include/asm-generic/pgtable-nopud.h | 43 ++++++++++++++--------------
 include/asm-generic/tlb.h           | 14 ++++++++--
 3 files changed, 89 insertions(+), 24 deletions(-)
 create mode 100644 include/asm-generic/pgtable-nop4d.h

(limited to 'include')

diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h
new file mode 100644
index 000000000000..de364ecb8df6
--- /dev/null
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -0,0 +1,56 @@
+#ifndef _PGTABLE_NOP4D_H
+#define _PGTABLE_NOP4D_H
+
+#ifndef __ASSEMBLY__
+
+#define __PAGETABLE_P4D_FOLDED
+
+typedef struct { pgd_t pgd; } p4d_t;
+
+#define P4D_SHIFT	PGDIR_SHIFT
+#define PTRS_PER_P4D	1
+#define P4D_SIZE	(1UL << P4D_SHIFT)
+#define P4D_MASK	(~(P4D_SIZE-1))
+
+/*
+ * The "pgd_xxx()" functions here are trivial for a folded two-level
+ * setup: the p4d is never bad, and a p4d always exists (as it's folded
+ * into the pgd entry)
+ */
+static inline int pgd_none(pgd_t pgd)		{ return 0; }
+static inline int pgd_bad(pgd_t pgd)		{ return 0; }
+static inline int pgd_present(pgd_t pgd)	{ return 1; }
+static inline void pgd_clear(pgd_t *pgd)	{ }
+#define p4d_ERROR(p4d)				(pgd_ERROR((p4d).pgd))
+
+#define pgd_populate(mm, pgd, p4d)		do { } while (0)
+/*
+ * (p4ds are folded into pgds so this doesn't get actually called,
+ * but the define is needed for a generic inline function.)
+ */
+#define set_pgd(pgdptr, pgdval)	set_p4d((p4d_t *)(pgdptr), (p4d_t) { pgdval })
+
+static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
+{
+	return (p4d_t *)pgd;
+}
+
+#define p4d_val(x)				(pgd_val((x).pgd))
+#define __p4d(x)				((p4d_t) { __pgd(x) })
+
+#define pgd_page(pgd)				(p4d_page((p4d_t){ pgd }))
+#define pgd_page_vaddr(pgd)			(p4d_page_vaddr((p4d_t){ pgd }))
+
+/*
+ * allocating and freeing a p4d is trivial: the 1-entry p4d is
+ * inside the pgd, so has no extra memory associated with it.
+ */
+#define p4d_alloc_one(mm, address)		NULL
+#define p4d_free(mm, x)				do { } while (0)
+#define __p4d_free_tlb(tlb, x, a)		do { } while (0)
+
+#undef  p4d_addr_end
+#define p4d_addr_end(addr, end)			(end)
+
+#endif /* __ASSEMBLY__ */
+#endif /* _PGTABLE_NOP4D_H */
diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h
index 5e49430a30a4..c2b9b96d6268 100644
--- a/include/asm-generic/pgtable-nopud.h
+++ b/include/asm-generic/pgtable-nopud.h
@@ -6,53 +6,54 @@
 #ifdef __ARCH_USE_5LEVEL_HACK
 #include <asm-generic/pgtable-nop4d-hack.h>
 #else
+#include <asm-generic/pgtable-nop4d.h>
 
 #define __PAGETABLE_PUD_FOLDED
 
 /*
- * Having the pud type consist of a pgd gets the size right, and allows
- * us to conceptually access the pgd entry that this pud is folded into
+ * Having the pud type consist of a p4d gets the size right, and allows
+ * us to conceptually access the p4d entry that this pud is folded into
  * without casting.
  */
-typedef struct { pgd_t pgd; } pud_t;
+typedef struct { p4d_t p4d; } pud_t;
 
-#define PUD_SHIFT	PGDIR_SHIFT
+#define PUD_SHIFT	P4D_SHIFT
 #define PTRS_PER_PUD	1
 #define PUD_SIZE  	(1UL << PUD_SHIFT)
 #define PUD_MASK  	(~(PUD_SIZE-1))
 
 /*
- * The "pgd_xxx()" functions here are trivial for a folded two-level
+ * The "p4d_xxx()" functions here are trivial for a folded two-level
  * setup: the pud is never bad, and a pud always exists (as it's folded
- * into the pgd entry)
+ * into the p4d entry)
  */
-static inline int pgd_none(pgd_t pgd)		{ return 0; }
-static inline int pgd_bad(pgd_t pgd)		{ return 0; }
-static inline int pgd_present(pgd_t pgd)	{ return 1; }
-static inline void pgd_clear(pgd_t *pgd)	{ }
-#define pud_ERROR(pud)				(pgd_ERROR((pud).pgd))
+static inline int p4d_none(p4d_t p4d)		{ return 0; }
+static inline int p4d_bad(p4d_t p4d)		{ return 0; }
+static inline int p4d_present(p4d_t p4d)	{ return 1; }
+static inline void p4d_clear(p4d_t *p4d)	{ }
+#define pud_ERROR(pud)				(p4d_ERROR((pud).p4d))
 
-#define pgd_populate(mm, pgd, pud)		do { } while (0)
+#define p4d_populate(mm, p4d, pud)		do { } while (0)
 /*
- * (puds are folded into pgds so this doesn't get actually called,
+ * (puds are folded into p4ds so this doesn't get actually called,
  * but the define is needed for a generic inline function.)
  */
-#define set_pgd(pgdptr, pgdval)			set_pud((pud_t *)(pgdptr), (pud_t) { pgdval })
+#define set_p4d(p4dptr, p4dval)	set_pud((pud_t *)(p4dptr), (pud_t) { p4dval })
 
-static inline pud_t * pud_offset(pgd_t * pgd, unsigned long address)
+static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
 {
-	return (pud_t *)pgd;
+	return (pud_t *)p4d;
 }
 
-#define pud_val(x)				(pgd_val((x).pgd))
-#define __pud(x)				((pud_t) { __pgd(x) } )
+#define pud_val(x)				(p4d_val((x).p4d))
+#define __pud(x)				((pud_t) { __p4d(x) })
 
-#define pgd_page(pgd)				(pud_page((pud_t){ pgd }))
-#define pgd_page_vaddr(pgd)			(pud_page_vaddr((pud_t){ pgd }))
+#define p4d_page(p4d)				(pud_page((pud_t){ p4d }))
+#define p4d_page_vaddr(p4d)			(pud_page_vaddr((pud_t){ p4d }))
 
 /*
  * allocating and freeing a pud is trivial: the 1-entry pud is
- * inside the pgd, so has no extra memory associated with it.
+ * inside the p4d, so has no extra memory associated with it.
  */
 #define pud_alloc_one(mm, address)		NULL
 #define pud_free(mm, x)				do { } while (0)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 4329bc6ef04b..8afa4335e5b2 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -270,6 +270,12 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 		__pte_free_tlb(tlb, ptep, address);		\
 	} while (0)
 
+#define pmd_free_tlb(tlb, pmdp, address)			\
+	do {							\
+		__tlb_adjust_range(tlb, address, PAGE_SIZE);		\
+		__pmd_free_tlb(tlb, pmdp, address);		\
+	} while (0)
+
 #ifndef __ARCH_HAS_4LEVEL_HACK
 #define pud_free_tlb(tlb, pudp, address)			\
 	do {							\
@@ -278,11 +284,13 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 	} while (0)
 #endif
 
-#define pmd_free_tlb(tlb, pmdp, address)			\
+#ifndef __ARCH_HAS_5LEVEL_HACK
+#define p4d_free_tlb(tlb, pudp, address)			\
 	do {							\
-		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
-		__pmd_free_tlb(tlb, pmdp, address);		\
+		__tlb_adjust_range(tlb, address, PAGE_SIZE);		\
+		__p4d_free_tlb(tlb, pudp, address);		\
 	} while (0)
+#endif
 
 #define tlb_migrate_finish(mm) do {} while (0)
 
-- 
cgit v1.2.3


From c2febafc67734a62196c1b9dfba926412d4077ba Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 9 Mar 2017 17:24:07 +0300
Subject: mm: convert generic code to 5-level paging

Convert all non-architecture-specific code to 5-level paging.

It's mostly mechanical adding handling one more page table level in
places where we deal with pud_t.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/misc/sgi-gru/grufault.c |   9 +-
 fs/userfaultfd.c                |   6 +-
 include/asm-generic/pgtable.h   |  48 +++++++++-
 include/linux/hugetlb.h         |   5 +-
 include/linux/kasan.h           |   1 +
 include/linux/mm.h              |  31 ++++--
 lib/ioremap.c                   |  39 +++++++-
 mm/gup.c                        |  46 +++++++--
 mm/huge_memory.c                |   7 +-
 mm/hugetlb.c                    |  29 +++---
 mm/kasan/kasan_init.c           |  44 ++++++++-
 mm/memory.c                     | 207 +++++++++++++++++++++++++++++++++-------
 mm/mlock.c                      |   1 +
 mm/mprotect.c                   |  26 ++++-
 mm/mremap.c                     |  13 ++-
 mm/page_vma_mapped.c            |   6 +-
 mm/pagewalk.c                   |  32 ++++++-
 mm/pgtable-generic.c            |   6 ++
 mm/rmap.c                       |   7 +-
 mm/sparse-vmemmap.c             |  22 ++++-
 mm/swapfile.c                   |  26 ++++-
 mm/userfaultfd.c                |  23 +++--
 mm/vmalloc.c                    |  81 ++++++++++++----
 23 files changed, 595 insertions(+), 120 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
index 6fb773dbcd0c..93be82fc338a 100644
--- a/drivers/misc/sgi-gru/grufault.c
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -219,15 +219,20 @@ static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr,
 	int write, unsigned long *paddr, int *pageshift)
 {
 	pgd_t *pgdp;
-	pmd_t *pmdp;
+	p4d_t *p4dp;
 	pud_t *pudp;
+	pmd_t *pmdp;
 	pte_t pte;
 
 	pgdp = pgd_offset(vma->vm_mm, vaddr);
 	if (unlikely(pgd_none(*pgdp)))
 		goto err;
 
-	pudp = pud_offset(pgdp, vaddr);
+	p4dp = p4d_offset(pgdp, vaddr);
+	if (unlikely(p4d_none(*p4dp)))
+		goto err;
+
+	pudp = pud_offset(p4dp, vaddr);
 	if (unlikely(pud_none(*pudp)))
 		goto err;
 
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 973607df579d..02ce3944d0f5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -267,6 +267,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 {
 	struct mm_struct *mm = ctx->mm;
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd, _pmd;
 	pte_t *pte;
@@ -277,7 +278,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
 		goto out;
-	pud = pud_offset(pgd, address);
+	p4d = p4d_offset(pgd, address);
+	if (!p4d_present(*p4d))
+		goto out;
+	pud = pud_offset(p4d, address);
 	if (!pud_present(*pud))
 		goto out;
 	pmd = pmd_offset(pud, address);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index f4ca23b158b3..1fad160f35de 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -10,9 +10,9 @@
 #include <linux/bug.h>
 #include <linux/errno.h>
 
-#if 4 - defined(__PAGETABLE_PUD_FOLDED) - defined(__PAGETABLE_PMD_FOLDED) != \
-	CONFIG_PGTABLE_LEVELS
-#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{PUD,PMD}_FOLDED
+#if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
+	defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
+#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED
 #endif
 
 /*
@@ -424,6 +424,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
 })
 
+#ifndef p4d_addr_end
+#define p4d_addr_end(addr, end)						\
+({	unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK;	\
+	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
+})
+#endif
+
 #ifndef pud_addr_end
 #define pud_addr_end(addr, end)						\
 ({	unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK;	\
@@ -444,6 +451,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
  * Do the tests inline, but report and clear the bad entry in mm/memory.c.
  */
 void pgd_clear_bad(pgd_t *);
+void p4d_clear_bad(p4d_t *);
 void pud_clear_bad(pud_t *);
 void pmd_clear_bad(pmd_t *);
 
@@ -458,6 +466,17 @@ static inline int pgd_none_or_clear_bad(pgd_t *pgd)
 	return 0;
 }
 
+static inline int p4d_none_or_clear_bad(p4d_t *p4d)
+{
+	if (p4d_none(*p4d))
+		return 1;
+	if (unlikely(p4d_bad(*p4d))) {
+		p4d_clear_bad(p4d);
+		return 1;
+	}
+	return 0;
+}
+
 static inline int pud_none_or_clear_bad(pud_t *pud)
 {
 	if (pud_none(*pud))
@@ -844,11 +863,30 @@ static inline int pmd_protnone(pmd_t pmd)
 #endif /* CONFIG_MMU */
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+
+#ifndef __PAGETABLE_P4D_FOLDED
+int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot);
+int p4d_clear_huge(p4d_t *p4d);
+#else
+static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
+{
+	return 0;
+}
+static inline int p4d_clear_huge(p4d_t *p4d)
+{
+	return 0;
+}
+#endif /* !__PAGETABLE_P4D_FOLDED */
+
 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
 int pud_clear_huge(pud_t *pud);
 int pmd_clear_huge(pmd_t *pmd);
 #else	/* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
+{
+	return 0;
+}
 static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 {
 	return 0;
@@ -857,6 +895,10 @@ static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 {
 	return 0;
 }
+static inline int p4d_clear_huge(p4d_t *p4d)
+{
+	return 0;
+}
 static inline int pud_clear_huge(pud_t *pud)
 {
 	return 0;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 503099d8aada..b857fc8cc2ec 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -122,7 +122,7 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
 				pud_t *pud, int flags);
 int pmd_huge(pmd_t pmd);
-int pud_huge(pud_t pmd);
+int pud_huge(pud_t pud);
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot);
 
@@ -197,6 +197,9 @@ static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
 #ifndef pgd_huge
 #define pgd_huge(x)	0
 #endif
+#ifndef p4d_huge
+#define p4d_huge(x)	0
+#endif
 
 #ifndef pgd_write
 static inline int pgd_write(pgd_t pgd)
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index ceb3fe78a0d3..1c823bef4c15 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -18,6 +18,7 @@ extern unsigned char kasan_zero_page[PAGE_SIZE];
 extern pte_t kasan_zero_pte[PTRS_PER_PTE];
 extern pmd_t kasan_zero_pmd[PTRS_PER_PMD];
 extern pud_t kasan_zero_pud[PTRS_PER_PUD];
+extern p4d_t kasan_zero_p4d[PTRS_PER_P4D];
 
 void kasan_populate_zero_shadow(const void *shadow_start,
 				const void *shadow_end);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index be1fe264eb37..5f01c88f0800 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1560,14 +1560,24 @@ static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
 	return ptep;
 }
 
+#ifdef __PAGETABLE_P4D_FOLDED
+static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
+						unsigned long address)
+{
+	return 0;
+}
+#else
+int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
+#endif
+
 #ifdef __PAGETABLE_PUD_FOLDED
-static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd,
+static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
 						unsigned long address)
 {
 	return 0;
 }
 #else
-int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
+int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
 #endif
 
 #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
@@ -1621,10 +1631,18 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
 #if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
 
 #ifndef __ARCH_HAS_5LEVEL_HACK
-static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
+		unsigned long address)
+{
+	return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ?
+		NULL : p4d_offset(pgd, address);
+}
+
+static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
+		unsigned long address)
 {
-	return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
-		NULL: pud_offset(pgd, address);
+	return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
+		NULL : pud_offset(p4d, address);
 }
 #endif /* !__ARCH_HAS_5LEVEL_HACK */
 
@@ -2388,7 +2406,8 @@ void sparse_mem_maps_populate_node(struct page **map_map,
 
 struct page *sparse_mem_map_populate(unsigned long pnum, int nid);
 pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
-pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node);
+p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
+pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
 void *vmemmap_alloc_block(unsigned long size, int node);
diff --git a/lib/ioremap.c b/lib/ioremap.c
index a3e14ce92a56..4bb30206b942 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -14,6 +14,7 @@
 #include <asm/pgtable.h>
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+static int __read_mostly ioremap_p4d_capable;
 static int __read_mostly ioremap_pud_capable;
 static int __read_mostly ioremap_pmd_capable;
 static int __read_mostly ioremap_huge_disabled;
@@ -35,6 +36,11 @@ void __init ioremap_huge_init(void)
 	}
 }
 
+static inline int ioremap_p4d_enabled(void)
+{
+	return ioremap_p4d_capable;
+}
+
 static inline int ioremap_pud_enabled(void)
 {
 	return ioremap_pud_capable;
@@ -46,6 +52,7 @@ static inline int ioremap_pmd_enabled(void)
 }
 
 #else	/* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static inline int ioremap_p4d_enabled(void) { return 0; }
 static inline int ioremap_pud_enabled(void) { return 0; }
 static inline int ioremap_pmd_enabled(void) { return 0; }
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
@@ -94,14 +101,14 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
 	return 0;
 }
 
-static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
+static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
 		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
 	pud_t *pud;
 	unsigned long next;
 
 	phys_addr -= addr;
-	pud = pud_alloc(&init_mm, pgd, addr);
+	pud = pud_alloc(&init_mm, p4d, addr);
 	if (!pud)
 		return -ENOMEM;
 	do {
@@ -120,6 +127,32 @@ static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
 	return 0;
 }
 
+static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	p4d_t *p4d;
+	unsigned long next;
+
+	phys_addr -= addr;
+	p4d = p4d_alloc(&init_mm, pgd, addr);
+	if (!p4d)
+		return -ENOMEM;
+	do {
+		next = p4d_addr_end(addr, end);
+
+		if (ioremap_p4d_enabled() &&
+		    ((next - addr) == P4D_SIZE) &&
+		    IS_ALIGNED(phys_addr + addr, P4D_SIZE)) {
+			if (p4d_set_huge(p4d, phys_addr + addr, prot))
+				continue;
+		}
+
+		if (ioremap_pud_range(p4d, addr, next, phys_addr + addr, prot))
+			return -ENOMEM;
+	} while (p4d++, addr = next, addr != end);
+	return 0;
+}
+
 int ioremap_page_range(unsigned long addr,
 		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
@@ -135,7 +168,7 @@ int ioremap_page_range(unsigned long addr,
 	pgd = pgd_offset_k(addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot);
+		err = ioremap_p4d_range(pgd, addr, next, phys_addr+addr, prot);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
diff --git a/mm/gup.c b/mm/gup.c
index 9c047e951aa3..c74bad1bf6e8 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -226,6 +226,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 			      unsigned int *page_mask)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 	spinlock_t *ptl;
@@ -243,8 +244,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 		return no_page_table(vma, flags);
-
-	pud = pud_offset(pgd, address);
+	p4d = p4d_offset(pgd, address);
+	if (p4d_none(*p4d))
+		return no_page_table(vma, flags);
+	BUILD_BUG_ON(p4d_huge(*p4d));
+	if (unlikely(p4d_bad(*p4d)))
+		return no_page_table(vma, flags);
+	pud = pud_offset(p4d, address);
 	if (pud_none(*pud))
 		return no_page_table(vma, flags);
 	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
@@ -325,6 +331,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
 		struct page **page)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
@@ -338,7 +345,9 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
 	else
 		pgd = pgd_offset_gate(mm, address);
 	BUG_ON(pgd_none(*pgd));
-	pud = pud_offset(pgd, address);
+	p4d = p4d_offset(pgd, address);
+	BUG_ON(p4d_none(*p4d));
+	pud = pud_offset(p4d, address);
 	BUG_ON(pud_none(*pud));
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd))
@@ -1400,13 +1409,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 	return 1;
 }
 
-static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
 			 int write, struct page **pages, int *nr)
 {
 	unsigned long next;
 	pud_t *pudp;
 
-	pudp = pud_offset(&pgd, addr);
+	pudp = pud_offset(&p4d, addr);
 	do {
 		pud_t pud = READ_ONCE(*pudp);
 
@@ -1428,6 +1437,31 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 	return 1;
 }
 
+static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
+			 int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	p4d_t *p4dp;
+
+	p4dp = p4d_offset(&pgd, addr);
+	do {
+		p4d_t p4d = READ_ONCE(*p4dp);
+
+		next = p4d_addr_end(addr, end);
+		if (p4d_none(p4d))
+			return 0;
+		BUILD_BUG_ON(p4d_huge(p4d));
+		if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
+			if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
+					 P4D_SHIFT, next, write, pages, nr))
+				return 0;
+		} else if (!gup_p4d_range(p4d, addr, next, write, pages, nr))
+			return 0;
+	} while (p4dp++, addr = next, addr != end);
+
+	return 1;
+}
+
 /*
  * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
  * the regular GUP. It will only return non-negative values.
@@ -1478,7 +1512,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
 					 PGDIR_SHIFT, next, write, pages, &nr))
 				break;
-		} else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+		} else if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
 			break;
 	} while (pgdp++, addr = next, addr != end);
 	local_irq_restore(flags);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d36b2af4d1bf..e4766de25709 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2048,6 +2048,7 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
 		bool freeze, struct page *page)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 
@@ -2055,7 +2056,11 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
 	if (!pgd_present(*pgd))
 		return;
 
-	pud = pud_offset(pgd, address);
+	p4d = p4d_offset(pgd, address);
+	if (!p4d_present(*p4d))
+		return;
+
+	pud = pud_offset(p4d, address);
 	if (!pud_present(*pud))
 		return;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a7aa811b7d14..3d0aab9ee80d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4555,7 +4555,8 @@ out:
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 {
 	pgd_t *pgd = pgd_offset(mm, *addr);
-	pud_t *pud = pud_offset(pgd, *addr);
+	p4d_t *p4d = p4d_offset(pgd, *addr);
+	pud_t *pud = pud_offset(p4d, *addr);
 
 	BUG_ON(page_count(virt_to_page(ptep)) == 0);
 	if (page_count(virt_to_page(ptep)) == 1)
@@ -4586,11 +4587,13 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pte_t *pte = NULL;
 
 	pgd = pgd_offset(mm, addr);
-	pud = pud_alloc(mm, pgd, addr);
+	p4d = p4d_offset(pgd, addr);
+	pud = pud_alloc(mm, p4d, addr);
 	if (pud) {
 		if (sz == PUD_SIZE) {
 			pte = (pte_t *)pud;
@@ -4610,18 +4613,22 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
-	pmd_t *pmd = NULL;
+	pmd_t *pmd;
 
 	pgd = pgd_offset(mm, addr);
-	if (pgd_present(*pgd)) {
-		pud = pud_offset(pgd, addr);
-		if (pud_present(*pud)) {
-			if (pud_huge(*pud))
-				return (pte_t *)pud;
-			pmd = pmd_offset(pud, addr);
-		}
-	}
+	if (!pgd_present(*pgd))
+		return NULL;
+	p4d = p4d_offset(pgd, addr);
+	if (!p4d_present(*p4d))
+		return NULL;
+	pud = pud_offset(p4d, addr);
+	if (!pud_present(*pud))
+		return NULL;
+	if (pud_huge(*pud))
+		return (pte_t *)pud;
+	pmd = pmd_offset(pud, addr);
 	return (pte_t *) pmd;
 }
 
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index 31238dad85fb..b96a5f773d88 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -30,6 +30,9 @@
  */
 unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
 
+#if CONFIG_PGTABLE_LEVELS > 4
+p4d_t kasan_zero_p4d[PTRS_PER_P4D] __page_aligned_bss;
+#endif
 #if CONFIG_PGTABLE_LEVELS > 3
 pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
 #endif
@@ -82,10 +85,10 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr,
 	} while (pmd++, addr = next, addr != end);
 }
 
-static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
+static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr,
 				unsigned long end)
 {
-	pud_t *pud = pud_offset(pgd, addr);
+	pud_t *pud = pud_offset(p4d, addr);
 	unsigned long next;
 
 	do {
@@ -107,6 +110,23 @@ static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
 	} while (pud++, addr = next, addr != end);
 }
 
+static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
+				unsigned long end)
+{
+	p4d_t *p4d = p4d_offset(pgd, addr);
+	unsigned long next;
+
+	do {
+		next = p4d_addr_end(addr, end);
+
+		if (p4d_none(*p4d)) {
+			p4d_populate(&init_mm, p4d,
+				early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+		}
+		zero_pud_populate(p4d, addr, next);
+	} while (p4d++, addr = next, addr != end);
+}
+
 /**
  * kasan_populate_zero_shadow - populate shadow memory region with
  *                               kasan_zero_page
@@ -125,6 +145,7 @@ void __init kasan_populate_zero_shadow(const void *shadow_start,
 		next = pgd_addr_end(addr, end);
 
 		if (IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) {
+			p4d_t *p4d;
 			pud_t *pud;
 			pmd_t *pmd;
 
@@ -135,9 +156,22 @@ void __init kasan_populate_zero_shadow(const void *shadow_start,
 			 * 3,2 - level page tables where we don't have
 			 * puds,pmds, so pgd_populate(), pud_populate()
 			 * is noops.
+			 *
+			 * The ifndef is required to avoid build breakage.
+			 *
+			 * With 5level-fixup.h, pgd_populate() is not nop and
+			 * we reference kasan_zero_p4d. It's not defined
+			 * unless 5-level paging enabled.
+			 *
+			 * The ifndef can be dropped once all KASAN-enabled
+			 * architectures will switch to pgtable-nop4d.h.
 			 */
-			pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_pud));
-			pud = pud_offset(pgd, addr);
+#ifndef __ARCH_HAS_5LEVEL_HACK
+			pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_p4d));
+#endif
+			p4d = p4d_offset(pgd, addr);
+			p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud));
+			pud = pud_offset(p4d, addr);
 			pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
 			pmd = pmd_offset(pud, addr);
 			pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
@@ -148,6 +182,6 @@ void __init kasan_populate_zero_shadow(const void *shadow_start,
 			pgd_populate(&init_mm, pgd,
 				early_alloc(PAGE_SIZE, NUMA_NO_NODE));
 		}
-		zero_pud_populate(pgd, addr, next);
+		zero_p4d_populate(pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
 }
diff --git a/mm/memory.c b/mm/memory.c
index a97a4cec2e1f..7f1c2163b3ce 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -445,7 +445,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 	mm_dec_nr_pmds(tlb->mm);
 }
 
-static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 				unsigned long addr, unsigned long end,
 				unsigned long floor, unsigned long ceiling)
 {
@@ -454,7 +454,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	unsigned long start;
 
 	start = addr;
-	pud = pud_offset(pgd, addr);
+	pud = pud_offset(p4d, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
@@ -462,6 +462,39 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
 	} while (pud++, addr = next, addr != end);
 
+	start &= P4D_MASK;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= P4D_MASK;
+		if (!ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		return;
+
+	pud = pud_offset(p4d, start);
+	p4d_clear(p4d);
+	pud_free_tlb(tlb, pud, start);
+}
+
+static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
+				unsigned long addr, unsigned long end,
+				unsigned long floor, unsigned long ceiling)
+{
+	p4d_t *p4d;
+	unsigned long next;
+	unsigned long start;
+
+	start = addr;
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		if (p4d_none_or_clear_bad(p4d))
+			continue;
+		free_pud_range(tlb, p4d, addr, next, floor, ceiling);
+	} while (p4d++, addr = next, addr != end);
+
 	start &= PGDIR_MASK;
 	if (start < floor)
 		return;
@@ -473,9 +506,9 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	if (end - 1 > ceiling - 1)
 		return;
 
-	pud = pud_offset(pgd, start);
+	p4d = p4d_offset(pgd, start);
 	pgd_clear(pgd);
-	pud_free_tlb(tlb, pud, start);
+	p4d_free_tlb(tlb, p4d, start);
 }
 
 /*
@@ -539,7 +572,7 @@ void free_pgd_range(struct mmu_gather *tlb,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
+		free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
 	} while (pgd++, addr = next, addr != end);
 }
 
@@ -658,7 +691,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 			  pte_t pte, struct page *page)
 {
 	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
-	pud_t *pud = pud_offset(pgd, addr);
+	p4d_t *p4d = p4d_offset(pgd, addr);
+	pud_t *pud = pud_offset(p4d, addr);
 	pmd_t *pmd = pmd_offset(pud, addr);
 	struct address_space *mapping;
 	pgoff_t index;
@@ -1023,16 +1057,16 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
 }
 
 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
+		p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end)
 {
 	pud_t *src_pud, *dst_pud;
 	unsigned long next;
 
-	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
+	dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
 	if (!dst_pud)
 		return -ENOMEM;
-	src_pud = pud_offset(src_pgd, addr);
+	src_pud = pud_offset(src_p4d, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
@@ -1056,6 +1090,28 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
 	return 0;
 }
 
+static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
+		unsigned long addr, unsigned long end)
+{
+	p4d_t *src_p4d, *dst_p4d;
+	unsigned long next;
+
+	dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
+	if (!dst_p4d)
+		return -ENOMEM;
+	src_p4d = p4d_offset(src_pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		if (p4d_none_or_clear_bad(src_p4d))
+			continue;
+		if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
+						vma, addr, next))
+			return -ENOMEM;
+	} while (dst_p4d++, src_p4d++, addr = next, addr != end);
+	return 0;
+}
+
 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		struct vm_area_struct *vma)
 {
@@ -1111,7 +1167,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(src_pgd))
 			continue;
-		if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+		if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
 					    vma, addr, next))) {
 			ret = -ENOMEM;
 			break;
@@ -1267,14 +1323,14 @@ next:
 }
 
 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
-				struct vm_area_struct *vma, pgd_t *pgd,
+				struct vm_area_struct *vma, p4d_t *p4d,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
 	pud_t *pud;
 	unsigned long next;
 
-	pud = pud_offset(pgd, addr);
+	pud = pud_offset(p4d, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
@@ -1295,6 +1351,25 @@ next:
 	return addr;
 }
 
+static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma, pgd_t *pgd,
+				unsigned long addr, unsigned long end,
+				struct zap_details *details)
+{
+	p4d_t *p4d;
+	unsigned long next;
+
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		if (p4d_none_or_clear_bad(p4d))
+			continue;
+		next = zap_pud_range(tlb, vma, p4d, addr, next, details);
+	} while (p4d++, addr = next, addr != end);
+
+	return addr;
+}
+
 void unmap_page_range(struct mmu_gather *tlb,
 			     struct vm_area_struct *vma,
 			     unsigned long addr, unsigned long end,
@@ -1310,7 +1385,7 @@ void unmap_page_range(struct mmu_gather *tlb,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		next = zap_pud_range(tlb, vma, pgd, addr, next, details);
+		next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
 	} while (pgd++, addr = next, addr != end);
 	tlb_end_vma(tlb, vma);
 }
@@ -1465,16 +1540,24 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes);
 pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
 			spinlock_t **ptl)
 {
-	pgd_t *pgd = pgd_offset(mm, addr);
-	pud_t *pud = pud_alloc(mm, pgd, addr);
-	if (pud) {
-		pmd_t *pmd = pmd_alloc(mm, pud, addr);
-		if (pmd) {
-			VM_BUG_ON(pmd_trans_huge(*pmd));
-			return pte_alloc_map_lock(mm, pmd, addr, ptl);
-		}
-	}
-	return NULL;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(mm, addr);
+	p4d = p4d_alloc(mm, pgd, addr);
+	if (!p4d)
+		return NULL;
+	pud = pud_alloc(mm, p4d, addr);
+	if (!pud)
+		return NULL;
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		return NULL;
+
+	VM_BUG_ON(pmd_trans_huge(*pmd));
+	return pte_alloc_map_lock(mm, pmd, addr, ptl);
 }
 
 /*
@@ -1740,7 +1823,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
 	return 0;
 }
 
-static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
 			unsigned long addr, unsigned long end,
 			unsigned long pfn, pgprot_t prot)
 {
@@ -1748,7 +1831,7 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
 	unsigned long next;
 
 	pfn -= addr >> PAGE_SHIFT;
-	pud = pud_alloc(mm, pgd, addr);
+	pud = pud_alloc(mm, p4d, addr);
 	if (!pud)
 		return -ENOMEM;
 	do {
@@ -1760,6 +1843,26 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
 	return 0;
 }
 
+static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
+			unsigned long addr, unsigned long end,
+			unsigned long pfn, pgprot_t prot)
+{
+	p4d_t *p4d;
+	unsigned long next;
+
+	pfn -= addr >> PAGE_SHIFT;
+	p4d = p4d_alloc(mm, pgd, addr);
+	if (!p4d)
+		return -ENOMEM;
+	do {
+		next = p4d_addr_end(addr, end);
+		if (remap_pud_range(mm, p4d, addr, next,
+				pfn + (addr >> PAGE_SHIFT), prot))
+			return -ENOMEM;
+	} while (p4d++, addr = next, addr != end);
+	return 0;
+}
+
 /**
  * remap_pfn_range - remap kernel memory to userspace
  * @vma: user vma to map to
@@ -1816,7 +1919,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	flush_cache_range(vma, addr, end);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = remap_pud_range(mm, pgd, addr, next,
+		err = remap_p4d_range(mm, pgd, addr, next,
 				pfn + (addr >> PAGE_SHIFT), prot);
 		if (err)
 			break;
@@ -1932,7 +2035,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
 	return err;
 }
 
-static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
 				     unsigned long addr, unsigned long end,
 				     pte_fn_t fn, void *data)
 {
@@ -1940,7 +2043,7 @@ static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
 	unsigned long next;
 	int err;
 
-	pud = pud_alloc(mm, pgd, addr);
+	pud = pud_alloc(mm, p4d, addr);
 	if (!pud)
 		return -ENOMEM;
 	do {
@@ -1952,6 +2055,26 @@ static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
 	return err;
 }
 
+static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
+{
+	p4d_t *p4d;
+	unsigned long next;
+	int err;
+
+	p4d = p4d_alloc(mm, pgd, addr);
+	if (!p4d)
+		return -ENOMEM;
+	do {
+		next = p4d_addr_end(addr, end);
+		err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
+		if (err)
+			break;
+	} while (p4d++, addr = next, addr != end);
+	return err;
+}
+
 /*
  * Scan a region of virtual memory, filling in page tables as necessary
  * and calling a provided function on each leaf page table.
@@ -1970,7 +2093,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
+		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
@@ -3653,11 +3776,15 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	};
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
+	p4d_t *p4d;
 	int ret;
 
 	pgd = pgd_offset(mm, address);
+	p4d = p4d_alloc(mm, pgd, address);
+	if (!p4d)
+		return VM_FAULT_OOM;
 
-	vmf.pud = pud_alloc(mm, pgd, address);
+	vmf.pud = pud_alloc(mm, p4d, address);
 	if (!vmf.pud)
 		return VM_FAULT_OOM;
 	if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
@@ -3784,7 +3911,7 @@ EXPORT_SYMBOL_GPL(handle_mm_fault);
  * Allocate page upper directory.
  * We've already handled the fast-path in-line.
  */
-int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
 {
 	pud_t *new = pud_alloc_one(mm, address);
 	if (!new)
@@ -3793,10 +3920,17 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 	smp_wmb(); /* See comment in __pte_alloc */
 
 	spin_lock(&mm->page_table_lock);
-	if (pgd_present(*pgd))		/* Another has populated it */
+#ifndef __ARCH_HAS_5LEVEL_HACK
+	if (p4d_present(*p4d))		/* Another has populated it */
+		pud_free(mm, new);
+	else
+		p4d_populate(mm, p4d, new);
+#else
+	if (pgd_present(*p4d))		/* Another has populated it */
 		pud_free(mm, new);
 	else
-		pgd_populate(mm, pgd, new);
+		pgd_populate(mm, p4d, new);
+#endif /* __ARCH_HAS_5LEVEL_HACK */
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
@@ -3839,6 +3973,7 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
 		pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *ptep;
@@ -3847,7 +3982,11 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 		goto out;
 
-	pud = pud_offset(pgd, address);
+	p4d = p4d_offset(pgd, address);
+	if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
+		goto out;
+
+	pud = pud_offset(p4d, address);
 	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
 		goto out;
 
diff --git a/mm/mlock.c b/mm/mlock.c
index 1050511f8b2b..945edac46810 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -380,6 +380,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
 	pte = get_locked_pte(vma->vm_mm, start,	&ptl);
 	/* Make sure we do not cross the page table boundary */
 	end = pgd_addr_end(start, end);
+	end = p4d_addr_end(start, end);
 	end = pud_addr_end(start, end);
 	end = pmd_addr_end(start, end);
 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 848e946b08e5..8edd0d576254 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -193,14 +193,14 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 }
 
 static inline unsigned long change_pud_range(struct vm_area_struct *vma,
-		pgd_t *pgd, unsigned long addr, unsigned long end,
+		p4d_t *p4d, unsigned long addr, unsigned long end,
 		pgprot_t newprot, int dirty_accountable, int prot_numa)
 {
 	pud_t *pud;
 	unsigned long next;
 	unsigned long pages = 0;
 
-	pud = pud_offset(pgd, addr);
+	pud = pud_offset(p4d, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
@@ -212,6 +212,26 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma,
 	return pages;
 }
 
+static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
+		pgd_t *pgd, unsigned long addr, unsigned long end,
+		pgprot_t newprot, int dirty_accountable, int prot_numa)
+{
+	p4d_t *p4d;
+	unsigned long next;
+	unsigned long pages = 0;
+
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		if (p4d_none_or_clear_bad(p4d))
+			continue;
+		pages += change_pud_range(vma, p4d, addr, next, newprot,
+				 dirty_accountable, prot_numa);
+	} while (p4d++, addr = next, addr != end);
+
+	return pages;
+}
+
 static unsigned long change_protection_range(struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end, pgprot_t newprot,
 		int dirty_accountable, int prot_numa)
@@ -230,7 +250,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		pages += change_pud_range(vma, pgd, addr, next, newprot,
+		pages += change_p4d_range(vma, pgd, addr, next, newprot,
 				 dirty_accountable, prot_numa);
 	} while (pgd++, addr = next, addr != end);
 
diff --git a/mm/mremap.c b/mm/mremap.c
index 8233b0105c82..cd8a1b199ef9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -32,6 +32,7 @@
 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 
@@ -39,7 +40,11 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 	if (pgd_none_or_clear_bad(pgd))
 		return NULL;
 
-	pud = pud_offset(pgd, addr);
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none_or_clear_bad(p4d))
+		return NULL;
+
+	pud = pud_offset(p4d, addr);
 	if (pud_none_or_clear_bad(pud))
 		return NULL;
 
@@ -54,11 +59,15 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 			    unsigned long addr)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 
 	pgd = pgd_offset(mm, addr);
-	pud = pud_alloc(mm, pgd, addr);
+	p4d = p4d_alloc(mm, pgd, addr);
+	if (!p4d)
+		return NULL;
+	pud = pud_alloc(mm, p4d, addr);
 	if (!pud)
 		return NULL;
 
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index a23001a22c15..c4c9def8ffea 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -104,6 +104,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 	struct mm_struct *mm = pvmw->vma->vm_mm;
 	struct page *page = pvmw->page;
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 
 	/* The only possible pmd mapping has been handled on last iteration */
@@ -133,7 +134,10 @@ restart:
 	pgd = pgd_offset(mm, pvmw->address);
 	if (!pgd_present(*pgd))
 		return false;
-	pud = pud_offset(pgd, pvmw->address);
+	p4d = p4d_offset(pgd, pvmw->address);
+	if (!p4d_present(*p4d))
+		return false;
+	pud = pud_offset(p4d, pvmw->address);
 	if (!pud_present(*pud))
 		return false;
 	pvmw->pmd = pmd_offset(pud, pvmw->address);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 03761577ae86..60f7856e508f 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -69,14 +69,14 @@ again:
 	return err;
 }
 
-static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 			  struct mm_walk *walk)
 {
 	pud_t *pud;
 	unsigned long next;
 	int err = 0;
 
-	pud = pud_offset(pgd, addr);
+	pud = pud_offset(p4d, addr);
 	do {
  again:
 		next = pud_addr_end(addr, end);
@@ -113,6 +113,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 	return err;
 }
 
+static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+			  struct mm_walk *walk)
+{
+	p4d_t *p4d;
+	unsigned long next;
+	int err = 0;
+
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		if (p4d_none_or_clear_bad(p4d)) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, walk);
+			if (err)
+				break;
+			continue;
+		}
+		if (walk->pmd_entry || walk->pte_entry)
+			err = walk_pud_range(p4d, addr, next, walk);
+		if (err)
+			break;
+	} while (p4d++, addr = next, addr != end);
+
+	return err;
+}
+
 static int walk_pgd_range(unsigned long addr, unsigned long end,
 			  struct mm_walk *walk)
 {
@@ -131,7 +157,7 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
 			continue;
 		}
 		if (walk->pmd_entry || walk->pte_entry)
-			err = walk_pud_range(pgd, addr, next, walk);
+			err = walk_p4d_range(pgd, addr, next, walk);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4ed5908c65b0..c99d9512a45b 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -22,6 +22,12 @@ void pgd_clear_bad(pgd_t *pgd)
 	pgd_clear(pgd);
 }
 
+void p4d_clear_bad(p4d_t *p4d)
+{
+	p4d_ERROR(*p4d);
+	p4d_clear(p4d);
+}
+
 void pud_clear_bad(pud_t *pud)
 {
 	pud_ERROR(*pud);
diff --git a/mm/rmap.c b/mm/rmap.c
index 2da487d6cea8..2984403a2424 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -684,6 +684,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd = NULL;
 	pmd_t pmde;
@@ -692,7 +693,11 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
 	if (!pgd_present(*pgd))
 		goto out;
 
-	pud = pud_offset(pgd, address);
+	p4d = p4d_offset(pgd, address);
+	if (!p4d_present(*p4d))
+		goto out;
+
+	pud = pud_offset(p4d, address);
 	if (!pud_present(*pud))
 		goto out;
 
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 574c67b663fe..a56c3989f773 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -196,9 +196,9 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
 	return pmd;
 }
 
-pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node)
+pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
 {
-	pud_t *pud = pud_offset(pgd, addr);
+	pud_t *pud = pud_offset(p4d, addr);
 	if (pud_none(*pud)) {
 		void *p = vmemmap_alloc_block(PAGE_SIZE, node);
 		if (!p)
@@ -208,6 +208,18 @@ pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node)
 	return pud;
 }
 
+p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
+{
+	p4d_t *p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d)) {
+		void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+		if (!p)
+			return NULL;
+		p4d_populate(&init_mm, p4d, p);
+	}
+	return p4d;
+}
+
 pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
 {
 	pgd_t *pgd = pgd_offset_k(addr);
@@ -225,6 +237,7 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
 {
 	unsigned long addr = start;
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
@@ -233,7 +246,10 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
 		pgd = vmemmap_pgd_populate(addr, node);
 		if (!pgd)
 			return -ENOMEM;
-		pud = vmemmap_pud_populate(pgd, addr, node);
+		p4d = vmemmap_p4d_populate(pgd, addr, node);
+		if (!p4d)
+			return -ENOMEM;
+		pud = vmemmap_pud_populate(p4d, addr, node);
 		if (!pud)
 			return -ENOMEM;
 		pmd = vmemmap_pmd_populate(pud, addr, node);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 521ef9b6064f..178130880b90 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1517,7 +1517,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 	return 0;
 }
 
-static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
 				unsigned long addr, unsigned long end,
 				swp_entry_t entry, struct page *page)
 {
@@ -1525,7 +1525,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 	unsigned long next;
 	int ret;
 
-	pud = pud_offset(pgd, addr);
+	pud = pud_offset(p4d, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
@@ -1537,6 +1537,26 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 	return 0;
 }
 
+static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
+				unsigned long addr, unsigned long end,
+				swp_entry_t entry, struct page *page)
+{
+	p4d_t *p4d;
+	unsigned long next;
+	int ret;
+
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		if (p4d_none_or_clear_bad(p4d))
+			continue;
+		ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
+		if (ret)
+			return ret;
+	} while (p4d++, addr = next, addr != end);
+	return 0;
+}
+
 static int unuse_vma(struct vm_area_struct *vma,
 				swp_entry_t entry, struct page *page)
 {
@@ -1560,7 +1580,7 @@ static int unuse_vma(struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
+		ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
 		if (ret)
 			return ret;
 	} while (pgd++, addr = next, addr != end);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 479e631d43c2..8bcb501bce60 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -128,19 +128,22 @@ out_unlock:
 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
 {
 	pgd_t *pgd;
+	p4d_t *p4d;
 	pud_t *pud;
-	pmd_t *pmd = NULL;
 
 	pgd = pgd_offset(mm, address);
-	pud = pud_alloc(mm, pgd, address);
-	if (pud)
-		/*
-		 * Note that we didn't run this because the pmd was
-		 * missing, the *pmd may be already established and in
-		 * turn it may also be a trans_huge_pmd.
-		 */
-		pmd = pmd_alloc(mm, pud, address);
-	return pmd;
+	p4d = p4d_alloc(mm, pgd, address);
+	if (!p4d)
+		return NULL;
+	pud = pud_alloc(mm, p4d, address);
+	if (!pud)
+		return NULL;
+	/*
+	 * Note that we didn't run this because the pmd was
+	 * missing, the *pmd may be already established and in
+	 * turn it may also be a trans_huge_pmd.
+	 */
+	return pmd_alloc(mm, pud, address);
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b4024d688f38..0dd80222b20b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -86,12 +86,12 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
 	} while (pmd++, addr = next, addr != end);
 }
 
-static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
+static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
 {
 	pud_t *pud;
 	unsigned long next;
 
-	pud = pud_offset(pgd, addr);
+	pud = pud_offset(p4d, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_clear_huge(pud))
@@ -102,6 +102,22 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
 	} while (pud++, addr = next, addr != end);
 }
 
+static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
+{
+	p4d_t *p4d;
+	unsigned long next;
+
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		if (p4d_clear_huge(p4d))
+			continue;
+		if (p4d_none_or_clear_bad(p4d))
+			continue;
+		vunmap_pud_range(p4d, addr, next);
+	} while (p4d++, addr = next, addr != end);
+}
+
 static void vunmap_page_range(unsigned long addr, unsigned long end)
 {
 	pgd_t *pgd;
@@ -113,7 +129,7 @@ static void vunmap_page_range(unsigned long addr, unsigned long end)
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		vunmap_pud_range(pgd, addr, next);
+		vunmap_p4d_range(pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
 }
 
@@ -160,13 +176,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
 	return 0;
 }
 
-static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
 		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
 	pud_t *pud;
 	unsigned long next;
 
-	pud = pud_alloc(&init_mm, pgd, addr);
+	pud = pud_alloc(&init_mm, p4d, addr);
 	if (!pud)
 		return -ENOMEM;
 	do {
@@ -177,6 +193,23 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
 	return 0;
 }
 
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+{
+	p4d_t *p4d;
+	unsigned long next;
+
+	p4d = p4d_alloc(&init_mm, pgd, addr);
+	if (!p4d)
+		return -ENOMEM;
+	do {
+		next = p4d_addr_end(addr, end);
+		if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
+			return -ENOMEM;
+	} while (p4d++, addr = next, addr != end);
+	return 0;
+}
+
 /*
  * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
  * will have pfns corresponding to the "pages" array.
@@ -196,7 +229,7 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
 	pgd = pgd_offset_k(addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
+		err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
 		if (err)
 			return err;
 	} while (pgd++, addr = next, addr != end);
@@ -237,6 +270,10 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 	unsigned long addr = (unsigned long) vmalloc_addr;
 	struct page *page = NULL;
 	pgd_t *pgd = pgd_offset_k(addr);
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
 
 	/*
 	 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
@@ -244,21 +281,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 	 */
 	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
 
-	if (!pgd_none(*pgd)) {
-		pud_t *pud = pud_offset(pgd, addr);
-		if (!pud_none(*pud)) {
-			pmd_t *pmd = pmd_offset(pud, addr);
-			if (!pmd_none(*pmd)) {
-				pte_t *ptep, pte;
-
-				ptep = pte_offset_map(pmd, addr);
-				pte = *ptep;
-				if (pte_present(pte))
-					page = pte_page(pte);
-				pte_unmap(ptep);
-			}
-		}
-	}
+	if (pgd_none(*pgd))
+		return NULL;
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d))
+		return NULL;
+	pud = pud_offset(p4d, addr);
+	if (pud_none(*pud))
+		return NULL;
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd))
+		return NULL;
+
+	ptep = pte_offset_map(pmd, addr);
+	pte = *ptep;
+	if (pte_present(pte))
+		page = pte_page(pte);
+	pte_unmap(ptep);
 	return page;
 }
 EXPORT_SYMBOL(vmalloc_to_page);
-- 
cgit v1.2.3


From 745cb7f8a5de0805cade3de3991b7a95317c7c73 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Tue, 7 Mar 2017 23:50:50 +0300
Subject: uapi: fix linux/packet_diag.h userspace compilation error

Replace MAX_ADDR_LEN with its numeric value to fix the following
linux/packet_diag.h userspace compilation error:

/usr/include/linux/packet_diag.h:67:17: error: 'MAX_ADDR_LEN' undeclared here (not in a function)
  __u8 pdmc_addr[MAX_ADDR_LEN];

This is not the first case in the UAPI where the numeric value
of MAX_ADDR_LEN is used instead of symbolic one, uapi/linux/if_link.h
already does the same:

$ grep MAX_ADDR_LEN include/uapi/linux/if_link.h
	__u8 mac[32]; /* MAX_ADDR_LEN */

There are no UAPI headers besides these two that use MAX_ADDR_LEN.

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Acked-by: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/packet_diag.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/packet_diag.h b/include/uapi/linux/packet_diag.h
index d08c63f3dd6f..0c5d5dd61b6a 100644
--- a/include/uapi/linux/packet_diag.h
+++ b/include/uapi/linux/packet_diag.h
@@ -64,7 +64,7 @@ struct packet_diag_mclist {
 	__u32	pdmc_count;
 	__u16	pdmc_type;
 	__u16	pdmc_alen;
-	__u8	pdmc_addr[MAX_ADDR_LEN];
+	__u8	pdmc_addr[32]; /* MAX_ADDR_LEN */
 };
 
 struct packet_diag_ring {
-- 
cgit v1.2.3


From 4fe8435909fddc97b81472026aa954e06dd192a5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 7 Mar 2017 20:00:13 -0800
Subject: bpf: convert htab map to hlist_nulls

when all map elements are pre-allocated one cpu can delete and reuse htab_elem
while another cpu is still walking the hlist. In such case the lookup may
miss the element. Convert hlist to hlist_nulls to avoid such scenario.
When bucket lock is taken there is no need to take such precautions,
so only convert map_lookup and map_get_next to nulls.
The race window is extremely small and only reproducible with explicit
udelay() inside lookup_nulls_elem_raw()

Similar to hlist add hlist_nulls_for_each_entry_safe() and
hlist_nulls_entry_safe() helpers.

Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements")
Reported-by: Jonathan Perry <jonperry@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/list_nulls.h    |  5 +++
 include/linux/rculist_nulls.h | 14 +++++++
 kernel/bpf/hashtab.c          | 94 +++++++++++++++++++++++++++----------------
 3 files changed, 79 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index b01fe1009084..87ff4f58a2f0 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -29,6 +29,11 @@ struct hlist_nulls_node {
 	((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
 
 #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_nulls_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
+	})
 /**
  * ptr_is_a_nulls - Test if a ptr is a nulls
  * @ptr: ptr to be tested
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 4ae95f7e8597..a23a33153180 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -156,5 +156,19 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
 		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
 		pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))
 
+/**
+ * hlist_nulls_for_each_entry_safe -
+ *   iterate over list of given type safe against removal of list entry
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_nulls_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_nulls_node within the struct.
+ */
+#define hlist_nulls_for_each_entry_safe(tpos, pos, head, member)		\
+	for (({barrier();}),							\
+	     pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));		\
+		(!is_a_nulls(pos)) &&						\
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member);	\
+		   pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });)
 #endif
 #endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 63c86a7be2a1..afe5bab376c9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -13,11 +13,12 @@
 #include <linux/bpf.h>
 #include <linux/jhash.h>
 #include <linux/filter.h>
+#include <linux/rculist_nulls.h>
 #include "percpu_freelist.h"
 #include "bpf_lru_list.h"
 
 struct bucket {
-	struct hlist_head head;
+	struct hlist_nulls_head head;
 	raw_spinlock_t lock;
 };
 
@@ -44,7 +45,7 @@ enum extra_elem_state {
 /* each htab element is struct htab_elem + key + value */
 struct htab_elem {
 	union {
-		struct hlist_node hash_node;
+		struct hlist_nulls_node hash_node;
 		struct {
 			void *padding;
 			union {
@@ -337,7 +338,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		goto free_htab;
 
 	for (i = 0; i < htab->n_buckets; i++) {
-		INIT_HLIST_HEAD(&htab->buckets[i].head);
+		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
 		raw_spin_lock_init(&htab->buckets[i].lock);
 	}
 
@@ -377,28 +378,52 @@ static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
 	return &htab->buckets[hash & (htab->n_buckets - 1)];
 }
 
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
 {
 	return &__select_bucket(htab, hash)->head;
 }
 
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+/* this lookup function can only be called with bucket lock taken */
+static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
 					 void *key, u32 key_size)
 {
+	struct hlist_nulls_node *n;
 	struct htab_elem *l;
 
-	hlist_for_each_entry_rcu(l, head, hash_node)
+	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 		if (l->hash == hash && !memcmp(&l->key, key, key_size))
 			return l;
 
 	return NULL;
 }
 
+/* can be called without bucket lock. it will repeat the loop in
+ * the unlikely event when elements moved from one bucket into another
+ * while link list is being walked
+ */
+static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
+					       u32 hash, void *key,
+					       u32 key_size, u32 n_buckets)
+{
+	struct hlist_nulls_node *n;
+	struct htab_elem *l;
+
+again:
+	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+			return l;
+
+	if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
+		goto again;
+
+	return NULL;
+}
+
 /* Called from syscall or from eBPF program */
 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct hlist_head *head;
+	struct hlist_nulls_head *head;
 	struct htab_elem *l;
 	u32 hash, key_size;
 
@@ -411,7 +436,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
 
 	head = select_bucket(htab, hash);
 
-	l = lookup_elem_raw(head, hash, key, key_size);
+	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
 
 	return l;
 }
@@ -444,8 +469,9 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 {
 	struct bpf_htab *htab = (struct bpf_htab *)arg;
-	struct htab_elem *l, *tgt_l;
-	struct hlist_head *head;
+	struct htab_elem *l = NULL, *tgt_l;
+	struct hlist_nulls_head *head;
+	struct hlist_nulls_node *n;
 	unsigned long flags;
 	struct bucket *b;
 
@@ -455,9 +481,9 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 
 	raw_spin_lock_irqsave(&b->lock, flags);
 
-	hlist_for_each_entry_rcu(l, head, hash_node)
+	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 		if (l == tgt_l) {
-			hlist_del_rcu(&l->hash_node);
+			hlist_nulls_del_rcu(&l->hash_node);
 			break;
 		}
 
@@ -470,7 +496,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct hlist_head *head;
+	struct hlist_nulls_head *head;
 	struct htab_elem *l, *next_l;
 	u32 hash, key_size;
 	int i;
@@ -484,7 +510,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	head = select_bucket(htab, hash);
 
 	/* lookup the key */
-	l = lookup_elem_raw(head, hash, key, key_size);
+	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
 
 	if (!l) {
 		i = 0;
@@ -492,7 +518,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	}
 
 	/* key was found, get next key in the same bucket */
-	next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
 				  struct htab_elem, hash_node);
 
 	if (next_l) {
@@ -511,7 +537,7 @@ find_first_elem:
 		head = select_bucket(htab, i);
 
 		/* pick first element in the bucket */
-		next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+		next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
 					  struct htab_elem, hash_node);
 		if (next_l) {
 			/* if it's not empty, just return it */
@@ -676,7 +702,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
-	struct hlist_head *head;
+	struct hlist_nulls_head *head;
 	unsigned long flags;
 	struct bucket *b;
 	u32 key_size, hash;
@@ -715,9 +741,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	/* add new element to the head of the list, so that
 	 * concurrent search will find it before old elem
 	 */
-	hlist_add_head_rcu(&l_new->hash_node, head);
+	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
 	if (l_old) {
-		hlist_del_rcu(&l_old->hash_node);
+		hlist_nulls_del_rcu(&l_old->hash_node);
 		free_htab_elem(htab, l_old);
 	}
 	ret = 0;
@@ -731,7 +757,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new, *l_old = NULL;
-	struct hlist_head *head;
+	struct hlist_nulls_head *head;
 	unsigned long flags;
 	struct bucket *b;
 	u32 key_size, hash;
@@ -772,10 +798,10 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 	/* add new element to the head of the list, so that
 	 * concurrent search will find it before old elem
 	 */
-	hlist_add_head_rcu(&l_new->hash_node, head);
+	hlist_nulls_add_head_rcu(&l_new->hash_node, head);
 	if (l_old) {
 		bpf_lru_node_set_ref(&l_new->lru_node);
-		hlist_del_rcu(&l_old->hash_node);
+		hlist_nulls_del_rcu(&l_old->hash_node);
 	}
 	ret = 0;
 
@@ -796,7 +822,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
-	struct hlist_head *head;
+	struct hlist_nulls_head *head;
 	unsigned long flags;
 	struct bucket *b;
 	u32 key_size, hash;
@@ -835,7 +861,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 			ret = PTR_ERR(l_new);
 			goto err;
 		}
-		hlist_add_head_rcu(&l_new->hash_node, head);
+		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
 	}
 	ret = 0;
 err:
@@ -849,7 +875,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
-	struct hlist_head *head;
+	struct hlist_nulls_head *head;
 	unsigned long flags;
 	struct bucket *b;
 	u32 key_size, hash;
@@ -897,7 +923,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 	} else {
 		pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
 				value, onallcpus);
-		hlist_add_head_rcu(&l_new->hash_node, head);
+		hlist_nulls_add_head_rcu(&l_new->hash_node, head);
 		l_new = NULL;
 	}
 	ret = 0;
@@ -925,7 +951,7 @@ static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct hlist_head *head;
+	struct hlist_nulls_head *head;
 	struct bucket *b;
 	struct htab_elem *l;
 	unsigned long flags;
@@ -945,7 +971,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	l = lookup_elem_raw(head, hash, key, key_size);
 
 	if (l) {
-		hlist_del_rcu(&l->hash_node);
+		hlist_nulls_del_rcu(&l->hash_node);
 		free_htab_elem(htab, l);
 		ret = 0;
 	}
@@ -957,7 +983,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct hlist_head *head;
+	struct hlist_nulls_head *head;
 	struct bucket *b;
 	struct htab_elem *l;
 	unsigned long flags;
@@ -977,7 +1003,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 	l = lookup_elem_raw(head, hash, key, key_size);
 
 	if (l) {
-		hlist_del_rcu(&l->hash_node);
+		hlist_nulls_del_rcu(&l->hash_node);
 		ret = 0;
 	}
 
@@ -992,12 +1018,12 @@ static void delete_all_elements(struct bpf_htab *htab)
 	int i;
 
 	for (i = 0; i < htab->n_buckets; i++) {
-		struct hlist_head *head = select_bucket(htab, i);
-		struct hlist_node *n;
+		struct hlist_nulls_head *head = select_bucket(htab, i);
+		struct hlist_nulls_node *n;
 		struct htab_elem *l;
 
-		hlist_for_each_entry_safe(l, n, head, hash_node) {
-			hlist_del_rcu(&l->hash_node);
+		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+			hlist_nulls_del_rcu(&l->hash_node);
 			if (l->state != HTAB_EXTRA_ELEM_USED)
 				htab_elem_free(htab, l);
 		}
-- 
cgit v1.2.3


From 8a1115ff6b6d90cf1066ec3a0c4e51276553eebe Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 9 Mar 2017 16:16:31 -0800
Subject: scripts/spelling.txt: add "disble(d)" pattern and fix typo instances

Fix typos and add the following to the scripts/spelling.txt:

  disble||disable
  disbled||disabled

I kept the TSL2563_INT_DISBLED in /drivers/iio/light/tsl2563.c
untouched.  The macro is not referenced at all, but this commit is
touching only comment blocks just in case.

Link: http://lkml.kernel.org/r/1481573103-11329-20-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/dev-tools/kcov.rst        | 2 +-
 arch/cris/arch-v32/drivers/cryptocop.c  | 2 +-
 arch/x86/kernel/ftrace.c                | 2 +-
 drivers/crypto/ux500/cryp/cryp.c        | 2 +-
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c  | 2 +-
 drivers/hv/channel.c                    | 2 +-
 drivers/isdn/hisax/st5481_b.c           | 2 +-
 drivers/mtd/spi-nor/spi-nor.c           | 2 +-
 drivers/net/ethernet/qlogic/qlge/qlge.h | 2 +-
 drivers/scsi/aic7xxx/aic79xx_core.c     | 2 +-
 drivers/usb/gadget/legacy/inode.c       | 3 +--
 drivers/usb/host/xhci.c                 | 4 ++--
 include/linux/regulator/machine.h       | 2 +-
 kernel/cgroup/cgroup.c                  | 2 +-
 kernel/events/core.c                    | 2 +-
 scripts/spelling.txt                    | 2 ++
 sound/soc/amd/acp-pcm-dma.c             | 2 +-
 17 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst
index 2c41b713841f..44886c91e112 100644
--- a/Documentation/dev-tools/kcov.rst
+++ b/Documentation/dev-tools/kcov.rst
@@ -10,7 +10,7 @@ Note that kcov does not aim to collect as much coverage as possible. It aims
 to collect more or less stable coverage that is function of syscall inputs.
 To achieve this goal it does not collect coverage in soft/hard interrupts
 and instrumentation of some inherently non-deterministic parts of kernel is
-disbled (e.g. scheduler, locking).
+disabled (e.g. scheduler, locking).
 
 Usage
 -----
diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c
index ae6903d7fdbe..14970f11bbf2 100644
--- a/arch/cris/arch-v32/drivers/cryptocop.c
+++ b/arch/cris/arch-v32/drivers/cryptocop.c
@@ -2086,7 +2086,7 @@ static void cryptocop_job_queue_close(void)
 		dma_in_cfg.en = regk_dma_no;
 		REG_WR(dma, IN_DMA_INST, rw_cfg, dma_in_cfg);
 
-		/* Disble the cryptocop. */
+		/* Disable the cryptocop. */
 		rw_cfg = REG_RD(strcop, regi_strcop, rw_cfg);
 		rw_cfg.en = 0;
 		REG_WR(strcop, regi_strcop, rw_cfg, rw_cfg);
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8639bb2ae058..8f3d9cf26ff9 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -535,7 +535,7 @@ static void run_sync(void)
 {
 	int enable_irqs = irqs_disabled();
 
-	/* We may be called with interrupts disbled (on bootup). */
+	/* We may be called with interrupts disabled (on bootup). */
 	if (enable_irqs)
 		local_irq_enable();
 	on_each_cpu(do_sync_core, NULL, 1);
diff --git a/drivers/crypto/ux500/cryp/cryp.c b/drivers/crypto/ux500/cryp/cryp.c
index 43a0c8a26ab0..00a16ab601cb 100644
--- a/drivers/crypto/ux500/cryp/cryp.c
+++ b/drivers/crypto/ux500/cryp/cryp.c
@@ -82,7 +82,7 @@ void cryp_activity(struct cryp_device_data *device_data,
 void cryp_flush_inoutfifo(struct cryp_device_data *device_data)
 {
 	/*
-	 * We always need to disble the hardware before trying to flush the
+	 * We always need to disable the hardware before trying to flush the
 	 * FIFO. This is something that isn't written in the design
 	 * specification, but we have been informed by the hardware designers
 	 * that this must be done.
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index 31375bdde6f1..011800f621c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -788,7 +788,7 @@ static int sdma_v3_0_start(struct amdgpu_device *adev)
 		}
 	}
 
-	/* disble sdma engine before programing it */
+	/* disable sdma engine before programing it */
 	sdma_v3_0_ctx_switch_enable(adev, false);
 	sdma_v3_0_enable(adev, false);
 
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 81a80c82f1bd..bd0d1988feb2 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -543,7 +543,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel)
 	/*
 	 * In case a device driver's probe() fails (e.g.,
 	 * util_probe() -> vmbus_open() returns -ENOMEM) and the device is
-	 * rescinded later (e.g., we dynamically disble an Integrated Service
+	 * rescinded later (e.g., we dynamically disable an Integrated Service
 	 * in Hyper-V Manager), the driver's remove() invokes vmbus_close():
 	 * here we should skip most of the below cleanup work.
 	 */
diff --git a/drivers/isdn/hisax/st5481_b.c b/drivers/isdn/hisax/st5481_b.c
index 409849165838..f64a36007800 100644
--- a/drivers/isdn/hisax/st5481_b.c
+++ b/drivers/isdn/hisax/st5481_b.c
@@ -239,7 +239,7 @@ static void st5481B_mode(struct st5481_bcs *bcs, int mode)
 			}
 		}
 	} else {
-		// Disble B channel interrupts
+		// Disable B channel interrupts
 		st5481_usb_device_ctrl_msg(adapter, FFMSK_B1+(bcs->channel * 2), 0, NULL, NULL);
 
 		// Disable B channel FIFOs
diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 1ae872bfc3ba..747645c74134 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -186,7 +186,7 @@ static inline int write_enable(struct spi_nor *nor)
 }
 
 /*
- * Send write disble instruction to the chip.
+ * Send write disable instruction to the chip.
  */
 static inline int write_disable(struct spi_nor *nor)
 {
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge.h b/drivers/net/ethernet/qlogic/qlge/qlge.h
index 6d31f92ef2b6..90b3b46f85cc 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge.h
+++ b/drivers/net/ethernet/qlogic/qlge/qlge.h
@@ -1163,7 +1163,7 @@ struct ib_mac_iocb_rsp {
 	u8 opcode;		/* 0x20 */
 	u8 flags1;
 #define IB_MAC_IOCB_RSP_OI	0x01	/* Overide intr delay */
-#define IB_MAC_IOCB_RSP_I	0x02	/* Disble Intr Generation */
+#define IB_MAC_IOCB_RSP_I	0x02	/* Disable Intr Generation */
 #define IB_MAC_CSUM_ERR_MASK 0x1c	/* A mask to use for csum errs */
 #define IB_MAC_IOCB_RSP_TE	0x04	/* Checksum error */
 #define IB_MAC_IOCB_RSP_NU	0x08	/* No checksum rcvd */
diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c
index 109e2c99e6c1..95d8f25cbcca 100644
--- a/drivers/scsi/aic7xxx/aic79xx_core.c
+++ b/drivers/scsi/aic7xxx/aic79xx_core.c
@@ -6278,7 +6278,7 @@ ahd_reset(struct ahd_softc *ahd, int reinit)
 		 * does not disable its parity logic prior to
 		 * the start of the reset.  This may cause a
 		 * parity error to be detected and thus a
-		 * spurious SERR or PERR assertion.  Disble
+		 * spurious SERR or PERR assertion.  Disable
 		 * PERR and SERR responses during the CHIPRST.
 		 */
 		mod_cmd = cmd & ~(PCIM_CMD_PERRESPEN|PCIM_CMD_SERRESPEN);
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index a2615d64d07c..79a2d8fba6b6 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -84,8 +84,7 @@ static int ep_open(struct inode *, struct file *);
 
 /* /dev/gadget/$CHIP represents ep0 and the whole device */
 enum ep0_state {
-	/* DISBLED is the initial state.
-	 */
+	/* DISABLED is the initial state. */
 	STATE_DEV_DISABLED = 0,
 
 	/* Only one open() of /dev/gadget/$CHIP; only one file tracks
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 6d6c46000e56..50aee8b7718b 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -868,7 +868,7 @@ static void xhci_disable_port_wake_on_bits(struct xhci_hcd *xhci)
 
 	spin_lock_irqsave(&xhci->lock, flags);
 
-	/* disble usb3 ports Wake bits*/
+	/* disable usb3 ports Wake bits */
 	port_index = xhci->num_usb3_ports;
 	port_array = xhci->usb3_ports;
 	while (port_index--) {
@@ -879,7 +879,7 @@ static void xhci_disable_port_wake_on_bits(struct xhci_hcd *xhci)
 			writel(t2, port_array[port_index]);
 	}
 
-	/* disble usb2 ports Wake bits*/
+	/* disable usb2 ports Wake bits */
 	port_index = xhci->num_usb2_ports;
 	port_array = xhci->usb2_ports;
 	while (port_index--) {
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index ad3e5158e586..c9f795e9a2ee 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -65,7 +65,7 @@ struct regulator_state {
 	int uV;	/* suspend voltage */
 	unsigned int mode; /* suspend regulator operating mode */
 	int enabled; /* is regulator enabled in this suspend state */
-	int disabled; /* is the regulator disbled in this suspend state */
+	int disabled; /* is the regulator disabled in this suspend state */
 };
 
 /**
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0125589c7428..48851327a15e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2669,7 +2669,7 @@ static bool css_visible(struct cgroup_subsys_state *css)
  *
  * Returns 0 on success, -errno on failure.  On failure, csses which have
  * been processed already aren't cleaned up.  The caller is responsible for
- * cleaning up with cgroup_apply_control_disble().
+ * cleaning up with cgroup_apply_control_disable().
  */
 static int cgroup_apply_control_enable(struct cgroup *cgrp)
 {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6f41548f2e32..a17ed56c8ce1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -998,7 +998,7 @@ list_update_cgroup_event(struct perf_event *event,
  */
 #define PERF_CPU_HRTIMER (1000 / HZ)
 /*
- * function must be called with interrupts disbled
+ * function must be called with interrupts disabled
  */
 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 {
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 0458b037c8a1..6dae4df472f6 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -372,6 +372,8 @@ disassocation||disassociation
 disapear||disappear
 disapeared||disappeared
 disappared||disappeared
+disble||disable
+disbled||disabled
 disconnet||disconnect
 discontinous||discontinuous
 dispertion||dispersion
diff --git a/sound/soc/amd/acp-pcm-dma.c b/sound/soc/amd/acp-pcm-dma.c
index ec1067a679da..08b1399d1da2 100644
--- a/sound/soc/amd/acp-pcm-dma.c
+++ b/sound/soc/amd/acp-pcm-dma.c
@@ -89,7 +89,7 @@ static void acp_reg_write(u32 val, void __iomem *acp_mmio, u32 reg)
 	writel(val, acp_mmio + (reg * 4));
 }
 
-/* Configure a given dma channel parameters - enable/disble,
+/* Configure a given dma channel parameters - enable/disable,
  * number of descriptors, priority
  */
 static void config_acp_dma_channel(void __iomem *acp_mmio, u8 ch_num,
-- 
cgit v1.2.3


From 505d3085d7120a9f4cd0d6ffaa876968854b3baa Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 9 Mar 2017 16:16:33 -0800
Subject: scripts/spelling.txt: add "overide" pattern and fix typo instances

Fix typos and add the following to the scripts/spelling.txt:

  overide||override

While we are here, fix the doubled "address" in the touched line
Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt.

Also, fix the comment block style in the touched hunks in
drivers/media/dvb-frontends/drx39xyj/drx_driver.h.

Link: http://lkml.kernel.org/r/1481573103-11329-21-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt | 2 +-
 drivers/block/paride/pcd.c                                       | 2 +-
 drivers/block/paride/pd.c                                        | 2 +-
 drivers/block/paride/pf.c                                        | 2 +-
 drivers/block/paride/pg.c                                        | 2 +-
 drivers/block/paride/pt.c                                        | 2 +-
 drivers/media/dvb-frontends/drx39xyj/drx_driver.h                | 8 +++-----
 drivers/net/ethernet/qlogic/qlge/qlge.h                          | 2 +-
 include/dt-bindings/sound/cs42l42.h                              | 2 +-
 include/net/irda/timer.h                                         | 2 +-
 kernel/trace/trace_stack.c                                       | 2 +-
 scripts/spelling.txt                                             | 1 +
 tools/lguest/lguest.c                                            | 2 +-
 tools/lib/bpf/Makefile                                           | 2 +-
 tools/lib/traceevent/Makefile                                    | 2 +-
 tools/lib/traceevent/event-parse.h                               | 2 +-
 16 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt b/Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt
index c3f6546ebac7..6a23ad9ac53a 100644
--- a/Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/ti-abb-regulator.txt
@@ -45,7 +45,7 @@ Required Properties:
 Optional Properties:
 - reg-names: In addition to the required properties, the following are optional
   - "efuse-address"	- Contains efuse base address used to pick up ABB info.
-  - "ldo-address"	- Contains address of ABB LDO overide register address.
+  - "ldo-address"	- Contains address of ABB LDO override register.
 	"efuse-address" is required for this.
 - ti,ldovbb-vset-mask	- Required if ldo-address is set, mask for LDO override
 	register to provide override vset value.
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 10aed84244f5..939641d6e262 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -50,7 +50,7 @@
                         the slower the port i/o.  In some cases, setting
                         this to zero will speed up the device. (default -1)
                         
-            major       You may use this parameter to overide the
+            major       You may use this parameter to override the
                         default major number (46) that this driver
                         will use.  Be sure to change the device
                         name as well.
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 644ba0888bd4..9cfd2e06a649 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -61,7 +61,7 @@
                         first drive found.
 			
 
-            major       You may use this parameter to overide the
+            major       You may use this parameter to override the
                         default major number (45) that this driver
                         will use.  Be sure to change the device
                         name as well.
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index ed93e8badf56..14c5d32f5d8b 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -59,7 +59,7 @@
                         the slower the port i/o.  In some cases, setting
                         this to zero will speed up the device. (default -1)
 
-	    major	You may use this parameter to overide the
+	    major	You may use this parameter to override the
 			default major number (47) that this driver
 			will use.  Be sure to change the device
 			name as well.
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index 5db955fe3a94..3b5882bfb736 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -84,7 +84,7 @@
 			the slower the port i/o.  In some cases, setting
 			this to zero will speed up the device. (default -1)
 
-	    major	You may use this parameter to overide the
+	    major	You may use this parameter to override the
 			default major number (97) that this driver
 			will use.  Be sure to change the device
 			name as well.
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 61fc6824299a..e815312a00ad 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -61,7 +61,7 @@
                         the slower the port i/o.  In some cases, setting
                         this to zero will speed up the device. (default -1)
 
-	    major	You may use this parameter to overide the
+	    major	You may use this parameter to override the
 			default major number (96) that this driver
 			will use.  Be sure to change the device
 			name as well.
diff --git a/drivers/media/dvb-frontends/drx39xyj/drx_driver.h b/drivers/media/dvb-frontends/drx39xyj/drx_driver.h
index 7a681d8202c7..4442e478db72 100644
--- a/drivers/media/dvb-frontends/drx39xyj/drx_driver.h
+++ b/drivers/media/dvb-frontends/drx39xyj/drx_driver.h
@@ -256,8 +256,7 @@ int drxbsp_tuner_default_i2c_write_read(struct tuner_instance *tuner,
 *
 * The actual DAP implementation may be restricted to only one of the modes.
 * A compiler warning or error will be generated if the DAP implementation
-* overides or cannot handle the mode defined below.
-*
+* overrides or cannot handle the mode defined below.
 */
 #ifndef DRXDAP_SINGLE_MASTER
 #define DRXDAP_SINGLE_MASTER 1
@@ -272,7 +271,7 @@ int drxbsp_tuner_default_i2c_write_read(struct tuner_instance *tuner,
 *
 * This maximum size may be restricted by the actual DAP implementation.
 * A compiler warning or error will be generated if the DAP implementation
-* overides or cannot handle the chunksize defined below.
+* overrides or cannot handle the chunksize defined below.
 *
 * Beware that the DAP uses  DRXDAP_MAX_WCHUNKSIZE to create a temporary data
 * buffer. Do not undefine or choose too large, unless your system is able to
@@ -292,8 +291,7 @@ int drxbsp_tuner_default_i2c_write_read(struct tuner_instance *tuner,
 *
 * This maximum size may be restricted by the actual DAP implementation.
 * A compiler warning or error will be generated if the DAP implementation
-* overides or cannot handle the chunksize defined below.
-*
+* overrides or cannot handle the chunksize defined below.
 */
 #ifndef DRXDAP_MAX_RCHUNKSIZE
 #define  DRXDAP_MAX_RCHUNKSIZE 60
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge.h b/drivers/net/ethernet/qlogic/qlge/qlge.h
index 90b3b46f85cc..84ac50f92c9c 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge.h
+++ b/drivers/net/ethernet/qlogic/qlge/qlge.h
@@ -1162,7 +1162,7 @@ struct ob_mac_tso_iocb_rsp {
 struct ib_mac_iocb_rsp {
 	u8 opcode;		/* 0x20 */
 	u8 flags1;
-#define IB_MAC_IOCB_RSP_OI	0x01	/* Overide intr delay */
+#define IB_MAC_IOCB_RSP_OI	0x01	/* Override intr delay */
 #define IB_MAC_IOCB_RSP_I	0x02	/* Disable Intr Generation */
 #define IB_MAC_CSUM_ERR_MASK 0x1c	/* A mask to use for csum errs */
 #define IB_MAC_IOCB_RSP_TE	0x04	/* Checksum error */
diff --git a/include/dt-bindings/sound/cs42l42.h b/include/dt-bindings/sound/cs42l42.h
index 399a123aed58..db69d84ed7d1 100644
--- a/include/dt-bindings/sound/cs42l42.h
+++ b/include/dt-bindings/sound/cs42l42.h
@@ -20,7 +20,7 @@
 #define CS42L42_HPOUT_LOAD_1NF		0
 #define CS42L42_HPOUT_LOAD_10NF		1
 
-/* HPOUT Clamp to GND Overide */
+/* HPOUT Clamp to GND Override */
 #define CS42L42_HPOUT_CLAMP_EN		0
 #define CS42L42_HPOUT_CLAMP_DIS		1
 
diff --git a/include/net/irda/timer.h b/include/net/irda/timer.h
index cb2615ccf761..d784f242cf7b 100644
--- a/include/net/irda/timer.h
+++ b/include/net/irda/timer.h
@@ -59,7 +59,7 @@ struct lap_cb;
  *  Slot timer must never exceed 85 ms, and must always be at least 25 ms, 
  *  suggested to  75-85 msec by IrDA lite. This doesn't work with a lot of
  *  devices, and other stackes uses a lot more, so it's best we do it as well
- *  (Note : this is the default value and sysctl overides it - Jean II)
+ *  (Note : this is the default value and sysctl overrides it - Jean II)
  */
 #define SLOT_TIMEOUT            (90*HZ/1000)
 
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 1d68b5b7ad41..5fb1f2c87e6b 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -65,7 +65,7 @@ void stack_trace_print(void)
 }
 
 /*
- * When arch-specific code overides this function, the following
+ * When arch-specific code overrides this function, the following
  * data should be filled up, assuming stack_trace_max_lock is held to
  * prevent concurrent updates.
  *     stack_trace_index[]
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 6dae4df472f6..0545f5a8cabe 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -734,6 +734,7 @@ oustanding||outstanding
 overaall||overall
 overhread||overhead
 overlaping||overlapping
+overide||override
 overrided||overridden
 overriden||overridden
 overun||overrun
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
index 11c8d9bc762e..5d19fdf80292 100644
--- a/tools/lguest/lguest.c
+++ b/tools/lguest/lguest.c
@@ -1387,7 +1387,7 @@ static bool pci_data_iowrite(u16 port, u32 mask, u32 val)
 		/* Allow writing to any other BAR, or expansion ROM */
 		iowrite(portoff, val, mask, &d->config_words[reg]);
 		return true;
-		/* We let them overide latency timer and cacheline size */
+		/* We let them override latency timer and cacheline size */
 	} else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) {
 		/* Only let them change the first two fields. */
 		if (mask == 0xFFFFFFFF)
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index e2efddf10231..1f5300e56b44 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -132,7 +132,7 @@ else
   Q = @
 endif
 
-# Disable command line variables (CFLAGS) overide from top
+# Disable command line variables (CFLAGS) override from top
 # level Makefile (perf), otherwise build Makefile will get
 # the same command line setup.
 MAKEOVERRIDES=
diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile
index 47076b15eebe..9b8555ea3459 100644
--- a/tools/lib/traceevent/Makefile
+++ b/tools/lib/traceevent/Makefile
@@ -135,7 +135,7 @@ else
   Q = @
 endif
 
-# Disable command line variables (CFLAGS) overide from top
+# Disable command line variables (CFLAGS) override from top
 # level Makefile (perf), otherwise build Makefile will get
 # the same command line setup.
 MAKEOVERRIDES=
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
index 66342804161c..0c03538df74c 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -140,7 +140,7 @@ struct pevent_plugin_option {
  *   struct pevent_plugin_option PEVENT_PLUGIN_OPTIONS[] = {
  *	{
  *		.name = "option-name",
- *		.plugin_alias = "overide-file-name", (optional)
+ *		.plugin_alias = "override-file-name", (optional)
  *		.description = "description of option to show users",
  *	},
  *	{
-- 
cgit v1.2.3


From dd0db88d8094a6d9d4d1fc5fcd56ab619f54ccf8 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 9 Mar 2017 16:16:49 -0800
Subject: userfaultfd: non-cooperative: rollback userfaultfd_exit

Patch series "userfaultfd non-cooperative further update for 4.11 merge
window".

Unfortunately I noticed one relevant bug in userfaultfd_exit while doing
more testing.  I've been doing testing before and this was also tested
by kbuild bot and exercised by the selftest, but this bug never
reproduced before.

I dropped userfaultfd_exit as result.  I dropped it because of
implementation difficulty in receiving signals in __mmput and because I
think -ENOSPC as result from the background UFFDIO_COPY should be enough
already.

Before I decided to remove userfaultfd_exit, I noticed userfaultfd_exit
wasn't exercised by the selftest and when I tried to exercise it, after
moving it to a more correct place in __mmput where it would make more
sense and where the vma list is stable, it resulted in the
event_wait_completion in D state.  So then I added the second patch to
be sure even if we call userfaultfd_event_wait_completion too late
during task exit(), we won't risk to generate tasks in D state.  The
same check exists in handle_userfault() for the same reason, except it
makes a difference there, while here is just a robustness check and it's
run under WARN_ON_ONCE.

While looking at the userfaultfd_event_wait_completion() function I
looked back at its callers too while at it and I think it's not ok to
stop executing dup_fctx on the fcs list because we relay on
userfaultfd_event_wait_completion to execute
userfaultfd_ctx_put(fctx->orig) which is paired against
userfaultfd_ctx_get(fctx->orig) in dup_userfault just before
list_add(fcs).  This change only takes care of fctx->orig but this area
also needs further review looking for similar problems in fctx->new.

The only patch that is urgent is the first because it's an use after
free during a SMP race condition that affects all processes if
CONFIG_USERFAULTFD=y.  Very hard to reproduce though and probably
impossible without SLUB poisoning enabled.

This patch (of 3):

I once reproduced this oops with the userfaultfd selftest, it's not
easily reproducible and it requires SLUB poisoning to reproduce.

    general protection fault: 0000 [#1] SMP
    Modules linked in:
    CPU: 2 PID: 18421 Comm: userfaultfd Tainted: G               ------------ T 3.10.0+ #15
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.1-0-g8891697-prebuilt.qemu-project.org 04/01/2014
    task: ffff8801f83b9440 ti: ffff8801f833c000 task.ti: ffff8801f833c000
    RIP: 0010:[<ffffffff81451299>]  [<ffffffff81451299>] userfaultfd_exit+0x29/0xa0
    RSP: 0018:ffff8801f833fe80  EFLAGS: 00010202
    RAX: ffff8801f833ffd8 RBX: 6b6b6b6b6b6b6b6b RCX: ffff8801f83b9440
    RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8800baf18600
    RBP: ffff8801f833fee8 R08: 0000000000000000 R09: 0000000000000001
    R10: 0000000000000000 R11: ffffffff8127ceb3 R12: 0000000000000000
    R13: ffff8800baf186b0 R14: ffff8801f83b99f8 R15: 00007faed746c700
    FS:  0000000000000000(0000) GS:ffff88023fc80000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
    CR2: 00007faf0966f028 CR3: 0000000001bc6000 CR4: 00000000000006e0
    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
    DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
    Call Trace:
      do_exit+0x297/0xd10
      SyS_exit+0x17/0x20
      tracesys+0xdd/0xe2
    Code: 00 00 66 66 66 66 90 55 48 89 e5 41 54 53 48 83 ec 58 48 8b 1f 48 85 db 75 11 eb 73 66 0f 1f 44 00 00 48 8b 5b 10 48 85 db 74 64 <4c> 8b a3 b8 00 00 00 4d 85 e4 74 eb 41 f6 84 24 2c 01 00 00 80
    RIP  [<ffffffff81451299>] userfaultfd_exit+0x29/0xa0
     RSP <ffff8801f833fe80>
    ---[ end trace 9fecd6dcb442846a ]---

In the debugger I located the "mm" pointer in the stack and walking
mm->mmap->vm_next through the end shows the vma->vm_next list is fully
consistent and it is null terminated list as expected.  So this has to
be an SMP race condition where userfaultfd_exit was running while the
vma list was being modified by another CPU.

When userfaultfd_exit() run one of the ->vm_next pointers pointed to
SLAB_POISON (RBX is the vma pointer and is 0x6b6b..).

The reason is that it's not running in __mmput but while there are still
other threads running and it's not holding the mmap_sem (it can't as it
has to wait the even to be received by the manager).  So this is an use
after free that was happening for all processes.

One more implementation problem aside from the race condition:
userfaultfd_exit has really to check a flag in mm->flags before walking
the vma or it's going to slowdown the exit() path for regular tasks.

One more implementation problem: at that point signals can't be
delivered so it would also create a task in D state if the manager
doesn't read the event.

The major design issue: it overall looks superfluous as the manager can
check for -ENOSPC in the background transfer:

	if (mmget_not_zero(ctx->mm)) {
[..]
	} else {
		return -ENOSPC;
	}

It's safer to roll it back and re-introduce it later if at all.

[rppt@linux.vnet.ibm.com: documentation fixup after removal of UFFD_EVENT_EXIT]
  Link: http://lkml.kernel.org/r/1488345437-4364-1-git-send-email-rppt@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/20170224181957.19736-2-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Acked-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/userfaultfd.txt |  4 ----
 fs/userfaultfd.c                 | 28 ----------------------------
 include/linux/userfaultfd_k.h    |  6 ------
 include/uapi/linux/userfaultfd.h |  5 +----
 kernel/exit.c                    |  1 -
 5 files changed, 1 insertion(+), 43 deletions(-)

(limited to 'include')

diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt
index 0e5543a920e5..bb2f945f87ab 100644
--- a/Documentation/vm/userfaultfd.txt
+++ b/Documentation/vm/userfaultfd.txt
@@ -172,10 +172,6 @@ the same read(2) protocol as for the page fault notifications. The
 manager has to explicitly enable these events by setting appropriate
 bits in uffdio_api.features passed to UFFDIO_API ioctl:
 
-UFFD_FEATURE_EVENT_EXIT - enable notification about exit() of the
-non-cooperative process. When the monitored process exits, the uffd
-manager will get UFFD_EVENT_EXIT.
-
 UFFD_FEATURE_EVENT_FORK - enable userfaultfd hooks for fork(). When
 this feature is enabled, the userfaultfd context of the parent process
 is duplicated into the newly created process. The manager receives
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f62199b90fd0..16d0cc600fa9 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -775,34 +775,6 @@ void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
 	}
 }
 
-void userfaultfd_exit(struct mm_struct *mm)
-{
-	struct vm_area_struct *vma = mm->mmap;
-
-	/*
-	 * We can do the vma walk without locking because the caller
-	 * (exit_mm) knows it now has exclusive access
-	 */
-	while (vma) {
-		struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
-
-		if (ctx && (ctx->features & UFFD_FEATURE_EVENT_EXIT)) {
-			struct userfaultfd_wait_queue ewq;
-
-			userfaultfd_ctx_get(ctx);
-
-			msg_init(&ewq.msg);
-			ewq.msg.event = UFFD_EVENT_EXIT;
-
-			userfaultfd_event_wait_completion(ctx, &ewq);
-
-			ctx->features &= ~UFFD_FEATURE_EVENT_EXIT;
-		}
-
-		vma = vma->vm_next;
-	}
-}
-
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 0468548acebf..f2b79bf4c895 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -72,8 +72,6 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
 extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 				       struct list_head *uf);
 
-extern void userfaultfd_exit(struct mm_struct *mm);
-
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -139,10 +137,6 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
 {
 }
 
-static inline void userfaultfd_exit(struct mm_struct *mm)
-{
-}
-
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index c055947c5c98..3b059530dac9 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,8 +18,7 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_EXIT |		\
-			   UFFD_FEATURE_EVENT_FORK |		\
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
 			   UFFD_FEATURE_EVENT_REMOVE |	\
 			   UFFD_FEATURE_EVENT_UNMAP |		\
@@ -113,7 +112,6 @@ struct uffd_msg {
 #define UFFD_EVENT_REMAP	0x14
 #define UFFD_EVENT_REMOVE	0x15
 #define UFFD_EVENT_UNMAP	0x16
-#define UFFD_EVENT_EXIT		0x17
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -163,7 +161,6 @@ struct uffdio_api {
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 #define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
-#define UFFD_FEATURE_EVENT_EXIT			(1<<7)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/kernel/exit.c b/kernel/exit.c
index e126ebf2400c..516acdb0e0ec 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -554,7 +554,6 @@ static void exit_mm(void)
 	enter_lazy_tlb(mm, current);
 	task_unlock(current);
 	mm_update_next_owner(mm);
-	userfaultfd_exit(mm);
 	mmput(mm);
 	if (test_thread_flag(TIF_MEMDIE))
 		exit_oom_victim();
-- 
cgit v1.2.3


From cbfd0c1001bedb4b051cf4a1f5df24f1500381bc Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 9 Mar 2017 16:16:57 -0800
Subject: include/linux/fs.h: fix unsigned enum warning with gcc-4.2

With arm-linux-gcc-4.2, almost every file we build in the kernel ends up
with this warning:

  include/linux/fs.h:2648: warning: comparison of unsigned expression < 0 is always false

Later versions don't have this problem, but it's easy enough to work
around.

Link: http://lkml.kernel.org/r/20161216105634.235457-12-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index aad3fd0ff5f8..7251f7bb45e8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2678,7 +2678,7 @@ static const char * const kernel_read_file_str[] = {
 
 static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
 {
-	if (id < 0 || id >= READING_MAX_ID)
+	if ((unsigned)id >= READING_MAX_ID)
 		return kernel_read_file_str[READING_UNKNOWN];
 
 	return kernel_read_file_str[id];
-- 
cgit v1.2.3


From ce9311cf95ad8baf044a014738d76973d93b739a Mon Sep 17 00:00:00 2001
From: Yisheng Xie <xieyisheng1@huawei.com>
Date: Thu, 9 Mar 2017 16:17:00 -0800
Subject: mm/vmstats: add thp_split_pud event for clarity

We added support for PUD-sized transparent hugepages, however we count
the event "thp split pud" into thp_split_pmd event.

To separate the event count of thp split pud from pmd, add a new event
named thp_split_pud.

Link: http://lkml.kernel.org/r/1488282380-5076-1-git-send-email-xieyisheng1@huawei.com
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vm_event_item.h | 3 +++
 mm/huge_memory.c              | 2 +-
 mm/vmstat.c                   | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 6aa1b6cb5828..a80b7b59cf33 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -79,6 +79,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		THP_SPLIT_PAGE_FAILED,
 		THP_DEFERRED_SPLIT_PAGE,
 		THP_SPLIT_PMD,
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+		THP_SPLIT_PUD,
+#endif
 		THP_ZERO_PAGE_ALLOC,
 		THP_ZERO_PAGE_ALLOC_FAILED,
 #endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d36b2af4d1bf..8f037e256c54 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1828,7 +1828,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
 	VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
 
-	count_vm_event(THP_SPLIT_PMD);
+	count_vm_event(THP_SPLIT_PUD);
 
 	pudp_huge_clear_flush_notify(vma, haddr, pud);
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 69f9aff39a2e..b1947f0cbee2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1065,6 +1065,9 @@ const char * const vmstat_text[] = {
 	"thp_split_page_failed",
 	"thp_deferred_split_page",
 	"thp_split_pmd",
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+	"thp_split_pud",
+#endif
 	"thp_zero_page_alloc",
 	"thp_zero_page_alloc_failed",
 #endif
-- 
cgit v1.2.3


From 70ccb92fdd90b35bb6f9200093d4ffd6cb38156b Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 9 Mar 2017 16:17:11 -0800
Subject: userfaultfd: non-cooperative: userfaultfd_remove revalidate vma in
 MADV_DONTNEED

userfaultfd_remove() has to be execute before zapping the pagetables or
UFFDIO_COPY could keep filling pages after zap_page_range returned,
which would result in non zero data after a MADV_DONTNEED.

However userfaultfd_remove() may have to release the mmap_sem.  This was
handled correctly in MADV_REMOVE, but MADV_DONTNEED accessed a
potentially stale vma (the very vma passed to zap_page_range(vma, ...)).

The fix consists in revalidating the vma in case userfaultfd_remove()
had to release the mmap_sem.

This also optimizes away an unnecessary down_read/up_read in the
MADV_REMOVE case if UFFD_EVENT_FORK had to be delivered.

It all remains zero runtime cost in case CONFIG_USERFAULTFD=n as
userfaultfd_remove() will be defined as "true" at build time.

Link: http://lkml.kernel.org/r/20170302173738.18994-3-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c              |  9 +++------
 include/linux/userfaultfd_k.h |  7 +++----
 mm/madvise.c                  | 44 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 47 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 2407249998c3..9fd5e51ffb31 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -695,8 +695,7 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 	userfaultfd_event_wait_completion(ctx, &ewq);
 }
 
-void userfaultfd_remove(struct vm_area_struct *vma,
-			struct vm_area_struct **prev,
+bool userfaultfd_remove(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -705,13 +704,11 @@ void userfaultfd_remove(struct vm_area_struct *vma,
 
 	ctx = vma->vm_userfaultfd_ctx.ctx;
 	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
-		return;
+		return true;
 
 	userfaultfd_ctx_get(ctx);
 	up_read(&mm->mmap_sem);
 
-	*prev = NULL; /* We wait for ACK w/o the mmap semaphore */
-
 	msg_init(&ewq.msg);
 
 	ewq.msg.event = UFFD_EVENT_REMOVE;
@@ -720,7 +717,7 @@ void userfaultfd_remove(struct vm_area_struct *vma,
 
 	userfaultfd_event_wait_completion(ctx, &ewq);
 
-	down_read(&mm->mmap_sem);
+	return false;
 }
 
 static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index f2b79bf4c895..48a3483dccb1 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -61,8 +61,7 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
 					unsigned long from, unsigned long to,
 					unsigned long len);
 
-extern void userfaultfd_remove(struct vm_area_struct *vma,
-			       struct vm_area_struct **prev,
+extern bool userfaultfd_remove(struct vm_area_struct *vma,
 			       unsigned long start,
 			       unsigned long end);
 
@@ -118,11 +117,11 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
 {
 }
 
-static inline void userfaultfd_remove(struct vm_area_struct *vma,
-				      struct vm_area_struct **prev,
+static inline bool userfaultfd_remove(struct vm_area_struct *vma,
 				      unsigned long start,
 				      unsigned long end)
 {
+	return true;
 }
 
 static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
diff --git a/mm/madvise.c b/mm/madvise.c
index dc5927c812d3..7a2abf0127ae 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -513,7 +513,43 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 	if (!can_madv_dontneed_vma(vma))
 		return -EINVAL;
 
-	userfaultfd_remove(vma, prev, start, end);
+	if (!userfaultfd_remove(vma, start, end)) {
+		*prev = NULL; /* mmap_sem has been dropped, prev is stale */
+
+		down_read(&current->mm->mmap_sem);
+		vma = find_vma(current->mm, start);
+		if (!vma)
+			return -ENOMEM;
+		if (start < vma->vm_start) {
+			/*
+			 * This "vma" under revalidation is the one
+			 * with the lowest vma->vm_start where start
+			 * is also < vma->vm_end. If start <
+			 * vma->vm_start it means an hole materialized
+			 * in the user address space within the
+			 * virtual range passed to MADV_DONTNEED.
+			 */
+			return -ENOMEM;
+		}
+		if (!can_madv_dontneed_vma(vma))
+			return -EINVAL;
+		if (end > vma->vm_end) {
+			/*
+			 * Don't fail if end > vma->vm_end. If the old
+			 * vma was splitted while the mmap_sem was
+			 * released the effect of the concurrent
+			 * operation may not cause MADV_DONTNEED to
+			 * have an undefined result. There may be an
+			 * adjacent next vma that we'll walk
+			 * next. userfaultfd_remove() will generate an
+			 * UFFD_EVENT_REMOVE repetition on the
+			 * end-vma->vm_end range, but the manager can
+			 * handle a repetition fine.
+			 */
+			end = vma->vm_end;
+		}
+		VM_WARN_ON(start >= end);
+	}
 	zap_page_range(vma, start, end - start);
 	return 0;
 }
@@ -554,8 +590,10 @@ static long madvise_remove(struct vm_area_struct *vma,
 	 * mmap_sem.
 	 */
 	get_file(f);
-	userfaultfd_remove(vma, prev, start, end);
-	up_read(&current->mm->mmap_sem);
+	if (userfaultfd_remove(vma, start, end)) {
+		/* mmap_sem was not released by userfaultfd_remove() */
+		up_read(&current->mm->mmap_sem);
+	}
 	error = vfs_fallocate(f,
 				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 				offset, end - start);
-- 
cgit v1.2.3


From cdfbabfb2f0ce983fdaa42f20e5f7842178fc01e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 9 Mar 2017 08:09:05 +0000
Subject: net: Work around lockdep limitation in sockets that use sockets

Lockdep issues a circular dependency warning when AFS issues an operation
through AF_RXRPC from a context in which the VFS/VM holds the mmap_sem.

The theory lockdep comes up with is as follows:

 (1) If the pagefault handler decides it needs to read pages from AFS, it
     calls AFS with mmap_sem held and AFS begins an AF_RXRPC call, but
     creating a call requires the socket lock:

	mmap_sem must be taken before sk_lock-AF_RXRPC

 (2) afs_open_socket() opens an AF_RXRPC socket and binds it.  rxrpc_bind()
     binds the underlying UDP socket whilst holding its socket lock.
     inet_bind() takes its own socket lock:

	sk_lock-AF_RXRPC must be taken before sk_lock-AF_INET

 (3) Reading from a TCP socket into a userspace buffer might cause a fault
     and thus cause the kernel to take the mmap_sem, but the TCP socket is
     locked whilst doing this:

	sk_lock-AF_INET must be taken before mmap_sem

However, lockdep's theory is wrong in this instance because it deals only
with lock classes and not individual locks.  The AF_INET lock in (2) isn't
really equivalent to the AF_INET lock in (3) as the former deals with a
socket entirely internal to the kernel that never sees userspace.  This is
a limitation in the design of lockdep.

Fix the general case by:

 (1) Double up all the locking keys used in sockets so that one set are
     used if the socket is created by userspace and the other set is used
     if the socket is created by the kernel.

 (2) Store the kern parameter passed to sk_alloc() in a variable in the
     sock struct (sk_kern_sock).  This informs sock_lock_init(),
     sock_init_data() and sk_clone_lock() as to the lock keys to be used.

     Note that the child created by sk_clone_lock() inherits the parent's
     kern setting.

 (3) Add a 'kern' parameter to ->accept() that is analogous to the one
     passed in to ->create() that distinguishes whether kernel_accept() or
     sys_accept4() was the caller and can be passed to sk_alloc().

     Note that a lot of accept functions merely dequeue an already
     allocated socket.  I haven't touched these as the new socket already
     exists before we get the parameter.

     Note also that there are a couple of places where I've made the accepted
     socket unconditionally kernel-based:

	irda_accept()
	rds_rcp_accept_one()
	tcp_accept_from_sock()

     because they follow a sock_create_kern() and accept off of that.

Whilst creating this, I noticed that lustre and ocfs don't create sockets
through sock_create_kern() and thus they aren't marked as for-kernel,
though they appear to be internal.  I wonder if these should do that so
that they use the new set of lock keys.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/af_alg.c                               |   9 ++-
 crypto/algif_hash.c                           |   9 ++-
 drivers/staging/lustre/lnet/lnet/lib-socket.c |   4 +-
 fs/dlm/lowcomms.c                             |   2 +-
 fs/ocfs2/cluster/tcp.c                        |   2 +-
 include/crypto/if_alg.h                       |   2 +-
 include/linux/net.h                           |   2 +-
 include/net/inet_common.h                     |   3 +-
 include/net/inet_connection_sock.h            |   2 +-
 include/net/sctp/structs.h                    |   3 +-
 include/net/sock.h                            |   9 ++-
 net/atm/svc.c                                 |   5 +-
 net/ax25/af_ax25.c                            |   3 +-
 net/bluetooth/l2cap_sock.c                    |   2 +-
 net/bluetooth/rfcomm/sock.c                   |   3 +-
 net/bluetooth/sco.c                           |   2 +-
 net/core/sock.c                               | 106 ++++++++++++++------------
 net/decnet/af_decnet.c                        |   5 +-
 net/ipv4/af_inet.c                            |   5 +-
 net/ipv4/inet_connection_sock.c               |   2 +-
 net/irda/af_irda.c                            |   5 +-
 net/iucv/af_iucv.c                            |   2 +-
 net/llc/af_llc.c                              |   4 +-
 net/netrom/af_netrom.c                        |   3 +-
 net/nfc/llcp_sock.c                           |   2 +-
 net/phonet/pep.c                              |   6 +-
 net/phonet/socket.c                           |   4 +-
 net/rds/tcp_listen.c                          |   2 +-
 net/rose/af_rose.c                            |   3 +-
 net/sctp/ipv6.c                               |   5 +-
 net/sctp/protocol.c                           |   5 +-
 net/sctp/socket.c                             |   4 +-
 net/smc/af_smc.c                              |   2 +-
 net/socket.c                                  |   4 +-
 net/tipc/socket.c                             |   8 +-
 net/unix/af_unix.c                            |   5 +-
 net/vmw_vsock/af_vsock.c                      |   3 +-
 net/x25/af_x25.c                              |   3 +-
 38 files changed, 142 insertions(+), 108 deletions(-)

(limited to 'include')

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index f5e18c2a4852..690deca17c35 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -266,7 +266,7 @@ unlock:
 	return err;
 }
 
-int af_alg_accept(struct sock *sk, struct socket *newsock)
+int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern)
 {
 	struct alg_sock *ask = alg_sk(sk);
 	const struct af_alg_type *type;
@@ -281,7 +281,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock)
 	if (!type)
 		goto unlock;
 
-	sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, 0);
+	sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, kern);
 	err = -ENOMEM;
 	if (!sk2)
 		goto unlock;
@@ -323,9 +323,10 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(af_alg_accept);
 
-static int alg_accept(struct socket *sock, struct socket *newsock, int flags)
+static int alg_accept(struct socket *sock, struct socket *newsock, int flags,
+		      bool kern)
 {
-	return af_alg_accept(sock->sk, newsock);
+	return af_alg_accept(sock->sk, newsock, kern);
 }
 
 static const struct proto_ops alg_proto_ops = {
diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index 54fc90e8339c..5e92bd275ef3 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -239,7 +239,8 @@ unlock:
 	return err ?: len;
 }
 
-static int hash_accept(struct socket *sock, struct socket *newsock, int flags)
+static int hash_accept(struct socket *sock, struct socket *newsock, int flags,
+		       bool kern)
 {
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
@@ -260,7 +261,7 @@ static int hash_accept(struct socket *sock, struct socket *newsock, int flags)
 	if (err)
 		return err;
 
-	err = af_alg_accept(ask->parent, newsock);
+	err = af_alg_accept(ask->parent, newsock, kern);
 	if (err)
 		return err;
 
@@ -378,7 +379,7 @@ static int hash_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
 }
 
 static int hash_accept_nokey(struct socket *sock, struct socket *newsock,
-			     int flags)
+			     int flags, bool kern)
 {
 	int err;
 
@@ -386,7 +387,7 @@ static int hash_accept_nokey(struct socket *sock, struct socket *newsock,
 	if (err)
 		return err;
 
-	return hash_accept(sock, newsock, flags);
+	return hash_accept(sock, newsock, flags, kern);
 }
 
 static struct proto_ops algif_hash_ops_nokey = {
diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c b/drivers/staging/lustre/lnet/lnet/lib-socket.c
index b7b87ecefcdf..9fca8d225ee0 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-socket.c
@@ -532,7 +532,7 @@ lnet_sock_accept(struct socket **newsockp, struct socket *sock)
 
 	newsock->ops = sock->ops;
 
-	rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+	rc = sock->ops->accept(sock, newsock, O_NONBLOCK, false);
 	if (rc == -EAGAIN) {
 		/* Nothing ready, so wait for activity */
 		init_waitqueue_entry(&wait, current);
@@ -540,7 +540,7 @@ lnet_sock_accept(struct socket **newsockp, struct socket *sock)
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule();
 		remove_wait_queue(sk_sleep(sock->sk), &wait);
-		rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+		rc = sock->ops->accept(sock, newsock, O_NONBLOCK, false);
 	}
 
 	if (rc)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 7d398d300e97..9382db998ec9 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -743,7 +743,7 @@ static int tcp_accept_from_sock(struct connection *con)
 	newsock->type = con->sock->type;
 	newsock->ops = con->sock->ops;
 
-	result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
+	result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK, true);
 	if (result < 0)
 		goto accept_err;
 
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 4348027384f5..d0ab7e56d0b4 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1863,7 +1863,7 @@ static int o2net_accept_one(struct socket *sock, int *more)
 
 	new_sock->type = sock->type;
 	new_sock->ops = sock->ops;
-	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, false);
 	if (ret < 0)
 		goto out;
 
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index a2bfd7843f18..e2b9c6fe2714 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -73,7 +73,7 @@ int af_alg_unregister_type(const struct af_alg_type *type);
 
 int af_alg_release(struct socket *sock);
 void af_alg_release_parent(struct sock *sk);
-int af_alg_accept(struct sock *sk, struct socket *newsock);
+int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern);
 
 int af_alg_make_sg(struct af_alg_sgl *sgl, struct iov_iter *iter, int len);
 void af_alg_free_sg(struct af_alg_sgl *sgl);
diff --git a/include/linux/net.h b/include/linux/net.h
index cd0c8bd0a1de..0620f5e18c96 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -146,7 +146,7 @@ struct proto_ops {
 	int		(*socketpair)(struct socket *sock1,
 				      struct socket *sock2);
 	int		(*accept)    (struct socket *sock,
-				      struct socket *newsock, int flags);
+				      struct socket *newsock, int flags, bool kern);
 	int		(*getname)   (struct socket *sock,
 				      struct sockaddr *addr,
 				      int *sockaddr_len, int peer);
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index b7952d55b9c0..f39ae697347f 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -20,7 +20,8 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 			  int addr_len, int flags, int is_sendmsg);
 int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
 		       int addr_len, int flags);
-int inet_accept(struct socket *sock, struct socket *newsock, int flags);
+int inet_accept(struct socket *sock, struct socket *newsock, int flags,
+		bool kern);
 int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
 ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 		      size_t size, int flags);
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 826f198374f8..c7a577976bec 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -258,7 +258,7 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk,
         return (unsigned long)min_t(u64, when, max_when);
 }
 
-struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern);
 
 int inet_csk_get_port(struct sock *sk, unsigned short snum);
 
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index a244db5e5ff7..07a0b128625a 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -476,7 +476,8 @@ struct sctp_pf {
 	int  (*send_verify) (struct sctp_sock *, union sctp_addr *);
 	int  (*supported_addrs)(const struct sctp_sock *, __be16 *);
 	struct sock *(*create_accept_sk) (struct sock *sk,
-					  struct sctp_association *asoc);
+					  struct sctp_association *asoc,
+					  bool kern);
 	int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr *addr);
 	void (*to_sk_saddr)(union sctp_addr *, struct sock *sk);
 	void (*to_sk_daddr)(union sctp_addr *, struct sock *sk);
diff --git a/include/net/sock.h b/include/net/sock.h
index 5e5997654db6..03252d53975d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -236,6 +236,7 @@ struct sock_common {
   *	@sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
   *	@sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
   *	@sk_lock:	synchronizer
+  *	@sk_kern_sock: True if sock is using kernel lock classes
   *	@sk_rcvbuf: size of receive buffer in bytes
   *	@sk_wq: sock wait queue and async head
   *	@sk_rx_dst: receive input route used by early demux
@@ -430,7 +431,8 @@ struct sock {
 #endif
 
 	kmemcheck_bitfield_begin(flags);
-	unsigned int		sk_padding : 2,
+	unsigned int		sk_padding : 1,
+				sk_kern_sock : 1,
 				sk_no_check_tx : 1,
 				sk_no_check_rx : 1,
 				sk_userlocks : 4,
@@ -1015,7 +1017,8 @@ struct proto {
 					int addr_len);
 	int			(*disconnect)(struct sock *sk, int flags);
 
-	struct sock *		(*accept)(struct sock *sk, int flags, int *err);
+	struct sock *		(*accept)(struct sock *sk, int flags, int *err,
+					  bool kern);
 
 	int			(*ioctl)(struct sock *sk, int cmd,
 					 unsigned long arg);
@@ -1573,7 +1576,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
 int sock_no_bind(struct socket *, struct sockaddr *, int);
 int sock_no_connect(struct socket *, struct sockaddr *, int, int);
 int sock_no_socketpair(struct socket *, struct socket *);
-int sock_no_accept(struct socket *, struct socket *, int);
+int sock_no_accept(struct socket *, struct socket *, int, bool);
 int sock_no_getname(struct socket *, struct sockaddr *, int *, int);
 unsigned int sock_no_poll(struct file *, struct socket *,
 			  struct poll_table_struct *);
diff --git a/net/atm/svc.c b/net/atm/svc.c
index db9794ec61d8..5589de7086af 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -318,7 +318,8 @@ out:
 	return error;
 }
 
-static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
+static int svc_accept(struct socket *sock, struct socket *newsock, int flags,
+		      bool kern)
 {
 	struct sock *sk = sock->sk;
 	struct sk_buff *skb;
@@ -329,7 +330,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
 
 	lock_sock(sk);
 
-	error = svc_create(sock_net(sk), newsock, 0, 0);
+	error = svc_create(sock_net(sk), newsock, 0, kern);
 	if (error)
 		goto out;
 
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index a8e42cedf1db..b7c486752b3a 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1320,7 +1320,8 @@ out_release:
 	return err;
 }
 
-static int ax25_accept(struct socket *sock, struct socket *newsock, int flags)
+static int ax25_accept(struct socket *sock, struct socket *newsock, int flags,
+		       bool kern)
 {
 	struct sk_buff *skb;
 	struct sock *newsk;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index f307b145ea54..507b80d59dec 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -301,7 +301,7 @@ done:
 }
 
 static int l2cap_sock_accept(struct socket *sock, struct socket *newsock,
-			     int flags)
+			     int flags, bool kern)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct sock *sk = sock->sk, *nsk;
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index aa1a814ceddc..ac3c650cb234 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -471,7 +471,8 @@ done:
 	return err;
 }
 
-static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags)
+static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags,
+			      bool kern)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct sock *sk = sock->sk, *nsk;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index e4e9a2da1e7e..728e0c8dc8e7 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -627,7 +627,7 @@ done:
 }
 
 static int sco_sock_accept(struct socket *sock, struct socket *newsock,
-			   int flags)
+			   int flags, bool kern)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct sock *sk = sock->sk, *ch;
diff --git a/net/core/sock.c b/net/core/sock.c
index f6fd79f33097..a96d5f7a5734 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -197,66 +197,55 @@ EXPORT_SYMBOL(sk_net_capable);
 
 /*
  * Each address family might have different locking rules, so we have
- * one slock key per address family:
+ * one slock key per address family and separate keys for internal and
+ * userspace sockets.
  */
 static struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_kern_keys[AF_MAX];
 static struct lock_class_key af_family_slock_keys[AF_MAX];
+static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 
 /*
  * Make lock validator output more readable. (we pre-construct these
  * strings build-time, so that runtime initialization of socket
  * locks is fast):
  */
+
+#define _sock_locks(x)						  \
+  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
+  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
+  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
+  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
+  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
+  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
+  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
+  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
+  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
+  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
+  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
+  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
+  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
+  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
+  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
+
 static const char *const af_family_key_strings[AF_MAX+1] = {
-  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
-  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
-  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
-  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
-  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
-  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
-  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
-  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
-  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
-  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
-  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
-  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
-  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
-  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
-  "sk_lock-AF_QIPCRTR", "sk_lock-AF_SMC"     , "sk_lock-AF_MAX"
+	_sock_locks("sk_lock-")
 };
 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
-  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
-  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
-  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
-  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
-  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
-  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
-  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
-  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
-  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
-  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
-  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
-  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
-  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
-  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
-  "slock-AF_QIPCRTR", "slock-AF_SMC"     , "slock-AF_MAX"
+	_sock_locks("slock-")
 };
 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
-  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
-  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
-  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
-  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
-  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
-  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
-  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
-  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
-  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
-  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
-  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
-  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
-  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
-  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
-  "clock-AF_QIPCRTR", "clock-AF_SMC"     , "clock-AF_MAX"
+	_sock_locks("clock-")
+};
+
+static const char *const af_family_kern_key_strings[AF_MAX+1] = {
+	_sock_locks("k-sk_lock-")
+};
+static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
+	_sock_locks("k-slock-")
+};
+static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
+	_sock_locks("k-clock-")
 };
 
 /*
@@ -264,6 +253,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
  * so split the lock classes by using a per-AF key:
  */
 static struct lock_class_key af_callback_keys[AF_MAX];
+static struct lock_class_key af_kern_callback_keys[AF_MAX];
 
 /* Take into consideration the size of the struct sk_buff overhead in the
  * determination of these values, since that is non-constant across
@@ -1293,7 +1283,16 @@ lenout:
  */
 static inline void sock_lock_init(struct sock *sk)
 {
-	sock_lock_init_class_and_name(sk,
+	if (sk->sk_kern_sock)
+		sock_lock_init_class_and_name(
+			sk,
+			af_family_kern_slock_key_strings[sk->sk_family],
+			af_family_kern_slock_keys + sk->sk_family,
+			af_family_kern_key_strings[sk->sk_family],
+			af_family_kern_keys + sk->sk_family);
+	else
+		sock_lock_init_class_and_name(
+			sk,
 			af_family_slock_key_strings[sk->sk_family],
 			af_family_slock_keys + sk->sk_family,
 			af_family_key_strings[sk->sk_family],
@@ -1399,6 +1398,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		 * why we need sk_prot_creator -acme
 		 */
 		sk->sk_prot = sk->sk_prot_creator = prot;
+		sk->sk_kern_sock = kern;
 		sock_lock_init(sk);
 		sk->sk_net_refcnt = kern ? 0 : 1;
 		if (likely(sk->sk_net_refcnt))
@@ -2277,7 +2277,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
 }
 EXPORT_SYMBOL(sock_no_socketpair);
 
-int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
+int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
+		   bool kern)
 {
 	return -EOPNOTSUPP;
 }
@@ -2481,7 +2482,14 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	}
 
 	rwlock_init(&sk->sk_callback_lock);
-	lockdep_set_class_and_name(&sk->sk_callback_lock,
+	if (sk->sk_kern_sock)
+		lockdep_set_class_and_name(
+			&sk->sk_callback_lock,
+			af_kern_callback_keys + sk->sk_family,
+			af_family_kern_clock_key_strings[sk->sk_family]);
+	else
+		lockdep_set_class_and_name(
+			&sk->sk_callback_lock,
 			af_callback_keys + sk->sk_family,
 			af_family_clock_key_strings[sk->sk_family]);
 
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index e6e79eda9763..7de5b40a5d0d 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1070,7 +1070,8 @@ static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo)
 	return skb == NULL ? ERR_PTR(err) : skb;
 }
 
-static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
+static int dn_accept(struct socket *sock, struct socket *newsock, int flags,
+		     bool kern)
 {
 	struct sock *sk = sock->sk, *newsk;
 	struct sk_buff *skb = NULL;
@@ -1099,7 +1100,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
 
 	cb = DN_SKB_CB(skb);
 	sk->sk_ack_backlog--;
-	newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, 0);
+	newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, kern);
 	if (newsk == NULL) {
 		release_sock(sk);
 		kfree_skb(skb);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5091f46826fa..6b1fc6e4278e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -689,11 +689,12 @@ EXPORT_SYMBOL(inet_stream_connect);
  *	Accept a pending connection. The TCP layer now gives BSD semantics.
  */
 
-int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+int inet_accept(struct socket *sock, struct socket *newsock, int flags,
+		bool kern)
 {
 	struct sock *sk1 = sock->sk;
 	int err = -EINVAL;
-	struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+	struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern);
 
 	if (!sk2)
 		goto do_err;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index b4d5980ade3b..5e313c1ac94f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -424,7 +424,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 /*
  * This will accept the next outstanding connection.
  */
-struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 81adc29a448d..8d77ad5cadaf 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -828,7 +828,8 @@ out:
  *    Wait for incoming connection
  *
  */
-static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
+static int irda_accept(struct socket *sock, struct socket *newsock, int flags,
+		       bool kern)
 {
 	struct sock *sk = sock->sk;
 	struct irda_sock *new, *self = irda_sk(sk);
@@ -836,7 +837,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
 	struct sk_buff *skb = NULL;
 	int err;
 
-	err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0);
+	err = irda_create(sock_net(sk), newsock, sk->sk_protocol, kern);
 	if (err)
 		return err;
 
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 89bbde1081ce..84de7b6326dc 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -938,7 +938,7 @@ done:
 
 /* Accept a pending connection */
 static int iucv_sock_accept(struct socket *sock, struct socket *newsock,
-			    int flags)
+			    int flags, bool kern)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct sock *sk = sock->sk, *nsk;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 06186d608a27..cb4fff785cbf 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -641,11 +641,13 @@ static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb)
  *	@sock: Socket which connections arrive on.
  *	@newsock: Socket to move incoming connection to.
  *	@flags: User specified operational flags.
+ *	@kern: If the socket is kernel internal
  *
  *	Accept a new incoming connection.
  *	Returns 0 upon success, negative otherwise.
  */
-static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags)
+static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags,
+			 bool kern)
 {
 	struct sock *sk = sock->sk, *newsk;
 	struct llc_sock *llc, *newllc;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 4bbf4526b885..ebf16f7f9089 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -765,7 +765,8 @@ out_release:
 	return err;
 }
 
-static int nr_accept(struct socket *sock, struct socket *newsock, int flags)
+static int nr_accept(struct socket *sock, struct socket *newsock, int flags,
+		     bool kern)
 {
 	struct sk_buff *skb;
 	struct sock *newsk;
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 879885b31cce..2ffb18e73df6 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -441,7 +441,7 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *parent,
 }
 
 static int llcp_sock_accept(struct socket *sock, struct socket *newsock,
-			    int flags)
+			    int flags, bool kern)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct sock *sk = sock->sk, *new_sk;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 222bedcd9575..e81537991ddf 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -772,7 +772,8 @@ static void pep_sock_close(struct sock *sk, long timeout)
 	sock_put(sk);
 }
 
-static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
+static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp,
+				    bool kern)
 {
 	struct pep_sock *pn = pep_sk(sk), *newpn;
 	struct sock *newsk = NULL;
@@ -846,7 +847,8 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
 	}
 
 	/* Create a new to-be-accepted sock */
-	newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, 0);
+	newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot,
+			 kern);
 	if (!newsk) {
 		pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL);
 		err = -ENOBUFS;
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index a6c8da3ee893..64634e3ec2fc 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -305,7 +305,7 @@ out:
 }
 
 static int pn_socket_accept(struct socket *sock, struct socket *newsock,
-				int flags)
+			    int flags, bool kern)
 {
 	struct sock *sk = sock->sk;
 	struct sock *newsk;
@@ -314,7 +314,7 @@ static int pn_socket_accept(struct socket *sock, struct socket *newsock,
 	if (unlikely(sk->sk_state != TCP_LISTEN))
 		return -EINVAL;
 
-	newsk = sk->sk_prot->accept(sk, flags, &err);
+	newsk = sk->sk_prot->accept(sk, flags, &err, kern);
 	if (!newsk)
 		return err;
 
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 2c69a508a693..507678853e6c 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -133,7 +133,7 @@ int rds_tcp_accept_one(struct socket *sock)
 
 	new_sock->type = sock->type;
 	new_sock->ops = sock->ops;
-	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, true);
 	if (ret < 0)
 		goto out;
 
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index b8a1df2c9785..4a9729257023 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -871,7 +871,8 @@ out_release:
 	return err;
 }
 
-static int rose_accept(struct socket *sock, struct socket *newsock, int flags)
+static int rose_accept(struct socket *sock, struct socket *newsock, int flags,
+		       bool kern)
 {
 	struct sk_buff *skb;
 	struct sock *newsk;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 063baac5b9fe..961ee59f696a 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -640,14 +640,15 @@ static sctp_scope_t sctp_v6_scope(union sctp_addr *addr)
 
 /* Create and initialize a new sk for the socket to be returned by accept(). */
 static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
-					     struct sctp_association *asoc)
+					     struct sctp_association *asoc,
+					     bool kern)
 {
 	struct sock *newsk;
 	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
 	struct sctp6_sock *newsctp6sk;
 	struct ipv6_txoptions *opt;
 
-	newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0);
+	newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern);
 	if (!newsk)
 		goto out;
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 1b6d4574d2b0..989a900383b5 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -575,10 +575,11 @@ static int sctp_v4_is_ce(const struct sk_buff *skb)
 
 /* Create and initialize a new sk for the socket returned by accept(). */
 static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
-					     struct sctp_association *asoc)
+					     struct sctp_association *asoc,
+					     bool kern)
 {
 	struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL,
-			sk->sk_prot, 0);
+			sk->sk_prot, kern);
 	struct inet_sock *newinet;
 
 	if (!newsk)
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 6f0a9be50f50..0f378ea2ae38 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4116,7 +4116,7 @@ static int sctp_disconnect(struct sock *sk, int flags)
  * descriptor will be returned from accept() to represent the newly
  * formed association.
  */
-static struct sock *sctp_accept(struct sock *sk, int flags, int *err)
+static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern)
 {
 	struct sctp_sock *sp;
 	struct sctp_endpoint *ep;
@@ -4151,7 +4151,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err)
 	 */
 	asoc = list_entry(ep->asocs.next, struct sctp_association, asocs);
 
-	newsk = sp->pf->create_accept_sk(sk, asoc);
+	newsk = sp->pf->create_accept_sk(sk, asoc, kern);
 	if (!newsk) {
 		error = -ENOMEM;
 		goto out;
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 85837ab90e89..093803786eac 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -944,7 +944,7 @@ out:
 }
 
 static int smc_accept(struct socket *sock, struct socket *new_sock,
-		      int flags)
+		      int flags, bool kern)
 {
 	struct sock *sk = sock->sk, *nsk;
 	DECLARE_WAITQUEUE(wait, current);
diff --git a/net/socket.c b/net/socket.c
index e0757e648c0c..e034fe4164be 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1506,7 +1506,7 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
 	if (err)
 		goto out_fd;
 
-	err = sock->ops->accept(sock, newsock, sock->file->f_flags);
+	err = sock->ops->accept(sock, newsock, sock->file->f_flags, false);
 	if (err < 0)
 		goto out_fd;
 
@@ -3239,7 +3239,7 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
 	if (err < 0)
 		goto done;
 
-	err = sock->ops->accept(sock, *newsock, flags);
+	err = sock->ops->accept(sock, *newsock, flags, true);
 	if (err < 0) {
 		sock_release(*newsock);
 		*newsock = NULL;
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 43e4045e72bc..7130e73bd42c 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -115,7 +115,8 @@ static void tipc_data_ready(struct sock *sk);
 static void tipc_write_space(struct sock *sk);
 static void tipc_sock_destruct(struct sock *sk);
 static int tipc_release(struct socket *sock);
-static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags);
+static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
+		       bool kern);
 static void tipc_sk_timeout(unsigned long data);
 static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
 			   struct tipc_name_seq const *seq);
@@ -2029,7 +2030,8 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo)
  *
  * Returns 0 on success, errno otherwise
  */
-static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
+static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
+		       bool kern)
 {
 	struct sock *new_sk, *sk = sock->sk;
 	struct sk_buff *buf;
@@ -2051,7 +2053,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
 
 	buf = skb_peek(&sk->sk_receive_queue);
 
-	res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, 0);
+	res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, kern);
 	if (res)
 		goto exit;
 	security_sk_clone(sock->sk, new_sock->sk);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index ee37b390260a..928691c43408 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -636,7 +636,7 @@ static int unix_bind(struct socket *, struct sockaddr *, int);
 static int unix_stream_connect(struct socket *, struct sockaddr *,
 			       int addr_len, int flags);
 static int unix_socketpair(struct socket *, struct socket *);
-static int unix_accept(struct socket *, struct socket *, int);
+static int unix_accept(struct socket *, struct socket *, int, bool);
 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 static unsigned int unix_dgram_poll(struct file *, struct socket *,
@@ -1402,7 +1402,8 @@ static void unix_sock_inherit_flags(const struct socket *old,
 		set_bit(SOCK_PASSSEC, &new->flags);
 }
 
-static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
+static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
+		       bool kern)
 {
 	struct sock *sk = sock->sk;
 	struct sock *tsk;
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 9192ead66751..9f770f33c100 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1250,7 +1250,8 @@ out:
 	return err;
 }
 
-static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
+static int vsock_accept(struct socket *sock, struct socket *newsock, int flags,
+			bool kern)
 {
 	struct sock *listener;
 	int err;
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index fd28a49dbe8f..8b911c29860e 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -852,7 +852,8 @@ static int x25_wait_for_data(struct sock *sk, long timeout)
 	return rc;
 }
 
-static int x25_accept(struct socket *sock, struct socket *newsock, int flags)
+static int x25_accept(struct socket *sock, struct socket *newsock, int flags,
+		      bool kern)
 {
 	struct sock *sk = sock->sk;
 	struct sock *newsk;
-- 
cgit v1.2.3


From 40c50c1fecdf012a3bf055ec813f0ef2eda2749c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Mar 2017 13:17:18 +0100
Subject: kexec, x86/purgatory: Unbreak it and clean it up

The purgatory code defines global variables which are referenced via a
symbol lookup in the kexec code (core and arch).

A recent commit addressing sparse warnings made these static and thereby
broke kexec_file.

Why did this happen? Simply because the whole machinery is undocumented and
lacks any form of forward declarations. The variable names are unspecific
and lack a prefix, so adding forward declarations creates shadow variables
in the core code. Aside of that the code relies on magic constants and
duplicate struct definitions with no way to ensure that these things stay
in sync. The section placement of the purgatory variables happened by
chance and not by design.

Unbreak kexec and cleanup the mess:

 - Add proper forward declarations and document the usage
 - Use common struct definition
 - Use the proper common defines instead of magic constants
 - Add a purgatory_ prefix to have a proper name space
 - Use ARRAY_SIZE() instead of a homebrewn reimplementation
 - Add proper sections to the purgatory variables [ From Mike ]

Fixes: 72042a8c7b01 ("x86/purgatory: Make functions and variables static")
Reported-by: Mike Galbraith <<efault@gmx.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Nicholas Mc Guire <der.herr@hofr.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: "Tobin C. Harding" <me@tobin.cc>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1703101315140.3681@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/powerpc/purgatory/trampoline.S | 12 ++++++------
 arch/x86/include/asm/purgatory.h    | 20 ++++++++++++++++++++
 arch/x86/kernel/machine_kexec_64.c  |  9 ++++++---
 arch/x86/purgatory/purgatory.c      | 35 +++++++++++++++++------------------
 arch/x86/purgatory/purgatory.h      |  8 --------
 arch/x86/purgatory/setup-x86_64.S   |  2 +-
 arch/x86/purgatory/sha256.h         |  1 -
 include/linux/purgatory.h           | 23 +++++++++++++++++++++++
 kernel/kexec_file.c                 |  8 ++++----
 kernel/kexec_internal.h             |  6 +-----
 10 files changed, 78 insertions(+), 46 deletions(-)
 create mode 100644 arch/x86/include/asm/purgatory.h
 delete mode 100644 arch/x86/purgatory/purgatory.h
 create mode 100644 include/linux/purgatory.h

(limited to 'include')

diff --git a/arch/powerpc/purgatory/trampoline.S b/arch/powerpc/purgatory/trampoline.S
index f9760ccf4032..3696ea6c4826 100644
--- a/arch/powerpc/purgatory/trampoline.S
+++ b/arch/powerpc/purgatory/trampoline.S
@@ -116,13 +116,13 @@ dt_offset:
 
 	.data
 	.balign 8
-.globl sha256_digest
-sha256_digest:
+.globl purgatory_sha256_digest
+purgatory_sha256_digest:
 	.skip	32
-	.size sha256_digest, . - sha256_digest
+	.size purgatory_sha256_digest, . - purgatory_sha256_digest
 
 	.balign 8
-.globl sha_regions
-sha_regions:
+.globl purgatory_sha_regions
+purgatory_sha_regions:
 	.skip	8 * 2 * 16
-	.size sha_regions, . - sha_regions
+	.size purgatory_sha_regions, . - purgatory_sha_regions
diff --git a/arch/x86/include/asm/purgatory.h b/arch/x86/include/asm/purgatory.h
new file mode 100644
index 000000000000..d7da2729903d
--- /dev/null
+++ b/arch/x86/include/asm/purgatory.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_PURGATORY_H
+#define _ASM_X86_PURGATORY_H
+
+#ifndef __ASSEMBLY__
+#include <linux/purgatory.h>
+
+extern void purgatory(void);
+/*
+ * These forward declarations serve two purposes:
+ *
+ * 1) Make sparse happy when checking arch/purgatory
+ * 2) Document that these are required to be global so the symbol
+ *    lookup in kexec works
+ */
+extern unsigned long purgatory_backup_dest;
+extern unsigned long purgatory_backup_src;
+extern unsigned long purgatory_backup_sz;
+#endif	/* __ASSEMBLY__ */
+
+#endif /* _ASM_PURGATORY_H */
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 307b1f4543de..857cdbd02867 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -194,19 +194,22 @@ static int arch_update_purgatory(struct kimage *image)
 
 	/* Setup copying of backup region */
 	if (image->type == KEXEC_TYPE_CRASH) {
-		ret = kexec_purgatory_get_set_symbol(image, "backup_dest",
+		ret = kexec_purgatory_get_set_symbol(image,
+				"purgatory_backup_dest",
 				&image->arch.backup_load_addr,
 				sizeof(image->arch.backup_load_addr), 0);
 		if (ret)
 			return ret;
 
-		ret = kexec_purgatory_get_set_symbol(image, "backup_src",
+		ret = kexec_purgatory_get_set_symbol(image,
+				"purgatory_backup_src",
 				&image->arch.backup_src_start,
 				sizeof(image->arch.backup_src_start), 0);
 		if (ret)
 			return ret;
 
-		ret = kexec_purgatory_get_set_symbol(image, "backup_sz",
+		ret = kexec_purgatory_get_set_symbol(image,
+				"purgatory_backup_sz",
 				&image->arch.backup_src_sz,
 				sizeof(image->arch.backup_src_sz), 0);
 		if (ret)
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index b6d5c8946e66..470edad96bb9 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -10,22 +10,19 @@
  * Version 2.  See the file COPYING for more details.
  */
 
+#include <linux/bug.h>
+#include <asm/purgatory.h>
+
 #include "sha256.h"
-#include "purgatory.h"
 #include "../boot/string.h"
 
-struct sha_region {
-	unsigned long start;
-	unsigned long len;
-};
-
-static unsigned long backup_dest;
-static unsigned long backup_src;
-static unsigned long backup_sz;
+unsigned long purgatory_backup_dest __section(.kexec-purgatory);
+unsigned long purgatory_backup_src __section(.kexec-purgatory);
+unsigned long purgatory_backup_sz __section(.kexec-purgatory);
 
-static u8 sha256_digest[SHA256_DIGEST_SIZE] = { 0 };
+u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE] __section(.kexec-purgatory);
 
-struct sha_region sha_regions[16] = {};
+struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX] __section(.kexec-purgatory);
 
 /*
  * On x86, second kernel requries first 640K of memory to boot. Copy
@@ -34,26 +31,28 @@ struct sha_region sha_regions[16] = {};
  */
 static int copy_backup_region(void)
 {
-	if (backup_dest)
-		memcpy((void *)backup_dest, (void *)backup_src, backup_sz);
-
+	if (purgatory_backup_dest) {
+		memcpy((void *)purgatory_backup_dest,
+		       (void *)purgatory_backup_src, purgatory_backup_sz);
+	}
 	return 0;
 }
 
 static int verify_sha256_digest(void)
 {
-	struct sha_region *ptr, *end;
+	struct kexec_sha_region *ptr, *end;
 	u8 digest[SHA256_DIGEST_SIZE];
 	struct sha256_state sctx;
 
 	sha256_init(&sctx);
-	end = &sha_regions[sizeof(sha_regions)/sizeof(sha_regions[0])];
-	for (ptr = sha_regions; ptr < end; ptr++)
+	end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions);
+
+	for (ptr = purgatory_sha_regions; ptr < end; ptr++)
 		sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len);
 
 	sha256_final(&sctx, digest);
 
-	if (memcmp(digest, sha256_digest, sizeof(digest)))
+	if (memcmp(digest, purgatory_sha256_digest, sizeof(digest)))
 		return 1;
 
 	return 0;
diff --git a/arch/x86/purgatory/purgatory.h b/arch/x86/purgatory/purgatory.h
deleted file mode 100644
index e2e365a6c192..000000000000
--- a/arch/x86/purgatory/purgatory.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef PURGATORY_H
-#define PURGATORY_H
-
-#ifndef __ASSEMBLY__
-extern void purgatory(void);
-#endif	/* __ASSEMBLY__ */
-
-#endif /* PURGATORY_H */
diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S
index f90e9dfa90bb..dfae9b9e60b5 100644
--- a/arch/x86/purgatory/setup-x86_64.S
+++ b/arch/x86/purgatory/setup-x86_64.S
@@ -9,7 +9,7 @@
  * This source code is licensed under the GNU General Public License,
  * Version 2.  See the file COPYING for more details.
  */
-#include "purgatory.h"
+#include <asm/purgatory.h>
 
 	.text
 	.globl purgatory_start
diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h
index bd15a4127735..2867d9825a57 100644
--- a/arch/x86/purgatory/sha256.h
+++ b/arch/x86/purgatory/sha256.h
@@ -10,7 +10,6 @@
 #ifndef SHA256_H
 #define SHA256_H
 
-
 #include <linux/types.h>
 #include <crypto/sha.h>
 
diff --git a/include/linux/purgatory.h b/include/linux/purgatory.h
new file mode 100644
index 000000000000..d60d4e278609
--- /dev/null
+++ b/include/linux/purgatory.h
@@ -0,0 +1,23 @@
+#ifndef _LINUX_PURGATORY_H
+#define _LINUX_PURGATORY_H
+
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <uapi/linux/kexec.h>
+
+struct kexec_sha_region {
+	unsigned long start;
+	unsigned long len;
+};
+
+/*
+ * These forward declarations serve two purposes:
+ *
+ * 1) Make sparse happy when checking arch/purgatory
+ * 2) Document that these are required to be global so the symbol
+ *    lookup in kexec works
+ */
+extern struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX];
+extern u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE];
+
+#endif
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b56a558e406d..b118735fea9d 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -614,13 +614,13 @@ static int kexec_calculate_store_digests(struct kimage *image)
 		ret = crypto_shash_final(desc, digest);
 		if (ret)
 			goto out_free_digest;
-		ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
-						sha_regions, sha_region_sz, 0);
+		ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions",
+						     sha_regions, sha_region_sz, 0);
 		if (ret)
 			goto out_free_digest;
 
-		ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
-						digest, SHA256_DIGEST_SIZE, 0);
+		ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest",
+						     digest, SHA256_DIGEST_SIZE, 0);
 		if (ret)
 			goto out_free_digest;
 	}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 4cef7e4706b0..799a8a452187 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -15,11 +15,7 @@ int kimage_is_destination_range(struct kimage *image,
 extern struct mutex kexec_mutex;
 
 #ifdef CONFIG_KEXEC_FILE
-struct kexec_sha_region {
-	unsigned long start;
-	unsigned long len;
-};
-
+#include <linux/purgatory.h>
 void kimage_file_post_load_cleanup(struct kimage *image);
 #else /* CONFIG_KEXEC_FILE */
 static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
-- 
cgit v1.2.3


From c962cff17dfa11f4a8227ac16de2b28aea3312e4 Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Fri, 3 Mar 2017 16:02:23 +0800
Subject: Revert "x86/acpi: Set persistent cpuid <-> nodeid mapping when
 booting"

Revert: dc6db24d2476 ("x86/acpi: Set persistent cpuid <-> nodeid mapping when booting")

The mapping of "cpuid <-> nodeid" is established at boot time via ACPI
tables to keep associations of workqueues and other node related items
consistent across cpu hotplug.

But, ACPI tables are unreliable and failures with that boot time mapping
have been reported on machines where the ACPI table and the physical
information which is retrieved at actual hotplug is inconsistent.

Revert the mapping implementation so it can be replaced with a less error
prone approach.

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Tested-by: Xiaolong Ye <xiaolong.ye@intel.com>
Cc: rjw@rjwysocki.net
Cc: linux-acpi@vger.kernel.org
Cc: guzheng1@huawei.com
Cc: izumi.taku@jp.fujitsu.com
Cc: lenb@kernel.org
Link: http://lkml.kernel.org/r/1488528147-2279-2-git-send-email-douly.fnst@cn.fujitsu.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/acpi/boot.c   |  2 +-
 drivers/acpi/acpi_processor.c |  5 ---
 drivers/acpi/bus.c            |  1 -
 drivers/acpi/processor_core.c | 73 -------------------------------------------
 include/linux/acpi.h          |  3 --
 5 files changed, 1 insertion(+), 83 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ae32838cac5f..f6b0e87d2388 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -710,7 +710,7 @@ static void __init acpi_set_irq_model_ioapic(void)
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 #include <acpi/processor.h>
 
-int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 {
 #ifdef CONFIG_ACPI_NUMA
 	int nid;
diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 4467a8089ab8..5d208a99d0c9 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -182,11 +182,6 @@ int __weak arch_register_cpu(int cpu)
 
 void __weak arch_unregister_cpu(int cpu) {}
 
-int __weak acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
-{
-	return -ENODEV;
-}
-
 static int acpi_processor_hotadd_init(struct acpi_processor *pr)
 {
 	unsigned long long sta;
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 80cb5eb75b63..34fbe027e73a 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -1249,7 +1249,6 @@ static int __init acpi_init(void)
 	acpi_wakeup_device_init();
 	acpi_debugger_init();
 	acpi_setup_sb_notify_handler();
-	acpi_set_processor_mapping();
 	return 0;
 }
 
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 611a5585a902..a84386204659 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -278,79 +278,6 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
 }
 EXPORT_SYMBOL_GPL(acpi_get_cpuid);
 
-#ifdef CONFIG_ACPI_HOTPLUG_CPU
-static bool __init
-map_processor(acpi_handle handle, phys_cpuid_t *phys_id, int *cpuid)
-{
-	int type, id;
-	u32 acpi_id;
-	acpi_status status;
-	acpi_object_type acpi_type;
-	unsigned long long tmp;
-	union acpi_object object = { 0 };
-	struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
-
-	status = acpi_get_type(handle, &acpi_type);
-	if (ACPI_FAILURE(status))
-		return false;
-
-	switch (acpi_type) {
-	case ACPI_TYPE_PROCESSOR:
-		status = acpi_evaluate_object(handle, NULL, NULL, &buffer);
-		if (ACPI_FAILURE(status))
-			return false;
-		acpi_id = object.processor.proc_id;
-
-		/* validate the acpi_id */
-		if(acpi_processor_validate_proc_id(acpi_id))
-			return false;
-		break;
-	case ACPI_TYPE_DEVICE:
-		status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp);
-		if (ACPI_FAILURE(status))
-			return false;
-		acpi_id = tmp;
-		break;
-	default:
-		return false;
-	}
-
-	type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
-
-	*phys_id = __acpi_get_phys_id(handle, type, acpi_id, false);
-	id = acpi_map_cpuid(*phys_id, acpi_id);
-
-	if (id < 0)
-		return false;
-	*cpuid = id;
-	return true;
-}
-
-static acpi_status __init
-set_processor_node_mapping(acpi_handle handle, u32 lvl, void *context,
-			   void **rv)
-{
-	phys_cpuid_t phys_id;
-	int cpu_id;
-
-	if (!map_processor(handle, &phys_id, &cpu_id))
-		return AE_ERROR;
-
-	acpi_map_cpu2node(handle, cpu_id, phys_id);
-	return AE_OK;
-}
-
-void __init acpi_set_processor_mapping(void)
-{
-	/* Set persistent cpu <-> node mapping for all processors. */
-	acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
-			    ACPI_UINT32_MAX, set_processor_node_mapping,
-			    NULL, NULL, NULL);
-}
-#else
-void __init acpi_set_processor_mapping(void) {}
-#endif /* CONFIG_ACPI_HOTPLUG_CPU */
-
 #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
 static int get_ioapic_id(struct acpi_subtable_header *entry, u32 gsi_base,
 			 u64 *phys_addr, int *ioapic_id)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 673acda012af..63a7519b00cc 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -294,11 +294,8 @@ bool acpi_processor_validate_proc_id(int proc_id);
 int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
 		 int *pcpu);
 int acpi_unmap_cpu(int cpu);
-int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid);
 #endif /* CONFIG_ACPI_HOTPLUG_CPU */
 
-void acpi_set_processor_mapping(void);
-
 #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
 int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr);
 #endif
-- 
cgit v1.2.3


From a77d6cd968497792e072b74dff45b891ba778ddb Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Fri, 3 Mar 2017 16:02:27 +0800
Subject: acpi/processor: Check for duplicate processor ids at hotplug time

The check for duplicate processor ids happens at boot time based on the
ACPI table contents, but the final sanity checks for a processor happen
at hotplug time.

At hotplug time, where the physical information is available, which might
differ from the ACPI table information, a check for duplicate processor
ids is missing.

Add it to the hotplug checks and rename the function so it better
reflects its purpose.

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Tested-by: Xiaolong Ye <xiaolong.ye@intel.com>
Cc: rjw@rjwysocki.net
Cc: linux-acpi@vger.kernel.org
Cc: guzheng1@huawei.com
Cc: izumi.taku@jp.fujitsu.com
Cc: lenb@kernel.org
Link: http://lkml.kernel.org/r/1488528147-2279-6-git-send-email-douly.fnst@cn.fujitsu.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/acpi/acpi_processor.c | 13 ++++++++++---
 include/linux/acpi.h          |  2 +-
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 9a98d7e00200..0143135b3abe 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -280,6 +280,13 @@ static int acpi_processor_get_info(struct acpi_device *device)
 		pr->acpi_id = value;
 	}
 
+	if (acpi_duplicate_processor_id(pr->acpi_id)) {
+		dev_err(&device->dev,
+			"Failed to get unique processor _UID (0x%x)\n",
+			pr->acpi_id);
+		return -ENODEV;
+	}
+
 	pr->phys_id = acpi_get_phys_id(pr->handle, device_declaration,
 					pr->acpi_id);
 	if (invalid_phys_cpuid(pr->phys_id))
@@ -580,7 +587,7 @@ static struct acpi_scan_handler processor_container_handler = {
 static int nr_unique_ids __initdata;
 
 /* The number of the duplicate processor IDs */
-static int nr_duplicate_ids __initdata;
+static int nr_duplicate_ids;
 
 /* Used to store the unique processor IDs */
 static int unique_processor_ids[] __initdata = {
@@ -588,7 +595,7 @@ static int unique_processor_ids[] __initdata = {
 };
 
 /* Used to store the duplicate processor IDs */
-static int duplicate_processor_ids[] __initdata = {
+static int duplicate_processor_ids[] = {
 	[0 ... NR_CPUS - 1] = -1,
 };
 
@@ -679,7 +686,7 @@ void __init acpi_processor_check_duplicates(void)
 						NULL, NULL);
 }
 
-bool __init acpi_processor_validate_proc_id(int proc_id)
+bool acpi_duplicate_processor_id(int proc_id)
 {
 	int i;
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 63a7519b00cc..9b05886f9773 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -287,7 +287,7 @@ static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id)
 }
 
 /* Validate the processor object's proc_id */
-bool acpi_processor_validate_proc_id(int proc_id);
+bool acpi_duplicate_processor_id(int proc_id);
 
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 /* Arch dependent functions for cpu hotplug support */
-- 
cgit v1.2.3


From 65869a47f3488253f5fd88cc4f14e0a4e2601a55 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 11 Mar 2017 16:55:49 +0100
Subject: bpf: improve read-only handling

Improve bpf_{prog,jit_binary}_{un,}lock_ro() by throwing a
one-time warning in case of an error when the image couldn't
be set read-only, and also mark struct bpf_prog as locked when
bpf_prog_lock_ro() was called.

Reason for the latter is that bpf_prog_unlock_ro() is called from
various places including error paths, and we shouldn't mess with
page attributes when really not needed.

For bpf_jit_binary_unlock_ro() this is not needed as jited flag
implicitly indicates this, thus for archs with ARCH_HAS_SET_MEMORY
we're guaranteed to have a previously locked image. Overall, this
should also help us to identify any further potential issues with
set_memory_*() helpers.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0c167fdee5f7..fbf7b39e8103 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -409,6 +409,7 @@ struct bpf_prog {
 	u16			pages;		/* Number of allocated pages */
 	kmemcheck_bitfield_begin(meta);
 	u16			jited:1,	/* Is our filter JIT'ed? */
+				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
 				dst_needed:1,	/* Do we need dst entry? */
@@ -554,22 +555,29 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
 #ifdef CONFIG_ARCH_HAS_SET_MEMORY
 static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 {
-	set_memory_ro((unsigned long)fp, fp->pages);
+	fp->locked = 1;
+	WARN_ON_ONCE(set_memory_ro((unsigned long)fp, fp->pages));
 }
 
 static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 {
-	set_memory_rw((unsigned long)fp, fp->pages);
+	if (fp->locked) {
+		WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages));
+		/* In case set_memory_rw() fails, we want to be the first
+		 * to crash here instead of some random place later on.
+		 */
+		fp->locked = 0;
+	}
 }
 
 static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
 {
-	set_memory_ro((unsigned long)hdr, hdr->pages);
+	WARN_ON_ONCE(set_memory_ro((unsigned long)hdr, hdr->pages));
 }
 
 static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
 {
-	set_memory_rw((unsigned long)hdr, hdr->pages);
+	WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages));
 }
 #else
 static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
-- 
cgit v1.2.3


From 337ba7fbf0fbc12242359c4af114878618b90951 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Sat, 25 Feb 2017 16:29:50 +0300
Subject: uapi: fix drm/omap_drm.h userspace compilation errors

Consistently use types from linux/types.h like in other uapi drm/*_drm.h
header files to fix the following drm/omap_drm.h userspace compilation
errors:

/usr/include/drm/omap_drm.h:36:2: error: unknown type name 'uint64_t'
  uint64_t param;   /* in */
/usr/include/drm/omap_drm.h:37:2: error: unknown type name 'uint64_t'
  uint64_t value;   /* in (set_param), out (get_param) */
/usr/include/drm/omap_drm.h:56:2: error: unknown type name 'uint32_t'
  uint32_t bytes;  /* (for non-tiled formats) */
/usr/include/drm/omap_drm.h:58:3: error: unknown type name 'uint16_t'
   uint16_t width;
/usr/include/drm/omap_drm.h:59:3: error: unknown type name 'uint16_t'
   uint16_t height;
/usr/include/drm/omap_drm.h:65:2: error: unknown type name 'uint32_t'
  uint32_t flags;   /* in */
/usr/include/drm/omap_drm.h:66:2: error: unknown type name 'uint32_t'
  uint32_t handle;  /* out */
/usr/include/drm/omap_drm.h:67:2: error: unknown type name 'uint32_t'
  uint32_t __pad;
/usr/include/drm/omap_drm.h:77:2: error: unknown type name 'uint32_t'
  uint32_t handle;  /* buffer handle (in) */
/usr/include/drm/omap_drm.h:78:2: error: unknown type name 'uint32_t'
  uint32_t op;   /* mask of omap_gem_op (in) */
/usr/include/drm/omap_drm.h:82:2: error: unknown type name 'uint32_t'
  uint32_t handle;  /* buffer handle (in) */
/usr/include/drm/omap_drm.h:83:2: error: unknown type name 'uint32_t'
  uint32_t op;   /* mask of omap_gem_op (in) */
/usr/include/drm/omap_drm.h:88:2: error: unknown type name 'uint32_t'
  uint32_t nregions;
/usr/include/drm/omap_drm.h:89:2: error: unknown type name 'uint32_t'
  uint32_t __pad;
/usr/include/drm/omap_drm.h:93:2: error: unknown type name 'uint32_t'
  uint32_t handle;  /* buffer handle (in) */
/usr/include/drm/omap_drm.h:94:2: error: unknown type name 'uint32_t'
  uint32_t pad;
/usr/include/drm/omap_drm.h:95:2: error: unknown type name 'uint64_t'
  uint64_t offset;  /* mmap offset (out) */
/usr/include/drm/omap_drm.h:102:2: error: unknown type name 'uint32_t'
  uint32_t size;   /* virtual size for mmap'ing (out) */
/usr/include/drm/omap_drm.h:103:2: error: unknown type name 'uint32_t'
  uint32_t __pad;

Fixes: ef6503e89194 ("drm: Kbuild: add omap_drm.h to the installed headers")
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
---
 include/uapi/drm/omap_drm.h | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/uapi/drm/omap_drm.h b/include/uapi/drm/omap_drm.h
index 407cb55df6ac..7fb97863c945 100644
--- a/include/uapi/drm/omap_drm.h
+++ b/include/uapi/drm/omap_drm.h
@@ -33,8 +33,8 @@ extern "C" {
 #define OMAP_PARAM_CHIPSET_ID	1	/* ie. 0x3430, 0x4430, etc */
 
 struct drm_omap_param {
-	uint64_t param;			/* in */
-	uint64_t value;			/* in (set_param), out (get_param) */
+	__u64 param;			/* in */
+	__u64 value;			/* in (set_param), out (get_param) */
 };
 
 #define OMAP_BO_SCANOUT		0x00000001	/* scanout capable (phys contiguous) */
@@ -53,18 +53,18 @@ struct drm_omap_param {
 #define OMAP_BO_TILED		(OMAP_BO_TILED_8 | OMAP_BO_TILED_16 | OMAP_BO_TILED_32)
 
 union omap_gem_size {
-	uint32_t bytes;		/* (for non-tiled formats) */
+	__u32 bytes;		/* (for non-tiled formats) */
 	struct {
-		uint16_t width;
-		uint16_t height;
+		__u16 width;
+		__u16 height;
 	} tiled;		/* (for tiled formats) */
 };
 
 struct drm_omap_gem_new {
 	union omap_gem_size size;	/* in */
-	uint32_t flags;			/* in */
-	uint32_t handle;		/* out */
-	uint32_t __pad;
+	__u32 flags;			/* in */
+	__u32 handle;			/* out */
+	__u32 __pad;
 };
 
 /* mask of operations: */
@@ -74,33 +74,33 @@ enum omap_gem_op {
 };
 
 struct drm_omap_gem_cpu_prep {
-	uint32_t handle;		/* buffer handle (in) */
-	uint32_t op;			/* mask of omap_gem_op (in) */
+	__u32 handle;			/* buffer handle (in) */
+	__u32 op;			/* mask of omap_gem_op (in) */
 };
 
 struct drm_omap_gem_cpu_fini {
-	uint32_t handle;		/* buffer handle (in) */
-	uint32_t op;			/* mask of omap_gem_op (in) */
+	__u32 handle;			/* buffer handle (in) */
+	__u32 op;			/* mask of omap_gem_op (in) */
 	/* TODO maybe here we pass down info about what regions are touched
 	 * by sw so we can be clever about cache ops?  For now a placeholder,
 	 * set to zero and we just do full buffer flush..
 	 */
-	uint32_t nregions;
-	uint32_t __pad;
+	__u32 nregions;
+	__u32 __pad;
 };
 
 struct drm_omap_gem_info {
-	uint32_t handle;		/* buffer handle (in) */
-	uint32_t pad;
-	uint64_t offset;		/* mmap offset (out) */
+	__u32 handle;			/* buffer handle (in) */
+	__u32 pad;
+	__u64 offset;			/* mmap offset (out) */
 	/* note: in case of tiled buffers, the user virtual size can be
 	 * different from the physical size (ie. how many pages are needed
 	 * to back the object) which is returned in DRM_IOCTL_GEM_OPEN..
 	 * This size here is the one that should be used if you want to
 	 * mmap() the buffer:
 	 */
-	uint32_t size;			/* virtual size for mmap'ing (out) */
-	uint32_t __pad;
+	__u32 size;			/* virtual size for mmap'ing (out) */
+	__u32 __pad;
 };
 
 #define DRM_OMAP_GET_PARAM		0x00
-- 
cgit v1.2.3


From f9bb7acb9b19a40dbd9d1b89380335dc8e23925f Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Fri, 24 Feb 2017 18:23:55 +0100
Subject: gpu: ipu-v3: add unsynchronised DP channel disabling

When disabling the foreground DP channel during a modeset, the DC is
already disabled without waiting for end of frame. There is no reason
to wait for a frame boundary before updating the DP registers in that
case.
Add support to apply updates immediately. No functional changes, yet.

Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Reviewed-by: Lucas Stach <l.stach@pengutronix.de>
---
 drivers/gpu/drm/imx/ipuv3-plane.c |  2 +-
 drivers/gpu/ipu-v3/ipu-common.c   |  8 +++++---
 drivers/gpu/ipu-v3/ipu-dp.c       | 12 ++++++------
 drivers/gpu/ipu-v3/ipu-prv.h      |  7 ++++++-
 include/video/imx-ipu-v3.h        |  2 +-
 5 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/imx/ipuv3-plane.c b/drivers/gpu/drm/imx/ipuv3-plane.c
index 24819c9c3640..55991d46ced5 100644
--- a/drivers/gpu/drm/imx/ipuv3-plane.c
+++ b/drivers/gpu/drm/imx/ipuv3-plane.c
@@ -181,7 +181,7 @@ static int ipu_disable_plane(struct drm_plane *plane)
 	ipu_idmac_wait_busy(ipu_plane->ipu_ch, 50);
 
 	if (ipu_plane->dp)
-		ipu_dp_disable_channel(ipu_plane->dp);
+		ipu_dp_disable_channel(ipu_plane->dp, true);
 	ipu_idmac_disable_channel(ipu_plane->ipu_ch);
 	ipu_dmfc_disable_channel(ipu_plane->dmfc);
 	if (ipu_plane->dp)
diff --git a/drivers/gpu/ipu-v3/ipu-common.c b/drivers/gpu/ipu-v3/ipu-common.c
index 8368e6f766ee..8a32ed25a1c2 100644
--- a/drivers/gpu/ipu-v3/ipu-common.c
+++ b/drivers/gpu/ipu-v3/ipu-common.c
@@ -51,15 +51,17 @@ int ipu_get_num(struct ipu_soc *ipu)
 }
 EXPORT_SYMBOL_GPL(ipu_get_num);
 
-void ipu_srm_dp_sync_update(struct ipu_soc *ipu)
+void ipu_srm_dp_update(struct ipu_soc *ipu, bool sync)
 {
 	u32 val;
 
 	val = ipu_cm_read(ipu, IPU_SRM_PRI2);
-	val |= 0x8;
+	val &= ~DP_S_SRM_MODE_MASK;
+	val |= sync ? DP_S_SRM_MODE_NEXT_FRAME :
+		      DP_S_SRM_MODE_NOW;
 	ipu_cm_write(ipu, val, IPU_SRM_PRI2);
 }
-EXPORT_SYMBOL_GPL(ipu_srm_dp_sync_update);
+EXPORT_SYMBOL_GPL(ipu_srm_dp_update);
 
 enum ipu_color_space ipu_drm_fourcc_to_colorspace(u32 drm_fourcc)
 {
diff --git a/drivers/gpu/ipu-v3/ipu-dp.c b/drivers/gpu/ipu-v3/ipu-dp.c
index 98686edbcdbb..0e09c98248a0 100644
--- a/drivers/gpu/ipu-v3/ipu-dp.c
+++ b/drivers/gpu/ipu-v3/ipu-dp.c
@@ -112,7 +112,7 @@ int ipu_dp_set_global_alpha(struct ipu_dp *dp, bool enable,
 		writel(reg & ~DP_COM_CONF_GWAM, flow->base + DP_COM_CONF);
 	}
 
-	ipu_srm_dp_sync_update(priv->ipu);
+	ipu_srm_dp_update(priv->ipu, true);
 
 	mutex_unlock(&priv->mutex);
 
@@ -127,7 +127,7 @@ int ipu_dp_set_window_pos(struct ipu_dp *dp, u16 x_pos, u16 y_pos)
 
 	writel((x_pos << 16) | y_pos, flow->base + DP_FG_POS);
 
-	ipu_srm_dp_sync_update(priv->ipu);
+	ipu_srm_dp_update(priv->ipu, true);
 
 	return 0;
 }
@@ -207,7 +207,7 @@ int ipu_dp_setup_channel(struct ipu_dp *dp,
 					flow->out_cs, DP_COM_CONF_CSC_DEF_FG);
 	}
 
-	ipu_srm_dp_sync_update(priv->ipu);
+	ipu_srm_dp_update(priv->ipu, true);
 
 	mutex_unlock(&priv->mutex);
 
@@ -247,7 +247,7 @@ int ipu_dp_enable_channel(struct ipu_dp *dp)
 	reg |= DP_COM_CONF_FG_EN;
 	writel(reg, flow->base + DP_COM_CONF);
 
-	ipu_srm_dp_sync_update(priv->ipu);
+	ipu_srm_dp_update(priv->ipu, true);
 
 	mutex_unlock(&priv->mutex);
 
@@ -255,7 +255,7 @@ int ipu_dp_enable_channel(struct ipu_dp *dp)
 }
 EXPORT_SYMBOL_GPL(ipu_dp_enable_channel);
 
-void ipu_dp_disable_channel(struct ipu_dp *dp)
+void ipu_dp_disable_channel(struct ipu_dp *dp, bool sync)
 {
 	struct ipu_flow *flow = to_flow(dp);
 	struct ipu_dp_priv *priv = flow->priv;
@@ -275,7 +275,7 @@ void ipu_dp_disable_channel(struct ipu_dp *dp)
 	writel(reg, flow->base + DP_COM_CONF);
 
 	writel(0, flow->base + DP_FG_POS);
-	ipu_srm_dp_sync_update(priv->ipu);
+	ipu_srm_dp_update(priv->ipu, sync);
 
 	if (ipu_idmac_channel_busy(priv->ipu, IPUV3_CHANNEL_MEM_BG_SYNC))
 		ipu_wait_interrupt(priv->ipu, IPU_IRQ_DP_SF_END, 50);
diff --git a/drivers/gpu/ipu-v3/ipu-prv.h b/drivers/gpu/ipu-v3/ipu-prv.h
index 22e47b68b14a..285595702ee0 100644
--- a/drivers/gpu/ipu-v3/ipu-prv.h
+++ b/drivers/gpu/ipu-v3/ipu-prv.h
@@ -75,6 +75,11 @@ struct ipu_soc;
 #define IPU_INT_CTRL(n)		IPU_CM_REG(0x003C + 4 * (n))
 #define IPU_INT_STAT(n)		IPU_CM_REG(0x0200 + 4 * (n))
 
+/* SRM_PRI2 */
+#define DP_S_SRM_MODE_MASK		(0x3 << 3)
+#define DP_S_SRM_MODE_NOW		(0x3 << 3)
+#define DP_S_SRM_MODE_NEXT_FRAME	(0x1 << 3)
+
 /* FS_PROC_FLOW1 */
 #define FS_PRPENC_ROT_SRC_SEL_MASK	(0xf << 0)
 #define FS_PRPENC_ROT_SRC_SEL_ENC		(0x7 << 0)
@@ -215,7 +220,7 @@ static inline void ipu_idmac_write(struct ipu_soc *ipu, u32 value,
 	writel(value, ipu->idmac_reg + offset);
 }
 
-void ipu_srm_dp_sync_update(struct ipu_soc *ipu);
+void ipu_srm_dp_update(struct ipu_soc *ipu, bool sync);
 
 int ipu_module_enable(struct ipu_soc *ipu, u32 mask);
 int ipu_module_disable(struct ipu_soc *ipu, u32 mask);
diff --git a/include/video/imx-ipu-v3.h b/include/video/imx-ipu-v3.h
index 53cd07ccaa4c..899d2b00ad6d 100644
--- a/include/video/imx-ipu-v3.h
+++ b/include/video/imx-ipu-v3.h
@@ -300,7 +300,7 @@ struct ipu_dp *ipu_dp_get(struct ipu_soc *ipu, unsigned int flow);
 void ipu_dp_put(struct ipu_dp *);
 int ipu_dp_enable(struct ipu_soc *ipu);
 int ipu_dp_enable_channel(struct ipu_dp *dp);
-void ipu_dp_disable_channel(struct ipu_dp *dp);
+void ipu_dp_disable_channel(struct ipu_dp *dp, bool sync);
 void ipu_dp_disable(struct ipu_soc *ipu);
 int ipu_dp_setup_channel(struct ipu_dp *dp,
 		enum ipu_color_space in, enum ipu_color_space out);
-- 
cgit v1.2.3


From ba2b5277dc52cc96944d9765281bdf1e12681f66 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Fri, 9 Jan 2015 11:05:13 +0100
Subject: drm: add RGB formats with separate alpha plane

Some hardware can read the alpha components separately and then
conditionally fetch color components only for non-zero alpha values.
This patch adds fourcc definitions for two-plane RGB formats with an
8-bit alpha channel on a second plane.

Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/gpu/drm/drm_fourcc.c  |  8 ++++++++
 include/uapi/drm/drm_fourcc.h | 14 ++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_fourcc.c b/drivers/gpu/drm/drm_fourcc.c
index 90d2cc8da8eb..92bf3306d4b3 100644
--- a/drivers/gpu/drm/drm_fourcc.c
+++ b/drivers/gpu/drm/drm_fourcc.c
@@ -132,6 +132,8 @@ const struct drm_format_info *__drm_format_info(u32 format)
 		{ .format = DRM_FORMAT_XBGR8888,	.depth = 24, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 },
 		{ .format = DRM_FORMAT_RGBX8888,	.depth = 24, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 },
 		{ .format = DRM_FORMAT_BGRX8888,	.depth = 24, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 },
+		{ .format = DRM_FORMAT_RGB565_A8,	.depth = 24, .num_planes = 2, .cpp = { 2, 1, 0 }, .hsub = 1, .vsub = 1 },
+		{ .format = DRM_FORMAT_BGR565_A8,	.depth = 24, .num_planes = 2, .cpp = { 2, 1, 0 }, .hsub = 1, .vsub = 1 },
 		{ .format = DRM_FORMAT_XRGB2101010,	.depth = 30, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 },
 		{ .format = DRM_FORMAT_XBGR2101010,	.depth = 30, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 },
 		{ .format = DRM_FORMAT_RGBX1010102,	.depth = 30, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 },
@@ -144,6 +146,12 @@ const struct drm_format_info *__drm_format_info(u32 format)
 		{ .format = DRM_FORMAT_ABGR8888,	.depth = 32, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 },
 		{ .format = DRM_FORMAT_RGBA8888,	.depth = 32, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 },
 		{ .format = DRM_FORMAT_BGRA8888,	.depth = 32, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 },
+		{ .format = DRM_FORMAT_RGB888_A8,	.depth = 32, .num_planes = 2, .cpp = { 3, 1, 0 }, .hsub = 1, .vsub = 1 },
+		{ .format = DRM_FORMAT_BGR888_A8,	.depth = 32, .num_planes = 2, .cpp = { 3, 1, 0 }, .hsub = 1, .vsub = 1 },
+		{ .format = DRM_FORMAT_XRGB8888_A8,	.depth = 32, .num_planes = 2, .cpp = { 4, 1, 0 }, .hsub = 1, .vsub = 1 },
+		{ .format = DRM_FORMAT_XBGR8888_A8,	.depth = 32, .num_planes = 2, .cpp = { 4, 1, 0 }, .hsub = 1, .vsub = 1 },
+		{ .format = DRM_FORMAT_RGBX8888_A8,	.depth = 32, .num_planes = 2, .cpp = { 4, 1, 0 }, .hsub = 1, .vsub = 1 },
+		{ .format = DRM_FORMAT_BGRX8888_A8,	.depth = 32, .num_planes = 2, .cpp = { 4, 1, 0 }, .hsub = 1, .vsub = 1 },
 		{ .format = DRM_FORMAT_YUV410,		.depth = 0,  .num_planes = 3, .cpp = { 1, 1, 1 }, .hsub = 4, .vsub = 4 },
 		{ .format = DRM_FORMAT_YVU410,		.depth = 0,  .num_planes = 3, .cpp = { 1, 1, 1 }, .hsub = 4, .vsub = 4 },
 		{ .format = DRM_FORMAT_YUV411,		.depth = 0,  .num_planes = 3, .cpp = { 1, 1, 1 }, .hsub = 4, .vsub = 1 },
diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h
index ef20abb8119b..995c8f9c692f 100644
--- a/include/uapi/drm/drm_fourcc.h
+++ b/include/uapi/drm/drm_fourcc.h
@@ -113,6 +113,20 @@ extern "C" {
 
 #define DRM_FORMAT_AYUV		fourcc_code('A', 'Y', 'U', 'V') /* [31:0] A:Y:Cb:Cr 8:8:8:8 little endian */
 
+/*
+ * 2 plane RGB + A
+ * index 0 = RGB plane, same format as the corresponding non _A8 format has
+ * index 1 = A plane, [7:0] A
+ */
+#define DRM_FORMAT_XRGB8888_A8	fourcc_code('X', 'R', 'A', '8')
+#define DRM_FORMAT_XBGR8888_A8	fourcc_code('X', 'B', 'A', '8')
+#define DRM_FORMAT_RGBX8888_A8	fourcc_code('R', 'X', 'A', '8')
+#define DRM_FORMAT_BGRX8888_A8	fourcc_code('B', 'X', 'A', '8')
+#define DRM_FORMAT_RGB888_A8	fourcc_code('R', '8', 'A', '8')
+#define DRM_FORMAT_BGR888_A8	fourcc_code('B', '8', 'A', '8')
+#define DRM_FORMAT_RGB565_A8	fourcc_code('R', '5', 'A', '8')
+#define DRM_FORMAT_BGR565_A8	fourcc_code('B', '5', 'A', '8')
+
 /*
  * 2 plane YCbCr
  * index 0 = Y plane, [7:0] Y
-- 
cgit v1.2.3


From e72db3b1e50febff493bc5e3fb57da9cb8cc3d91 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Fri, 9 Jan 2015 11:03:13 +0100
Subject: gpu: ipu-v3: add support for separate alpha channels

The IPUv3 can read 8-bit alpha values from a separate IDMAC channel driven
by the Alpha Transparency Controller (ATC) for the graphics IDMAC channels.
This allows to reduce memory bandwidth via a conditional read mechanism or
to support planar YUV formats with alpha transparency.

Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/gpu/ipu-v3/ipu-common.c |  6 +++++
 drivers/gpu/ipu-v3/ipu-cpmem.c  | 57 +++++++++++++++++++++++++++++++++++++++++
 include/video/imx-ipu-v3.h      | 22 ++++++++++++++++
 3 files changed, 85 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/ipu-v3/ipu-common.c b/drivers/gpu/ipu-v3/ipu-common.c
index 8a32ed25a1c2..448043c051e9 100644
--- a/drivers/gpu/ipu-v3/ipu-common.c
+++ b/drivers/gpu/ipu-v3/ipu-common.c
@@ -83,6 +83,12 @@ enum ipu_color_space ipu_drm_fourcc_to_colorspace(u32 drm_fourcc)
 	case DRM_FORMAT_ABGR8888:
 	case DRM_FORMAT_RGBA8888:
 	case DRM_FORMAT_BGRA8888:
+	case DRM_FORMAT_RGB565_A8:
+	case DRM_FORMAT_BGR565_A8:
+	case DRM_FORMAT_RGB888_A8:
+	case DRM_FORMAT_BGR888_A8:
+	case DRM_FORMAT_RGBX8888_A8:
+	case DRM_FORMAT_BGRX8888_A8:
 		return IPUV3_COLORSPACE_RGB;
 	case DRM_FORMAT_YUYV:
 	case DRM_FORMAT_UYVY:
diff --git a/drivers/gpu/ipu-v3/ipu-cpmem.c b/drivers/gpu/ipu-v3/ipu-cpmem.c
index b72f725e00b5..114160dfc3ad 100644
--- a/drivers/gpu/ipu-v3/ipu-cpmem.c
+++ b/drivers/gpu/ipu-v3/ipu-cpmem.c
@@ -537,6 +537,43 @@ static const struct ipu_rgb def_bgra_16 = {
 #define UV2_OFFSET(pix, x, y)	((pix->width * pix->height) +	\
 				 (pix->width * y) + (x))
 
+#define NUM_ALPHA_CHANNELS	7
+
+/* See Table 37-12. Alpha channels mapping. */
+static int ipu_channel_albm(int ch_num)
+{
+	switch (ch_num) {
+	case IPUV3_CHANNEL_G_MEM_IC_PRP_VF:	return 0;
+	case IPUV3_CHANNEL_G_MEM_IC_PP:		return 1;
+	case IPUV3_CHANNEL_MEM_FG_SYNC:		return 2;
+	case IPUV3_CHANNEL_MEM_FG_ASYNC:	return 3;
+	case IPUV3_CHANNEL_MEM_BG_SYNC:		return 4;
+	case IPUV3_CHANNEL_MEM_BG_ASYNC:	return 5;
+	case IPUV3_CHANNEL_MEM_VDI_PLANE1_COMB: return 6;
+	default:
+		return -EINVAL;
+	}
+}
+
+static void ipu_cpmem_set_separate_alpha(struct ipuv3_channel *ch)
+{
+	struct ipu_soc *ipu = ch->ipu;
+	int albm;
+	u32 val;
+
+	albm = ipu_channel_albm(ch->num);
+	if (albm < 0)
+		return;
+
+	ipu_ch_param_write_field(ch, IPU_FIELD_ALU, 1);
+	ipu_ch_param_write_field(ch, IPU_FIELD_ALBM, albm);
+	ipu_ch_param_write_field(ch, IPU_FIELD_CRE, 1);
+
+	val = ipu_idmac_read(ipu, IDMAC_SEP_ALPHA);
+	val |= BIT(ch->num);
+	ipu_idmac_write(ipu, val, IDMAC_SEP_ALPHA);
+}
+
 int ipu_cpmem_set_fmt(struct ipuv3_channel *ch, u32 drm_fourcc)
 {
 	switch (drm_fourcc) {
@@ -599,22 +636,28 @@ int ipu_cpmem_set_fmt(struct ipuv3_channel *ch, u32 drm_fourcc)
 		break;
 	case DRM_FORMAT_RGBA8888:
 	case DRM_FORMAT_RGBX8888:
+	case DRM_FORMAT_RGBX8888_A8:
 		ipu_cpmem_set_format_rgb(ch, &def_rgbx_32);
 		break;
 	case DRM_FORMAT_BGRA8888:
 	case DRM_FORMAT_BGRX8888:
+	case DRM_FORMAT_BGRX8888_A8:
 		ipu_cpmem_set_format_rgb(ch, &def_bgrx_32);
 		break;
 	case DRM_FORMAT_BGR888:
+	case DRM_FORMAT_BGR888_A8:
 		ipu_cpmem_set_format_rgb(ch, &def_bgr_24);
 		break;
 	case DRM_FORMAT_RGB888:
+	case DRM_FORMAT_RGB888_A8:
 		ipu_cpmem_set_format_rgb(ch, &def_rgb_24);
 		break;
 	case DRM_FORMAT_RGB565:
+	case DRM_FORMAT_RGB565_A8:
 		ipu_cpmem_set_format_rgb(ch, &def_rgb_16);
 		break;
 	case DRM_FORMAT_BGR565:
+	case DRM_FORMAT_BGR565_A8:
 		ipu_cpmem_set_format_rgb(ch, &def_bgr_16);
 		break;
 	case DRM_FORMAT_ARGB1555:
@@ -636,6 +679,20 @@ int ipu_cpmem_set_fmt(struct ipuv3_channel *ch, u32 drm_fourcc)
 		return -EINVAL;
 	}
 
+	switch (drm_fourcc) {
+	case DRM_FORMAT_RGB565_A8:
+	case DRM_FORMAT_BGR565_A8:
+	case DRM_FORMAT_RGB888_A8:
+	case DRM_FORMAT_BGR888_A8:
+	case DRM_FORMAT_RGBX8888_A8:
+	case DRM_FORMAT_BGRX8888_A8:
+		ipu_ch_param_write_field(ch, IPU_FIELD_WID3, 7);
+		ipu_cpmem_set_separate_alpha(ch);
+		break;
+	default:
+		break;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ipu_cpmem_set_fmt);
diff --git a/include/video/imx-ipu-v3.h b/include/video/imx-ipu-v3.h
index 899d2b00ad6d..6af74f0cf161 100644
--- a/include/video/imx-ipu-v3.h
+++ b/include/video/imx-ipu-v3.h
@@ -161,6 +161,28 @@ enum ipu_channel_irq {
 #define IPUV3_CHANNEL_MEM_BG_ASYNC_ALPHA	52
 #define IPUV3_NUM_CHANNELS			64
 
+static inline int ipu_channel_alpha_channel(int ch_num)
+{
+	switch (ch_num) {
+	case IPUV3_CHANNEL_G_MEM_IC_PRP_VF:
+		return IPUV3_CHANNEL_G_MEM_IC_PRP_VF_ALPHA;
+	case IPUV3_CHANNEL_G_MEM_IC_PP:
+		return IPUV3_CHANNEL_G_MEM_IC_PP_ALPHA;
+	case IPUV3_CHANNEL_MEM_FG_SYNC:
+		return IPUV3_CHANNEL_MEM_FG_SYNC_ALPHA;
+	case IPUV3_CHANNEL_MEM_FG_ASYNC:
+		return IPUV3_CHANNEL_MEM_FG_ASYNC_ALPHA;
+	case IPUV3_CHANNEL_MEM_BG_SYNC:
+		return IPUV3_CHANNEL_MEM_BG_SYNC_ALPHA;
+	case IPUV3_CHANNEL_MEM_BG_ASYNC:
+		return IPUV3_CHANNEL_MEM_BG_ASYNC_ALPHA;
+	case IPUV3_CHANNEL_MEM_VDI_PLANE1_COMB:
+		return IPUV3_CHANNEL_MEM_VDI_PLANE1_COMB_ALPHA;
+	default:
+		return -EINVAL;
+	}
+}
+
 int ipu_map_irq(struct ipu_soc *ipu, int irq);
 int ipu_idmac_channel_irq(struct ipu_soc *ipu, struct ipuv3_channel *channel,
 		enum ipu_channel_irq irq);
-- 
cgit v1.2.3


From ea9c260514c15f14d43a4c099646c44238dccf1e Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Wed, 8 Mar 2017 12:13:16 +0100
Subject: gpu: ipu-v3: add driver for Prefetch Resolve Gasket

This adds support for the i.MX6 QUadPlus PRG unit. It glues together the
IPU and the PRE units.

Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
v4: add missing ipu_soc->prg_priv
---
 drivers/gpu/ipu-v3/Makefile     |   2 +-
 drivers/gpu/ipu-v3/ipu-common.c |   1 +
 drivers/gpu/ipu-v3/ipu-prg.c    | 424 ++++++++++++++++++++++++++++++++++++++++
 drivers/gpu/ipu-v3/ipu-prv.h    |   6 +
 include/video/imx-ipu-v3.h      |  15 ++
 5 files changed, 447 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/ipu-v3/ipu-prg.c

(limited to 'include')

diff --git a/drivers/gpu/ipu-v3/Makefile b/drivers/gpu/ipu-v3/Makefile
index 8ae90de46b4d..1ab9bceee755 100644
--- a/drivers/gpu/ipu-v3/Makefile
+++ b/drivers/gpu/ipu-v3/Makefile
@@ -2,4 +2,4 @@ obj-$(CONFIG_IMX_IPUV3_CORE) += imx-ipu-v3.o
 
 imx-ipu-v3-objs := ipu-common.o ipu-cpmem.o ipu-csi.o ipu-dc.o ipu-di.o \
 		ipu-dp.o ipu-dmfc.o ipu-ic.o ipu-image-convert.o \
-		ipu-pre.o ipu-smfc.o ipu-vdi.o
+		ipu-pre.o ipu-prg.o ipu-smfc.o ipu-vdi.o
diff --git a/drivers/gpu/ipu-v3/ipu-common.c b/drivers/gpu/ipu-v3/ipu-common.c
index 7ae1b9739a7f..4c8453989746 100644
--- a/drivers/gpu/ipu-v3/ipu-common.c
+++ b/drivers/gpu/ipu-v3/ipu-common.c
@@ -1530,6 +1530,7 @@ static struct platform_driver imx_ipu_driver = {
 
 static struct platform_driver * const drivers[] = {
 	&ipu_pre_drv,
+	&ipu_prg_drv,
 	&imx_ipu_driver,
 };
 
diff --git a/drivers/gpu/ipu-v3/ipu-prg.c b/drivers/gpu/ipu-v3/ipu-prg.c
new file mode 100644
index 000000000000..caca57febbd6
--- /dev/null
+++ b/drivers/gpu/ipu-v3/ipu-prg.c
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2016-2017 Lucas Stach, Pengutronix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <drm/drm_fourcc.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/mfd/syscon.h>
+#include <linux/mfd/syscon/imx6q-iomuxc-gpr.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <video/imx-ipu-v3.h>
+
+#include "ipu-prv.h"
+
+#define IPU_PRG_CTL				0x00
+#define  IPU_PRG_CTL_BYPASS(i)			(1 << (0 + i))
+#define  IPU_PRG_CTL_SOFT_ARID_MASK		0x3
+#define  IPU_PRG_CTL_SOFT_ARID_SHIFT(i)		(8 + i * 2)
+#define  IPU_PRG_CTL_SOFT_ARID(i, v)		((v & 0x3) << (8 + 2 * i))
+#define  IPU_PRG_CTL_SO(i)			(1 << (16 + i))
+#define  IPU_PRG_CTL_VFLIP(i)			(1 << (19 + i))
+#define  IPU_PRG_CTL_BLOCK_MODE(i)		(1 << (22 + i))
+#define  IPU_PRG_CTL_CNT_LOAD_EN(i)		(1 << (25 + i))
+#define  IPU_PRG_CTL_SOFTRST			(1 << 30)
+#define  IPU_PRG_CTL_SHADOW_EN			(1 << 31)
+
+#define IPU_PRG_STATUS				0x04
+#define  IPU_PRG_STATUS_BUFFER0_READY(i)	(1 << (0 + i * 2))
+#define  IPU_PRG_STATUS_BUFFER1_READY(i)	(1 << (1 + i * 2))
+
+#define IPU_PRG_QOS				0x08
+#define  IPU_PRG_QOS_ARID_MASK			0xf
+#define  IPU_PRG_QOS_ARID_SHIFT(i)		(0 + i * 4)
+
+#define IPU_PRG_REG_UPDATE			0x0c
+#define  IPU_PRG_REG_UPDATE_REG_UPDATE		(1 << 0)
+
+#define IPU_PRG_STRIDE(i)			(0x10 + i * 0x4)
+#define  IPU_PRG_STRIDE_STRIDE_MASK		0x3fff
+
+#define IPU_PRG_CROP_LINE			0x1c
+
+#define IPU_PRG_THD				0x20
+
+#define IPU_PRG_BADDR(i)			(0x24 + i * 0x4)
+
+#define IPU_PRG_OFFSET(i)			(0x30 + i * 0x4)
+
+#define IPU_PRG_ILO(i)				(0x3c + i * 0x4)
+
+#define IPU_PRG_HEIGHT(i)			(0x48 + i * 0x4)
+#define  IPU_PRG_HEIGHT_PRE_HEIGHT_MASK		0xfff
+#define  IPU_PRG_HEIGHT_PRE_HEIGHT_SHIFT	0
+#define  IPU_PRG_HEIGHT_IPU_HEIGHT_MASK		0xfff
+#define  IPU_PRG_HEIGHT_IPU_HEIGHT_SHIFT	16
+
+struct ipu_prg_channel {
+	bool			enabled;
+	int			used_pre;
+};
+
+struct ipu_prg {
+	struct list_head	list;
+	struct device		*dev;
+	int			id;
+
+	void __iomem		*regs;
+	struct clk		*clk_ipg, *clk_axi;
+	struct regmap		*iomuxc_gpr;
+	struct ipu_pre		*pres[3];
+
+	struct ipu_prg_channel	chan[3];
+};
+
+static DEFINE_MUTEX(ipu_prg_list_mutex);
+static LIST_HEAD(ipu_prg_list);
+
+struct ipu_prg *
+ipu_prg_lookup_by_phandle(struct device *dev, const char *name, int ipu_id)
+{
+	struct device_node *prg_node = of_parse_phandle(dev->of_node,
+							name, 0);
+	struct ipu_prg *prg;
+
+	mutex_lock(&ipu_prg_list_mutex);
+	list_for_each_entry(prg, &ipu_prg_list, list) {
+		if (prg_node == prg->dev->of_node) {
+			mutex_unlock(&ipu_prg_list_mutex);
+			device_link_add(dev, prg->dev, DL_FLAG_AUTOREMOVE);
+			prg->id = ipu_id;
+			return prg;
+		}
+	}
+	mutex_unlock(&ipu_prg_list_mutex);
+
+	return NULL;
+}
+
+int ipu_prg_max_active_channels(void)
+{
+	return ipu_pre_get_available_count();
+}
+EXPORT_SYMBOL_GPL(ipu_prg_max_active_channels);
+
+bool ipu_prg_present(struct ipu_soc *ipu)
+{
+	if (ipu->prg_priv)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(ipu_prg_present);
+
+bool ipu_prg_format_supported(struct ipu_soc *ipu, uint32_t format,
+			      uint64_t modifier)
+{
+	const struct drm_format_info *info = drm_format_info(format);
+
+	if (info->num_planes != 1)
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(ipu_prg_format_supported);
+
+int ipu_prg_enable(struct ipu_soc *ipu)
+{
+	struct ipu_prg *prg = ipu->prg_priv;
+	int ret;
+
+	if (!prg)
+		return 0;
+
+	ret = clk_prepare_enable(prg->clk_axi);
+	if (ret)
+		goto fail_disable_ipg;
+
+	return 0;
+
+fail_disable_ipg:
+	clk_disable_unprepare(prg->clk_ipg);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ipu_prg_enable);
+
+void ipu_prg_disable(struct ipu_soc *ipu)
+{
+	struct ipu_prg *prg = ipu->prg_priv;
+
+	if (!prg)
+		return;
+
+	clk_disable_unprepare(prg->clk_axi);
+}
+EXPORT_SYMBOL_GPL(ipu_prg_disable);
+
+/*
+ * The channel configuartion functions below are not thread safe, as they
+ * must be only called from the atomic commit path in the DRM driver, which
+ * is properly serialized.
+ */
+static int ipu_prg_ipu_to_prg_chan(int ipu_chan)
+{
+	/*
+	 * This isn't clearly documented in the RM, but IPU to PRG channel
+	 * assignment is fixed, as only with this mapping the control signals
+	 * match up.
+	 */
+	switch (ipu_chan) {
+	case IPUV3_CHANNEL_MEM_BG_SYNC:
+		return 0;
+	case IPUV3_CHANNEL_MEM_FG_SYNC:
+		return 1;
+	case IPUV3_CHANNEL_MEM_DC_SYNC:
+		return 2;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int ipu_prg_get_pre(struct ipu_prg *prg, int prg_chan)
+{
+	int i, ret;
+
+	/* channel 0 is special as it is hardwired to one of the PREs */
+	if (prg_chan == 0) {
+		ret = ipu_pre_get(prg->pres[0]);
+		if (ret)
+			goto fail;
+		prg->chan[prg_chan].used_pre = 0;
+		return 0;
+	}
+
+	for (i = 1; i < 3; i++) {
+		ret = ipu_pre_get(prg->pres[i]);
+		if (!ret) {
+			u32 val, mux;
+			int shift;
+
+			prg->chan[prg_chan].used_pre = i;
+
+			/* configure the PRE to PRG channel mux */
+			shift = (i == 1) ? 12 : 14;
+			mux = (prg->id << 1) | (prg_chan - 1);
+			regmap_update_bits(prg->iomuxc_gpr, IOMUXC_GPR5,
+					   0x3 << shift, mux << shift);
+
+			/* check other mux, must not point to same channel */
+			shift = (i == 1) ? 14 : 12;
+			regmap_read(prg->iomuxc_gpr, IOMUXC_GPR5, &val);
+			if (((val >> shift) & 0x3) == mux) {
+				regmap_update_bits(prg->iomuxc_gpr, IOMUXC_GPR5,
+						   0x3 << shift,
+						   (mux ^ 0x1) << shift);
+			}
+
+			return 0;
+		}
+	}
+
+fail:
+	dev_err(prg->dev, "could not get PRE for PRG chan %d", prg_chan);
+	return ret;
+}
+
+static void ipu_prg_put_pre(struct ipu_prg *prg, int prg_chan)
+{
+	struct ipu_prg_channel *chan = &prg->chan[prg_chan];
+
+	ipu_pre_put(prg->pres[chan->used_pre]);
+	chan->used_pre = -1;
+}
+
+void ipu_prg_channel_disable(struct ipuv3_channel *ipu_chan)
+{
+	int prg_chan = ipu_prg_ipu_to_prg_chan(ipu_chan->num);
+	struct ipu_prg *prg = ipu_chan->ipu->prg_priv;
+	struct ipu_prg_channel *chan = &prg->chan[prg_chan];
+	u32 val;
+
+	if (!chan->enabled || prg_chan < 0)
+		return;
+
+	clk_prepare_enable(prg->clk_ipg);
+
+	val = readl(prg->regs + IPU_PRG_CTL);
+	val |= IPU_PRG_CTL_BYPASS(prg_chan);
+	writel(val, prg->regs + IPU_PRG_CTL);
+
+	val = IPU_PRG_REG_UPDATE_REG_UPDATE;
+	writel(val, prg->regs + IPU_PRG_REG_UPDATE);
+
+	clk_disable_unprepare(prg->clk_ipg);
+
+	ipu_prg_put_pre(prg, prg_chan);
+
+	chan->enabled = false;
+}
+EXPORT_SYMBOL_GPL(ipu_prg_channel_disable);
+
+int ipu_prg_channel_configure(struct ipuv3_channel *ipu_chan,
+			      unsigned int axi_id, unsigned int width,
+			      unsigned int height, unsigned int stride,
+			      u32 format, unsigned long *eba)
+{
+	int prg_chan = ipu_prg_ipu_to_prg_chan(ipu_chan->num);
+	struct ipu_prg *prg = ipu_chan->ipu->prg_priv;
+	struct ipu_prg_channel *chan = &prg->chan[prg_chan];
+	u32 val;
+	int ret;
+
+	if (prg_chan < 0)
+		return prg_chan;
+
+	if (chan->enabled) {
+		ipu_pre_update(prg->pres[chan->used_pre], *eba);
+		return 0;
+	}
+
+	ret = ipu_prg_get_pre(prg, prg_chan);
+	if (ret)
+		return ret;
+
+	ipu_pre_configure(prg->pres[chan->used_pre],
+			  width, height, stride, format, *eba);
+
+
+	ret = clk_prepare_enable(prg->clk_ipg);
+	if (ret) {
+		ipu_prg_put_pre(prg, prg_chan);
+		return ret;
+	}
+
+	val = (stride - 1) & IPU_PRG_STRIDE_STRIDE_MASK;
+	writel(val, prg->regs + IPU_PRG_STRIDE(prg_chan));
+
+	val = ((height & IPU_PRG_HEIGHT_PRE_HEIGHT_MASK) <<
+	       IPU_PRG_HEIGHT_PRE_HEIGHT_SHIFT) |
+	      ((height & IPU_PRG_HEIGHT_IPU_HEIGHT_MASK) <<
+	       IPU_PRG_HEIGHT_IPU_HEIGHT_SHIFT);
+	writel(val, prg->regs + IPU_PRG_HEIGHT(prg_chan));
+
+	val = ipu_pre_get_baddr(prg->pres[chan->used_pre]);
+	*eba = val;
+	writel(val, prg->regs + IPU_PRG_BADDR(prg_chan));
+
+	val = readl(prg->regs + IPU_PRG_CTL);
+	/* counter load enable */
+	val |= IPU_PRG_CTL_CNT_LOAD_EN(prg_chan);
+	/* config AXI ID */
+	val &= ~(IPU_PRG_CTL_SOFT_ARID_MASK <<
+		 IPU_PRG_CTL_SOFT_ARID_SHIFT(prg_chan));
+	val |= IPU_PRG_CTL_SOFT_ARID(prg_chan, axi_id);
+	/* enable channel */
+	val &= ~IPU_PRG_CTL_BYPASS(prg_chan);
+	writel(val, prg->regs + IPU_PRG_CTL);
+
+	val = IPU_PRG_REG_UPDATE_REG_UPDATE;
+	writel(val, prg->regs + IPU_PRG_REG_UPDATE);
+
+	clk_disable_unprepare(prg->clk_ipg);
+
+	chan->enabled = true;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ipu_prg_channel_configure);
+
+static int ipu_prg_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct resource *res;
+	struct ipu_prg *prg;
+	u32 val;
+	int i, ret;
+
+	prg = devm_kzalloc(dev, sizeof(*prg), GFP_KERNEL);
+	if (!prg)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	prg->regs = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(prg->regs))
+		return PTR_ERR(prg->regs);
+
+
+	prg->clk_ipg = devm_clk_get(dev, "ipg");
+	if (IS_ERR(prg->clk_ipg))
+		return PTR_ERR(prg->clk_ipg);
+
+	prg->clk_axi = devm_clk_get(dev, "axi");
+	if (IS_ERR(prg->clk_axi))
+		return PTR_ERR(prg->clk_axi);
+
+	prg->iomuxc_gpr =
+		syscon_regmap_lookup_by_compatible("fsl,imx6q-iomuxc-gpr");
+	if (IS_ERR(prg->iomuxc_gpr))
+		return PTR_ERR(prg->iomuxc_gpr);
+
+	for (i = 0; i < 3; i++) {
+		prg->pres[i] = ipu_pre_lookup_by_phandle(dev, "fsl,pres", i);
+		if (!prg->pres[i])
+			return -EPROBE_DEFER;
+	}
+
+	ret = clk_prepare_enable(prg->clk_ipg);
+	if (ret)
+		return ret;
+
+	/* init to free running mode */
+	val = readl(prg->regs + IPU_PRG_CTL);
+	val |= IPU_PRG_CTL_SHADOW_EN;
+	writel(val, prg->regs + IPU_PRG_CTL);
+
+	/* disable address threshold */
+	writel(0xffffffff, prg->regs + IPU_PRG_THD);
+
+	clk_disable_unprepare(prg->clk_ipg);
+
+	prg->dev = dev;
+	platform_set_drvdata(pdev, prg);
+	mutex_lock(&ipu_prg_list_mutex);
+	list_add(&prg->list, &ipu_prg_list);
+	mutex_unlock(&ipu_prg_list_mutex);
+
+	return 0;
+}
+
+static int ipu_prg_remove(struct platform_device *pdev)
+{
+	struct ipu_prg *prg = platform_get_drvdata(pdev);
+
+	mutex_lock(&ipu_prg_list_mutex);
+	list_del(&prg->list);
+	mutex_unlock(&ipu_prg_list_mutex);
+
+	return 0;
+}
+
+static const struct of_device_id ipu_prg_dt_ids[] = {
+	{ .compatible = "fsl,imx6qp-prg", },
+	{ /* sentinel */ },
+};
+
+struct platform_driver ipu_prg_drv = {
+	.probe		= ipu_prg_probe,
+	.remove		= ipu_prg_remove,
+	.driver		= {
+		.name	= "imx-ipu-prg",
+		.of_match_table = ipu_prg_dt_ids,
+	},
+};
diff --git a/drivers/gpu/ipu-v3/ipu-prv.h b/drivers/gpu/ipu-v3/ipu-prv.h
index 262efdff1dba..ca2a223a0d1e 100644
--- a/drivers/gpu/ipu-v3/ipu-prv.h
+++ b/drivers/gpu/ipu-v3/ipu-prv.h
@@ -174,6 +174,7 @@ struct ipu_vdi;
 struct ipu_image_convert_priv;
 struct ipu_smfc_priv;
 struct ipu_pre;
+struct ipu_prg;
 
 struct ipu_devtype;
 
@@ -208,6 +209,7 @@ struct ipu_soc {
 	struct ipu_vdi          *vdi_priv;
 	struct ipu_image_convert_priv *image_convert_priv;
 	struct ipu_smfc_priv	*smfc_priv;
+	struct ipu_prg		*prg_priv;
 };
 
 static inline u32 ipu_idmac_read(struct ipu_soc *ipu, unsigned offset)
@@ -276,6 +278,10 @@ void ipu_pre_configure(struct ipu_pre *pre, unsigned int width,
 		       unsigned int stride, u32 format, unsigned int bufaddr);
 void ipu_pre_update(struct ipu_pre *pre, unsigned int bufaddr);
 
+struct ipu_prg *ipu_prg_lookup_by_phandle(struct device *dev, const char *name,
+					  int ipu_id);
+
 extern struct platform_driver ipu_pre_drv;
+extern struct platform_driver ipu_prg_drv;
 
 #endif				/* __IPU_PRV_H__ */
diff --git a/include/video/imx-ipu-v3.h b/include/video/imx-ipu-v3.h
index 6af74f0cf161..8cb07680fb41 100644
--- a/include/video/imx-ipu-v3.h
+++ b/include/video/imx-ipu-v3.h
@@ -330,6 +330,21 @@ int ipu_dp_set_window_pos(struct ipu_dp *, u16 x_pos, u16 y_pos);
 int ipu_dp_set_global_alpha(struct ipu_dp *dp, bool enable, u8 alpha,
 		bool bg_chan);
 
+/*
+ * IPU Prefetch Resolve Gasket (prg) functions
+ */
+int ipu_prg_max_active_channels(void);
+bool ipu_prg_present(struct ipu_soc *ipu);
+bool ipu_prg_format_supported(struct ipu_soc *ipu, uint32_t format,
+			      uint64_t modifier);
+int ipu_prg_enable(struct ipu_soc *ipu);
+void ipu_prg_disable(struct ipu_soc *ipu);
+void ipu_prg_channel_disable(struct ipuv3_channel *ipu_chan);
+int ipu_prg_channel_configure(struct ipuv3_channel *ipu_chan,
+			      unsigned int axi_id,  unsigned int width,
+			      unsigned int height, unsigned int stride,
+			      u32 format, unsigned long *eba);
+
 /*
  * IPU CMOS Sensor Interface (csi) functions
  */
-- 
cgit v1.2.3


From 5be9b730b09c45c358bbfe7f51d254e306cccc07 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 16 Mar 2017 16:40:21 -0700
Subject: kasan: add a prototype of task_struct to avoid warning

Add a prototype of task_struct to fix below warning on arm64.

  In file included from arch/arm64/kernel/probes/kprobes.c:19:0:
  include/linux/kasan.h:81:132: error: 'struct task_struct' declared inside parameter list will not be visible outside of this definition or declaration [-Werror]
   static inline void kasan_unpoison_task_stack(struct task_struct *task) {}

As same as other types (kmem_cache, page, and vm_struct) this adds a
prototype of task_struct data structure on top of kasan.h.

[arnd] A related warning was fixed before, but now appears in a
different line in the same file in v4.11-rc2.  The patch from Masami
Hiramatsu still seems appropriate, so let's take his version.

Fixes: 71af2ed5eeea ("kasan, sched/headers: Remove <linux/sched.h> from <linux/kasan.h>")
Link: https://patchwork.kernel.org/patch/9569839/
Link: http://lkml.kernel.org/r/20170313141517.3397802-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Alexander Potapenko <glider@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 1c823bef4c15..5734480c9590 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -6,6 +6,7 @@
 struct kmem_cache;
 struct page;
 struct vm_struct;
+struct task_struct;
 
 #ifdef CONFIG_KASAN
 
-- 
cgit v1.2.3


From 15c9e10d9ad4d41d076148bbff1de7f659f68852 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 16 Mar 2017 16:40:33 -0700
Subject: drivers core: remove assert_held_device_hotplug()

The last caller of assert_held_device_hotplug() is gone, so remove it again.

Link: http://lkml.kernel.org/r/20170314125226.16779-3-heiko.carstens@de.ibm.com
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/core.c    | 5 -----
 include/linux/device.h | 1 -
 2 files changed, 6 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 684bda4d14a1..6bb60fb6a30b 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -639,11 +639,6 @@ int lock_device_hotplug_sysfs(void)
 	return restart_syscall();
 }
 
-void assert_held_device_hotplug(void)
-{
-	lockdep_assert_held(&device_hotplug_lock);
-}
-
 #ifdef CONFIG_BLOCK
 static inline int device_is_not_partition(struct device *dev)
 {
diff --git a/include/linux/device.h b/include/linux/device.h
index 30c4570e928d..9ef518af5515 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1140,7 +1140,6 @@ static inline bool device_supports_offline(struct device *dev)
 extern void lock_device_hotplug(void);
 extern void unlock_device_hotplug(void);
 extern int lock_device_hotplug_sysfs(void);
-void assert_held_device_hotplug(void);
 extern int device_offline(struct device *dev);
 extern int device_online(struct device *dev);
 extern void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
-- 
cgit v1.2.3


From 92a0256e9acf20ee3f0add61b5aa774bfdf214cc Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 13 Mar 2017 11:28:10 +0000
Subject: drm/i915: Add i810/i815 pci-ids for completeness

To improve our historical record and to simplify userspace that wants to
include i915_pciids.h as its canonical breakdown of PCI IDs and their
respective generations, include the gen1 ids for i810 and i815.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/20170313112810.4202-1-chris@chris-wilson.co.uk
Acked-by: Jani Nikula <jani.nikula@intel.com>
---
 include/drm/i915_pciids.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
index 466c71592a6f..27e0dbaa6c0e 100644
--- a/include/drm/i915_pciids.h
+++ b/include/drm/i915_pciids.h
@@ -47,6 +47,14 @@
 	0x030000, 0xff0000,			\
 	(unsigned long) info }
 
+#define INTEL_I810_IDS(info)					\
+	INTEL_VGA_DEVICE(0x7121, info), /* I810 */		\
+	INTEL_VGA_DEVICE(0x7123, info), /* I810_DC100 */	\
+	INTEL_VGA_DEVICE(0x7125, info)  /* I810_E */
+
+#define INTEL_I815_IDS(info)					\
+	INTEL_VGA_DEVICE(0x1132, info)  /* I815*/
+
 #define INTEL_I830_IDS(info)				\
 	INTEL_VGA_DEVICE(0x3577, info)
 
-- 
cgit v1.2.3


From 530c6891b1220cba780b6c18f4691d85a3435080 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Wed, 1 Mar 2017 23:13:24 -0600
Subject: target: allow ALUA setup for some passthrough backends

This patch allows passthrough backends to use the core/base LIO
ALUA setup and state checks, but still handle the execution of
commands.

This will allow the target_core_user module to execute STPG and RTPG
in userspace, and not have to duplicate the ALUA state checks, path
information (needed so we can check if command is executable on
specific paths) and setup (rtslib sets/updates the configfs ALUA
interface like it does for iblock or file).

For STPG, the target_core_user userspace daemon, tcmu-runner will
still execute the STPG, and to update the core/base LIO state it
will use the existing configfs interface. For RTPG, tcmu-runner
will loop over configfs and/or cache the state.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_alua.c    | 9 +++++----
 drivers/target/target_core_pscsi.c   | 3 ++-
 drivers/target/target_core_tpg.c     | 3 ++-
 include/target/target_core_backend.h | 7 ++++++-
 4 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c
index f5e330099bfc..a41bbb8087cf 100644
--- a/drivers/target/target_core_alua.c
+++ b/drivers/target/target_core_alua.c
@@ -691,7 +691,7 @@ target_alua_state_check(struct se_cmd *cmd)
 
 	if (dev->se_hba->hba_flags & HBA_FLAGS_INTERNAL_USE)
 		return 0;
-	if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH)
+	if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH_ALUA)
 		return 0;
 
 	/*
@@ -1973,7 +1973,7 @@ ssize_t core_alua_store_tg_pt_gp_info(
 	unsigned char buf[TG_PT_GROUP_NAME_BUF];
 	int move = 0;
 
-	if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH ||
+	if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH_ALUA ||
 	    (dev->se_hba->hba_flags & HBA_FLAGS_INTERNAL_USE))
 		return -ENODEV;
 
@@ -2230,7 +2230,7 @@ ssize_t core_alua_store_offline_bit(
 	unsigned long tmp;
 	int ret;
 
-	if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH ||
+	if (dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH_ALUA ||
 	    (dev->se_hba->hba_flags & HBA_FLAGS_INTERNAL_USE))
 		return -ENODEV;
 
@@ -2316,7 +2316,8 @@ ssize_t core_alua_store_secondary_write_metadata(
 
 int core_setup_alua(struct se_device *dev)
 {
-	if (!(dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH) &&
+	if (!(dev->transport->transport_flags &
+	     TRANSPORT_FLAG_PASSTHROUGH_ALUA) &&
 	    !(dev->se_hba->hba_flags & HBA_FLAGS_INTERNAL_USE)) {
 		struct t10_alua_lu_gp_member *lu_gp_mem;
 
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 44d92f23a3f0..94cda7991e80 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1080,7 +1080,8 @@ static void pscsi_req_done(struct request *req, int uptodate)
 static const struct target_backend_ops pscsi_ops = {
 	.name			= "pscsi",
 	.owner			= THIS_MODULE,
-	.transport_flags	= TRANSPORT_FLAG_PASSTHROUGH,
+	.transport_flags	= TRANSPORT_FLAG_PASSTHROUGH |
+				  TRANSPORT_FLAG_PASSTHROUGH_ALUA,
 	.attach_hba		= pscsi_attach_hba,
 	.detach_hba		= pscsi_detach_hba,
 	.pmode_enable_hba	= pscsi_pmode_enable_hba,
diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index c0dbfa016575..6fb191914f45 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -602,7 +602,8 @@ int core_tpg_add_lun(
 	if (ret)
 		goto out_kill_ref;
 
-	if (!(dev->transport->transport_flags & TRANSPORT_FLAG_PASSTHROUGH) &&
+	if (!(dev->transport->transport_flags &
+	     TRANSPORT_FLAG_PASSTHROUGH_ALUA) &&
 	    !(dev->se_hba->hba_flags & HBA_FLAGS_INTERNAL_USE))
 		target_attach_tg_pt_gp(lun, dev->t10_alua.default_tg_pt_gp);
 
diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h
index b54b98dc2d4a..1b0f447ce850 100644
--- a/include/target/target_core_backend.h
+++ b/include/target/target_core_backend.h
@@ -4,7 +4,12 @@
 #include <linux/types.h>
 #include <target/target_core_base.h>
 
-#define TRANSPORT_FLAG_PASSTHROUGH		1
+#define TRANSPORT_FLAG_PASSTHROUGH		0x1
+/*
+ * ALUA commands, state checks and setup operations are handled by the
+ * backend module.
+ */
+#define TRANSPORT_FLAG_PASSTHROUGH_ALUA		0x2
 
 struct request_queue;
 struct scatterlist;
-- 
cgit v1.2.3


From d7175373f2745ed4abe5b388d5aabd06304f801e Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Thu, 2 Mar 2017 04:59:48 -0600
Subject: target: fix ALUA transition timeout handling

The implicit transition time tells initiators the min time
to wait before timing out a transition. We currently schedule
the transition to occur in tg_pt_gp_implicit_trans_secs
seconds so there is no room for delays. If
core_alua_do_transition_tg_pt_work->core_alua_update_tpg_primary_metadata
needs to write out info to a remote file, then the initiator can
easily time out the operation.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_alua.c | 23 ++++++++---------------
 include/target/target_core_base.h |  2 +-
 2 files changed, 9 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c
index 58bf5e6350ac..594807cd92cb 100644
--- a/drivers/target/target_core_alua.c
+++ b/drivers/target/target_core_alua.c
@@ -1013,7 +1013,7 @@ static void core_alua_queue_state_change_ua(struct t10_alua_tg_pt_gp *tg_pt_gp)
 static void core_alua_do_transition_tg_pt_work(struct work_struct *work)
 {
 	struct t10_alua_tg_pt_gp *tg_pt_gp = container_of(work,
-		struct t10_alua_tg_pt_gp, tg_pt_gp_transition_work.work);
+		struct t10_alua_tg_pt_gp, tg_pt_gp_transition_work);
 	struct se_device *dev = tg_pt_gp->tg_pt_gp_dev;
 	bool explicit = (tg_pt_gp->tg_pt_gp_alua_access_status ==
 			 ALUA_STATUS_ALTERED_BY_EXPLICIT_STPG);
@@ -1076,13 +1076,12 @@ static int core_alua_do_transition_tg_pt(
 	/*
 	 * Flush any pending transitions
 	 */
-	if (!explicit && tg_pt_gp->tg_pt_gp_implicit_trans_secs &&
-	    atomic_read(&tg_pt_gp->tg_pt_gp_alua_access_state) ==
+	if (!explicit && atomic_read(&tg_pt_gp->tg_pt_gp_alua_access_state) ==
 	    ALUA_ACCESS_STATE_TRANSITION) {
 		/* Just in case */
 		tg_pt_gp->tg_pt_gp_alua_pending_state = new_state;
 		tg_pt_gp->tg_pt_gp_transition_complete = &wait;
-		flush_delayed_work(&tg_pt_gp->tg_pt_gp_transition_work);
+		flush_work(&tg_pt_gp->tg_pt_gp_transition_work);
 		wait_for_completion(&wait);
 		tg_pt_gp->tg_pt_gp_transition_complete = NULL;
 		return 0;
@@ -1117,15 +1116,9 @@ static int core_alua_do_transition_tg_pt(
 	atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt);
 	spin_unlock(&dev->t10_alua.tg_pt_gps_lock);
 
-	if (!explicit && tg_pt_gp->tg_pt_gp_implicit_trans_secs) {
-		unsigned long transition_tmo;
-
-		transition_tmo = tg_pt_gp->tg_pt_gp_implicit_trans_secs * HZ;
-		schedule_delayed_work(&tg_pt_gp->tg_pt_gp_transition_work,
-				      transition_tmo);
-	} else {
+	schedule_work(&tg_pt_gp->tg_pt_gp_transition_work);
+	if (explicit) {
 		tg_pt_gp->tg_pt_gp_transition_complete = &wait;
-		schedule_delayed_work(&tg_pt_gp->tg_pt_gp_transition_work, 0);
 		wait_for_completion(&wait);
 		tg_pt_gp->tg_pt_gp_transition_complete = NULL;
 	}
@@ -1696,8 +1689,8 @@ struct t10_alua_tg_pt_gp *core_alua_allocate_tg_pt_gp(struct se_device *dev,
 	mutex_init(&tg_pt_gp->tg_pt_gp_md_mutex);
 	spin_lock_init(&tg_pt_gp->tg_pt_gp_lock);
 	atomic_set(&tg_pt_gp->tg_pt_gp_ref_cnt, 0);
-	INIT_DELAYED_WORK(&tg_pt_gp->tg_pt_gp_transition_work,
-			  core_alua_do_transition_tg_pt_work);
+	INIT_WORK(&tg_pt_gp->tg_pt_gp_transition_work,
+		  core_alua_do_transition_tg_pt_work);
 	tg_pt_gp->tg_pt_gp_dev = dev;
 	atomic_set(&tg_pt_gp->tg_pt_gp_alua_access_state,
 		ALUA_ACCESS_STATE_ACTIVE_OPTIMIZED);
@@ -1805,7 +1798,7 @@ void core_alua_free_tg_pt_gp(
 	dev->t10_alua.alua_tg_pt_gps_counter--;
 	spin_unlock(&dev->t10_alua.tg_pt_gps_lock);
 
-	flush_delayed_work(&tg_pt_gp->tg_pt_gp_transition_work);
+	flush_work(&tg_pt_gp->tg_pt_gp_transition_work);
 
 	/*
 	 * Allow a struct t10_alua_tg_pt_gp_member * referenced by
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 37c274e61acc..4b784b6e21c0 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -299,7 +299,7 @@ struct t10_alua_tg_pt_gp {
 	struct list_head tg_pt_gp_lun_list;
 	struct se_lun *tg_pt_gp_alua_lun;
 	struct se_node_acl *tg_pt_gp_alua_nacl;
-	struct delayed_work tg_pt_gp_transition_work;
+	struct work_struct tg_pt_gp_transition_work;
 	struct completion *tg_pt_gp_transition_complete;
 };
 
-- 
cgit v1.2.3