From e7e0c3e26587749b62d17b9dd0532874186c77f7 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Sun, 3 Apr 2016 16:15:00 -0300
Subject: [media] videobuf2-core: Check user space planes array in dqbuf

The number of planes in videobuf2 is specific to a buffer. In order to
verify that the planes array provided by the user is long enough, a new
vb2_buf_op is required.

Call __verify_planes_array() when the dequeued buffer is known. Return an
error to the caller if there was one, otherwise remove the buffer from the
done list.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: Hans Verkuil <hans.verkuil@cisco.com>
Cc: stable@vger.kernel.org # for v4.4 and later
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 include/media/videobuf2-core.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index 8a0f55b6c2ba..5342ff4d748f 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -375,6 +375,9 @@ struct vb2_ops {
 /**
  * struct vb2_ops - driver-specific callbacks
  *
+ * @verify_planes_array: Verify that a given user space structure contains
+ *			enough planes for the buffer. This is called
+ *			for each dequeued buffer.
  * @fill_user_buffer:	given a vb2_buffer fill in the userspace structure.
  *			For V4L2 this is a struct v4l2_buffer.
  * @fill_vb2_buffer:	given a userspace structure, fill in the vb2_buffer.
@@ -384,6 +387,7 @@ struct vb2_ops {
  *			the vb2_buffer struct.
  */
 struct vb2_buf_ops {
+	int (*verify_planes_array)(struct vb2_buffer *vb, const void *pb);
 	void (*fill_user_buffer)(struct vb2_buffer *vb, void *pb);
 	int (*fill_vb2_buffer)(struct vb2_buffer *vb, const void *pb,
 				struct vb2_plane *planes);
-- 
cgit v1.2.3


From 75dd602a5198a6e5f75534db52b6e6fbaabb33d1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 30 Mar 2016 11:36:59 +0200
Subject: lockdep: Fix lock_chain::base size

lock_chain::base is used to store an index into the chain_hlocks[]
array, however that array contains more elements than can be indexed
using the u16.

Change the lock_chain structure to use a bitfield to encode the data
it needs and add BUILD_BUG_ON() assertions to check the fields are
wide enough.

Also, for DEBUG_LOCKDEP, assert that we don't run out of elements of
that array; as that would wreck the collision detectoring.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alfredo Alvarez Fernandez <alfredoalvarezfernandez@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160330093659.GS3408@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h       |  8 +++++---
 kernel/locking/lockdep.c      | 24 +++++++++++++++++++++++-
 kernel/locking/lockdep_proc.c |  2 ++
 3 files changed, 30 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index d026b190c530..d10ef06971b5 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -196,9 +196,11 @@ struct lock_list {
  * We record lock dependency chains, so that we can cache them:
  */
 struct lock_chain {
-	u8				irq_context;
-	u8				depth;
-	u16				base;
+	/* see BUILD_BUG_ON()s in lookup_chain_cache() */
+	unsigned int			irq_context :  2,
+					depth       :  6,
+					base	    : 24;
+	/* 4 byte hole */
 	struct hlist_node		entry;
 	u64				chain_key;
 };
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index beb06f604420..78c1c0ee6dc1 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2176,15 +2176,37 @@ cache_hit:
 	chain->irq_context = hlock->irq_context;
 	i = get_first_held_lock(curr, hlock);
 	chain->depth = curr->lockdep_depth + 1 - i;
+
+	BUILD_BUG_ON((1UL << 24) <= ARRAY_SIZE(chain_hlocks));
+	BUILD_BUG_ON((1UL << 6)  <= ARRAY_SIZE(curr->held_locks));
+	BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes));
+
 	if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
 		chain->base = nr_chain_hlocks;
-		nr_chain_hlocks += chain->depth;
 		for (j = 0; j < chain->depth - 1; j++, i++) {
 			int lock_id = curr->held_locks[i].class_idx - 1;
 			chain_hlocks[chain->base + j] = lock_id;
 		}
 		chain_hlocks[chain->base + j] = class - lock_classes;
 	}
+
+	if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS)
+		nr_chain_hlocks += chain->depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+	/*
+	 * Important for check_no_collision().
+	 */
+	if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) {
+		if (debug_locks_off_graph_unlock())
+			return 0;
+
+		print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
+		dump_stack();
+		return 0;
+	}
+#endif
+
 	hlist_add_head_rcu(&chain->entry, hash_head);
 	debug_atomic_inc(chain_lookup_misses);
 	inc_chains();
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index dbb61a302548..a0f61effad25 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -141,6 +141,8 @@ static int lc_show(struct seq_file *m, void *v)
 	int i;
 
 	if (v == SEQ_START_TOKEN) {
+		if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)
+			seq_printf(m, "(buggered) ");
 		seq_printf(m, "all lock chains:\n");
 		return 0;
 	}
-- 
cgit v1.2.3


From 3020ca711871fdaf0c15c8bab677a6bc302e28fe Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Fri, 22 Apr 2016 04:00:50 -0300
Subject: [media] v4l2-dv-timings.h: fix polarity for 4k formats

The VSync polarity was negative instead of positive for the 4k CEA formats.
I probably copy-and-pasted these from the DMT 4k format, which does have a
negative VSync polarity.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reported-by: Martin Bugge <marbugge@cisco.com>
Cc: <stable@vger.kernel.org>      # for v4.1 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 include/uapi/linux/v4l2-dv-timings.h | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/v4l2-dv-timings.h b/include/uapi/linux/v4l2-dv-timings.h
index c039f1d68a09..086168e18ca8 100644
--- a/include/uapi/linux/v4l2-dv-timings.h
+++ b/include/uapi/linux/v4l2-dv-timings.h
@@ -183,7 +183,8 @@
 
 #define V4L2_DV_BT_CEA_3840X2160P24 { \
 	.type = V4L2_DV_BT_656_1120, \
-	V4L2_INIT_BT_TIMINGS(3840, 2160, 0, V4L2_DV_HSYNC_POS_POL, \
+	V4L2_INIT_BT_TIMINGS(3840, 2160, 0, \
+		V4L2_DV_HSYNC_POS_POL | V4L2_DV_VSYNC_POS_POL, \
 		297000000, 1276, 88, 296, 8, 10, 72, 0, 0, 0, \
 		V4L2_DV_BT_STD_CEA861, \
 		V4L2_DV_FL_CAN_REDUCE_FPS | V4L2_DV_FL_IS_CE_VIDEO) \
@@ -191,14 +192,16 @@
 
 #define V4L2_DV_BT_CEA_3840X2160P25 { \
 	.type = V4L2_DV_BT_656_1120, \
-	V4L2_INIT_BT_TIMINGS(3840, 2160, 0, V4L2_DV_HSYNC_POS_POL, \
+	V4L2_INIT_BT_TIMINGS(3840, 2160, 0, \
+		V4L2_DV_HSYNC_POS_POL | V4L2_DV_VSYNC_POS_POL, \
 		297000000, 1056, 88, 296, 8, 10, 72, 0, 0, 0, \
 		V4L2_DV_BT_STD_CEA861, V4L2_DV_FL_IS_CE_VIDEO) \
 }
 
 #define V4L2_DV_BT_CEA_3840X2160P30 { \
 	.type = V4L2_DV_BT_656_1120, \
-	V4L2_INIT_BT_TIMINGS(3840, 2160, 0, V4L2_DV_HSYNC_POS_POL, \
+	V4L2_INIT_BT_TIMINGS(3840, 2160, 0, \
+		V4L2_DV_HSYNC_POS_POL | V4L2_DV_VSYNC_POS_POL, \
 		297000000, 176, 88, 296, 8, 10, 72, 0, 0, 0, \
 		V4L2_DV_BT_STD_CEA861, \
 		V4L2_DV_FL_CAN_REDUCE_FPS | V4L2_DV_FL_IS_CE_VIDEO) \
@@ -206,14 +209,16 @@
 
 #define V4L2_DV_BT_CEA_3840X2160P50 { \
 	.type = V4L2_DV_BT_656_1120, \
-	V4L2_INIT_BT_TIMINGS(3840, 2160, 0, V4L2_DV_HSYNC_POS_POL, \
+	V4L2_INIT_BT_TIMINGS(3840, 2160, 0, \
+		V4L2_DV_HSYNC_POS_POL | V4L2_DV_VSYNC_POS_POL, \
 		594000000, 1056, 88, 296, 8, 10, 72, 0, 0, 0, \
 		V4L2_DV_BT_STD_CEA861, V4L2_DV_FL_IS_CE_VIDEO) \
 }
 
 #define V4L2_DV_BT_CEA_3840X2160P60 { \
 	.type = V4L2_DV_BT_656_1120, \
-	V4L2_INIT_BT_TIMINGS(3840, 2160, 0, V4L2_DV_HSYNC_POS_POL, \
+	V4L2_INIT_BT_TIMINGS(3840, 2160, 0, \
+		V4L2_DV_HSYNC_POS_POL | V4L2_DV_VSYNC_POS_POL, \
 		594000000, 176, 88, 296, 8, 10, 72, 0, 0, 0, \
 		V4L2_DV_BT_STD_CEA861, \
 		V4L2_DV_FL_CAN_REDUCE_FPS | V4L2_DV_FL_IS_CE_VIDEO) \
@@ -221,7 +226,8 @@
 
 #define V4L2_DV_BT_CEA_4096X2160P24 { \
 	.type = V4L2_DV_BT_656_1120, \
-	V4L2_INIT_BT_TIMINGS(4096, 2160, 0, V4L2_DV_HSYNC_POS_POL, \
+	V4L2_INIT_BT_TIMINGS(4096, 2160, 0, \
+		V4L2_DV_HSYNC_POS_POL | V4L2_DV_VSYNC_POS_POL, \
 		297000000, 1020, 88, 296, 8, 10, 72, 0, 0, 0, \
 		V4L2_DV_BT_STD_CEA861, \
 		V4L2_DV_FL_CAN_REDUCE_FPS | V4L2_DV_FL_IS_CE_VIDEO) \
@@ -229,14 +235,16 @@
 
 #define V4L2_DV_BT_CEA_4096X2160P25 { \
 	.type = V4L2_DV_BT_656_1120, \
-	V4L2_INIT_BT_TIMINGS(4096, 2160, 0, V4L2_DV_HSYNC_POS_POL, \
+	V4L2_INIT_BT_TIMINGS(4096, 2160, 0, \
+		V4L2_DV_HSYNC_POS_POL | V4L2_DV_VSYNC_POS_POL, \
 		297000000, 968, 88, 128, 8, 10, 72, 0, 0, 0, \
 		V4L2_DV_BT_STD_CEA861, V4L2_DV_FL_IS_CE_VIDEO) \
 }
 
 #define V4L2_DV_BT_CEA_4096X2160P30 { \
 	.type = V4L2_DV_BT_656_1120, \
-	V4L2_INIT_BT_TIMINGS(4096, 2160, 0, V4L2_DV_HSYNC_POS_POL, \
+	V4L2_INIT_BT_TIMINGS(4096, 2160, 0, \
+		V4L2_DV_HSYNC_POS_POL | V4L2_DV_VSYNC_POS_POL, \
 		297000000, 88, 88, 128, 8, 10, 72, 0, 0, 0, \
 		V4L2_DV_BT_STD_CEA861, \
 		V4L2_DV_FL_CAN_REDUCE_FPS | V4L2_DV_FL_IS_CE_VIDEO) \
@@ -244,14 +252,16 @@
 
 #define V4L2_DV_BT_CEA_4096X2160P50 { \
 	.type = V4L2_DV_BT_656_1120, \
-	V4L2_INIT_BT_TIMINGS(4096, 2160, 0, V4L2_DV_HSYNC_POS_POL, \
+	V4L2_INIT_BT_TIMINGS(4096, 2160, 0, \
+		V4L2_DV_HSYNC_POS_POL | V4L2_DV_VSYNC_POS_POL, \
 		594000000, 968, 88, 128, 8, 10, 72, 0, 0, 0, \
 		V4L2_DV_BT_STD_CEA861, V4L2_DV_FL_IS_CE_VIDEO) \
 }
 
 #define V4L2_DV_BT_CEA_4096X2160P60 { \
 	.type = V4L2_DV_BT_656_1120, \
-	V4L2_INIT_BT_TIMINGS(4096, 2160, 0, V4L2_DV_HSYNC_POS_POL, \
+	V4L2_INIT_BT_TIMINGS(4096, 2160, 0, \
+		V4L2_DV_HSYNC_POS_POL | V4L2_DV_VSYNC_POS_POL, \
 		594000000, 88, 88, 128, 8, 10, 72, 0, 0, 0, \
 		V4L2_DV_BT_STD_CEA861, \
 		V4L2_DV_FL_CAN_REDUCE_FPS | V4L2_DV_FL_IS_CE_VIDEO) \
-- 
cgit v1.2.3


From b93876845c5e30a92964eeb088d9d2e024118022 Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ricardo.ribalda@gmail.com>
Date: Mon, 25 Apr 2016 06:04:45 -0300
Subject: [media] media: vb2: Fix regression on poll() for RW mode

When using a device is read/write mode, vb2 does not handle properly the
first select/poll operation.

The reason for this, is that when this code has been refactored, some of
the operations have changed their order, and now fileio emulator is not
started.

The reintroduced check to the core is enabled by a quirk flag, that
avoids this check by other subsystems like DVB.

Fixes: 49d8ab9feaf2 ("media] media: videobuf2: Separate vb2_poll()")

Reported-by: Dimitrios Katsaros <patcherwork@gmail.com>
Cc: Junghak Sung <jh1009.sung@samsung.com>
Cc: <stable@vger.kernel.org>      # for v4.5 and up
Signed-off-by: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 drivers/media/v4l2-core/videobuf2-core.c | 10 ++++++++++
 drivers/media/v4l2-core/videobuf2-v4l2.c | 14 ++++++--------
 include/media/videobuf2-core.h           |  4 ++++
 3 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/videobuf2-core.c b/drivers/media/v4l2-core/videobuf2-core.c
index 2169544883e8..9fbcb67a9ee6 100644
--- a/drivers/media/v4l2-core/videobuf2-core.c
+++ b/drivers/media/v4l2-core/videobuf2-core.c
@@ -2297,6 +2297,16 @@ unsigned int vb2_core_poll(struct vb2_queue *q, struct file *file,
 	if (!vb2_is_streaming(q) || q->error)
 		return POLLERR;
 
+	/*
+	 * If this quirk is set and QBUF hasn't been called yet then
+	 * return POLLERR as well. This only affects capture queues, output
+	 * queues will always initialize waiting_for_buffers to false.
+	 * This quirk is set by V4L2 for backwards compatibility reasons.
+	 */
+	if (q->quirk_poll_must_check_waiting_for_buffers &&
+	    q->waiting_for_buffers && (req_events & (POLLIN | POLLRDNORM)))
+		return POLLERR;
+
 	/*
 	 * For output streams you can call write() as long as there are fewer
 	 * buffers queued than there are buffers available.
diff --git a/drivers/media/v4l2-core/videobuf2-v4l2.c b/drivers/media/v4l2-core/videobuf2-v4l2.c
index 8da7470ca364..7f366f1b0377 100644
--- a/drivers/media/v4l2-core/videobuf2-v4l2.c
+++ b/drivers/media/v4l2-core/videobuf2-v4l2.c
@@ -771,6 +771,12 @@ int vb2_queue_init(struct vb2_queue *q)
 	q->is_output = V4L2_TYPE_IS_OUTPUT(q->type);
 	q->copy_timestamp = (q->timestamp_flags & V4L2_BUF_FLAG_TIMESTAMP_MASK)
 			== V4L2_BUF_FLAG_TIMESTAMP_COPY;
+	/*
+	 * For compatibility with vb1: if QBUF hasn't been called yet, then
+	 * return POLLERR as well. This only affects capture queues, output
+	 * queues will always initialize waiting_for_buffers to false.
+	 */
+	q->quirk_poll_must_check_waiting_for_buffers = true;
 
 	return vb2_core_queue_init(q);
 }
@@ -824,14 +830,6 @@ unsigned int vb2_poll(struct vb2_queue *q, struct file *file, poll_table *wait)
 			poll_wait(file, &fh->wait, wait);
 	}
 
-	/*
-	 * For compatibility with vb1: if QBUF hasn't been called yet, then
-	 * return POLLERR as well. This only affects capture queues, output
-	 * queues will always initialize waiting_for_buffers to false.
-	 */
-	if (q->waiting_for_buffers && (req_events & (POLLIN | POLLRDNORM)))
-		return POLLERR;
-
 	return res | vb2_core_poll(q, file, wait);
 }
 EXPORT_SYMBOL_GPL(vb2_poll);
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index 5342ff4d748f..88e3ab496e8f 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -404,6 +404,9 @@ struct vb2_buf_ops {
  * @fileio_read_once:		report EOF after reading the first buffer
  * @fileio_write_immediately:	queue buffer after each write() call
  * @allow_zero_bytesused:	allow bytesused == 0 to be passed to the driver
+ * @quirk_poll_must_check_waiting_for_buffers: Return POLLERR at poll when QBUF
+ *              has not been called. This is a vb1 idiom that has been adopted
+ *              also by vb2.
  * @lock:	pointer to a mutex that protects the vb2_queue struct. The
  *		driver can set this to a mutex to let the v4l2 core serialize
  *		the queuing ioctls. If the driver wants to handle locking
@@ -467,6 +470,7 @@ struct vb2_queue {
 	unsigned			fileio_read_once:1;
 	unsigned			fileio_write_immediately:1;
 	unsigned			allow_zero_bytesused:1;
+	unsigned		   quirk_poll_must_check_waiting_for_buffers:1;
 
 	struct mutex			*lock;
 	void				*owner;
-- 
cgit v1.2.3


From 6c1ea260f89709e0021d2c59f8fd2a104b5b1123 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 11 Apr 2016 19:34:49 +0200
Subject: libceph: make authorizer destruction independent of ceph_auth_client

Starting the kernel client with cephx disabled and then enabling cephx
and restarting userspace daemons can result in a crash:

    [262671.478162] BUG: unable to handle kernel paging request at ffffebe000000000
    [262671.531460] IP: [<ffffffff811cd04a>] kfree+0x5a/0x130
    [262671.584334] PGD 0
    [262671.635847] Oops: 0000 [#1] SMP
    [262672.055841] CPU: 22 PID: 2961272 Comm: kworker/22:2 Not tainted 4.2.0-34-generic #39~14.04.1-Ubuntu
    [262672.162338] Hardware name: Dell Inc. PowerEdge R720/068CDY, BIOS 2.4.3 07/09/2014
    [262672.268937] Workqueue: ceph-msgr con_work [libceph]
    [262672.322290] task: ffff88081c2d0dc0 ti: ffff880149ae8000 task.ti: ffff880149ae8000
    [262672.428330] RIP: 0010:[<ffffffff811cd04a>]  [<ffffffff811cd04a>] kfree+0x5a/0x130
    [262672.535880] RSP: 0018:ffff880149aeba58  EFLAGS: 00010286
    [262672.589486] RAX: 000001e000000000 RBX: 0000000000000012 RCX: ffff8807e7461018
    [262672.695980] RDX: 000077ff80000000 RSI: ffff88081af2be04 RDI: 0000000000000012
    [262672.803668] RBP: ffff880149aeba78 R08: 0000000000000000 R09: 0000000000000000
    [262672.912299] R10: ffffebe000000000 R11: ffff880819a60e78 R12: ffff8800aec8df40
    [262673.021769] R13: ffffffffc035f70f R14: ffff8807e5b138e0 R15: ffff880da9785840
    [262673.131722] FS:  0000000000000000(0000) GS:ffff88081fac0000(0000) knlGS:0000000000000000
    [262673.245377] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    [262673.303281] CR2: ffffebe000000000 CR3: 0000000001c0d000 CR4: 00000000001406e0
    [262673.417556] Stack:
    [262673.472943]  ffff880149aeba88 ffff88081af2be04 ffff8800aec8df40 ffff88081af2be04
    [262673.583767]  ffff880149aeba98 ffffffffc035f70f ffff880149aebac8 ffff8800aec8df00
    [262673.694546]  ffff880149aebac8 ffffffffc035c89e ffff8807e5b138e0 ffff8805b047f800
    [262673.805230] Call Trace:
    [262673.859116]  [<ffffffffc035f70f>] ceph_x_destroy_authorizer+0x1f/0x50 [libceph]
    [262673.968705]  [<ffffffffc035c89e>] ceph_auth_destroy_authorizer+0x3e/0x60 [libceph]
    [262674.078852]  [<ffffffffc0352805>] put_osd+0x45/0x80 [libceph]
    [262674.134249]  [<ffffffffc035290e>] remove_osd+0xae/0x140 [libceph]
    [262674.189124]  [<ffffffffc0352aa3>] __reset_osd+0x103/0x150 [libceph]
    [262674.243749]  [<ffffffffc0354703>] kick_requests+0x223/0x460 [libceph]
    [262674.297485]  [<ffffffffc03559e2>] ceph_osdc_handle_map+0x282/0x5e0 [libceph]
    [262674.350813]  [<ffffffffc035022e>] dispatch+0x4e/0x720 [libceph]
    [262674.403312]  [<ffffffffc034bd91>] try_read+0x3d1/0x1090 [libceph]
    [262674.454712]  [<ffffffff810ab7c2>] ? dequeue_entity+0x152/0x690
    [262674.505096]  [<ffffffffc034cb1b>] con_work+0xcb/0x1300 [libceph]
    [262674.555104]  [<ffffffff8108fb3e>] process_one_work+0x14e/0x3d0
    [262674.604072]  [<ffffffff810901ea>] worker_thread+0x11a/0x470
    [262674.652187]  [<ffffffff810900d0>] ? rescuer_thread+0x310/0x310
    [262674.699022]  [<ffffffff810957a2>] kthread+0xd2/0xf0
    [262674.744494]  [<ffffffff810956d0>] ? kthread_create_on_node+0x1c0/0x1c0
    [262674.789543]  [<ffffffff817bd81f>] ret_from_fork+0x3f/0x70
    [262674.834094]  [<ffffffff810956d0>] ? kthread_create_on_node+0x1c0/0x1c0

What happens is the following:

    (1) new MON session is established
    (2) old "none" ac is destroyed
    (3) new "cephx" ac is constructed
    ...
    (4) old OSD session (w/ "none" authorizer) is put
          ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer)

osd->o_auth.authorizer in the "none" case is just a bare pointer into
ac, which contains a single static copy for all services.  By the time
we get to (4), "none" ac, freed in (2), is long gone.  On top of that,
a new vtable installed in (3) points us at ceph_x_destroy_authorizer(),
so we end up trying to destroy a "none" authorizer with a "cephx"
destructor operating on invalid memory!

To fix this, decouple authorizer destruction from ac and do away with
a single static "none" authorizer by making a copy for each OSD or MDS
session.  Authorizers themselves are independent of ac and so there is
no reason for destroy_authorizer() to be an ac op.  Make it an op on
the authorizer itself by turning ceph_authorizer into a real struct.

Fixes: http://tracker.ceph.com/issues/15447

Reported-by: Alan Zhang <alan.zhang@linux.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Sage Weil <sage@redhat.com>
---
 fs/ceph/mds_client.c            |  6 ++--
 include/linux/ceph/auth.h       | 10 +++---
 include/linux/ceph/osd_client.h |  1 -
 net/ceph/auth.c                 |  8 ++---
 net/ceph/auth_none.c            | 71 ++++++++++++++++++++++-------------------
 net/ceph/auth_none.h            |  3 +-
 net/ceph/auth_x.c               | 21 ++++++------
 net/ceph/auth_x.h               |  1 +
 net/ceph/osd_client.c           |  6 ++--
 9 files changed, 62 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 541ead4d8965..85b8517f17a0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -386,9 +386,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
 	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
 	if (atomic_dec_and_test(&s->s_ref)) {
 		if (s->s_auth.authorizer)
-			ceph_auth_destroy_authorizer(
-				s->s_mdsc->fsc->client->monc.auth,
-				s->s_auth.authorizer);
+			ceph_auth_destroy_authorizer(s->s_auth.authorizer);
 		kfree(s);
 	}
 }
@@ -3900,7 +3898,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 	struct ceph_auth_handshake *auth = &s->s_auth;
 
 	if (force_new && auth->authorizer) {
-		ceph_auth_destroy_authorizer(ac, auth->authorizer);
+		ceph_auth_destroy_authorizer(auth->authorizer);
 		auth->authorizer = NULL;
 	}
 	if (!auth->authorizer) {
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 260d78b587c4..1563265d2097 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -12,9 +12,12 @@
  */
 
 struct ceph_auth_client;
-struct ceph_authorizer;
 struct ceph_msg;
 
+struct ceph_authorizer {
+	void (*destroy)(struct ceph_authorizer *);
+};
+
 struct ceph_auth_handshake {
 	struct ceph_authorizer *authorizer;
 	void *authorizer_buf;
@@ -62,8 +65,6 @@ struct ceph_auth_client_ops {
 				 struct ceph_auth_handshake *auth);
 	int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
 				       struct ceph_authorizer *a, size_t len);
-	void (*destroy_authorizer)(struct ceph_auth_client *ac,
-				   struct ceph_authorizer *a);
 	void (*invalidate_authorizer)(struct ceph_auth_client *ac,
 				      int peer_type);
 
@@ -112,8 +113,7 @@ extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
 extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
 				       int peer_type,
 				       struct ceph_auth_handshake *auth);
-extern void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
-					 struct ceph_authorizer *a);
+void ceph_auth_destroy_authorizer(struct ceph_authorizer *a);
 extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
 				       int peer_type,
 				       struct ceph_auth_handshake *a);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 4343df806710..cbf460927c42 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -16,7 +16,6 @@ struct ceph_msg;
 struct ceph_snap_context;
 struct ceph_osd_request;
 struct ceph_osd_client;
-struct ceph_authorizer;
 
 /*
  * completion callback for async writepages
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 6b923bcaa2a4..2bc5965fdd1e 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -293,13 +293,9 @@ int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
 }
 EXPORT_SYMBOL(ceph_auth_create_authorizer);
 
-void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
-				  struct ceph_authorizer *a)
+void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
 {
-	mutex_lock(&ac->mutex);
-	if (ac->ops && ac->ops->destroy_authorizer)
-		ac->ops->destroy_authorizer(ac, a);
-	mutex_unlock(&ac->mutex);
+	a->destroy(a);
 }
 EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
 
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index 8c93fa8d81bc..5f836f02ae36 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -16,7 +16,6 @@ static void reset(struct ceph_auth_client *ac)
 	struct ceph_auth_none_info *xi = ac->private;
 
 	xi->starting = true;
-	xi->built_authorizer = false;
 }
 
 static void destroy(struct ceph_auth_client *ac)
@@ -39,6 +38,27 @@ static int should_authenticate(struct ceph_auth_client *ac)
 	return xi->starting;
 }
 
+static int ceph_auth_none_build_authorizer(struct ceph_auth_client *ac,
+					   struct ceph_none_authorizer *au)
+{
+	void *p = au->buf;
+	void *const end = p + sizeof(au->buf);
+	int ret;
+
+	ceph_encode_8_safe(&p, end, 1, e_range);
+	ret = ceph_entity_name_encode(ac->name, &p, end);
+	if (ret < 0)
+		return ret;
+
+	ceph_encode_64_safe(&p, end, ac->global_id, e_range);
+	au->buf_len = p - (void *)au->buf;
+	dout("%s built authorizer len %d\n", __func__, au->buf_len);
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+
 static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
 {
 	return 0;
@@ -57,32 +77,32 @@ static int handle_reply(struct ceph_auth_client *ac, int result,
 	return result;
 }
 
+static void ceph_auth_none_destroy_authorizer(struct ceph_authorizer *a)
+{
+	kfree(a);
+}
+
 /*
- * build an 'authorizer' with our entity_name and global_id.  we can
- * reuse a single static copy since it is identical for all services
- * we connect to.
+ * build an 'authorizer' with our entity_name and global_id.  it is
+ * identical for all services we connect to.
  */
 static int ceph_auth_none_create_authorizer(
 	struct ceph_auth_client *ac, int peer_type,
 	struct ceph_auth_handshake *auth)
 {
-	struct ceph_auth_none_info *ai = ac->private;
-	struct ceph_none_authorizer *au = &ai->au;
-	void *p, *end;
+	struct ceph_none_authorizer *au;
 	int ret;
 
-	if (!ai->built_authorizer) {
-		p = au->buf;
-		end = p + sizeof(au->buf);
-		ceph_encode_8(&p, 1);
-		ret = ceph_entity_name_encode(ac->name, &p, end - 8);
-		if (ret < 0)
-			goto bad;
-		ceph_decode_need(&p, end, sizeof(u64), bad2);
-		ceph_encode_64(&p, ac->global_id);
-		au->buf_len = p - (void *)au->buf;
-		ai->built_authorizer = true;
-		dout("built authorizer len %d\n", au->buf_len);
+	au = kmalloc(sizeof(*au), GFP_NOFS);
+	if (!au)
+		return -ENOMEM;
+
+	au->base.destroy = ceph_auth_none_destroy_authorizer;
+
+	ret = ceph_auth_none_build_authorizer(ac, au);
+	if (ret) {
+		kfree(au);
+		return ret;
 	}
 
 	auth->authorizer = (struct ceph_authorizer *) au;
@@ -92,17 +112,6 @@ static int ceph_auth_none_create_authorizer(
 	auth->authorizer_reply_buf_len = sizeof (au->reply_buf);
 
 	return 0;
-
-bad2:
-	ret = -ERANGE;
-bad:
-	return ret;
-}
-
-static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
-				      struct ceph_authorizer *a)
-{
-	/* nothing to do */
 }
 
 static const struct ceph_auth_client_ops ceph_auth_none_ops = {
@@ -114,7 +123,6 @@ static const struct ceph_auth_client_ops ceph_auth_none_ops = {
 	.build_request = build_request,
 	.handle_reply = handle_reply,
 	.create_authorizer = ceph_auth_none_create_authorizer,
-	.destroy_authorizer = ceph_auth_none_destroy_authorizer,
 };
 
 int ceph_auth_none_init(struct ceph_auth_client *ac)
@@ -127,7 +135,6 @@ int ceph_auth_none_init(struct ceph_auth_client *ac)
 		return -ENOMEM;
 
 	xi->starting = true;
-	xi->built_authorizer = false;
 
 	ac->protocol = CEPH_AUTH_NONE;
 	ac->private = xi;
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
index 059a3ce4b53f..62021535ae4a 100644
--- a/net/ceph/auth_none.h
+++ b/net/ceph/auth_none.h
@@ -12,6 +12,7 @@
  */
 
 struct ceph_none_authorizer {
+	struct ceph_authorizer base;
 	char buf[128];
 	int buf_len;
 	char reply_buf[0];
@@ -19,8 +20,6 @@ struct ceph_none_authorizer {
 
 struct ceph_auth_none_info {
 	bool starting;
-	bool built_authorizer;
-	struct ceph_none_authorizer au;   /* we only need one; it's static */
 };
 
 int ceph_auth_none_init(struct ceph_auth_client *ac);
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 9e43a315e662..a0905f04bd13 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -565,6 +565,14 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 	return -EAGAIN;
 }
 
+static void ceph_x_destroy_authorizer(struct ceph_authorizer *a)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+
+	ceph_x_authorizer_cleanup(au);
+	kfree(au);
+}
+
 static int ceph_x_create_authorizer(
 	struct ceph_auth_client *ac, int peer_type,
 	struct ceph_auth_handshake *auth)
@@ -581,6 +589,8 @@ static int ceph_x_create_authorizer(
 	if (!au)
 		return -ENOMEM;
 
+	au->base.destroy = ceph_x_destroy_authorizer;
+
 	ret = ceph_x_build_authorizer(ac, th, au);
 	if (ret) {
 		kfree(au);
@@ -643,16 +653,6 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
 	return ret;
 }
 
-static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
-				      struct ceph_authorizer *a)
-{
-	struct ceph_x_authorizer *au = (void *)a;
-
-	ceph_x_authorizer_cleanup(au);
-	kfree(au);
-}
-
-
 static void ceph_x_reset(struct ceph_auth_client *ac)
 {
 	struct ceph_x_info *xi = ac->private;
@@ -770,7 +770,6 @@ static const struct ceph_auth_client_ops ceph_x_ops = {
 	.create_authorizer = ceph_x_create_authorizer,
 	.update_authorizer = ceph_x_update_authorizer,
 	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
-	.destroy_authorizer = ceph_x_destroy_authorizer,
 	.invalidate_authorizer = ceph_x_invalidate_authorizer,
 	.reset =  ceph_x_reset,
 	.destroy = ceph_x_destroy,
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index 40b1a3cf7397..21a5af904bae 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -26,6 +26,7 @@ struct ceph_x_ticket_handler {
 
 
 struct ceph_x_authorizer {
+	struct ceph_authorizer base;
 	struct ceph_crypto_key session_key;
 	struct ceph_buffer *buf;
 	unsigned int service;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 32355d9d0103..40a53a70efdf 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1087,10 +1087,8 @@ static void put_osd(struct ceph_osd *osd)
 	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
 	     atomic_read(&osd->o_ref) - 1);
 	if (atomic_dec_and_test(&osd->o_ref)) {
-		struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
-
 		if (osd->o_auth.authorizer)
-			ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer);
+			ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
 		kfree(osd);
 	}
 }
@@ -2984,7 +2982,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 	struct ceph_auth_handshake *auth = &o->o_auth;
 
 	if (force_new && auth->authorizer) {
-		ceph_auth_destroy_authorizer(ac, auth->authorizer);
+		ceph_auth_destroy_authorizer(auth->authorizer);
 		auth->authorizer = NULL;
 	}
 	if (!auth->authorizer) {
-- 
cgit v1.2.3


From 5cf1cacb49aee39c3e02ae87068fc3c6430659b0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 21 Apr 2016 19:06:48 -0400
Subject: cgroup, cpuset: replace cpuset_post_attach_flush() with
 cgroup_subsys->post_attach callback

Since e93ad19d0564 ("cpuset: make mm migration asynchronous"), cpuset
kicks off asynchronous NUMA node migration if necessary during task
migration and flushes it from cpuset_post_attach_flush() which is
called at the end of __cgroup_procs_write().  This is to avoid
performing migration with cgroup_threadgroup_rwsem write-locked which
can lead to deadlock through dependency on kworker creation.

memcg has a similar issue with charge moving, so let's convert it to
an official callback rather than the current one-off cpuset specific
function.  This patch adds cgroup_subsys->post_attach callback and
makes cpuset register cpuset_post_attach_flush() as its ->post_attach.

The conversion is mostly one-to-one except that the new callback is
called under cgroup_mutex.  This is to guarantee that no other
migration operations are started before ->post_attach callbacks are
finished.  cgroup_mutex is one of the outermost mutex in the system
and has never been and shouldn't be a problem.  We can add specialized
synchronization around __cgroup_procs_write() but I don't think
there's any noticeable benefit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: <stable@vger.kernel.org> # 4.4+ prerequisite for the next patch
---
 include/linux/cgroup-defs.h | 1 +
 include/linux/cpuset.h      | 6 ------
 kernel/cgroup.c             | 7 +++++--
 kernel/cpuset.c             | 4 ++--
 4 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 3e39ae5bc799..5b17de62c962 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -444,6 +444,7 @@ struct cgroup_subsys {
 	int (*can_attach)(struct cgroup_taskset *tset);
 	void (*cancel_attach)(struct cgroup_taskset *tset);
 	void (*attach)(struct cgroup_taskset *tset);
+	void (*post_attach)(void);
 	int (*can_fork)(struct task_struct *task);
 	void (*cancel_fork)(struct task_struct *task);
 	void (*fork)(struct task_struct *task);
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index fea160ee5803..85a868ccb493 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -137,8 +137,6 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 	task_unlock(current);
 }
 
-extern void cpuset_post_attach_flush(void);
-
 #else /* !CONFIG_CPUSETS */
 
 static inline bool cpusets_enabled(void) { return false; }
@@ -245,10 +243,6 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
 	return false;
 }
 
-static inline void cpuset_post_attach_flush(void)
-{
-}
-
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 671dc05c0b0f..909a7d31ffd3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2825,9 +2825,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 				    size_t nbytes, loff_t off, bool threadgroup)
 {
 	struct task_struct *tsk;
+	struct cgroup_subsys *ss;
 	struct cgroup *cgrp;
 	pid_t pid;
-	int ret;
+	int ssid, ret;
 
 	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
 		return -EINVAL;
@@ -2875,8 +2876,10 @@ out_unlock_rcu:
 	rcu_read_unlock();
 out_unlock_threadgroup:
 	percpu_up_write(&cgroup_threadgroup_rwsem);
+	for_each_subsys(ss, ssid)
+		if (ss->post_attach)
+			ss->post_attach();
 	cgroup_kn_unlock(of->kn);
-	cpuset_post_attach_flush();
 	return ret ?: nbytes;
 }
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 00ab5c2b7c5b..1902956baba1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -58,7 +58,6 @@
 #include <asm/uaccess.h>
 #include <linux/atomic.h>
 #include <linux/mutex.h>
-#include <linux/workqueue.h>
 #include <linux/cgroup.h>
 #include <linux/wait.h>
 
@@ -1016,7 +1015,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 	}
 }
 
-void cpuset_post_attach_flush(void)
+static void cpuset_post_attach(void)
 {
 	flush_workqueue(cpuset_migrate_mm_wq);
 }
@@ -2087,6 +2086,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
 	.can_attach	= cpuset_can_attach,
 	.cancel_attach	= cpuset_cancel_attach,
 	.attach		= cpuset_attach,
+	.post_attach	= cpuset_post_attach,
 	.bind		= cpuset_bind,
 	.legacy_cftypes	= files,
 	.early_init	= true,
-- 
cgit v1.2.3


From bb03ed216370cb021f377f923471e56d1de3ff5d Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 21 Apr 2016 16:39:17 +0200
Subject: ALSA: hda - Update BCLK also at hotplug for i915 HSW/BDW

The recent bug report suggests that BCLK setup for i915 HSW/BDW needs
to be updated at each HDMI hotplug, not only at initialization and
resume.  That is, we need to update HSW_EM4 and HSW_EM5 registers at
ELD notification, too.  Otherwise the HDMI audio may be out of sync
and played in a wrong pitch.

However, the HDA codec driver has no access to the controller
registers, and currently the code managing these registers is in
hda_intel.c, i.e. local to the controller driver.  For allowing the
explicit BCLK update from the codec driver, as in this patch, the
former haswell_set_bclk() in hda_intel.c is moved to hdac_i915.c and
exposed as snd_hdac_i915_set_bclk().  This is called from both the HDA
controller driver and intel_pin_eld_notify() in HDMI codec driver.

Along with this change, snd_hdac_get_display_clk() gets dropped as
it's no longer used.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91410
Cc: <stable@vger.kernel.org> # v4.5+
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/hda_i915.h   |  5 ++--
 sound/hda/hdac_i915.c      | 62 ++++++++++++++++++++++++++++++++++++++--------
 sound/pci/hda/hda_intel.c  | 56 +++--------------------------------------
 sound/pci/hda/patch_hdmi.c |  1 +
 4 files changed, 58 insertions(+), 66 deletions(-)

(limited to 'include')

diff --git a/include/sound/hda_i915.h b/include/sound/hda_i915.h
index fa341fcb5829..f5842bcd9c94 100644
--- a/include/sound/hda_i915.h
+++ b/include/sound/hda_i915.h
@@ -9,7 +9,7 @@
 #ifdef CONFIG_SND_HDA_I915
 int snd_hdac_set_codec_wakeup(struct hdac_bus *bus, bool enable);
 int snd_hdac_display_power(struct hdac_bus *bus, bool enable);
-int snd_hdac_get_display_clk(struct hdac_bus *bus);
+void snd_hdac_i915_set_bclk(struct hdac_bus *bus);
 int snd_hdac_sync_audio_rate(struct hdac_bus *bus, hda_nid_t nid, int rate);
 int snd_hdac_acomp_get_eld(struct hdac_bus *bus, hda_nid_t nid,
 			   bool *audio_enabled, char *buffer, int max_bytes);
@@ -25,9 +25,8 @@ static inline int snd_hdac_display_power(struct hdac_bus *bus, bool enable)
 {
 	return 0;
 }
-static inline int snd_hdac_get_display_clk(struct hdac_bus *bus)
+static inline void snd_hdac_i915_set_bclk(struct hdac_bus *bus)
 {
-	return 0;
 }
 static inline int snd_hdac_sync_audio_rate(struct hdac_bus *bus, hda_nid_t nid,
 					   int rate)
diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c
index 54babe1c0b16..607bbeaebddf 100644
--- a/sound/hda/hdac_i915.c
+++ b/sound/hda/hdac_i915.c
@@ -20,6 +20,7 @@
 #include <sound/core.h>
 #include <sound/hdaudio.h>
 #include <sound/hda_i915.h>
+#include <sound/hda_register.h>
 
 static struct i915_audio_component *hdac_acomp;
 
@@ -97,26 +98,65 @@ int snd_hdac_display_power(struct hdac_bus *bus, bool enable)
 }
 EXPORT_SYMBOL_GPL(snd_hdac_display_power);
 
+#define CONTROLLER_IN_GPU(pci) (((pci)->device == 0x0a0c) || \
+				((pci)->device == 0x0c0c) || \
+				((pci)->device == 0x0d0c) || \
+				((pci)->device == 0x160c))
+
 /**
- * snd_hdac_get_display_clk - Get CDCLK in kHz
+ * snd_hdac_i915_set_bclk - Reprogram BCLK for HSW/BDW
  * @bus: HDA core bus
  *
- * This function is supposed to be used only by a HD-audio controller
- * driver that needs the interaction with i915 graphics.
+ * Intel HSW/BDW display HDA controller is in GPU. Both its power and link BCLK
+ * depends on GPU. Two Extended Mode registers EM4 (M value) and EM5 (N Value)
+ * are used to convert CDClk (Core Display Clock) to 24MHz BCLK:
+ * BCLK = CDCLK * M / N
+ * The values will be lost when the display power well is disabled and need to
+ * be restored to avoid abnormal playback speed.
  *
- * This function queries CDCLK value in kHz from the graphics driver and
- * returns the value.  A negative code is returned in error.
+ * Call this function at initializing and changing power well, as well as
+ * at ELD notifier for the hotplug.
  */
-int snd_hdac_get_display_clk(struct hdac_bus *bus)
+void snd_hdac_i915_set_bclk(struct hdac_bus *bus)
 {
 	struct i915_audio_component *acomp = bus->audio_component;
+	struct pci_dev *pci = to_pci_dev(bus->dev);
+	int cdclk_freq;
+	unsigned int bclk_m, bclk_n;
+
+	if (!acomp || !acomp->ops || !acomp->ops->get_cdclk_freq)
+		return; /* only for i915 binding */
+	if (!CONTROLLER_IN_GPU(pci))
+		return; /* only HSW/BDW */
+
+	cdclk_freq = acomp->ops->get_cdclk_freq(acomp->dev);
+	switch (cdclk_freq) {
+	case 337500:
+		bclk_m = 16;
+		bclk_n = 225;
+		break;
+
+	case 450000:
+	default: /* default CDCLK 450MHz */
+		bclk_m = 4;
+		bclk_n = 75;
+		break;
+
+	case 540000:
+		bclk_m = 4;
+		bclk_n = 90;
+		break;
+
+	case 675000:
+		bclk_m = 8;
+		bclk_n = 225;
+		break;
+	}
 
-	if (!acomp || !acomp->ops)
-		return -ENODEV;
-
-	return acomp->ops->get_cdclk_freq(acomp->dev);
+	snd_hdac_chip_writew(bus, HSW_EM4, bclk_m);
+	snd_hdac_chip_writew(bus, HSW_EM5, bclk_n);
 }
-EXPORT_SYMBOL_GPL(snd_hdac_get_display_clk);
+EXPORT_SYMBOL_GPL(snd_hdac_i915_set_bclk);
 
 /* There is a fixed mapping between audio pin node and display port
  * on current Intel platforms:
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index 637b8a0e2a91..9a0d1445ca5c 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -857,50 +857,6 @@ static int param_set_xint(const char *val, const struct kernel_param *kp)
 #define azx_del_card_list(chip) /* NOP */
 #endif /* CONFIG_PM */
 
-/* Intel HSW/BDW display HDA controller is in GPU. Both its power and link BCLK
- * depends on GPU. Two Extended Mode registers EM4 (M value) and EM5 (N Value)
- * are used to convert CDClk (Core Display Clock) to 24MHz BCLK:
- * BCLK = CDCLK * M / N
- * The values will be lost when the display power well is disabled and need to
- * be restored to avoid abnormal playback speed.
- */
-static void haswell_set_bclk(struct hda_intel *hda)
-{
-	struct azx *chip = &hda->chip;
-	int cdclk_freq;
-	unsigned int bclk_m, bclk_n;
-
-	if (!hda->need_i915_power)
-		return;
-
-	cdclk_freq = snd_hdac_get_display_clk(azx_bus(chip));
-	switch (cdclk_freq) {
-	case 337500:
-		bclk_m = 16;
-		bclk_n = 225;
-		break;
-
-	case 450000:
-	default: /* default CDCLK 450MHz */
-		bclk_m = 4;
-		bclk_n = 75;
-		break;
-
-	case 540000:
-		bclk_m = 4;
-		bclk_n = 90;
-		break;
-
-	case 675000:
-		bclk_m = 8;
-		bclk_n = 225;
-		break;
-	}
-
-	azx_writew(chip, HSW_EM4, bclk_m);
-	azx_writew(chip, HSW_EM5, bclk_n);
-}
-
 #if defined(CONFIG_PM_SLEEP) || defined(SUPPORT_VGA_SWITCHEROO)
 /*
  * power management
@@ -958,7 +914,7 @@ static int azx_resume(struct device *dev)
 	if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL
 		&& hda->need_i915_power) {
 		snd_hdac_display_power(azx_bus(chip), true);
-		haswell_set_bclk(hda);
+		snd_hdac_i915_set_bclk(azx_bus(chip));
 	}
 	if (chip->msi)
 		if (pci_enable_msi(pci) < 0)
@@ -1058,7 +1014,7 @@ static int azx_runtime_resume(struct device *dev)
 		bus = azx_bus(chip);
 		if (hda->need_i915_power) {
 			snd_hdac_display_power(bus, true);
-			haswell_set_bclk(hda);
+			snd_hdac_i915_set_bclk(bus);
 		} else {
 			/* toggle codec wakeup bit for STATESTS read */
 			snd_hdac_set_codec_wakeup(bus, true);
@@ -1796,12 +1752,8 @@ static int azx_first_init(struct azx *chip)
 	/* initialize chip */
 	azx_init_pci(chip);
 
-	if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL) {
-		struct hda_intel *hda;
-
-		hda = container_of(chip, struct hda_intel, chip);
-		haswell_set_bclk(hda);
-	}
+	if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL)
+		snd_hdac_i915_set_bclk(bus);
 
 	hda_intel_init_chip(chip, (probe_only[dev] & 2) == 0);
 
diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index 40933aa33afe..1483f85999ec 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c
@@ -2232,6 +2232,7 @@ static void intel_pin_eld_notify(void *audio_ptr, int port)
 	if (atomic_read(&(codec)->core.in_pm))
 		return;
 
+	snd_hdac_i915_set_bclk(&codec->bus->core);
 	check_presence_and_report(codec, pin_nid);
 }
 
-- 
cgit v1.2.3


From 986ef95ecdd3eb6fa29433e68faa94c7624083be Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 31 Mar 2016 19:03:25 +0300
Subject: IB/mlx5: Expose correct max_sge_rd limit

mlx5 devices (Connect-IB, ConnectX-4, ConnectX-4-LX) has a limitation
where rdma read work queue entries cannot exceed 512 bytes.
A rdma_read wqe needs to fit in 512 bytes:
- wqe control segment (16 bytes)
- rdma segment (16 bytes)
- scatter elements (16 bytes each)

So max_sge_rd should be: (512 - 16 - 16) / 16 = 30.

Cc: linux-stable@vger.kernel.org
Reported-by: Christoph Hellwig <hch@lst.de>
Tested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagig@grimberg.me>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/main.c |  2 +-
 include/linux/mlx5/device.h       | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 5acf346e048e..049754f6e638 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -530,7 +530,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		     sizeof(struct mlx5_wqe_ctrl_seg)) /
 		     sizeof(struct mlx5_wqe_data_seg);
 	props->max_sge = min(max_rq_sg, max_sq_sg);
-	props->max_sge_rd = props->max_sge;
+	props->max_sge_rd	   = MLX5_MAX_SGE_RD;
 	props->max_cq		   = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
 	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
 	props->max_mr		   = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 8156e3c9239c..b3575f392492 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -392,6 +392,17 @@ enum {
 	MLX5_CAP_OFF_CMDIF_CSUM		= 46,
 };
 
+enum {
+	/*
+	 * Max wqe size for rdma read is 512 bytes, so this
+	 * limits our max_sge_rd as the wqe needs to fit:
+	 * - ctrl segment (16 bytes)
+	 * - rdma segment (16 bytes)
+	 * - scatter elements (16 bytes each)
+	 */
+	MLX5_MAX_SGE_RD	= (512 - 16 - 16) / 16
+};
+
 struct mlx5_inbox_hdr {
 	__be16		opcode;
 	u8		rsvd[4];
-- 
cgit v1.2.3


From e6bd18f57aad1a2d1ef40e646d03ed0f2515c9e3 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Sun, 10 Apr 2016 19:13:13 -0600
Subject: IB/security: Restrict use of the write() interface

The drivers/infiniband stack uses write() as a replacement for
bi-directional ioctl().  This is not safe. There are ways to
trigger write calls that result in the return structure that
is normally written to user space being shunted off to user
specified kernel memory instead.

For the immediate repair, detect and deny suspicious accesses to
the write API.

For long term, update the user space libraries and the kernel API
to something that doesn't present the same security vulnerabilities
(likely a structured ioctl() interface).

The impacted uAPI interfaces are generally only available if
hardware from drivers/infiniband is installed in the system.

Reported-by: Jann Horn <jann@thejh.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
[ Expanded check to all known write() entry points ]
Cc: stable@vger.kernel.org
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/ucm.c            |  4 ++++
 drivers/infiniband/core/ucma.c           |  3 +++
 drivers/infiniband/core/uverbs_main.c    |  5 +++++
 drivers/infiniband/hw/qib/qib_file_ops.c |  5 +++++
 drivers/staging/rdma/hfi1/TODO           |  2 +-
 drivers/staging/rdma/hfi1/file_ops.c     |  6 ++++++
 include/rdma/ib.h                        | 16 ++++++++++++++++
 7 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 4a9aa0433b07..7713ef089c3c 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -48,6 +48,7 @@
 
 #include <asm/uaccess.h>
 
+#include <rdma/ib.h>
 #include <rdma/ib_cm.h>
 #include <rdma/ib_user_cm.h>
 #include <rdma/ib_marshall.h>
@@ -1103,6 +1104,9 @@ static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
 	struct ib_ucm_cmd_hdr hdr;
 	ssize_t result;
 
+	if (WARN_ON_ONCE(!ib_safe_file_access(filp)))
+		return -EACCES;
+
 	if (len < sizeof(hdr))
 		return -EINVAL;
 
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index dd3bcceadfde..c0f3826abb30 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1574,6 +1574,9 @@ static ssize_t ucma_write(struct file *filp, const char __user *buf,
 	struct rdma_ucm_cmd_hdr hdr;
 	ssize_t ret;
 
+	if (WARN_ON_ONCE(!ib_safe_file_access(filp)))
+		return -EACCES;
+
 	if (len < sizeof(hdr))
 		return -EINVAL;
 
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 28ba2cc81535..31f422a70623 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -48,6 +48,8 @@
 
 #include <asm/uaccess.h>
 
+#include <rdma/ib.h>
+
 #include "uverbs.h"
 
 MODULE_AUTHOR("Roland Dreier");
@@ -709,6 +711,9 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 	int srcu_key;
 	ssize_t ret;
 
+	if (WARN_ON_ONCE(!ib_safe_file_access(filp)))
+		return -EACCES;
+
 	if (count < sizeof hdr)
 		return -EINVAL;
 
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index e449e394963f..24f4a782e0f4 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -45,6 +45,8 @@
 #include <linux/export.h>
 #include <linux/uio.h>
 
+#include <rdma/ib.h>
+
 #include "qib.h"
 #include "qib_common.h"
 #include "qib_user_sdma.h"
@@ -2067,6 +2069,9 @@ static ssize_t qib_write(struct file *fp, const char __user *data,
 	ssize_t ret = 0;
 	void *dest;
 
+	if (WARN_ON_ONCE(!ib_safe_file_access(fp)))
+		return -EACCES;
+
 	if (count < sizeof(cmd.type)) {
 		ret = -EINVAL;
 		goto bail;
diff --git a/drivers/staging/rdma/hfi1/TODO b/drivers/staging/rdma/hfi1/TODO
index 05de0dad8762..4c6f1d7d2eaf 100644
--- a/drivers/staging/rdma/hfi1/TODO
+++ b/drivers/staging/rdma/hfi1/TODO
@@ -3,4 +3,4 @@ July, 2015
 - Remove unneeded file entries in sysfs
 - Remove software processing of IB protocol and place in library for use
   by qib, ipath (if still present), hfi1, and eventually soft-roce
-
+- Replace incorrect uAPI
diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c
index 541529589736..c1c5bf82addb 100644
--- a/drivers/staging/rdma/hfi1/file_ops.c
+++ b/drivers/staging/rdma/hfi1/file_ops.c
@@ -49,6 +49,8 @@
 #include <linux/vmalloc.h>
 #include <linux/io.h>
 
+#include <rdma/ib.h>
+
 #include "hfi.h"
 #include "pio.h"
 #include "device.h"
@@ -190,6 +192,10 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
 	int uctxt_required = 1;
 	int must_be_root = 0;
 
+	/* FIXME: This interface cannot continue out of staging */
+	if (WARN_ON_ONCE(!ib_safe_file_access(fp)))
+		return -EACCES;
+
 	if (count < sizeof(cmd)) {
 		ret = -EINVAL;
 		goto bail;
diff --git a/include/rdma/ib.h b/include/rdma/ib.h
index cf8f9e700e48..a6b93706b0fc 100644
--- a/include/rdma/ib.h
+++ b/include/rdma/ib.h
@@ -34,6 +34,7 @@
 #define _RDMA_IB_H
 
 #include <linux/types.h>
+#include <linux/sched.h>
 
 struct ib_addr {
 	union {
@@ -86,4 +87,19 @@ struct sockaddr_ib {
 	__u64			sib_scope_id;
 };
 
+/*
+ * The IB interfaces that use write() as bi-directional ioctl() are
+ * fundamentally unsafe, since there are lots of ways to trigger "write()"
+ * calls from various contexts with elevated privileges. That includes the
+ * traditional suid executable error message writes, but also various kernel
+ * interfaces that can write to file descriptors.
+ *
+ * This function provides protection for the legacy API by restricting the
+ * calling context.
+ */
+static inline bool ib_safe_file_access(struct file *filp)
+{
+	return filp->f_cred == current_cred() && segment_eq(get_fs(), USER_DS);
+}
+
 #endif /* _RDMA_IB_H */
-- 
cgit v1.2.3


From 7b7483409f09c15f30ac43242ead1ab3061e1f59 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 25 Apr 2016 15:13:17 -0300
Subject: net: fix net_gso_ok for new GSO types.

Fix casting in net_gso_ok. Otherwise the shift on
gso_type << NETIF_F_GSO_SHIFT may hit the 32th bit and make it look like
a INT_MIN, which is then promoted from signed to uint64 which is
0xffffffff80000000, resulting in wrong behavior when it is and'ed with
the feature itself, as in:

This test app:
#include <stdio.h>
#include <stdint.h>

int main(int argc, char **argv)
{
	uint64_t feature1;
	uint64_t feature2;
	int gso_type = 1 << 15;

	feature1 = gso_type << 16;
	feature2 = (uint64_t)gso_type << 16;
	printf("%lx %lx\n", feature1, feature2);

	return 0;
}

Gives:
ffffffff80000000 80000000

So that this:
   return (features & feature) == feature;
Actually works on more bits than expected and invalid ones.

Fix is to promote it earlier.

Issue noted while rebasing SCTP GSO patch but posting separetely as
someone else may experience this meanwhile.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8395308a2445..b3c46b019ac1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4004,7 +4004,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb);
 
 static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 {
-	netdev_features_t feature = gso_type << NETIF_F_GSO_SHIFT;
+	netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT;
 
 	/* check flags correspondence */
 	BUILD_BUG_ON(SKB_GSO_TCPV4   != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
-- 
cgit v1.2.3


From 92117d8443bc5afacc8d5ba82e541946310f106e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 27 Apr 2016 18:56:20 -0700
Subject: bpf: fix refcnt overflow

On a system with >32Gbyte of phyiscal memory and infinite RLIMIT_MEMLOCK,
the malicious application may overflow 32-bit bpf program refcnt.
It's also possible to overflow map refcnt on 1Tb system.
Impose 32k hard limit which means that the same bpf program or
map cannot be shared by more than 32k processes.

Fixes: 1be7f75d1668 ("bpf: enable non-root eBPF programs")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  3 ++-
 kernel/bpf/inode.c    |  7 ++++---
 kernel/bpf/syscall.c  | 24 ++++++++++++++++++++----
 kernel/bpf/verifier.c | 11 +++++++----
 4 files changed, 33 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 21ee41b92e8a..f1d5c5acc8dd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -171,12 +171,13 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl);
 void bpf_register_map_type(struct bpf_map_type_list *tl);
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 void bpf_prog_put_rcu(struct bpf_prog *prog);
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
-void bpf_map_inc(struct bpf_map *map, bool uref);
+struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_precharge_memlock(u32 pages);
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index f2ece3c174a5..8f94ca1860cf 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
 {
 	switch (type) {
 	case BPF_TYPE_PROG:
-		atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
+		raw = bpf_prog_inc(raw);
 		break;
 	case BPF_TYPE_MAP:
-		bpf_map_inc(raw, true);
+		raw = bpf_map_inc(raw, true);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -297,7 +297,8 @@ static void *bpf_obj_do_get(const struct filename *pathname,
 		goto out;
 
 	raw = bpf_any_get(inode->i_private, *type);
-	touch_atime(&path);
+	if (!IS_ERR(raw))
+		touch_atime(&path);
 
 	path_put(&path);
 	return raw;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index adc5e4bd74f8..cf5e9f7ad13a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -218,11 +218,18 @@ struct bpf_map *__bpf_map_get(struct fd f)
 	return f.file->private_data;
 }
 
-void bpf_map_inc(struct bpf_map *map, bool uref)
+/* prog's and map's refcnt limit */
+#define BPF_MAX_REFCNT 32768
+
+struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
 {
-	atomic_inc(&map->refcnt);
+	if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) {
+		atomic_dec(&map->refcnt);
+		return ERR_PTR(-EBUSY);
+	}
 	if (uref)
 		atomic_inc(&map->usercnt);
+	return map;
 }
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
@@ -234,7 +241,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 	if (IS_ERR(map))
 		return map;
 
-	bpf_map_inc(map, true);
+	map = bpf_map_inc(map, true);
 	fdput(f);
 
 	return map;
@@ -658,6 +665,15 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
 	return f.file->private_data;
 }
 
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+	if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
+		atomic_dec(&prog->aux->refcnt);
+		return ERR_PTR(-EBUSY);
+	}
+	return prog;
+}
+
 /* called by sockets/tracing/seccomp before attaching program to an event
  * pairs with bpf_prog_put()
  */
@@ -670,7 +686,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 	if (IS_ERR(prog))
 		return prog;
 
-	atomic_inc(&prog->aux->refcnt);
+	prog = bpf_prog_inc(prog);
 	fdput(f);
 
 	return prog;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index db2574e7b8b0..89bcaa0966da 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2049,15 +2049,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
 				return -E2BIG;
 			}
 
-			/* remember this map */
-			env->used_maps[env->used_map_cnt++] = map;
-
 			/* hold the map. If the program is rejected by verifier,
 			 * the map will be released by release_maps() or it
 			 * will be used by the valid program until it's unloaded
 			 * and all maps are released in free_bpf_prog_info()
 			 */
-			bpf_map_inc(map, false);
+			map = bpf_map_inc(map, false);
+			if (IS_ERR(map)) {
+				fdput(f);
+				return PTR_ERR(map);
+			}
+			env->used_maps[env->used_map_cnt++] = map;
+
 			fdput(f);
 next_insn:
 			insn++;
-- 
cgit v1.2.3


From 66ee95d16a7f1b7b4f1dd74a2d81c6e19dc29a14 Mon Sep 17 00:00:00 2001
From: Steve Capper <steve.capper@arm.com>
Date: Thu, 28 Apr 2016 16:18:24 -0700
Subject: mm: exclude HugeTLB pages from THP page_mapped() logic

HugeTLB pages cannot be split, so we use the compound_mapcount to track
rmaps.

Currently page_mapped() will check the compound_mapcount, but will also
go through the constituent pages of a THP compound page and query the
individual _mapcount's too.

Unfortunately, page_mapped() does not distinguish between HugeTLB and
THP compound pages and assumes that a compound page always needs to have
HPAGE_PMD_NR pages querying.

For most cases when dealing with HugeTLB this is just inefficient, but
for scenarios where the HugeTLB page size is less than the pmd block
size (e.g.  when using contiguous bit on ARM) this can lead to crashes.

This patch adjusts the page_mapped function such that we skip the
unnecessary THP reference checks for HugeTLB pages.

Fixes: e1534ae95004 ("mm: differentiate page_mapped() from page_mapcount() for compound pages")
Signed-off-by: Steve Capper <steve.capper@arm.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a55e5be0894f..79b6c18d0a38 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1031,6 +1031,8 @@ static inline bool page_mapped(struct page *page)
 	page = compound_head(page);
 	if (atomic_read(compound_mapcount_ptr(page)) >= 0)
 		return true;
+	if (PageHuge(page))
+		return false;
 	for (i = 0; i < hpage_nr_pages(page); i++) {
 		if (atomic_read(&page[i]._mapcount) >= 0)
 			return true;
-- 
cgit v1.2.3


From aa88b68c3b1dce8bc3fd54c8a7372a777ff265cd Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 28 Apr 2016 16:18:27 -0700
Subject: thp: keep huge zero page pinned until tlb flush

Andrea has found[1] a race condition on MMU-gather based TLB flush vs
split_huge_page() or shrinker which frees huge zero under us (patch 1/2
and 2/2 respectively).

With new THP refcounting, we don't need patch 1/2: mmu_gather keeps the
page pinned until flush is complete and the pin prevents the page from
being split under us.

We still need patch 2/2.  This is simplified version of Andrea's patch.
We don't need fancy encoding.

[1] http://lkml.kernel.org/r/1447938052-22165-1-git-send-email-aarcange@redhat.com

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 5 +++++
 mm/huge_memory.c        | 6 +++---
 mm/swap.c               | 5 +++++
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7008623e24b1..d7b9e5346fba 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -152,6 +152,7 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
 }
 
 struct page *get_huge_zero_page(void);
+void put_huge_zero_page(void);
 
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
@@ -208,6 +209,10 @@ static inline bool is_huge_zero_page(struct page *page)
 	return false;
 }
 
+static inline void put_huge_zero_page(void)
+{
+	BUILD_BUG();
+}
 
 static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmd, int flags)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 86f9f8b82f8e..5346de05f471 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -232,7 +232,7 @@ retry:
 	return READ_ONCE(huge_zero_page);
 }
 
-static void put_huge_zero_page(void)
+void put_huge_zero_page(void)
 {
 	/*
 	 * Counter should never go to zero here. Only shrinker can put
@@ -1684,12 +1684,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	if (vma_is_dax(vma)) {
 		spin_unlock(ptl);
 		if (is_huge_zero_pmd(orig_pmd))
-			put_huge_zero_page();
+			tlb_remove_page(tlb, pmd_page(orig_pmd));
 	} else if (is_huge_zero_pmd(orig_pmd)) {
 		pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
 		atomic_long_dec(&tlb->mm->nr_ptes);
 		spin_unlock(ptl);
-		put_huge_zero_page();
+		tlb_remove_page(tlb, pmd_page(orig_pmd));
 	} else {
 		struct page *page = pmd_page(orig_pmd);
 		page_remove_rmap(page, true);
diff --git a/mm/swap.c b/mm/swap.c
index a0bc206b4ac6..03aacbcb013f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -728,6 +728,11 @@ void release_pages(struct page **pages, int nr, bool cold)
 			zone = NULL;
 		}
 
+		if (is_huge_zero_page(page)) {
+			put_huge_zero_page();
+			continue;
+		}
+
 		page = compound_head(page);
 		if (!put_page_testzero(page))
 			continue;
-- 
cgit v1.2.3


From 28093f9f34cedeaea0f481c58446d9dac6dd620f Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Thu, 28 Apr 2016 16:18:35 -0700
Subject: numa: fix /proc/<pid>/numa_maps for THP

In gather_pte_stats() a THP pmd is cast into a pte, which is wrong
because the layouts may differ depending on the architecture.  On s390
this will lead to inaccurate numa_maps accounting in /proc because of
misguided pte_present() and pte_dirty() checks on the fake pte.

On other architectures pte_present() and pte_dirty() may work by chance,
but there may be an issue with direct-access (dax) mappings w/o
underlying struct pages when HAVE_PTE_SPECIAL is set and THP is
available.  In vm_normal_page() the fake pte will be checked with
pte_special() and because there is no "special" bit in a pmd, this will
always return false and the VM_PFNMAP | VM_MIXEDMAP checking will be
skipped.  On dax mappings w/o struct pages, an invalid struct page
pointer would then be returned that can crash the kernel.

This patch fixes the numa_maps THP handling by introducing new "_pmd"
variants of the can_gather_numa_stats() and vm_normal_page() functions.

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org>	[4.3+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 33 ++++++++++++++++++++++++++++++---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 229cb546bee0..541583510cfb 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1518,6 +1518,32 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
 	return page;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
+					      struct vm_area_struct *vma,
+					      unsigned long addr)
+{
+	struct page *page;
+	int nid;
+
+	if (!pmd_present(pmd))
+		return NULL;
+
+	page = vm_normal_page_pmd(vma, addr, pmd);
+	if (!page)
+		return NULL;
+
+	if (PageReserved(page))
+		return NULL;
+
+	nid = page_to_nid(page);
+	if (!node_isset(nid, node_states[N_MEMORY]))
+		return NULL;
+
+	return page;
+}
+#endif
+
 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 		unsigned long end, struct mm_walk *walk)
 {
@@ -1527,14 +1553,14 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 	pte_t *orig_pte;
 	pte_t *pte;
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
-		pte_t huge_pte = *(pte_t *)pmd;
 		struct page *page;
 
-		page = can_gather_numa_stats(huge_pte, vma, addr);
+		page = can_gather_numa_stats_pmd(*pmd, vma, addr);
 		if (page)
-			gather_stats(page, md, pte_dirty(huge_pte),
+			gather_stats(page, md, pmd_dirty(*pmd),
 				     HPAGE_PMD_SIZE/PAGE_SIZE);
 		spin_unlock(ptl);
 		return 0;
@@ -1542,6 +1568,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 
 	if (pmd_trans_unstable(pmd))
 		return 0;
+#endif
 	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 	do {
 		struct page *page = can_gather_numa_stats(*pte, vma, addr);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 79b6c18d0a38..864d7221de84 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1140,6 +1140,8 @@ struct zap_details {
 
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 		pte_t pte);
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+				pmd_t pmd);
 
 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size);
diff --git a/mm/memory.c b/mm/memory.c
index 93897f23cc11..305537fc8640 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -789,6 +789,46 @@ out:
 	return pfn_to_page(pfn);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+				pmd_t pmd)
+{
+	unsigned long pfn = pmd_pfn(pmd);
+
+	/*
+	 * There is no pmd_special() but there may be special pmds, e.g.
+	 * in a direct-access (dax) mapping, so let's just replicate the
+	 * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
+	 */
+	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+		if (vma->vm_flags & VM_MIXEDMAP) {
+			if (!pfn_valid(pfn))
+				return NULL;
+			goto out;
+		} else {
+			unsigned long off;
+			off = (addr - vma->vm_start) >> PAGE_SHIFT;
+			if (pfn == vma->vm_pgoff + off)
+				return NULL;
+			if (!is_cow_mapping(vma->vm_flags))
+				return NULL;
+		}
+	}
+
+	if (is_zero_pfn(pfn))
+		return NULL;
+	if (unlikely(pfn > highest_memmap_pfn))
+		return NULL;
+
+	/*
+	 * NOTE! We still have PageReserved() pages in the page tables.
+	 * eg. VDSO mappings can cause them to exist.
+	 */
+out:
+	return pfn_to_page(pfn);
+}
+#endif
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
-- 
cgit v1.2.3


From 2c94b53738549d81dc7464a32117d1f5112c64d3 Mon Sep 17 00:00:00 2001
From: Tim Bingham <tbingham@akamai.com>
Date: Fri, 29 Apr 2016 13:30:23 -0400
Subject: net: Implement net_dbg_ratelimited() for CONFIG_DYNAMIC_DEBUG case

Prior to commit d92cff89a0c8 ("net_dbg_ratelimited: turn into no-op
when !DEBUG") the implementation of net_dbg_ratelimited() was buggy
for both the DEBUG and CONFIG_DYNAMIC_DEBUG cases.

The bug was that net_ratelimit() was being called and, despite
returning true, nothing was being printed to the console. This
resulted in messages like the following -

"net_ratelimit: %d callbacks suppressed"

with no other output nearby.

After commit d92cff89a0c8 ("net_dbg_ratelimited: turn into no-op when
!DEBUG") the bug is fixed for the DEBUG case. However, there's no
output at all for CONFIG_DYNAMIC_DEBUG case.

This patch restores debug output (if enabled) for the
CONFIG_DYNAMIC_DEBUG case.

Add a definition of net_dbg_ratelimited() for the CONFIG_DYNAMIC_DEBUG
case. The implementation takes care to check that dynamic debugging is
enabled before calling net_ratelimit().

Fixes: d92cff89a0c8 ("net_dbg_ratelimited: turn into no-op when !DEBUG")
Signed-off-by: Tim Bingham <tbingham@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index 49175e4ced11..f840d77c6c31 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -246,7 +246,15 @@ do {								\
 	net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__)
 #define net_info_ratelimited(fmt, ...)				\
 	net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__)
-#if defined(DEBUG)
+#if defined(CONFIG_DYNAMIC_DEBUG)
+#define net_dbg_ratelimited(fmt, ...)					\
+do {									\
+	DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);			\
+	if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) &&	\
+	    net_ratelimit())						\
+		__dynamic_pr_debug(&descriptor, fmt, ##__VA_ARGS__);	\
+} while (0)
+#elif defined(DEBUG)
 #define net_dbg_ratelimited(fmt, ...)				\
 	net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__)
 #else
-- 
cgit v1.2.3


From 689de1d6ca95b3b5bd8ee446863bf81a4883ea25 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 2 May 2016 12:46:42 -0700
Subject: Minimal fix-up of bad hashing behavior of hash_64()

This is a fairly minimal fixup to the horribly bad behavior of hash_64()
with certain input patterns.

In particular, because the multiplicative value used for the 64-bit hash
was intentionally bit-sparse (so that the multiply could be done with
shifts and adds on architectures without hardware multipliers), some
bits did not get spread out very much.  In particular, certain fairly
common bit ranges in the input (roughly bits 12-20: commonly with the
most information in them when you hash things like byte offsets in files
or memory that have block factors that mean that the low bits are often
zero) would not necessarily show up much in the result.

There's a bigger patch-series brewing to fix up things more completely,
but this is the fairly minimal fix for the 64-bit hashing problem.  It
simply picks a much better constant multiplier, spreading the bits out a
lot better.

NOTE! For 32-bit architectures, the bad old hash_64() remains the same
for now, since 64-bit multiplies are expensive.  The bigger hashing
cleanup will replace the 32-bit case with something better.

The new constants were picked by George Spelvin who wrote that bigger
cleanup series.  I just picked out the constants and part of the comment
from that series.

Cc: stable@vger.kernel.org
Cc: George Spelvin <linux@horizon.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hash.h | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/hash.h b/include/linux/hash.h
index 1afde47e1528..79c52fa81cac 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -32,12 +32,28 @@
 #error Wordsize not 32 or 64
 #endif
 
+/*
+ * The above primes are actively bad for hashing, since they are
+ * too sparse. The 32-bit one is mostly ok, the 64-bit one causes
+ * real problems. Besides, the "prime" part is pointless for the
+ * multiplicative hash.
+ *
+ * Although a random odd number will do, it turns out that the golden
+ * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
+ * properties.
+ *
+ * These are the negative, (1 - phi) = (phi^2) = (3 - sqrt(5))/2.
+ * (See Knuth vol 3, section 6.4, exercise 9.)
+ */
+#define GOLDEN_RATIO_32 0x61C88647
+#define GOLDEN_RATIO_64 0x61C8864680B583EBull
+
 static __always_inline u64 hash_64(u64 val, unsigned int bits)
 {
 	u64 hash = val;
 
-#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
-	hash = hash * GOLDEN_RATIO_PRIME_64;
+#if BITS_PER_LONG == 64
+	hash = hash * GOLDEN_RATIO_64;
 #else
 	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
 	u64 n = hash;
-- 
cgit v1.2.3


From af67eb9e7e1ab37880459f83153d34b3c42b0075 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Mon, 2 May 2016 09:25:16 -0700
Subject: vxlan: Add checksum check to the features check function

We need to perform an additional check on the inner headers to determine if
we can offload the checksum for them.  Previously this check didn't occur
so we would generate an invalid frame in the case of an IPv6 header
encapsulated inside of an IPv4 tunnel.  To fix this I added a secondary
check to vxlan_features_check so that we can verify that we can offload the
inner checksum.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_ether.h | 5 +++++
 include/net/vxlan.h      | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h
index d5569734f672..548fd535fd02 100644
--- a/include/linux/if_ether.h
+++ b/include/linux/if_ether.h
@@ -28,6 +28,11 @@ static inline struct ethhdr *eth_hdr(const struct sk_buff *skb)
 	return (struct ethhdr *)skb_mac_header(skb);
 }
 
+static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb)
+{
+	return (struct ethhdr *)skb_inner_mac_header(skb);
+}
+
 int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr);
 
 extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 73ed2e951c02..35437c779da8 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -252,7 +252,9 @@ static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
 	    (skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
 	     skb->inner_protocol != htons(ETH_P_TEB) ||
 	     (skb_inner_mac_header(skb) - skb_transport_header(skb) !=
-	      sizeof(struct udphdr) + sizeof(struct vxlanhdr))))
+	      sizeof(struct udphdr) + sizeof(struct vxlanhdr)) ||
+	     (skb->ip_summed != CHECKSUM_NONE &&
+	      !can_checksum_protocol(features, inner_eth_hdr(skb)->h_proto))))
 		return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 
 	return features;
-- 
cgit v1.2.3